Index: lib/Transforms/InstCombine/InstCombineCasts.cpp
===================================================================
--- lib/Transforms/InstCombine/InstCombineCasts.cpp
+++ lib/Transforms/InstCombine/InstCombineCasts.cpp
@@ -16,6 +16,7 @@
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/PatternMatch.h"
 #include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
 using namespace llvm;
 using namespace PatternMatch;
 
@@ -637,8 +638,9 @@
   return nullptr;
 }
 
-/// CanEvaluateZExtd - Determine if the specified value can be computed in the
-/// specified wider type and produce the same low bits.  If not, return false.
+/// ComputeEvaluateZExtdCost - Compute the extra cost if the specified value is
+/// computed in the specified wider type and produce the same low bits.
+/// If such evalutation is not possible, return INT_MAX.
 ///
 /// If this function returns true, it can also return a non-zero number of bits
 /// (in BitsToClear) which indicates that the value it computes is correct for
@@ -649,48 +651,55 @@
 ///   %C = lshr i32 %B, 8
 ///   %E = zext i32 %C to i64
 ///
-/// CanEvaluateZExtd for the 'lshr' will return true, and BitsToClear will be
-/// set to 8 to indicate that the promoted value needs to have bits 24-31
-/// cleared in addition to bits 32-63.  Since an 'and' will be generated to
-/// clear the top bits anyway, doing this has no extra cost.
+/// ComputeEvaluateZExtdCost for the 'lshr' will set BitsToClear 8 to indicate
+/// that the promoted value needs to have bits 24-31 cleared in addition to bits
+/// 32-63.  Since an 'and' will be generated to clear the top bits anyway, doing
+/// this has no extra cost.
 ///
 /// This function works on both vectors and scalars.
-static bool CanEvaluateZExtd(Value *V, Type *Ty, unsigned &BitsToClear,
-                             InstCombiner &IC, Instruction *CxtI) {
+static int ComputeEvaluateZExtdCost(Value *V, Type *Ty, unsigned &BitsToClear,
+                                    InstCombiner &IC, Instruction *CxtI,
+                                    const TargetTransformInfo &TTI) {
   BitsToClear = 0;
   if (isa<Constant>(V))
-    return true;
+    return 0;
+
+  Type *OrigTy = V->getType();
 
   Instruction *I = dyn_cast<Instruction>(V);
-  if (!I) return false;
+  if (!I) return INT_MAX;
 
   // If the input is a truncate from the destination type, we can trivially
   // eliminate it.
   if (isa<TruncInst>(I) && I->getOperand(0)->getType() == Ty)
-    return true;
+    return 0;
 
   // We can't extend or shrink something that has multiple uses: doing so would
   // require duplicating the instruction in general, which isn't profitable.
-  if (!I->hasOneUse()) return false;
+  if (!I->hasOneUse()) return INT_MAX;
 
   unsigned Opc = I->getOpcode(), Tmp;
   switch (Opc) {
   case Instruction::ZExt:  // zext(zext(x)) -> zext(x).
   case Instruction::SExt:  // zext(sext(x)) -> sext(x).
   case Instruction::Trunc: // zext(trunc(x)) -> trunc(x) or zext(x)
-    return true;
+    return 0;
   case Instruction::And:
   case Instruction::Or:
   case Instruction::Xor:
   case Instruction::Add:
   case Instruction::Sub:
-  case Instruction::Mul:
-    if (!CanEvaluateZExtd(I->getOperand(0), Ty, BitsToClear, IC, CxtI) ||
-        !CanEvaluateZExtd(I->getOperand(1), Ty, Tmp, IC, CxtI))
-      return false;
+  case Instruction::Mul: {
+    int LHSCost = ComputeEvaluateZExtdCost(I->getOperand(0), Ty, BitsToClear,
+                                           IC, CxtI, TTI);
+    if (LHSCost == INT_MAX) return INT_MAX;
+    int RHSCost =
+        ComputeEvaluateZExtdCost(I->getOperand(1), Ty, Tmp, IC, CxtI, TTI);
+    if (RHSCost == INT_MAX) return INT_MAX;
     // These can all be promoted if neither operand has 'bits to clear'.
     if (BitsToClear == 0 && Tmp == 0)
-      return true;
+      return LHSCost + RHSCost + TTI.getArithmeticInstrCost(Opc, Ty) -
+             TTI.getArithmeticInstrCost(Opc, OrigTy);
 
     // If the operation is an AND/OR/XOR and the bits to clear are zero in the
     // other side, BitsToClear is ok.
@@ -703,63 +712,84 @@
       if (IC.MaskedValueIsZero(I->getOperand(1),
                                APInt::getHighBitsSet(VSize, BitsToClear),
                                0, CxtI))
-        return true;
+      return LHSCost + RHSCost + TTI.getArithmeticInstrCost(Opc, Ty) -
+             TTI.getArithmeticInstrCost(Opc, OrigTy);
     }
 
     // Otherwise, we don't know how to analyze this BitsToClear case yet.
-    return false;
+    return INT_MAX;
+  }
 
   case Instruction::Shl:
     // We can promote shl(x, cst) if we can promote x.  Since shl overwrites the
     // upper bits we can reduce BitsToClear by the shift amount.
     if (ConstantInt *Amt = dyn_cast<ConstantInt>(I->getOperand(1))) {
-      if (!CanEvaluateZExtd(I->getOperand(0), Ty, BitsToClear, IC, CxtI))
-        return false;
+      int Cost = ComputeEvaluateZExtdCost(I->getOperand(0), Ty, BitsToClear, IC,
+                                          CxtI, TTI);
+      if (Cost == INT_MAX)
+        return INT_MAX;
       uint64_t ShiftAmt = Amt->getZExtValue();
       BitsToClear = ShiftAmt < BitsToClear ? BitsToClear - ShiftAmt : 0;
-      return true;
+      return Cost + TTI.getArithmeticInstrCost(Opc, Ty) -
+             TTI.getArithmeticInstrCost(Opc, OrigTy);
     }
-    return false;
+    return INT_MAX;
   case Instruction::LShr:
     // We can promote lshr(x, cst) if we can promote x.  This requires the
     // ultimate 'and' to clear out the high zero bits we're clearing out though.
     if (ConstantInt *Amt = dyn_cast<ConstantInt>(I->getOperand(1))) {
-      if (!CanEvaluateZExtd(I->getOperand(0), Ty, BitsToClear, IC, CxtI))
-        return false;
+      int Cost = ComputeEvaluateZExtdCost(I->getOperand(0), Ty, BitsToClear, IC,
+                                          CxtI, TTI);
+      if (Cost == INT_MAX)
+        return INT_MAX;
       BitsToClear += Amt->getZExtValue();
       if (BitsToClear > V->getType()->getScalarSizeInBits())
         BitsToClear = V->getType()->getScalarSizeInBits();
-      return true;
+      return Cost + TTI.getArithmeticInstrCost(Opc, Ty) -
+             TTI.getArithmeticInstrCost(Opc, OrigTy);
     }
     // Cannot promote variable LSHR.
-    return false;
-  case Instruction::Select:
-    if (!CanEvaluateZExtd(I->getOperand(1), Ty, Tmp, IC, CxtI) ||
-        !CanEvaluateZExtd(I->getOperand(2), Ty, BitsToClear, IC, CxtI) ||
-        // TODO: If important, we could handle the case when the BitsToClear are
-        // known zero in the disagreeing side.
-        Tmp != BitsToClear)
-      return false;
-    return true;
+    return INT_MAX;
+  case Instruction::Select: {
+    int Cost1 =
+        ComputeEvaluateZExtdCost(I->getOperand(1), Ty, Tmp, IC, CxtI, TTI);
+    if (Cost1 == INT_MAX)
+      return INT_MAX;
+    int Cost2 = ComputeEvaluateZExtdCost(I->getOperand(2), Ty, BitsToClear, IC,
+                                         CxtI, TTI);
+    // TODO: If important, we could handle the case when the BitsToClear are
+    // known zero in the disagreeing side.
+    if (Cost2 == INT_MAX || Tmp != BitsToClear) {
+      return INT_MAX;
+    }
+    return Cost1 + Cost2;
+  }
 
   case Instruction::PHI: {
     // We can change a phi if we can change all operands.  Note that we never
     // get into trouble with cyclic PHIs here because we only consider
     // instructions with a single use.
     PHINode *PN = cast<PHINode>(I);
-    if (!CanEvaluateZExtd(PN->getIncomingValue(0), Ty, BitsToClear, IC, CxtI))
-      return false;
-    for (unsigned i = 1, e = PN->getNumIncomingValues(); i != e; ++i)
-      if (!CanEvaluateZExtd(PN->getIncomingValue(i), Ty, Tmp, IC, CxtI) ||
+    int Cost = ComputeEvaluateZExtdCost(PN->getIncomingValue(0), Ty,
+                                        BitsToClear, IC, CxtI, TTI);
+    if (Cost == INT_MAX)
+      return INT_MAX;
+    int AccumCost = Cost;
+    for (unsigned i = 1, e = PN->getNumIncomingValues(); i != e; ++i) {
+      Cost = ComputeEvaluateZExtdCost(PN->getIncomingValue(i), Ty, Tmp, IC,
+                                      CxtI, TTI);
+      if (Cost == INT_MAX ||
           // TODO: If important, we could handle the case when the BitsToClear
           // are known zero in the disagreeing input.
           Tmp != BitsToClear)
-        return false;
-    return true;
+        return INT_MAX;
+      AccumCost += Cost;
+    }
+    return AccumCost;
   }
   default:
     // TODO: Can handle more cases here.
-    return false;
+    return INT_MAX;
   }
 }
 
@@ -787,13 +817,13 @@
   // strange.
   unsigned BitsToClear;
   if ((DestTy->isVectorTy() || ShouldChangeType(SrcTy, DestTy)) &&
-      CanEvaluateZExtd(Src, DestTy, BitsToClear, *this, &CI)) {
+      ComputeEvaluateZExtdCost(Src, DestTy, BitsToClear, *this, &CI, TTI) <= 0) {
     assert(BitsToClear < SrcTy->getScalarSizeInBits() &&
            "Unreasonable BitsToClear");
 
     // Okay, we can transform this!  Insert the new expression now.
     DEBUG(dbgs() << "ICE: EvaluateInDifferentType converting expression type"
-          " to avoid zero extend: " << CI);
+          " to avoid zero extend: " << CI << '\n');
     Value *Res = EvaluateInDifferentType(Src, DestTy, false);
     assert(Res->getType() == DestTy);
 
@@ -985,69 +1015,87 @@
   return nullptr;
 }
 
-/// CanEvaluateSExtd - Return true if we can take the specified value
-/// and return it as type Ty without inserting any new casts and without
-/// changing the value of the common low bits.  This is used by code that tries
-/// to promote integer operations to a wider types will allow us to eliminate
-/// the extension.
+/// ComputeEvaluateSExtdCost - Return the extra cost if we take the specified
+/// value and return it as type Ty without inserting any new casts and without
+/// changing the value of the common low bits. If such evalution is not
+/// possible, return INT_MAX.
+///
+/// This is used by code that tries to promote integer operations to a wider
+/// types will allow us to eliminate the extension.
 ///
 /// This function works on both vectors and scalars.
 ///
-static bool CanEvaluateSExtd(Value *V, Type *Ty) {
+static int ComputeEvaluateSExtdCost(Value *V, Type *Ty,
+                                    const TargetTransformInfo &TTI) {
   assert(V->getType()->getScalarSizeInBits() < Ty->getScalarSizeInBits() &&
          "Can't sign extend type to a smaller type");
   // If this is a constant, it can be trivially promoted.
   if (isa<Constant>(V))
-    return true;
+    return 0;
 
   Instruction *I = dyn_cast<Instruction>(V);
-  if (!I) return false;
+  if (!I) return INT_MAX;
 
   // If this is a truncate from the dest type, we can trivially eliminate it.
   if (isa<TruncInst>(I) && I->getOperand(0)->getType() == Ty)
-    return true;
+    return 0;
 
   // We can't extend or shrink something that has multiple uses: doing so would
   // require duplicating the instruction in general, which isn't profitable.
-  if (!I->hasOneUse()) return false;
+  if (!I->hasOneUse()) return INT_MAX;
 
-  switch (I->getOpcode()) {
+  unsigned Opc = I->getOpcode();
+  switch (Opc) {
   case Instruction::SExt:  // sext(sext(x)) -> sext(x)
   case Instruction::ZExt:  // sext(zext(x)) -> zext(x)
   case Instruction::Trunc: // sext(trunc(x)) -> trunc(x) or sext(x)
-    return true;
+    return 0;
   case Instruction::And:
   case Instruction::Or:
   case Instruction::Xor:
   case Instruction::Add:
   case Instruction::Sub:
-  case Instruction::Mul:
+  case Instruction::Mul: {
     // These operators can all arbitrarily be extended if their inputs can.
-    return CanEvaluateSExtd(I->getOperand(0), Ty) &&
-           CanEvaluateSExtd(I->getOperand(1), Ty);
+    int LHSCost = ComputeEvaluateSExtdCost(I->getOperand(0), Ty, TTI);
+    if (LHSCost == INT_MAX) return INT_MAX;
+    int RHSCost = ComputeEvaluateSExtdCost(I->getOperand(1), Ty, TTI);
+    if (RHSCost == INT_MAX) return INT_MAX;
+    return LHSCost + RHSCost + TTI.getArithmeticInstrCost(Opc, Ty) -
+           TTI.getArithmeticInstrCost(Opc, I->getType());
+  }
 
   //case Instruction::Shl:   TODO
   //case Instruction::LShr:  TODO
 
-  case Instruction::Select:
-    return CanEvaluateSExtd(I->getOperand(1), Ty) &&
-           CanEvaluateSExtd(I->getOperand(2), Ty);
+  case Instruction::Select: {
+    auto TrueCost = ComputeEvaluateSExtdCost(I->getOperand(1), Ty, TTI);
+    if (TrueCost == INT_MAX) return INT_MAX;
+    auto FalseCost = ComputeEvaluateSExtdCost(I->getOperand(2), Ty, TTI);
+    if (FalseCost == INT_MAX) return INT_MAX;
+    return TrueCost + FalseCost;
+  }
 
   case Instruction::PHI: {
     // We can change a phi if we can change all operands.  Note that we never
     // get into trouble with cyclic PHIs here because we only consider
     // instructions with a single use.
     PHINode *PN = cast<PHINode>(I);
-    for (Value *IncValue : PN->incoming_values())
-      if (!CanEvaluateSExtd(IncValue, Ty)) return false;
-    return true;
+    int AccumCost = 0;
+    for (Value *IncValue : PN->incoming_values()) {
+      int Cost = ComputeEvaluateSExtdCost(IncValue, Ty, TTI);
+      if (Cost == INT_MAX)
+        return INT_MAX;
+      AccumCost += Cost;
+    }
+    return AccumCost;
   }
   default:
     // TODO: Can handle more cases here.
     break;
   }
 
-  return false;
+  return INT_MAX;
 }
 
 Instruction *InstCombiner::visitSExt(SExtInst &CI) {
@@ -1081,10 +1129,10 @@
   // expression tree to something weird like i93 unless the source is also
   // strange.
   if ((DestTy->isVectorTy() || ShouldChangeType(SrcTy, DestTy)) &&
-      CanEvaluateSExtd(Src, DestTy)) {
+      ComputeEvaluateSExtdCost(Src, DestTy, TTI) <= 0) {
     // Okay, we can transform this!  Insert the new expression now.
     DEBUG(dbgs() << "ICE: EvaluateInDifferentType converting expression type"
-          " to avoid sign extend: " << CI);
+          " to avoid sign extend: " << CI << '\n');
     Value *Res = EvaluateInDifferentType(Src, DestTy, true);
     assert(Res->getType() == DestTy);
 
Index: lib/Transforms/InstCombine/InstCombineInternal.h
===================================================================
--- lib/Transforms/InstCombine/InstCombineInternal.h
+++ lib/Transforms/InstCombine/InstCombineInternal.h
@@ -35,6 +35,7 @@
 class DataLayout;
 class DominatorTree;
 class TargetLibraryInfo;
+class TargetTransformInfo;
 class DbgDeclareInst;
 class MemIntrinsic;
 class MemSetInst;
@@ -183,6 +184,7 @@
   TargetLibraryInfo *TLI;
   DominatorTree *DT;
   const DataLayout &DL;
+  const TargetTransformInfo &TTI;
 
   // Optional analyses. When non-null, these can both be used to do better
   // combining and will be updated to reflect any changes.
@@ -193,9 +195,11 @@
 public:
   InstCombiner(InstCombineWorklist &Worklist, BuilderTy *Builder,
                bool MinimizeSize, AssumptionCache *AC, TargetLibraryInfo *TLI,
-               DominatorTree *DT, const DataLayout &DL, LoopInfo *LI)
+               DominatorTree *DT, const DataLayout &DL,
+               const TargetTransformInfo &TTI, LoopInfo *LI)
       : Worklist(Worklist), Builder(Builder), MinimizeSize(MinimizeSize),
-        AC(AC), TLI(TLI), DT(DT), DL(DL), LI(LI), MadeIRChange(false) {}
+        AC(AC), TLI(TLI), DT(DT), DL(DL), TTI(TTI), LI(LI),
+        MadeIRChange(false) {}
 
   /// \brief Run the combiner over the entire worklist until it is empty.
   ///
Index: lib/Transforms/InstCombine/InstructionCombining.cpp
===================================================================
--- lib/Transforms/InstCombine/InstructionCombining.cpp
+++ lib/Transforms/InstCombine/InstructionCombining.cpp
@@ -48,6 +48,7 @@
 #include "llvm/Analysis/MemoryBuiltins.h"
 #include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/Analysis/ValueTracking.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/IR/CFG.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/Dominators.h"
@@ -2973,7 +2974,9 @@
 static bool
 combineInstructionsOverFunction(Function &F, InstCombineWorklist &Worklist,
                                 AssumptionCache &AC, TargetLibraryInfo &TLI,
-                                DominatorTree &DT, LoopInfo *LI = nullptr) {
+                                DominatorTree &DT,
+                                const TargetTransformInfo &TTI,
+                                LoopInfo *LI = nullptr) {
   // Minimizing size?
   bool MinimizeSize = F.hasFnAttribute(Attribute::MinSize);
   auto &DL = F.getParent()->getDataLayout();
@@ -2998,7 +3001,8 @@
     if (prepareICWorklistFromFunction(F, DL, &TLI, Worklist))
       Changed = true;
 
-    InstCombiner IC(Worklist, &Builder, MinimizeSize, &AC, &TLI, &DT, DL, LI);
+    InstCombiner IC(Worklist, &Builder, MinimizeSize, &AC, &TLI, &DT, DL, TTI,
+                    LI);
     if (IC.run())
       Changed = true;
 
@@ -3014,10 +3018,10 @@
   auto &AC = AM->getResult<AssumptionAnalysis>(F);
   auto &DT = AM->getResult<DominatorTreeAnalysis>(F);
   auto &TLI = AM->getResult<TargetLibraryAnalysis>(F);
-
+  const auto &TTI = AM->getResult<TargetIRAnalysis>(F);
   auto *LI = AM->getCachedResult<LoopAnalysis>(F);
 
-  if (!combineInstructionsOverFunction(F, Worklist, AC, TLI, DT, LI))
+  if (!combineInstructionsOverFunction(F, Worklist, AC, TLI, DT, TTI, LI))
     // No changes, all analyses are preserved.
     return PreservedAnalyses::all();
 
@@ -3052,6 +3056,7 @@
   AU.setPreservesCFG();
   AU.addRequired<AssumptionCacheTracker>();
   AU.addRequired<TargetLibraryInfoWrapperPass>();
+  AU.addRequired<TargetTransformInfoWrapperPass>();
   AU.addRequired<DominatorTreeWrapperPass>();
   AU.addPreserved<DominatorTreeWrapperPass>();
 }
@@ -3063,13 +3068,14 @@
   // Required analyses.
   auto &AC = getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
   auto &TLI = getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
+  const auto &TTI = getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
   auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
 
   // Optional analyses.
   auto *LIWP = getAnalysisIfAvailable<LoopInfoWrapperPass>();
   auto *LI = LIWP ? &LIWP->getLoopInfo() : nullptr;
 
-  return combineInstructionsOverFunction(F, Worklist, AC, TLI, DT, LI);
+  return combineInstructionsOverFunction(F, Worklist, AC, TLI, DT, TTI, LI);
 }
 
 char InstructionCombiningPass::ID = 0;
@@ -3077,6 +3083,7 @@
                       "Combine redundant instructions", false, false)
 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
 INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
 INITIALIZE_PASS_END(InstructionCombiningPass, "instcombine",
                     "Combine redundant instructions", false, false)
Index: test/Transforms/InstCombine/NVPTX/lit.local.cfg
===================================================================
--- /dev/null
+++ test/Transforms/InstCombine/NVPTX/lit.local.cfg
@@ -0,0 +1,2 @@
+if not 'NVPTX' in config.root.targets:
+    config.unsupported = True
Index: test/Transforms/InstCombine/NVPTX/no-widen-expensive.ll
===================================================================
--- /dev/null
+++ test/Transforms/InstCombine/NVPTX/no-widen-expensive.ll
@@ -0,0 +1,22 @@
+; RUN: opt < %s -instcombine -S | FileCheck %s
+target datalayout = "e-i64:64-v16:16-v32:32-n16:32:64"
+target triple = "nvptx64-unknown-unknown"
+
+; For the nvptx64 architecture, the cost of an arithmetic instruction on a
+; 64-bit integer is twice as expensive as that on a 32-bit integer, because the
+; hardware needs to simulate a 64-bit integer using two 32-bit integers.
+; Therefore, in this particular architecture, we should not widen induction
+; variables to 64-bit integers even though i64 is a legal type in the 64-bit
+; PTX ISA.
+
+define i64 @test(i64 %A) {
+; CHECK-LABEL: @test
+  %trunc = trunc i64 %A to i32
+; CHECK-NOT: xor i64
+  %xor = xor i32 %trunc, 23
+; CHECK-NOT: and i64
+  %and = and i32 %xor, 112
+; CHECK: zext i32
+  %zext = zext i32 %and to i64
+  ret i64 %zext
+}