Index: include/llvm/Analysis/TargetTransformInfoImpl.h
===================================================================
--- include/llvm/Analysis/TargetTransformInfoImpl.h
+++ include/llvm/Analysis/TargetTransformInfoImpl.h
@@ -277,6 +277,10 @@
     return 1;
   }
 
+  unsigned getScalarizationOverhead(Type *Ty, bool Insert, bool Extract) {
+    return 1;
+  }
+
   unsigned getShuffleCost(TTI::ShuffleKind Kind, Type *Ty, int Index,
                           Type *SubTp) {
     return 1;
@@ -499,6 +503,24 @@
         Operator::getOpcode(U), U->getType(),
         U->getNumOperands() == 1 ? U->getOperand(0)->getType() : nullptr);
   }
+
+  /// Estimate the overhead of scalarizing an instruction. Insert and Extract
+  /// are set if the result needs to be inserted and/or extracted from vectors.
+  unsigned getScalarizationOverhead(Type *Ty, bool Insert, bool Extract) {
+    assert(Ty->isVectorTy() && "Can only scalarize vectors");
+    unsigned Cost = 0;
+
+    for (int i = 0, e = Ty->getVectorNumElements(); i < e; ++i) {
+      if (Insert)
+        Cost += static_cast<T *>(this)
+          ->getVectorInstrCost(Instruction::InsertElement, Ty, i);
+      if (Extract)
+        Cost += static_cast<T *>(this)
+          ->getVectorInstrCost(Instruction::ExtractElement, Ty, i);
+    }
+
+    return Cost;
+  }
 };
 }
 
Index: include/llvm/CodeGen/BasicTTIImpl.h
===================================================================
--- include/llvm/CodeGen/BasicTTIImpl.h
+++ include/llvm/CodeGen/BasicTTIImpl.h
@@ -42,24 +42,6 @@
   typedef TargetTransformInfoImplCRTPBase<T> BaseT;
   typedef TargetTransformInfo TTI;
 
-  /// Estimate the overhead of scalarizing an instruction. Insert and Extract
-  /// are set if the result needs to be inserted and/or extracted from vectors.
-  unsigned getScalarizationOverhead(Type *Ty, bool Insert, bool Extract) {
-    assert(Ty->isVectorTy() && "Can only scalarize vectors");
-    unsigned Cost = 0;
-
-    for (int i = 0, e = Ty->getVectorNumElements(); i < e; ++i) {
-      if (Insert)
-        Cost += static_cast<T *>(this)
-                    ->getVectorInstrCost(Instruction::InsertElement, Ty, i);
-      if (Extract)
-        Cost += static_cast<T *>(this)
-                    ->getVectorInstrCost(Instruction::ExtractElement, Ty, i);
-    }
-
-    return Cost;
-  }
-
   /// Estimate the cost overhead of SK_Alternate shuffle.
   unsigned getAltShuffleOverhead(Type *Ty) {
     assert(Ty->isVectorTy() && "Can only shuffle vectors");
@@ -320,7 +302,9 @@
       // return the cost of multiple scalar invocation plus the cost of
       // inserting
       // and extracting the values.
-      return getScalarizationOverhead(Ty, true, true) + Num * Cost;
+      unsigned ScalarizeCost
+        = static_cast<T *>(this)->getScalarizationOverhead(Ty, true, true);
+      return ScalarizeCost + Num * Cost;
     }
 
     // We don't know anything about this scalar instruction.
@@ -411,19 +395,23 @@
 
       // Return the cost of multiple scalar invocation plus the cost of
       // inserting and extracting the values.
-      return getScalarizationOverhead(Dst, true, true) + Num * Cost;
+      unsigned ScalarizeCost
+        = static_cast<T *>(this)->getScalarizationOverhead(Dst, true, true);
+      return ScalarizeCost + Num * Cost;
     }
 
     // We already handled vector-to-vector and scalar-to-scalar conversions.
     // This
     // is where we handle bitcast between vectors and scalars. We need to assume
     //  that the conversion is scalarized in one way or another.
-    if (Opcode == Instruction::BitCast)
+    if (Opcode == Instruction::BitCast) {
+      T *This = static_cast<T *>(this);
       // Illegal bitcasts are done by storing and loading from a stack slot.
-      return (Src->isVectorTy() ? getScalarizationOverhead(Src, false, true)
-                                : 0) +
-             (Dst->isVectorTy() ? getScalarizationOverhead(Dst, true, false)
-                                : 0);
+      return (Src->isVectorTy() ?
+              This->getScalarizationOverhead(Src, false, true) : 0) +
+        (Dst->isVectorTy() ?
+         This->getScalarizationOverhead(Dst, true, false) : 0);
+    }
 
     llvm_unreachable("Unhandled cast");
   }
@@ -464,7 +452,9 @@
       // Return the cost of multiple scalar invocation plus the cost of
       // inserting
       // and extracting the values.
-      return getScalarizationOverhead(ValTy, true, false) + Num * Cost;
+      unsigned ScalarizeCost
+        = static_cast<T *>(this)->getScalarizationOverhead(ValTy, true, false);
+      return ScalarizeCost + Num * Cost;
     }
 
     // Unknown scalar opcode.
@@ -501,8 +491,11 @@
       if (LA != TargetLowering::Legal && LA != TargetLowering::Custom) {
         // This is a vector load/store for some illegal type that is scalarized.
         // We must account for the cost of building or decomposing the vector.
-        Cost += getScalarizationOverhead(Src, Opcode != Instruction::Store,
-                                         Opcode == Instruction::Store);
+
+        unsigned ScalarizeCost = static_cast<T *>(this)->
+          getScalarizationOverhead(Src, Opcode != Instruction::Store,
+                                   Opcode == Instruction::Store);
+        Cost += ScalarizeCost;
       }
     }
 
@@ -590,7 +583,8 @@
       unsigned ScalarCalls = 1;
       Type *ScalarRetTy = RetTy;
       if (RetTy->isVectorTy()) {
-        ScalarizationCost = getScalarizationOverhead(RetTy, true, false);
+        ScalarizationCost
+          = static_cast<T *>(this)->getScalarizationOverhead(RetTy, true, false);
         ScalarCalls = std::max(ScalarCalls, RetTy->getVectorNumElements());
         ScalarRetTy = RetTy->getScalarType();
       }
@@ -598,7 +592,8 @@
       for (unsigned i = 0, ie = Tys.size(); i != ie; ++i) {
         Type *Ty = Tys[i];
         if (Ty->isVectorTy()) {
-          ScalarizationCost += getScalarizationOverhead(Ty, false, true);
+          ScalarizationCost +=
+            static_cast<T *>(this)->getScalarizationOverhead(Ty, false, true);
           ScalarCalls = std::max(ScalarCalls, Ty->getVectorNumElements());
           Ty = Ty->getScalarType();
         }
@@ -725,7 +720,8 @@
     // this will emit a costly libcall, adding call overhead and spills. Make it
     // very expensive.
     if (RetTy->isVectorTy()) {
-      unsigned ScalarizationCost = getScalarizationOverhead(RetTy, true, false);
+      unsigned ScalarizationCost
+        = static_cast<T *>(this)->getScalarizationOverhead(RetTy, true, false);
       unsigned ScalarCalls = RetTy->getVectorNumElements();
       SmallVector<Type *, 4> ScalarTys;
       for (unsigned i = 0, ie = Tys.size(); i != ie; ++i) {
@@ -738,7 +734,9 @@
           IID, RetTy->getScalarType(), ScalarTys);
       for (unsigned i = 0, ie = Tys.size(); i != ie; ++i) {
         if (Tys[i]->isVectorTy()) {
-          ScalarizationCost += getScalarizationOverhead(Tys[i], false, true);
+          ScalarizationCost +=
+            static_cast<T *>(this)->getScalarizationOverhead(Tys[i], false,
+                                                             true);
           ScalarCalls = std::max(ScalarCalls, Tys[i]->getVectorNumElements());
         }
       }
@@ -784,7 +782,9 @@
         NumReduxLevels * (IsPairwise + 1) *
         static_cast<T *>(this)
             ->getShuffleCost(TTI::SK_ExtractSubvector, Ty, NumVecElts / 2, Ty);
-    return ShuffleCost + ArithCost + getScalarizationOverhead(Ty, false, true);
+    unsigned ScalarizeCost
+      = static_cast<T *>(this)->getScalarizationOverhead(Ty, false, true);
+    return ShuffleCost + ArithCost + ScalarizeCost;
   }
 
   /// @}
Index: lib/Target/AArch64/AArch64TargetTransformInfo.h
===================================================================
--- lib/Target/AArch64/AArch64TargetTransformInfo.h
+++ lib/Target/AArch64/AArch64TargetTransformInfo.h
@@ -34,10 +34,6 @@
   const AArch64Subtarget *ST;
   const AArch64TargetLowering *TLI;
 
-  /// Estimate the overhead of scalarizing an instruction. Insert and Extract
-  /// are set if the result needs to be inserted and/or extracted from vectors.
-  unsigned getScalarizationOverhead(Type *Ty, bool Insert, bool Extract);
-
   const AArch64Subtarget *getST() const { return ST; }
   const AArch64TargetLowering *getTLI() const { return TLI; }
 
Index: lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
===================================================================
--- lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
+++ lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
@@ -61,6 +61,10 @@
   unsigned getRegisterBitWidth(bool Vector);
   unsigned getMaxInterleaveFactor(unsigned VF);
 
+  unsigned getScalarizationOverhead(Type *Ty, bool Insert, bool Extract) {
+    return 0;
+  }
+
   int getVectorInstrCost(unsigned Opcode, Type *ValTy, unsigned Index);
   bool isSourceOfDivergence(const Value *V) const;
 };
Index: lib/Target/ARM/ARMTargetTransformInfo.h
===================================================================
--- lib/Target/ARM/ARMTargetTransformInfo.h
+++ lib/Target/ARM/ARMTargetTransformInfo.h
@@ -33,10 +33,6 @@
   const ARMSubtarget *ST;
   const ARMTargetLowering *TLI;
 
-  /// Estimate the overhead of scalarizing an instruction. Insert and Extract
-  /// are set if the result needs to be inserted and/or extracted from vectors.
-  unsigned getScalarizationOverhead(Type *Ty, bool Insert, bool Extract);
-
   const ARMSubtarget *getST() const { return ST; }
   const ARMTargetLowering *getTLI() const { return TLI; }
 
Index: lib/Target/X86/X86TargetTransformInfo.h
===================================================================
--- lib/Target/X86/X86TargetTransformInfo.h
+++ lib/Target/X86/X86TargetTransformInfo.h
@@ -33,8 +33,6 @@
   const X86Subtarget *ST;
   const X86TargetLowering *TLI;
 
-  int getScalarizationOverhead(Type *Ty, bool Insert, bool Extract);
-
   const X86Subtarget *getST() const { return ST; }
   const X86TargetLowering *getTLI() const { return TLI; }
 
Index: lib/Target/X86/X86TargetTransformInfo.cpp
===================================================================
--- lib/Target/X86/X86TargetTransformInfo.cpp
+++ lib/Target/X86/X86TargetTransformInfo.cpp
@@ -934,20 +934,6 @@
   return BaseT::getVectorInstrCost(Opcode, Val, Index);
 }
 
-int X86TTIImpl::getScalarizationOverhead(Type *Ty, bool Insert, bool Extract) {
-  assert (Ty->isVectorTy() && "Can only scalarize vectors");
-  int Cost = 0;
-
-  for (int i = 0, e = Ty->getVectorNumElements(); i < e; ++i) {
-    if (Insert)
-      Cost += getVectorInstrCost(Instruction::InsertElement, Ty, i);
-    if (Extract)
-      Cost += getVectorInstrCost(Instruction::ExtractElement, Ty, i);
-  }
-
-  return Cost;
-}
-
 int X86TTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment,
                                 unsigned AddressSpace) {
   // Handle non-power-of-two vectors such as <3 x float>
Index: test/Analysis/CostModel/AMDGPU/add.ll
===================================================================
--- /dev/null
+++ test/Analysis/CostModel/AMDGPU/add.ll
@@ -0,0 +1,56 @@
+; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa < %s | FileCheck %s
+
+; CHECK: 'add_i32'
+; CHECK: estimated cost of 1 for {{.*}} add i32
+define void @add_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %vaddr, i32 %b) #0 {
+  %vec = load i32, i32 addrspace(1)* %vaddr
+  %add = add i32 %vec, %b
+  store i32 %add, i32 addrspace(1)* %out
+  ret void
+}
+
+; CHECK: 'add_v2i32'
+; CHECK: estimated cost of 2 for {{.*}} add <2 x i32>
+define void @add_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %vaddr, <2 x i32> %b) #0 {
+  %vec = load <2 x i32>, <2 x i32> addrspace(1)* %vaddr
+  %add = add <2 x i32> %vec, %b
+  store <2 x i32> %add, <2 x i32> addrspace(1)* %out
+  ret void
+}
+
+; CHECK: 'add_v3i32'
+; CHECK: estimated cost of 3 for {{.*}} add <3 x i32>
+define void @add_v3i32(<3 x i32> addrspace(1)* %out, <3 x i32> addrspace(1)* %vaddr, <3 x i32> %b) #0 {
+  %vec = load <3 x i32>, <3 x i32> addrspace(1)* %vaddr
+  %add = add <3 x i32> %vec, %b
+  store <3 x i32> %add, <3 x i32> addrspace(1)* %out
+  ret void
+}
+
+; CHECK: 'add_i64'
+; CHECK: estimated cost of 1 for {{.*}} add i64
+define void @add_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %vaddr, i64 %b) #0 {
+  %vec = load i64, i64 addrspace(1)* %vaddr
+  %add = add i64 %vec, %b
+  store i64 %add, i64 addrspace(1)* %out
+  ret void
+}
+
+; CHECK: 'add_v2i64'
+; CHECK: estimated cost of 2 for {{.*}} add <2 x i64>
+define void @add_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> addrspace(1)* %vaddr, <2 x i64> %b) #0 {
+  %vec = load <2 x i64>, <2 x i64> addrspace(1)* %vaddr
+  %add = add <2 x i64> %vec, %b
+  store <2 x i64> %add, <2 x i64> addrspace(1)* %out
+  ret void
+}
+
+; CHECK: 'add_v3i64'
+; CHECK: estimated cost of 3 for {{.*}} add <3 x i64>
+define void @add_v3i64(<3 x i64> addrspace(1)* %out, <3 x i64> addrspace(1)* %vaddr, <3 x i64> %b) #0 {
+  %vec = load <3 x i64>, <3 x i64> addrspace(1)* %vaddr
+  %add = add <3 x i64> %vec, %b
+  store <3 x i64> %add, <3 x i64> addrspace(1)* %out
+  ret void
+}
+
Index: test/Analysis/CostModel/AMDGPU/fabs.ll
===================================================================
--- test/Analysis/CostModel/AMDGPU/fabs.ll
+++ test/Analysis/CostModel/AMDGPU/fabs.ll
@@ -10,7 +10,7 @@
 }
 
 ; CHECK: 'fabs_v2f32'
-; CHECK: estimated cost of 2 for {{.*}} call <2 x float> @llvm.fabs.v2f32
+; CHECK: estimated cost of 0 for {{.*}} call <2 x float> @llvm.fabs.v2f32
 define void @fabs_v2f32(<2 x float> addrspace(1)* %out, <2 x float> addrspace(1)* %vaddr) #0 {
   %vec = load <2 x float>, <2 x float> addrspace(1)* %vaddr
   %fabs = call <2 x float> @llvm.fabs.v2f32(<2 x float> %vec) #1
@@ -19,7 +19,7 @@
 }
 
 ; CHECK: 'fabs_v3f32'
-; CHECK: estimated cost of 3 for {{.*}} call <3 x float> @llvm.fabs.v3f32
+; CHECK: estimated cost of 0 for {{.*}} call <3 x float> @llvm.fabs.v3f32
 define void @fabs_v3f32(<3 x float> addrspace(1)* %out, <3 x float> addrspace(1)* %vaddr) #0 {
   %vec = load <3 x float>, <3 x float> addrspace(1)* %vaddr
   %fabs = call <3 x float> @llvm.fabs.v3f32(<3 x float> %vec) #1
@@ -37,7 +37,7 @@
 }
 
 ; CHECK: 'fabs_v2f64'
-; CHECK: estimated cost of 2 for {{.*}} call <2 x double> @llvm.fabs.v2f64
+; CHECK: estimated cost of 0 for {{.*}} call <2 x double> @llvm.fabs.v2f64
 define void @fabs_v2f64(<2 x double> addrspace(1)* %out, <2 x double> addrspace(1)* %vaddr) #0 {
   %vec = load <2 x double>, <2 x double> addrspace(1)* %vaddr
   %fabs = call <2 x double> @llvm.fabs.v2f64(<2 x double> %vec) #1
@@ -46,7 +46,7 @@
 }
 
 ; CHECK: 'fabs_v3f64'
-; CHECK: estimated cost of 3 for {{.*}} call <3 x double> @llvm.fabs.v3f64
+; CHECK: estimated cost of 0 for {{.*}} call <3 x double> @llvm.fabs.v3f64
 define void @fabs_v3f64(<3 x double> addrspace(1)* %out, <3 x double> addrspace(1)* %vaddr) #0 {
   %vec = load <3 x double>, <3 x double> addrspace(1)* %vaddr
   %fabs = call <3 x double> @llvm.fabs.v3f64(<3 x double> %vec) #1