diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
@@ -115,21 +115,26 @@
     return TargetTransformInfo::TCC_Basic;
   }
 
-  static inline int getHalfRateInstrCost() {
-    return 2 * TargetTransformInfo::TCC_Basic;
+  static inline int getHalfRateInstrCost(
+      TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput) {
+    return CostKind == TTI::TCK_CodeSize ? 2
+                                         : 2 * TargetTransformInfo::TCC_Basic;
   }
 
   // TODO: The size is usually 8 bytes, but takes 4x as many cycles. Maybe
   // should be 2 or 4.
-  static inline int getQuarterRateInstrCost() {
-    return 3 * TargetTransformInfo::TCC_Basic;
+  static inline int getQuarterRateInstrCost(
+      TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput) {
+    return CostKind == TTI::TCK_CodeSize ? 2
+                                         : 4 * TargetTransformInfo::TCC_Basic;
   }
 
-   // On some parts, normal fp64 operations are half rate, and others
-   // quarter. This also applies to some integer operations.
-  inline int get64BitInstrCost() const {
-    return ST->hasHalfRate64Ops() ?
-      getHalfRateInstrCost() : getQuarterRateInstrCost();
+  // On some parts, normal fp64 operations are half rate, and others
+  // quarter. This also applies to some integer operations.
+  inline int get64BitInstrCost(
+      TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput) const {
+    return ST->hasHalfRate64Ops() ? getHalfRateInstrCost(CostKind)
+                                  : getQuarterRateInstrCost(CostKind);
   }
 
 public:
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
@@ -472,9 +472,50 @@
     // FIXME: We're having to query the throughput cost so that the basic
     // implementation tries to generate legalize and scalarization costs. Maybe
     // we could hoist the scalarization code here?
-    return BaseT::getArithmeticInstrCost(Opcode, Ty, TTI::TCK_RecipThroughput,
-                                         Opd1Info, Opd2Info, Opd1PropInfo,
-                                         Opd2PropInfo, Args, CxtI);
+    if (CostKind != TTI::TCK_CodeSize)
+      return BaseT::getArithmeticInstrCost(Opcode, Ty, TTI::TCK_RecipThroughput,
+                                           Opd1Info, Opd2Info, Opd1PropInfo,
+                                           Opd2PropInfo, Args, CxtI);
+    // Scalarization
+
+    // Check if any of the operands are vector operands.
+    int ISD = TLI->InstructionOpcodeToISD(Opcode);
+    assert(ISD && "Invalid opcode");
+
+    std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);
+
+    bool IsFloat = Ty->isFPOrFPVectorTy();
+    // Assume that floating point arithmetic operations cost twice as much as
+    // integer operations.
+    unsigned OpCost = (IsFloat ? 2 : 1);
+
+    if (TLI->isOperationLegalOrPromote(ISD, LT.second)) {
+      // The operation is legal. Assume it costs 1.
+      // TODO: Once we have extract/insert subvector cost we need to use them.
+      return LT.first * OpCost;
+    }
+
+    if (!TLI->isOperationExpand(ISD, LT.second)) {
+      // If the operation is custom lowered, then assume that the code is twice
+      // as expensive.
+      return LT.first * 2 * OpCost;
+    }
+
+    // Else, assume that we need to scalarize this op.
+    // TODO: If one of the types get legalized by splitting, handle this
+    // similarly to what getCastInstrCost() does.
+    if (auto *VTy = dyn_cast<VectorType>(Ty)) {
+      unsigned Num = cast<FixedVectorType>(VTy)->getNumElements();
+      unsigned Cost = getArithmeticInstrCost(
+          Opcode, VTy->getScalarType(), CostKind, Opd1Info, Opd2Info,
+          Opd1PropInfo, Opd2PropInfo, Args, CxtI);
+      // Return the cost of multiple scalar invocation plus the cost of
+      // inserting and extracting the values.
+      return getScalarizationOverhead(VTy, Args) + Num * Cost;
+    }
+
+    // We don't know anything about this scalar instruction.
+    return OpCost;
   }
 
   // Legalize the type.
@@ -493,7 +534,7 @@
   case ISD::SRL:
   case ISD::SRA:
     if (SLT == MVT::i64)
-      return get64BitInstrCost() * LT.first * NElts;
+      return get64BitInstrCost(CostKind) * LT.first * NElts;
 
     if (ST->has16BitInsts() && SLT == MVT::i16)
       NElts = (NElts + 1) / 2;
@@ -515,7 +556,7 @@
 
     return LT.first * NElts * getFullRateInstrCost();
   case ISD::MUL: {
-    const int QuarterRateCost = getQuarterRateInstrCost();
+    const int QuarterRateCost = getQuarterRateInstrCost(CostKind);
     if (SLT == MVT::i64) {
       const int FullRateCost = getFullRateInstrCost();
       return (4 * QuarterRateCost + (2 * 2) * FullRateCost) * LT.first * NElts;
@@ -552,7 +593,7 @@
   case ISD::FADD:
   case ISD::FSUB:
     if (SLT == MVT::f64)
-      return LT.first * NElts * get64BitInstrCost();
+      return LT.first * NElts * get64BitInstrCost(CostKind);
 
     if (ST->has16BitInsts() && SLT == MVT::f16)
       NElts = (NElts + 1) / 2;
@@ -565,7 +606,9 @@
     // FIXME: frem should be handled separately. The fdiv in it is most of it,
     // but the current lowering is also not entirely correct.
     if (SLT == MVT::f64) {
-      int Cost = 4 * get64BitInstrCost() + 7 * getQuarterRateInstrCost();
+      int Cost = 7 * get64BitInstrCost(CostKind) +
+                 getQuarterRateInstrCost(CostKind) +
+                 3 * getHalfRateInstrCost(CostKind);
       // Add cost of workaround.
       if (!ST->hasUsableDivScaleConditionOutput())
         Cost += 3 * getFullRateInstrCost();
@@ -577,7 +620,7 @@
       // TODO: This is more complicated, unsafe flags etc.
       if ((SLT == MVT::f32 && !HasFP32Denormals) ||
           (SLT == MVT::f16 && ST->has16BitInsts())) {
-        return LT.first * getQuarterRateInstrCost() * NElts;
+        return LT.first * getQuarterRateInstrCost(CostKind) * NElts;
       }
     }
 
@@ -587,12 +630,15 @@
       // f32 fmul
       // v_cvt_f16_f32
       // f16 div_fixup
-      int Cost = 4 * getFullRateInstrCost() + 2 * getQuarterRateInstrCost();
+      int Cost =
+          4 * getFullRateInstrCost() + 2 * getQuarterRateInstrCost(CostKind);
       return LT.first * Cost * NElts;
     }
 
     if (SLT == MVT::f32 || SLT == MVT::f16) {
-      int Cost = 7 * getFullRateInstrCost() + 1 * getQuarterRateInstrCost();
+      // 4 more v_cvt_* insts without f16 insts support
+      int Cost = (SLT == MVT::f16 ? 14 : 10) * getFullRateInstrCost() +
+                 1 * getQuarterRateInstrCost(CostKind);
 
       if (!HasFP32Denormals) {
         // FP mode switches.
@@ -642,7 +688,48 @@
   Type *RetTy = ICA.getReturnType();
   EVT OrigTy = TLI->getValueType(DL, RetTy);
   if (!OrigTy.isSimple()) {
-    return BaseT::getIntrinsicInstrCost(ICA, CostKind);
+    if (CostKind != TTI::TCK_CodeSize)
+      return BaseT::getIntrinsicInstrCost(ICA, CostKind);
+
+    // TODO: Combine these two logic paths.
+    if (ICA.isTypeBasedOnly())
+      return getTypeBasedIntrinsicInstrCost(ICA, CostKind);
+
+    Type *RetTy = ICA.getReturnType();
+    unsigned VF = ICA.getVectorFactor();
+    unsigned RetVF =
+        (RetTy->isVectorTy() ? cast<FixedVectorType>(RetTy)->getNumElements()
+                             : 1);
+    assert((RetVF == 1 || VF == 1) && "VF > 1 and RetVF is a vector type");
+    const IntrinsicInst *I = ICA.getInst();
+    const SmallVectorImpl<const Value *> &Args = ICA.getArgs();
+    FastMathFlags FMF = ICA.getFlags();
+    // Assume that we need to scalarize this intrinsic.
+    SmallVector<Type *, 4> Types;
+    for (const Value *Op : Args) {
+      Type *OpTy = Op->getType();
+      assert(VF == 1 || !OpTy->isVectorTy());
+      Types.push_back(VF == 1 ? OpTy : FixedVectorType::get(OpTy, VF));
+    }
+
+    if (VF > 1 && !RetTy->isVoidTy())
+      RetTy = FixedVectorType::get(RetTy, VF);
+
+    // Compute the scalarization overhead based on Args for a vector
+    // intrinsic. A vectorizer will pass a scalar RetTy and VF > 1, while
+    // CostModel will pass a vector RetTy and VF is 1.
+    unsigned ScalarizationCost = std::numeric_limits<unsigned>::max();
+    if (RetVF > 1 || VF > 1) {
+      ScalarizationCost = 0;
+      if (!RetTy->isVoidTy())
+        ScalarizationCost +=
+            getScalarizationOverhead(cast<VectorType>(RetTy), true, false);
+      ScalarizationCost += getOperandsScalarizationOverhead(Args, VF);
+    }
+
+    IntrinsicCostAttributes Attrs(ICA.getID(), RetTy, Types, FMF,
+                                  ScalarizationCost, I);
+    return getIntrinsicInstrCost(Attrs, CostKind);
   }
 
   // Legalize the type.
@@ -654,16 +741,16 @@
   MVT::SimpleValueType SLT = LT.second.getScalarType().SimpleTy;
 
   if (SLT == MVT::f64)
-    return LT.first * NElts * get64BitInstrCost();
+    return LT.first * NElts * get64BitInstrCost(CostKind);
 
   if (ST->has16BitInsts() && SLT == MVT::f16)
     NElts = (NElts + 1) / 2;
 
   // TODO: Get more refined intrinsic costs?
-  unsigned InstRate = getQuarterRateInstrCost();
+  unsigned InstRate = getQuarterRateInstrCost(CostKind);
   if (ICA.getID() == Intrinsic::fma) {
-    InstRate = ST->hasFastFMAF32() ? getHalfRateInstrCost()
-                                   : getQuarterRateInstrCost();
+    InstRate = ST->hasFastFMAF32() ? getHalfRateInstrCost(CostKind)
+                                   : getQuarterRateInstrCost(CostKind);
   }
 
   return LT.first * NElts * InstRate;
@@ -714,7 +801,7 @@
                                          CostKind);
 
   std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);
-  return LT.first * getHalfRateInstrCost();
+  return LT.first * getHalfRateInstrCost(CostKind);
 }
 
 int GCNTTIImpl::getVectorInstrCost(unsigned Opcode, Type *ValTy,
diff --git a/llvm/test/Analysis/CostModel/AMDGPU/fadd.ll b/llvm/test/Analysis/CostModel/AMDGPU/fadd.ll
--- a/llvm/test/Analysis/CostModel/AMDGPU/fadd.ll
+++ b/llvm/test/Analysis/CostModel/AMDGPU/fadd.ll
@@ -1,9 +1,9 @@
 ; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900  -mattr=+half-rate-64-ops < %s | FileCheck -check-prefixes=FASTF64,FASTF16,ALL %s
 ; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa -mattr=-half-rate-64-ops < %s | FileCheck -check-prefixes=SLOWF64,SLOWF16,ALL %s
-; RUN: opt -cost-model -cost-kind=code-size -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 -mattr=+half-rate-64-ops < %s | FileCheck -check-prefixes=FASTF64,FASTF16,ALL %s
-; RUN: opt -cost-model -cost-kind=code-size -analyze -mtriple=amdgcn-unknown-amdhsa -mattr=-half-rate-64-ops < %s | FileCheck -check-prefixes=SLOWF64,SLOWF16,ALL %s
+; RUN: opt -cost-model -cost-kind=code-size -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900  -mattr=+half-rate-64-ops < %s | FileCheck -check-prefixes=FASTF16,SIZEALL,ALL %s
+; RUN: opt -cost-model -cost-kind=code-size -analyze -mtriple=amdgcn-unknown-amdhsa -mattr=-half-rate-64-ops < %s | FileCheck -check-prefixes=SLOWF16,SIZEALL,ALL %s
 
-; ALL: 'fadd_f32'
+; ALL-LABEL: 'fadd_f32'
 ; ALL: estimated cost of 1 for {{.*}} fadd float
 define amdgpu_kernel void @fadd_f32(float addrspace(1)* %out, float addrspace(1)* %vaddr, float %b) #0 {
   %vec = load float, float addrspace(1)* %vaddr
@@ -12,7 +12,7 @@
   ret void
 }
 
-; ALL: 'fadd_v2f32'
+; ALL-LABEL: 'fadd_v2f32'
 ; ALL: estimated cost of 2 for {{.*}} fadd <2 x float>
 define amdgpu_kernel void @fadd_v2f32(<2 x float> addrspace(1)* %out, <2 x float> addrspace(1)* %vaddr, <2 x float> %b) #0 {
   %vec = load <2 x float>, <2 x float> addrspace(1)* %vaddr
@@ -21,10 +21,8 @@
   ret void
 }
 
-; ALL: 'fadd_v3f32'
-; Allow for 4 when v3f32 is illegal and TargetLowering thinks it needs widening,
-; and 3 when it is legal.
-; ALL: estimated cost of {{[34]}} for {{.*}} fadd <3 x float>
+; ALL-LABEL: 'fadd_v3f32'
+; ALL: estimated cost of 3 for {{.*}} fadd <3 x float>
 define amdgpu_kernel void @fadd_v3f32(<3 x float> addrspace(1)* %out, <3 x float> addrspace(1)* %vaddr, <3 x float> %b) #0 {
   %vec = load <3 x float>, <3 x float> addrspace(1)* %vaddr
   %add = fadd <3 x float> %vec, %b
@@ -32,10 +30,8 @@
   ret void
 }
 
-; ALL: 'fadd_v5f32'
-; Allow for 8 when v5f32 is illegal and TargetLowering thinks it needs widening,
-; and 5 when it is legal.
-; ALL: estimated cost of {{[58]}} for {{.*}} fadd <5 x float>
+; ALL-LABEL: 'fadd_v5f32'
+; ALL: estimated cost of 5 for {{.*}} fadd <5 x float>
 define amdgpu_kernel void @fadd_v5f32(<5 x float> addrspace(1)* %out, <5 x float> addrspace(1)* %vaddr, <5 x float> %b) #0 {
   %vec = load <5 x float>, <5 x float> addrspace(1)* %vaddr
   %add = fadd <5 x float> %vec, %b
@@ -43,9 +39,10 @@
   ret void
 }
 
-; ALL: 'fadd_f64'
+; ALL-LABEL: 'fadd_f64'
 ; FASTF64: estimated cost of 2 for {{.*}} fadd double
-; SLOWF64: estimated cost of 3 for {{.*}} fadd double
+; SLOWF64: estimated cost of 4 for {{.*}} fadd double
+; SIZEALL: estimated cost of 2 for {{.*}} fadd double
 define amdgpu_kernel void @fadd_f64(double addrspace(1)* %out, double addrspace(1)* %vaddr, double %b) #0 {
   %vec = load double, double addrspace(1)* %vaddr
   %add = fadd double %vec, %b
@@ -53,9 +50,10 @@
   ret void
 }
 
-; ALL: 'fadd_v2f64'
+; ALL-LABEL: 'fadd_v2f64'
 ; FASTF64: estimated cost of 4 for {{.*}} fadd <2 x double>
-; SLOWF64: estimated cost of 6 for {{.*}} fadd <2 x double>
+; SLOWF64: estimated cost of 8 for {{.*}} fadd <2 x double>
+; SIZEALL: estimated cost of 4 for {{.*}} fadd <2 x double>
 define amdgpu_kernel void @fadd_v2f64(<2 x double> addrspace(1)* %out, <2 x double> addrspace(1)* %vaddr, <2 x double> %b) #0 {
   %vec = load <2 x double>, <2 x double> addrspace(1)* %vaddr
   %add = fadd <2 x double> %vec, %b
@@ -63,9 +61,10 @@
   ret void
 }
 
-; ALL: 'fadd_v3f64'
+; ALL-LABEL: 'fadd_v3f64'
 ; FASTF64: estimated cost of 6 for {{.*}} fadd <3 x double>
-; SLOWF64: estimated cost of 9 for {{.*}} fadd <3 x double>
+; SLOWF64: estimated cost of 12 for {{.*}} fadd <3 x double>
+; SIZEALL: estimated cost of 6 for {{.*}} fadd <3 x double>
 define amdgpu_kernel void @fadd_v3f64(<3 x double> addrspace(1)* %out, <3 x double> addrspace(1)* %vaddr, <3 x double> %b) #0 {
   %vec = load <3 x double>, <3 x double> addrspace(1)* %vaddr
   %add = fadd <3 x double> %vec, %b
@@ -73,7 +72,7 @@
   ret void
 }
 
-; ALL: 'fadd_f16'
+; ALL-LABEL: 'fadd_f16'
 ; ALL: estimated cost of 1 for {{.*}} fadd half
 define amdgpu_kernel void @fadd_f16(half addrspace(1)* %out, half addrspace(1)* %vaddr, half %b) #0 {
   %vec = load half, half addrspace(1)* %vaddr
@@ -82,7 +81,7 @@
   ret void
 }
 
-; ALL: 'fadd_v2f16'
+; ALL-LABEL: 'fadd_v2f16'
 ; SLOWF16: estimated cost of 2 for {{.*}} fadd <2 x half>
 ; FASTF16: estimated cost of 1 for {{.*}} fadd <2 x half>
 define amdgpu_kernel void @fadd_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %vaddr, <2 x half> %b) #0 {
@@ -92,7 +91,7 @@
   ret void
 }
 
-; ALL: 'fadd_v3f16'
+; ALL-LABEL: 'fadd_v3f16'
 ; SLOWF16: estimated cost of 4 for {{.*}} fadd <3 x half>
 ; FASTF16: estimated cost of 2 for {{.*}} fadd <3 x half>
 define amdgpu_kernel void @fadd_v3f16(<3 x half> addrspace(1)* %out, <3 x half> addrspace(1)* %vaddr, <3 x half> %b) #0 {
@@ -102,7 +101,7 @@
   ret void
 }
 
-; ALL: 'fadd_v4f16'
+; ALL-LABEL: 'fadd_v4f16'
 ; SLOWF16: estimated cost of 4 for {{.*}} fadd <4 x half>
 ; FASTF16: estimated cost of 2 for {{.*}} fadd <4 x half>
 define amdgpu_kernel void @fadd_v4f16(<4 x half> addrspace(1)* %out, <4 x half> addrspace(1)* %vaddr, <4 x half> %b) #0 {
diff --git a/llvm/test/Analysis/CostModel/AMDGPU/fdiv.ll b/llvm/test/Analysis/CostModel/AMDGPU/fdiv.ll
--- a/llvm/test/Analysis/CostModel/AMDGPU/fdiv.ll
+++ b/llvm/test/Analysis/CostModel/AMDGPU/fdiv.ll
@@ -1,19 +1,18 @@
-; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=hawaii < %s | FileCheck -check-prefixes=ALL,CIFASTF64,NOFP16,NOFP16-NOFP32DENORM,SLOWFP32DENORMS %s
-; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=kaveri < %s | FileCheck -check-prefixes=ALL,CISLOWF64,NOFP16,NOFP16-NOFP32DENORM,SLOWFP32DENORMS  %s
-; RUN: opt -cost-model -analyze -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti < %s | FileCheck -check-prefixes=ALL,SIFASTF64,NOFP32DENORM,NOFP16,NOFP16-NOFP32DENORM,SLOWFP32DENORMS  %s
-; RUN: opt -cost-model -analyze -mtriple=amdgcn-mesa-mesa3d -mcpu=verde < %s | FileCheck -check-prefixes=ALL,SISLOWF64,NOFP32DENORM,NOFP16,NOFP16-NOFP32DENORM,SLOWFP32DENORMS  %s
-; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=hawaii < %s | FileCheck -check-prefixes=ALL,NOFP16,NOFP16-FP32DENORM,SLOWFP32DENORMS %s
-; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=ALL,FASTFP32DENORMS,FP16 %s
+; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=hawaii < %s | FileCheck -check-prefixes=ALL,THRPTALL,CIFASTF64,NOFP16 %s
+; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=kaveri < %s | FileCheck -check-prefixes=ALL,THRPTALL,CISLOWF64,NOFP16  %s
+; RUN: opt -cost-model -analyze -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti < %s | FileCheck -check-prefixes=ALL,THRPTALL,SIFASTF64,NOFP16  %s
+; RUN: opt -cost-model -analyze -mtriple=amdgcn-mesa-mesa3d -mcpu=verde < %s | FileCheck -check-prefixes=ALL,THRPTALL,SISLOWF64,NOFP16  %s
+; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=ALL,THRPTALL,FP16,CISLOWF64 %s
 
-; RUN: opt -cost-model -cost-kind=code-size -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=hawaii < %s | FileCheck -check-prefixes=ALL,CIFASTF64,NOFP16,NOFP16-NOFP32DENORM %s
-; RUN: opt -cost-model -cost-kind=code-size -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=kaveri < %s | FileCheck -check-prefixes=ALL,CISLOWF64,NOFP16,NOFP16-NOFP32DENORM  %s
-; RUN: opt -cost-model -cost-kind=code-size -analyze -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti < %s | FileCheck -check-prefixes=ALL,SIFASTF64,NOFP16,NOFP16-NOFP32DENORM  %s
-; RUN: opt -cost-model -cost-kind=code-size -analyze -mtriple=amdgcn-mesa-mesa3d -mcpu=verde < %s | FileCheck -check-prefixes=ALL,SISLOWF64,NOFP16,NOFP16-NOFP32DENORM  %s
-; RUN: opt -cost-model -cost-kind=code-size -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=hawaii < %s | FileCheck -check-prefixes=ALL,SLOWFP32DENORMS,NOFP16,NOFP16-FP32DENORM %s
-; RUN: opt -cost-model -cost-kind=code-size -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=ALL,FASTFP32DENORMS,FP16 %s
+; RUN: opt -cost-model -cost-kind=code-size -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=hawaii < %s | FileCheck -check-prefixes=ALL,SIZEALL,SIZECI,SIZENOF16 %s
+; RUN: opt -cost-model -cost-kind=code-size -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=kaveri < %s | FileCheck -check-prefixes=ALL,SIZEALL,SIZECI,SIZENOF16 %s
+; RUN: opt -cost-model -cost-kind=code-size -analyze -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti < %s | FileCheck -check-prefixes=ALL,SIZEALL,SIZESI,SIZENOF16  %s
+; RUN: opt -cost-model -cost-kind=code-size -analyze -mtriple=amdgcn-mesa-mesa3d -mcpu=verde < %s | FileCheck -check-prefixes=ALL,SIZEALL,SIZESI,SIZENOF16  %s
+; RUN: opt -cost-model -cost-kind=code-size -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=ALL,SIZEALL,SIZECI,SIZEF16 %s
 
-; ALL: 'fdiv_f32_ieee'
-; ALL: estimated cost of 10 for {{.*}} fdiv float
+; ALL-LABEL: 'fdiv_f32_ieee'
+; THRPTALL: estimated cost of 14 for {{.*}} fdiv float
+; SIZEALL: estimated cost of 12 for {{.*}} fdiv float
 define amdgpu_kernel void @fdiv_f32_ieee(float addrspace(1)* %out, float addrspace(1)* %vaddr, float %b) #0 {
   %vec = load float, float addrspace(1)* %vaddr
   %add = fdiv float %vec, %b
@@ -21,8 +20,9 @@
   ret void
 }
 
-; ALL: 'fdiv_f32_ftzdaz'
-; ALL: estimated cost of 12 for {{.*}} fdiv float
+; ALL-LABEL: 'fdiv_f32_ftzdaz'
+; THRPTALL: estimated cost of 16 for {{.*}} fdiv float
+; SIZEALL: estimated cost of 14 for {{.*}} fdiv float
 define amdgpu_kernel void @fdiv_f32_ftzdaz(float addrspace(1)* %out, float addrspace(1)* %vaddr, float %b) #1 {
   %vec = load float, float addrspace(1)* %vaddr
   %add = fdiv float %vec, %b
@@ -30,8 +30,9 @@
   ret void
 }
 
-; ALL: 'fdiv_v2f32_ieee'
-; ALL: estimated cost of 20 for {{.*}} fdiv <2 x float>
+; ALL-LABEL: 'fdiv_v2f32_ieee'
+; THRPTALL: estimated cost of 28 for {{.*}} fdiv <2 x float>
+; SIZEALL: estimated cost of 24 for {{.*}} fdiv <2 x float>
 define amdgpu_kernel void @fdiv_v2f32_ieee(<2 x float> addrspace(1)* %out, <2 x float> addrspace(1)* %vaddr, <2 x float> %b) #0 {
   %vec = load <2 x float>, <2 x float> addrspace(1)* %vaddr
   %add = fdiv <2 x float> %vec, %b
@@ -39,8 +40,9 @@
   ret void
 }
 
-; ALL: 'fdiv_v2f32_ftzdaz'
-; ALL: estimated cost of 24 for {{.*}} fdiv <2 x float>
+; ALL-LABEL: 'fdiv_v2f32_ftzdaz'
+; THRPTALL: estimated cost of 32 for {{.*}} fdiv <2 x float>
+; SIZEALL: estimated cost of 28 for {{.*}} fdiv <2 x float>
 define amdgpu_kernel void @fdiv_v2f32_ftzdaz(<2 x float> addrspace(1)* %out, <2 x float> addrspace(1)* %vaddr, <2 x float> %b) #1 {
   %vec = load <2 x float>, <2 x float> addrspace(1)* %vaddr
   %add = fdiv <2 x float> %vec, %b
@@ -48,10 +50,9 @@
   ret void
 }
 
-; ALL: 'fdiv_v3f32_ieee'
-; Allow for 48/40 when v3f32 is illegal and TargetLowering thinks it needs widening,
-; and 36/30 when it is legal.
-; ALL: estimated cost of {{30|40}} for {{.*}} fdiv <3 x float>
+; ALL-LABEL: 'fdiv_v3f32_ieee'
+; THRPTALL: estimated cost of 42 for {{.*}} fdiv <3 x float>
+; SIZEALL: estimated cost of 36 for {{.*}} fdiv <3 x float>
 define amdgpu_kernel void @fdiv_v3f32_ieee(<3 x float> addrspace(1)* %out, <3 x float> addrspace(1)* %vaddr, <3 x float> %b) #0 {
   %vec = load <3 x float>, <3 x float> addrspace(1)* %vaddr
   %add = fdiv <3 x float> %vec, %b
@@ -59,10 +60,9 @@
   ret void
 }
 
-; ALL: 'fdiv_v3f32_ftzdaz'
-; Allow for 48/40 when v3f32 is illegal and TargetLowering thinks it needs widening,
-; and 36/30 when it is legal.
-; ALL: estimated cost of {{36|48}} for {{.*}} fdiv <3 x float>
+; ALL-LABEL: 'fdiv_v3f32_ftzdaz'
+; THRPTALL: estimated cost of 48 for {{.*}} fdiv <3 x float>
+; SIZEALL: estimated cost of 42 for {{.*}} fdiv <3 x float>
 define amdgpu_kernel void @fdiv_v3f32_ftzdaz(<3 x float> addrspace(1)* %out, <3 x float> addrspace(1)* %vaddr, <3 x float> %b) #1 {
   %vec = load <3 x float>, <3 x float> addrspace(1)* %vaddr
   %add = fdiv <3 x float> %vec, %b
@@ -70,10 +70,9 @@
   ret void
 }
 
-; ALL: 'fdiv_v5f32_ieee'
-; Allow for 96/80 when v5f32 is illegal and TargetLowering thinks it needs widening,
-; and 60/50 when it is legal.
-; ALL: estimated cost of {{80|50}} for {{.*}} fdiv <5 x float>
+; ALL-LABEL: 'fdiv_v5f32_ieee'
+; THRPTALL: estimated cost of 70 for {{.*}} fdiv <5 x float>
+; SIZEALL: estimated cost of 60 for {{.*}} fdiv <5 x float>
 define amdgpu_kernel void @fdiv_v5f32_ieee(<5 x float> addrspace(1)* %out, <5 x float> addrspace(1)* %vaddr, <5 x float> %b) #0 {
   %vec = load <5 x float>, <5 x float> addrspace(1)* %vaddr
   %add = fdiv <5 x float> %vec, %b
@@ -81,10 +80,9 @@
   ret void
 }
 
-; ALL: 'fdiv_v5f32_ftzdaz'
-; Allow for 96/80 when v5f32 is illegal and TargetLowering thinks it needs widening,
-; and 60/50 when it is legal.
-; ALL: estimated cost of {{96|60}} for {{.*}} fdiv <5 x float>
+; ALL-LABEL: 'fdiv_v5f32_ftzdaz'
+; THRPTALL: estimated cost of 80 for {{.*}} fdiv <5 x float>
+; SIZEALL: estimated cost of 70 for {{.*}} fdiv <5 x float>
 define amdgpu_kernel void @fdiv_v5f32_ftzdaz(<5 x float> addrspace(1)* %out, <5 x float> addrspace(1)* %vaddr, <5 x float> %b) #1 {
   %vec = load <5 x float>, <5 x float> addrspace(1)* %vaddr
   %add = fdiv <5 x float> %vec, %b
@@ -92,11 +90,13 @@
   ret void
 }
 
-; ALL: 'fdiv_f64'
-; CIFASTF64: estimated cost of 29 for {{.*}} fdiv double
-; CISLOWF64: estimated cost of 33 for {{.*}} fdiv double
-; SIFASTF64: estimated cost of 32 for {{.*}} fdiv double
-; SISLOWF64: estimated cost of 36 for {{.*}} fdiv double
+; ALL-LABEL: 'fdiv_f64'
+; CIFASTF64: estimated cost of 24 for {{.*}} fdiv double
+; CISLOWF64: estimated cost of 38 for {{.*}} fdiv double
+; SIFASTF64: estimated cost of 27 for {{.*}} fdiv double
+; SISLOWF64: estimated cost of 41 for {{.*}} fdiv double
+; SIZECI: estimated cost of 22 for {{.*}} fdiv double
+; SIZESI: estimated cost of 25 for {{.*}} fdiv double
 define amdgpu_kernel void @fdiv_f64(double addrspace(1)* %out, double addrspace(1)* %vaddr, double %b) #0 {
   %vec = load double, double addrspace(1)* %vaddr
   %add = fdiv double %vec, %b
@@ -104,11 +104,13 @@
   ret void
 }
 
-; ALL: 'fdiv_v2f64'
-; CIFASTF64: estimated cost of 58 for {{.*}} fdiv <2 x double>
-; CISLOWF64: estimated cost of 66 for {{.*}} fdiv <2 x double>
-; SIFASTF64: estimated cost of 64 for {{.*}} fdiv <2 x double>
-; SISLOWF64: estimated cost of 72 for {{.*}} fdiv <2 x double>
+; ALL-LABEL: 'fdiv_v2f64'
+; CIFASTF64: estimated cost of 48 for {{.*}} fdiv <2 x double>
+; CISLOWF64: estimated cost of 76 for {{.*}} fdiv <2 x double>
+; SIFASTF64: estimated cost of 54 for {{.*}} fdiv <2 x double>
+; SISLOWF64: estimated cost of 82 for {{.*}} fdiv <2 x double>
+; SIZECI: estimated cost of 44 for {{.*}} fdiv <2 x double>
+; SIZESI: estimated cost of 50 for {{.*}} fdiv <2 x double>
 define amdgpu_kernel void @fdiv_v2f64(<2 x double> addrspace(1)* %out, <2 x double> addrspace(1)* %vaddr, <2 x double> %b) #0 {
   %vec = load <2 x double>, <2 x double> addrspace(1)* %vaddr
   %add = fdiv <2 x double> %vec, %b
@@ -116,11 +118,13 @@
   ret void
 }
 
-; ALL: 'fdiv_v3f64'
-; CIFASTF64: estimated cost of 87 for {{.*}} fdiv <3 x double>
-; CISLOWF64: estimated cost of 99 for {{.*}} fdiv <3 x double>
-; SIFASTF64: estimated cost of 96 for {{.*}} fdiv <3 x double>
-; SISLOWF64: estimated cost of 108 for {{.*}} fdiv <3 x double>
+; ALL-LABEL: 'fdiv_v3f64'
+; CIFASTF64: estimated cost of 72 for {{.*}} fdiv <3 x double>
+; CISLOWF64: estimated cost of 114 for {{.*}} fdiv <3 x double>
+; SIFASTF64: estimated cost of 81 for {{.*}} fdiv <3 x double>
+; SISLOWF64: estimated cost of 123 for {{.*}} fdiv <3 x double>
+; SIZECI: estimated cost of 66 for {{.*}} fdiv <3 x double>
+; SIZESI: estimated cost of 75 for {{.*}} fdiv <3 x double>
 define amdgpu_kernel void @fdiv_v3f64(<3 x double> addrspace(1)* %out, <3 x double> addrspace(1)* %vaddr, <3 x double> %b) #0 {
   %vec = load <3 x double>, <3 x double> addrspace(1)* %vaddr
   %add = fdiv <3 x double> %vec, %b
@@ -128,9 +132,11 @@
   ret void
 }
 
-; ALL: 'fdiv_f16_f32_ieee'
-; NOFP16: estimated cost of 10 for {{.*}} fdiv half
-; FP16: estimated cost of 10 for {{.*}} fdiv half
+; ALL-LABEL: 'fdiv_f16_f32_ieee'
+; NOFP16: estimated cost of 14 for {{.*}} fdiv half
+; FP16: estimated cost of 12 for {{.*}} fdiv half
+; SIZENOF16: estimated cost of 12 for {{.*}} fdiv half
+; SIZEF16: estimated cost of 8 for {{.*}} fdiv half
 define amdgpu_kernel void @fdiv_f16_f32_ieee(half addrspace(1)* %out, half addrspace(1)* %vaddr, half %b) #0 {
   %vec = load half, half addrspace(1)* %vaddr
   %add = fdiv half %vec, %b
@@ -138,9 +144,11 @@
   ret void
 }
 
-; ALL: 'fdiv_f16_f32_ftzdaz'
-; NOFP16: estimated cost of 12 for {{.*}} fdiv half
-; FP16: estimated cost of 10 for {{.*}} fdiv half
+; ALL-LABEL: 'fdiv_f16_f32_ftzdaz'
+; NOFP16: estimated cost of 16 for {{.*}} fdiv half
+; FP16: estimated cost of 12 for {{.*}} fdiv half
+; SIZENOF16: estimated cost of 14 for {{.*}} fdiv half
+; SIZEF16: estimated cost of 8 for {{.*}} fdiv half
 define amdgpu_kernel void @fdiv_f16_f32_ftzdaz(half addrspace(1)* %out, half addrspace(1)* %vaddr, half %b) #1 {
   %vec = load half, half addrspace(1)* %vaddr
   %add = fdiv half %vec, %b
@@ -148,9 +156,11 @@
   ret void
 }
 
-; ALL: 'fdiv_v2f16_f32_ieee'
-; NOFP16: estimated cost of 20 for {{.*}} fdiv <2 x half>
-; FP16: estimated cost of 20 for {{.*}} fdiv <2 x half>
+; ALL-LABEL: 'fdiv_v2f16_f32_ieee'
+; NOFP16: estimated cost of 28 for {{.*}} fdiv <2 x half>
+; FP16: estimated cost of 24 for {{.*}} fdiv <2 x half>
+; SIZENOF16: estimated cost of 24 for {{.*}} fdiv <2 x half>
+; SIZEF16: estimated cost of 16 for {{.*}} fdiv <2 x half>
 define amdgpu_kernel void @fdiv_v2f16_f32_ieee(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %vaddr, <2 x half> %b) #0 {
   %vec = load <2 x half>, <2 x half> addrspace(1)* %vaddr
   %add = fdiv <2 x half> %vec, %b
@@ -158,9 +168,11 @@
   ret void
 }
 
-; ALL: 'fdiv_v2f16_f32_ftzdaz'
-; NOFP16: estimated cost of 24 for {{.*}} fdiv <2 x half>
-; FP16: estimated cost of 20 for {{.*}} fdiv <2 x half>
+; ALL-LABEL: 'fdiv_v2f16_f32_ftzdaz'
+; NOFP16: estimated cost of 32 for {{.*}} fdiv <2 x half>
+; FP16: estimated cost of 24 for {{.*}} fdiv <2 x half>
+; SIZENOF16: estimated cost of 28 for {{.*}} fdiv <2 x half>
+; SIZEF16: estimated cost of 16 for {{.*}} fdiv <2 x half>
 define amdgpu_kernel void @fdiv_v2f16_f32_ftzdaz(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %vaddr, <2 x half> %b) #1 {
   %vec = load <2 x half>, <2 x half> addrspace(1)* %vaddr
   %add = fdiv <2 x half> %vec, %b
@@ -168,9 +180,11 @@
   ret void
 }
 
-; ALL: 'fdiv_v4f16_f32_ieee'
-; NOFP16: estimated cost of 40 for {{.*}} fdiv <4 x half>
-; FP16: estimated cost of 40 for {{.*}} fdiv <4 x half>
+; ALL-LABEL: 'fdiv_v4f16_f32_ieee'
+; NOFP16: estimated cost of 56 for {{.*}} fdiv <4 x half>
+; FP16: estimated cost of 48 for {{.*}} fdiv <4 x half>
+; SIZENOF16: estimated cost of 48 for {{.*}} fdiv <4 x half>
+; SIZEF16: estimated cost of 32 for {{.*}} fdiv <4 x half>
 define amdgpu_kernel void @fdiv_v4f16_f32_ieee(<4 x half> addrspace(1)* %out, <4 x half> addrspace(1)* %vaddr, <4 x half> %b) #0 {
   %vec = load <4 x half>, <4 x half> addrspace(1)* %vaddr
   %add = fdiv <4 x half> %vec, %b
@@ -178,9 +192,11 @@
   ret void
 }
 
-; ALL: 'fdiv_v4f16_f32_ftzdaz'
-; NOFP16: estimated cost of 48 for {{.*}} fdiv <4 x half>
-; FP16: estimated cost of 40 for {{.*}} fdiv <4 x half>
+; ALL-LABEL: 'fdiv_v4f16_f32_ftzdaz'
+; NOFP16: estimated cost of 64 for {{.*}} fdiv <4 x half>
+; FP16: estimated cost of 48 for {{.*}} fdiv <4 x half>
+; SIZENOF16: estimated cost of 56 for {{.*}} fdiv <4 x half>
+; SIZEF16: estimated cost of 32 for {{.*}} fdiv <4 x half>
 define amdgpu_kernel void @fdiv_v4f16_f32_ftzdaz(<4 x half> addrspace(1)* %out, <4 x half> addrspace(1)* %vaddr, <4 x half> %b) #1 {
   %vec = load <4 x half>, <4 x half> addrspace(1)* %vaddr
   %add = fdiv <4 x half> %vec, %b
@@ -188,9 +204,9 @@
   ret void
 }
 
-; ALL: 'rcp_f32_ieee'
-; SLOWFP32DENORMS: estimated cost of 10 for {{.*}} fdiv float
-; FASTFP32DENORMS: estimated cost of 10 for {{.*}} fdiv float
+; ALL-LABEL: 'rcp_f32_ieee'
+; THRPTALL: estimated cost of 14 for {{.*}} fdiv float
+; SIZEALL: estimated cost of 12 for {{.*}} fdiv float
 define amdgpu_kernel void @rcp_f32_ieee(float addrspace(1)* %out, float addrspace(1)* %vaddr) #0 {
   %vec = load float, float addrspace(1)* %vaddr
   %add = fdiv float 1.0, %vec
@@ -198,8 +214,9 @@
   ret void
 }
 
-; ALL: 'rcp_f32_ftzdaz'
-; ALL: estimated cost of 3 for {{.*}} fdiv float
+; ALL-LABEL: 'rcp_f32_ftzdaz'
+; THRPTALL: estimated cost of 4 for {{.*}} fdiv float
+; SIZEALL: estimated cost of 2 for {{.*}} fdiv float
 define amdgpu_kernel void @rcp_f32_ftzdaz(float addrspace(1)* %out, float addrspace(1)* %vaddr) #1 {
   %vec = load float, float addrspace(1)* %vaddr
   %add = fdiv float 1.0, %vec
@@ -207,9 +224,11 @@
   ret void
 }
 
-; ALL: 'rcp_f16_f32_ieee'
-; NOFP16: estimated cost of 10 for {{.*}} fdiv half
-; FP16: estimated cost of 3 for {{.*}} fdiv half
+; ALL-LABEL: 'rcp_f16_f32_ieee'
+; NOFP16: estimated cost of 14 for {{.*}} fdiv half
+; FP16: estimated cost of 4 for {{.*}} fdiv half
+; SIZENOF16: estimated cost of 12 for {{.*}} fdiv half
+; SIZEF16: estimated cost of 2 for {{.*}} fdiv half
 define amdgpu_kernel void @rcp_f16_f32_ieee(half addrspace(1)* %out, half addrspace(1)* %vaddr) #0 {
   %vec = load half, half addrspace(1)* %vaddr
   %add = fdiv half 1.0, %vec
@@ -217,9 +236,9 @@
   ret void
 }
 
-; ALL: 'rcp_f16_f32_ftzdaz'
-; NOFP16: estimated cost of 3 for {{.*}} fdiv half
-; FP16: estimated cost of 3 for {{.*}} fdiv half
+; ALL-LABEL: 'rcp_f16_f32_ftzdaz'
+; THRPTALL: estimated cost of 4 for {{.*}} fdiv half
+; SIZEALL: estimated cost of 2 for {{.*}} fdiv half
 define amdgpu_kernel void @rcp_f16_f32_ftzdaz(half addrspace(1)* %out, half addrspace(1)* %vaddr) #1 {
   %vec = load half, half addrspace(1)* %vaddr
   %add = fdiv half 1.0, %vec
@@ -227,11 +246,13 @@
   ret void
 }
 
-; ALL: 'rcp_f64'
-; CIFASTF64: estimated cost of 29 for {{.*}} fdiv double
-; CISLOWF64: estimated cost of 33 for {{.*}} fdiv double
-; SIFASTF64: estimated cost of 32 for {{.*}} fdiv double
-; SISLOWF64: estimated cost of 36 for {{.*}} fdiv double
+; ALL-LABEL: 'rcp_f64'
+; CIFASTF64: estimated cost of 24 for {{.*}} fdiv double
+; CISLOWF64: estimated cost of 38 for {{.*}} fdiv double
+; SIFASTF64: estimated cost of 27 for {{.*}} fdiv double
+; SISLOWF64: estimated cost of 41 for {{.*}} fdiv double
+; SIZECI: estimated cost of 22 for {{.*}} fdiv double
+; SIZESI: estimated cost of 25 for {{.*}} fdiv double
 define amdgpu_kernel void @rcp_f64(double addrspace(1)* %out, double addrspace(1)* %vaddr) #0 {
   %vec = load double, double addrspace(1)* %vaddr
   %add = fdiv double 1.0, %vec
@@ -239,9 +260,9 @@
   ret void
 }
 
-; ALL: 'rcp_v2f32_ieee'
-; SLOWFP32DENORMS: estimated cost of 20 for {{.*}} fdiv <2 x float>
-; FASTFP32DENORMS: estimated cost of 20 for {{.*}} fdiv <2 x float>
+; ALL-LABEL: 'rcp_v2f32_ieee'
+; THRPTALL: estimated cost of 28 for {{.*}} fdiv <2 x float>
+; SIZEALL: estimated cost of 24 for {{.*}} fdiv <2 x float>
 define amdgpu_kernel void @rcp_v2f32_ieee(<2 x float> addrspace(1)* %out, <2 x float> addrspace(1)* %vaddr) #0 {
   %vec = load <2 x float>, <2 x float> addrspace(1)* %vaddr
   %add = fdiv <2 x float> <float 1.0, float 1.0>, %vec
@@ -249,8 +270,9 @@
   ret void
 }
 
-; ALL: 'rcp_v2f32_ftzdaz'
-; ALL: estimated cost of 6 for {{.*}} fdiv <2 x float>
+; ALL-LABEL: 'rcp_v2f32_ftzdaz'
+; THRPTALL: estimated cost of 8 for {{.*}} fdiv <2 x float>
+; SIZEALL: estimated cost of 4 for {{.*}} fdiv <2 x float>
 define amdgpu_kernel void @rcp_v2f32_ftzdaz(<2 x float> addrspace(1)* %out, <2 x float> addrspace(1)* %vaddr) #1 {
   %vec = load <2 x float>, <2 x float> addrspace(1)* %vaddr
   %add = fdiv <2 x float> <float 1.0, float 1.0>, %vec
@@ -258,9 +280,11 @@
   ret void
 }
 
-; ALL: 'rcp_v2f16_f32_ieee'
-; NOFP16: estimated cost of 20 for {{.*}} fdiv <2 x half>
-; FP16: estimated cost of 6 for {{.*}} fdiv <2 x half>
+; ALL-LABEL: 'rcp_v2f16_f32_ieee'
+; NOFP16: estimated cost of 28 for {{.*}} fdiv <2 x half>
+; FP16: estimated cost of 8 for {{.*}} fdiv <2 x half>
+; SIZENOF16: estimated cost of 24 for {{.*}} fdiv <2 x half>
+; SIZEF16: estimated cost of 4 for {{.*}} fdiv <2 x half>
 define amdgpu_kernel void @rcp_v2f16_f32_ieee(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %vaddr) #0 {
   %vec = load <2 x half>, <2 x half> addrspace(1)* %vaddr
   %add = fdiv <2 x half> <half 1.0, half 1.0>, %vec
@@ -268,9 +292,9 @@
   ret void
 }
 
-; ALL: 'rcp_v2f16_f32_ftzdaz'
-; NOFP16: estimated cost of 6 for {{.*}} fdiv <2 x half>
-; FP16: estimated cost of 6 for {{.*}} fdiv <2 x half>
+; ALL-LABEL: 'rcp_v2f16_f32_ftzdaz'
+; THRPTALL: estimated cost of 8 for {{.*}} fdiv <2 x half>
+; SIZEALL: estimated cost of 4 for {{.*}} fdiv <2 x half>
 define amdgpu_kernel void @rcp_v2f16_f32_ftzdaz(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %vaddr) #1 {
   %vec = load <2 x half>, <2 x half> addrspace(1)* %vaddr
   %add = fdiv <2 x half> <half 1.0, half 1.0>, %vec
diff --git a/llvm/test/Analysis/CostModel/AMDGPU/fma.ll b/llvm/test/Analysis/CostModel/AMDGPU/fma.ll
--- a/llvm/test/Analysis/CostModel/AMDGPU/fma.ll
+++ b/llvm/test/Analysis/CostModel/AMDGPU/fma.ll
@@ -1,11 +1,12 @@
-; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900  -mattr=+half-rate-64-ops < %s | FileCheck -check-prefixes=FASTF64,FAST32,FASTF16,ALL %s
-; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa -mattr=-half-rate-64-ops < %s | FileCheck -check-prefixes=SLOWF64,SLOW32,SLOWF16,ALL %s
-; RUN: opt -cost-model -cost-kind=code-size -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 -mattr=+half-rate-64-ops < %s | FileCheck -check-prefixes=FASTF64,FAST32,FASTF16,ALL %s
-; RUN: opt -cost-model -cost-kind=code-size -analyze -mtriple=amdgcn-unknown-amdhsa -mattr=-half-rate-64-ops < %s | FileCheck -check-prefixes=SLOWF64,SLOW32,SLOWF16,ALL %s
+; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900  -mattr=+half-rate-64-ops < %s | FileCheck -check-prefixes=FASTF64,FASTF32,FASTF16,ALL %s
+; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa -mattr=-half-rate-64-ops < %s | FileCheck -check-prefixes=SLOWF64,SLOWF32,SLOWF16,ALL %s
+; RUN: opt -cost-model -cost-kind=code-size -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 -mattr=+half-rate-64-ops < %s | FileCheck -check-prefixes=ALL,SIZEALL,SIZEF16 %s
+; RUN: opt -cost-model -cost-kind=code-size -analyze -mtriple=amdgcn-unknown-amdhsa -mattr=-half-rate-64-ops < %s | FileCheck -check-prefixes=ALL,SIZEALL,SIZENOF16 %s
 
 ; ALL-LABEL: 'fma_f32'
-; SLOW32: estimated cost of 3 for {{.*}} call float @llvm.fma.f32
-; FAST32: estimated cost of 2 for {{.*}} call float @llvm.fma.f32
+; SLOWF32: estimated cost of 4 for {{.*}} call float @llvm.fma.f32
+; FASTF32: estimated cost of 2 for {{.*}} call float @llvm.fma.f32
+; SIZEALL: estimated cost of 2 for {{.*}} call float @llvm.fma.f32
 define amdgpu_kernel void @fma_f32(float addrspace(1)* %out, float addrspace(1)* %vaddr) #0 {
   %vec = load float, float addrspace(1)* %vaddr
   %fma = call float @llvm.fma.f32(float %vec, float %vec, float %vec) #1
@@ -14,8 +15,9 @@
 }
 
 ; ALL-LABEL: 'fma_v2f32'
-; SLOW32: estimated cost of 6 for {{.*}} call <2 x float> @llvm.fma.v2f32
-; FAST32: estimated cost of 4 for {{.*}} call <2 x float> @llvm.fma.v2f32
+; SLOWF32: estimated cost of 8 for {{.*}} call <2 x float> @llvm.fma.v2f32
+; FASTF32: estimated cost of 4 for {{.*}} call <2 x float> @llvm.fma.v2f32
+; SIZEALL: estimated cost of 4 for {{.*}} call <2 x float> @llvm.fma.v2f32
 define amdgpu_kernel void @fma_v2f32(<2 x float> addrspace(1)* %out, <2 x float> addrspace(1)* %vaddr) #0 {
   %vec = load <2 x float>, <2 x float> addrspace(1)* %vaddr
   %fma = call <2 x float> @llvm.fma.v2f32(<2 x float> %vec, <2 x float> %vec, <2 x float> %vec) #1
@@ -24,8 +26,9 @@
 }
 
 ; ALL-LABEL: 'fma_v3f32'
-; SLOW32: estimated cost of 9 for {{.*}} call <3 x float> @llvm.fma.v3f32
-; FAST32: estimated cost of 6 for {{.*}} call <3 x float> @llvm.fma.v3f32
+; SLOWF32: estimated cost of 12 for {{.*}} call <3 x float> @llvm.fma.v3f32
+; FASTF32: estimated cost of 6 for {{.*}} call <3 x float> @llvm.fma.v3f32
+; SIZEALL: estimated cost of 6 for {{.*}} call <3 x float> @llvm.fma.v3f32
 define amdgpu_kernel void @fma_v3f32(<3 x float> addrspace(1)* %out, <3 x float> addrspace(1)* %vaddr) #0 {
   %vec = load <3 x float>, <3 x float> addrspace(1)* %vaddr
   %fma = call <3 x float> @llvm.fma.v3f32(<3 x float> %vec, <3 x float> %vec, <3 x float> %vec) #1
@@ -34,8 +37,9 @@
 }
 
 ; ALL-LABEL: 'fma_v5f32'
-; SLOW32: estimated cost of 15 for {{.*}} call <5 x float> @llvm.fma.v5f32
-; FAST32: estimated cost of 10 for {{.*}} call <5 x float> @llvm.fma.v5f32
+; SLOWF32: estimated cost of 20 for {{.*}} call <5 x float> @llvm.fma.v5f32
+; FASTF32: estimated cost of 10 for {{.*}} call <5 x float> @llvm.fma.v5f32
+; SIZEALL: estimated cost of 10 for {{.*}} call <5 x float> @llvm.fma.v5f32
 define amdgpu_kernel void @fma_v5f32(<5 x float> addrspace(1)* %out, <5 x float> addrspace(1)* %vaddr) #0 {
   %vec = load <5 x float>, <5 x float> addrspace(1)* %vaddr
   %fma = call <5 x float> @llvm.fma.v5f32(<5 x float> %vec, <5 x float> %vec, <5 x float> %vec) #1
@@ -44,8 +48,9 @@
 }
 
 ; ALL-LABEL: 'fma_f64'
-; SLOW64: estimated cost of 3 for {{.*}} call double @llvm.fma.f64
-; FAST64: estimated cost of 2 for {{.*}} call double @llvm.fma.f64
+; SLOWF64: estimated cost of 4 for {{.*}} call double @llvm.fma.f64
+; FASTF64: estimated cost of 2 for {{.*}} call double @llvm.fma.f64
+; SIZEALL: estimated cost of 2 for {{.*}} call double @llvm.fma.f64
 define amdgpu_kernel void @fma_f64(double addrspace(1)* %out, double addrspace(1)* %vaddr) #0 {
   %vec = load double, double addrspace(1)* %vaddr
   %fma = call double @llvm.fma.f64(double %vec, double %vec, double %vec) #1
@@ -54,8 +59,9 @@
 }
 
 ; ALL-LABEL: 'fma_v2f64'
-; SLOW64: estimated cost of 6 for {{.*}} call <2 x double> @llvm.fma.v2f64
-; FAST64: estimated cost of 4 for {{.*}} call <2 x double> @llvm.fma.v2f64
+; SLOWF64: estimated cost of 8 for {{.*}} call <2 x double> @llvm.fma.v2f64
+; FASTF64: estimated cost of 4 for {{.*}} call <2 x double> @llvm.fma.v2f64
+; SIZEALL: estimated cost of 4 for {{.*}} call <2 x double> @llvm.fma.v2f64
 define amdgpu_kernel void @fma_v2f64(<2 x double> addrspace(1)* %out, <2 x double> addrspace(1)* %vaddr) #0 {
   %vec = load <2 x double>, <2 x double> addrspace(1)* %vaddr
   %fma = call <2 x double> @llvm.fma.v2f64(<2 x double> %vec, <2 x double> %vec, <2 x double> %vec) #1
@@ -64,8 +70,9 @@
 }
 
 ; ALL-LABEL: 'fma_v3f64'
-; SLOW64: estimated cost of 9 for {{.*}} call <3 x double> @llvm.fma.v3f64
-; FAST64: estimated cost of 6 for {{.*}} call <3 x double> @llvm.fma.v3f64
+; SLOWF64: estimated cost of 12 for {{.*}} call <3 x double> @llvm.fma.v3f64
+; FASTF64: estimated cost of 6 for {{.*}} call <3 x double> @llvm.fma.v3f64
+; SIZEALL: estimated cost of 6 for {{.*}} call <3 x double> @llvm.fma.v3f64
 define amdgpu_kernel void @fma_v3f64(<3 x double> addrspace(1)* %out, <3 x double> addrspace(1)* %vaddr) #0 {
   %vec = load <3 x double>, <3 x double> addrspace(1)* %vaddr
   %fma = call <3 x double> @llvm.fma.v3f64(<3 x double> %vec, <3 x double> %vec, <3 x double> %vec) #1
@@ -74,8 +81,9 @@
 }
 
 ; ALL-LABEL: 'fma_f16'
-; SLOW16: estimated cost of 3 for {{.*}} call half @llvm.fma.f16
-; FAST16: estimated cost of 2 for {{.*}} call half @llvm.fma.f16
+; SLOWF16: estimated cost of 4 for {{.*}} call half @llvm.fma.f16
+; FASTF16: estimated cost of 2 for {{.*}} call half @llvm.fma.f16
+; SIZEALL: estimated cost of 2 for {{.*}} call half @llvm.fma.f16
 define amdgpu_kernel void @fma_f16(half addrspace(1)* %out, half addrspace(1)* %vaddr) #0 {
   %vec = load half, half addrspace(1)* %vaddr
   %fma = call half @llvm.fma.f16(half %vec, half %vec, half %vec) #1
@@ -84,8 +92,10 @@
 }
 
 ; ALL-LABEL: 'fma_v2f16'
-; SLOW16: estimated cost of 6 for {{.*}} call <2 x half> @llvm.fma.v2f16
-; FAST16: estimated cost of 2 for {{.*}} call <2 x half> @llvm.fma.v2f16
+; SLOWF16: estimated cost of 8 for {{.*}} call <2 x half> @llvm.fma.v2f16
+; FASTF16: estimated cost of 2 for {{.*}} call <2 x half> @llvm.fma.v2f16
+; SIZEF16: estimated cost of 2 for {{.*}} call <2 x half> @llvm.fma.v2f16
+; SIZENOF16: estimated cost of 4 for {{.*}} call <2 x half> @llvm.fma.v2f16
 define amdgpu_kernel void @fma_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %vaddr) #0 {
   %vec = load <2 x half>, <2 x half> addrspace(1)* %vaddr
   %fma = call <2 x half> @llvm.fma.v2f16(<2 x half> %vec, <2 x half> %vec, <2 x half> %vec) #1
@@ -94,8 +104,10 @@
 }
 
 ; ALL-LABEL: 'fma_v3f16'
-; SLOW16: estimated cost of 12 for {{.*}} call <3 x half> @llvm.fma.v3f16
-; FAST16: estimated cost of 4 for {{.*}} call <3 x half> @llvm.fma.v3f16
+; SLOWF16: estimated cost of 16 for {{.*}} call <3 x half> @llvm.fma.v3f16
+; FASTF16: estimated cost of 4 for {{.*}} call <3 x half> @llvm.fma.v3f16
+; SIZEF16: estimated cost of 4 for {{.*}} call <3 x half> @llvm.fma.v3f16
+; SIZENOF16: estimated cost of 8 for {{.*}} call <3 x half> @llvm.fma.v3f16
 define amdgpu_kernel void @fma_v3f16(<3 x half> addrspace(1)* %out, <3 x half> addrspace(1)* %vaddr) #0 {
   %vec = load <3 x half>, <3 x half> addrspace(1)* %vaddr
   %fma = call <3 x half> @llvm.fma.v3f16(<3 x half> %vec, <3 x half> %vec, <3 x half> %vec) #1
diff --git a/llvm/test/Analysis/CostModel/AMDGPU/fmul.ll b/llvm/test/Analysis/CostModel/AMDGPU/fmul.ll
--- a/llvm/test/Analysis/CostModel/AMDGPU/fmul.ll
+++ b/llvm/test/Analysis/CostModel/AMDGPU/fmul.ll
@@ -1,7 +1,7 @@
 ; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 -mattr=+half-rate-64-ops < %s | FileCheck -check-prefixes=FASTF64,FASTF16,ALL %s
 ; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa -mattr=-half-rate-64-ops < %s | FileCheck -check-prefixes=SLOWF64,SLOWF16,ALL %s
-; RUN: opt -cost-model -cost-kind=code-size -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 -mattr=+half-rate-64-ops < %s | FileCheck -check-prefixes=FASTF64,FASTF16,ALL %s
-; RUN: opt -cost-model -cost-kind=code-size -analyze -mtriple=amdgcn-unknown-amdhsa -mattr=-half-rate-64-ops < %s | FileCheck -check-prefixes=SLOWF64,SLOWF16,ALL %s
+; RUN: opt -cost-model -cost-kind=code-size -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 -mattr=+half-rate-64-ops < %s | FileCheck -check-prefixes=ALL,SIZEALL,FASTF16 %s
+; RUN: opt -cost-model -cost-kind=code-size -analyze -mtriple=amdgcn-unknown-amdhsa -mattr=-half-rate-64-ops < %s | FileCheck -check-prefixes=ALL,SIZEALL,SLOWF16 %s
 
 ; ALL-LABEL: 'fmul_f32'
 ; ALL: estimated cost of 1 for {{.*}} fmul float
@@ -22,9 +22,7 @@
 }
 
 ; ALL-LABEL: 'fmul_v3f32'
-; Allow for 4 when v3f32 is illegal and TargetLowering thinks it needs widening,
-; and 3 when it is legal.
-; ALL: estimated cost of {{[34]}} for {{.*}} fmul <3 x float>
+; ALL: estimated cost of 3 for {{.*}} fmul <3 x float>
 define amdgpu_kernel void @fmul_v3f32(<3 x float> addrspace(1)* %out, <3 x float> addrspace(1)* %vaddr, <3 x float> %b) #0 {
   %vec = load <3 x float>, <3 x float> addrspace(1)* %vaddr
   %add = fmul <3 x float> %vec, %b
@@ -33,9 +31,7 @@
 }
 
 ; ALL-LABEL: 'fmul_v5f32'
-; Allow for 8 when v5f32 is illegal and TargetLowering thinks it needs widening,
-; and 5 when it is legal.
-; ALL: estimated cost of {{[58]}} for {{.*}} fmul <5 x float>
+; ALL: estimated cost of 5 for {{.*}} fmul <5 x float>
 define amdgpu_kernel void @fmul_v5f32(<5 x float> addrspace(1)* %out, <5 x float> addrspace(1)* %vaddr, <5 x float> %b) #0 {
   %vec = load <5 x float>, <5 x float> addrspace(1)* %vaddr
   %add = fmul <5 x float> %vec, %b
@@ -45,7 +41,8 @@
 
 ; ALL-LABEL: 'fmul_f64'
 ; FASTF64: estimated cost of 2 for {{.*}} fmul double
-; SLOWF64: estimated cost of 3 for {{.*}} fmul double
+; SLOWF64: estimated cost of 4 for {{.*}} fmul double
+; SIZEALL: estimated cost of 2 for {{.*}} fmul double
 define amdgpu_kernel void @fmul_f64(double addrspace(1)* %out, double addrspace(1)* %vaddr, double %b) #0 {
   %vec = load double, double addrspace(1)* %vaddr
   %add = fmul double %vec, %b
@@ -55,7 +52,8 @@
 
 ; ALL-LABEL: 'fmul_v2f64'
 ; FASTF64: estimated cost of 4 for {{.*}} fmul <2 x double>
-; SLOWF64: estimated cost of 6 for {{.*}} fmul <2 x double>
+; SLOWF64: estimated cost of 8 for {{.*}} fmul <2 x double>
+; SIZEALL: estimated cost of 4 for {{.*}} fmul <2 x double>
 define amdgpu_kernel void @fmul_v2f64(<2 x double> addrspace(1)* %out, <2 x double> addrspace(1)* %vaddr, <2 x double> %b) #0 {
   %vec = load <2 x double>, <2 x double> addrspace(1)* %vaddr
   %add = fmul <2 x double> %vec, %b
@@ -65,7 +63,8 @@
 
 ; ALL-LABEL: 'fmul_v3f64'
 ; FASTF64: estimated cost of 6 for {{.*}} fmul <3 x double>
-; SLOWF64: estimated cost of 9 for {{.*}} fmul <3 x double>
+; SLOWF64: estimated cost of 12 for {{.*}} fmul <3 x double>
+; SIZEALL: estimated cost of 6 for {{.*}} fmul <3 x double>
 define amdgpu_kernel void @fmul_v3f64(<3 x double> addrspace(1)* %out, <3 x double> addrspace(1)* %vaddr, <3 x double> %b) #0 {
   %vec = load <3 x double>, <3 x double> addrspace(1)* %vaddr
   %add = fmul <3 x double> %vec, %b
diff --git a/llvm/test/Analysis/CostModel/AMDGPU/fsub.ll b/llvm/test/Analysis/CostModel/AMDGPU/fsub.ll
--- a/llvm/test/Analysis/CostModel/AMDGPU/fsub.ll
+++ b/llvm/test/Analysis/CostModel/AMDGPU/fsub.ll
@@ -1,9 +1,9 @@
 ; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 -mattr=+half-rate-64-ops < %s | FileCheck -check-prefixes=FASTF64,FASTF16,ALL %s
 ; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa -mattr=-half-rate-64-ops < %s | FileCheck -check-prefixes=SLOWF64,SLOWF16,ALL %s
-; RUN: opt -cost-model -analyze -cost-kind=code-size -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 -mattr=+half-rate-64-ops < %s | FileCheck -check-prefixes=FASTF64,FASTF16,ALL %s
-; RUN: opt -cost-model -analyze -cost-kind=code-size -mtriple=amdgcn-unknown-amdhsa -mattr=-half-rate-64-ops < %s | FileCheck -check-prefixes=SLOWF64,SLOWF16,ALL %s
+; RUN: opt -cost-model -analyze -cost-kind=code-size -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 -mattr=+half-rate-64-ops < %s | FileCheck -check-prefixes=SIZEALL,FASTF16,ALL %s
+; RUN: opt -cost-model -analyze -cost-kind=code-size -mtriple=amdgcn-unknown-amdhsa -mattr=-half-rate-64-ops < %s | FileCheck -check-prefixes=SIZEALL,SLOWF16,ALL %s
 
-; ALL: 'fsub_f32'
+; ALL-LABEL: 'fsub_f32'
 ; ALL: estimated cost of 1 for {{.*}} fsub float
 define amdgpu_kernel void @fsub_f32(float addrspace(1)* %out, float addrspace(1)* %vaddr, float %b) #0 {
   %vec = load float, float addrspace(1)* %vaddr
@@ -12,7 +12,7 @@
   ret void
 }
 
-; ALL: 'fsub_v2f32'
+; ALL-LABEL: 'fsub_v2f32'
 ; ALL: estimated cost of 2 for {{.*}} fsub <2 x float>
 define amdgpu_kernel void @fsub_v2f32(<2 x float> addrspace(1)* %out, <2 x float> addrspace(1)* %vaddr, <2 x float> %b) #0 {
   %vec = load <2 x float>, <2 x float> addrspace(1)* %vaddr
@@ -21,10 +21,8 @@
   ret void
 }
 
-; ALL: 'fsub_v3f32'
-; Allow for 4 when v3f32 is illegal and TargetLowering thinks it needs widening,
-; and 3 when it is legal.
-; ALL: estimated cost of {{[34]}} for {{.*}} fsub <3 x float>
+; ALL-LABEL: 'fsub_v3f32'
+; ALL: estimated cost of 3 for {{.*}} fsub <3 x float>
 define amdgpu_kernel void @fsub_v3f32(<3 x float> addrspace(1)* %out, <3 x float> addrspace(1)* %vaddr, <3 x float> %b) #0 {
   %vec = load <3 x float>, <3 x float> addrspace(1)* %vaddr
   %add = fsub <3 x float> %vec, %b
@@ -32,10 +30,8 @@
   ret void
 }
 
-; ALL: 'fsub_v5f32'
-; Allow for 8 when v5f32 is illegal and TargetLowering thinks it needs widening,
-; and 5 when it is legal.
-; ALL: estimated cost of {{[58]}} for {{.*}} fsub <5 x float>
+; ALL-LABEL: 'fsub_v5f32'
+; ALL: estimated cost of 5 for {{.*}} fsub <5 x float>
 define amdgpu_kernel void @fsub_v5f32(<5 x float> addrspace(1)* %out, <5 x float> addrspace(1)* %vaddr, <5 x float> %b) #0 {
   %vec = load <5 x float>, <5 x float> addrspace(1)* %vaddr
   %add = fsub <5 x float> %vec, %b
@@ -43,9 +39,10 @@
   ret void
 }
 
-; ALL: 'fsub_f64'
+; ALL-LABEL: 'fsub_f64'
 ; FASTF64: estimated cost of 2 for {{.*}} fsub double
-; SLOWF64: estimated cost of 3 for {{.*}} fsub double
+; SLOWF64: estimated cost of 4 for {{.*}} fsub double
+; SIZEALL: estimated cost of 2 for {{.*}} fsub double
 define amdgpu_kernel void @fsub_f64(double addrspace(1)* %out, double addrspace(1)* %vaddr, double %b) #0 {
   %vec = load double, double addrspace(1)* %vaddr
   %add = fsub double %vec, %b
@@ -53,9 +50,10 @@
   ret void
 }
 
-; ALL: 'fsub_v2f64'
+; ALL-LABEL: 'fsub_v2f64'
 ; FASTF64: estimated cost of 4 for {{.*}} fsub <2 x double>
-; SLOWF64: estimated cost of 6 for {{.*}} fsub <2 x double>
+; SLOWF64: estimated cost of 8 for {{.*}} fsub <2 x double>
+; SIZEALL: estimated cost of 4 for {{.*}} fsub <2 x double>
 define amdgpu_kernel void @fsub_v2f64(<2 x double> addrspace(1)* %out, <2 x double> addrspace(1)* %vaddr, <2 x double> %b) #0 {
   %vec = load <2 x double>, <2 x double> addrspace(1)* %vaddr
   %add = fsub <2 x double> %vec, %b
@@ -63,9 +61,10 @@
   ret void
 }
 
-; ALL: 'fsub_v3f64'
+; ALL-LABEL: 'fsub_v3f64'
 ; FASTF64: estimated cost of 6 for {{.*}} fsub <3 x double>
-; SLOWF64: estimated cost of 9 for {{.*}} fsub <3 x double>
+; SLOWF64: estimated cost of 12 for {{.*}} fsub <3 x double>
+; SIZEALL: estimated cost of 6 for {{.*}} fsub <3 x double>
 define amdgpu_kernel void @fsub_v3f64(<3 x double> addrspace(1)* %out, <3 x double> addrspace(1)* %vaddr, <3 x double> %b) #0 {
   %vec = load <3 x double>, <3 x double> addrspace(1)* %vaddr
   %add = fsub <3 x double> %vec, %b
@@ -73,7 +72,7 @@
   ret void
 }
 
-; ALL: 'fsub_f16'
+; ALL-LABEL: 'fsub_f16'
 ; ALL: estimated cost of 1 for {{.*}} fsub half
 define amdgpu_kernel void @fsub_f16(half addrspace(1)* %out, half addrspace(1)* %vaddr, half %b) #0 {
   %vec = load half, half addrspace(1)* %vaddr
@@ -82,7 +81,7 @@
   ret void
 }
 
-; ALL: 'fsub_v2f16'
+; ALL-LABEL: 'fsub_v2f16'
 ; SLOWF16: estimated cost of 2 for {{.*}} fsub <2 x half>
 ; FASTF16: estimated cost of 1 for {{.*}} fsub <2 x half>
 define amdgpu_kernel void @fsub_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %vaddr, <2 x half> %b) #0 {
@@ -92,7 +91,7 @@
   ret void
 }
 
-; ALL: 'fsub_v3f16'
+; ALL-LABEL: 'fsub_v3f16'
 ; SLOWF16: estimated cost of 4 for {{.*}} fsub <3 x half>
 ; FASTF16: estimated cost of 2 for {{.*}} fsub <3 x half>
 define amdgpu_kernel void @fsub_v3f16(<3 x half> addrspace(1)* %out, <3 x half> addrspace(1)* %vaddr, <3 x half> %b) #0 {
@@ -102,7 +101,7 @@
   ret void
 }
 
-; ALL: 'fsub_v4f16'
+; ALL-LABEL: 'fsub_v4f16'
 ; SLOWF16: estimated cost of 4 for {{.*}} fsub <4 x half>
 ; FASTF16: estimated cost of 2 for {{.*}} fsub <4 x half>
 define amdgpu_kernel void @fsub_v4f16(<4 x half> addrspace(1)* %out, <4 x half> addrspace(1)* %vaddr, <4 x half> %b) #0 {
diff --git a/llvm/test/Analysis/CostModel/AMDGPU/fused_costs.ll b/llvm/test/Analysis/CostModel/AMDGPU/fused_costs.ll
--- a/llvm/test/Analysis/CostModel/AMDGPU/fused_costs.ll
+++ b/llvm/test/Analysis/CostModel/AMDGPU/fused_costs.ll
@@ -1,11 +1,11 @@
-; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 -denormal-fp-math-f32=preserve-sign -denormal-fp-math=preserve-sign -fp-contract=on < %s | FileCheck -check-prefixes=FUSED,NOCONTRACT,ALL %s
-; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 -denormal-fp-math-f32=ieee -denormal-fp-math=ieee -fp-contract=on < %s | FileCheck -check-prefixes=SLOW,NOCONTRACT,ALL %s
-; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 -denormal-fp-math-f32=ieee -denormal-fp-math=ieee -fp-contract=fast < %s | FileCheck -check-prefixes=FUSED,CONTRACT,ALL %s
-; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx1030 -denormal-fp-math-f32=preserve-sign -denormal-fp-math=preserve-sign -fp-contract=on < %s | FileCheck -check-prefixes=GFX1030,NOCONTRACT,ALL %s
-; RUN: opt -cost-model -cost-kind=code-size -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 -denormal-fp-math-f32=preserve-sign -denormal-fp-math=preserve-sign -fp-contract=on < %s | FileCheck -check-prefixes=FUSED32,FUSED16,NOCONTRACT,ALL %s
-; RUN: opt -cost-model -cost-kind=code-size -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 -denormal-fp-math-f32=ieee -denormal-fp-math=ieee -fp-contract=on < %s | FileCheck -check-prefixes=SLOW,NOCONTRACT,ALL %s
-; RUN: opt -cost-model -cost-kind=code-size -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 -denormal-fp-math-f32=ieee -denormal-fp-math=ieee -fp-contract=fast < %s | FileCheck -check-prefixes=FUSED32,FUSED16,CONTRACT,ALL %s
-; RUN: opt -cost-model -cost-kind=code-size -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx1030 -denormal-fp-math-f32=preserve-sign -denormal-fp-math=preserve-sign -fp-contract=on < %s | FileCheck -check-prefixes=GFX1030,NOCONTRACT,ALL %s
+; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 -denormal-fp-math-f32=preserve-sign -denormal-fp-math=preserve-sign -fp-contract=on < %s | FileCheck -check-prefixes=FUSED,NOCONTRACT,THRPTALL,ALL %s
+; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 -denormal-fp-math-f32=ieee -denormal-fp-math=ieee -fp-contract=on < %s | FileCheck -check-prefixes=SLOW,NOCONTRACT,THRPTALL,ALL %s
+; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 -denormal-fp-math-f32=ieee -denormal-fp-math=ieee -fp-contract=fast < %s | FileCheck -check-prefixes=FUSED,CONTRACT,THRPTALL,ALL %s
+; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx1030 -denormal-fp-math-f32=preserve-sign -denormal-fp-math=preserve-sign -fp-contract=on < %s | FileCheck -check-prefixes=GFX1030,NOCONTRACT,THRPTALL,ALL %s
+; RUN: opt -cost-model -cost-kind=code-size -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 -denormal-fp-math-f32=preserve-sign -denormal-fp-math=preserve-sign -fp-contract=on < %s | FileCheck -check-prefixes=FUSED,SZNOCONTRACT,SIZEALL,ALL %s
+; RUN: opt -cost-model -cost-kind=code-size -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 -denormal-fp-math-f32=ieee -denormal-fp-math=ieee -fp-contract=on < %s | FileCheck -check-prefixes=SLOW,SZNOCONTRACT,SIZEALL,ALL %s
+; RUN: opt -cost-model -cost-kind=code-size -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 -denormal-fp-math-f32=ieee -denormal-fp-math=ieee -fp-contract=fast < %s | FileCheck -check-prefixes=FUSED,CONTRACT,SIZEALL,ALL %s
+; RUN: opt -cost-model -cost-kind=code-size -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx1030 -denormal-fp-math-f32=preserve-sign -denormal-fp-math=preserve-sign -fp-contract=on < %s | FileCheck -check-prefixes=GFX1030,SZNOCONTRACT,SIZEALL,ALL %s
 
 target triple = "amdgcn--"
 
@@ -113,8 +113,10 @@
 
 ; ALL-LABEL: 'fmul_fadd_f64':
 ; CONTRACT: estimated cost of 0 for instruction:   %mul = fmul double
-; NOCONTRACT: estimated cost of 3 for instruction:   %mul = fmul double
-; ALL: estimated cost of 3 for instruction:   %add = fadd double
+; NOCONTRACT: estimated cost of 4 for instruction:   %mul = fmul double
+; SZNOCONTRACT: estimated cost of 2 for instruction:   %mul = fmul double
+; THRPTALL: estimated cost of 4 for instruction:   %add = fadd double
+; SIZEALL: estimated cost of 2 for instruction:   %add = fadd double
 define double @fmul_fadd_f64(double %r0, double %r1, double %r2) #0 {
   %mul = fmul double %r0, %r1
   %add = fadd double %mul, %r2
@@ -123,7 +125,8 @@
 
 ; ALL-LABEL: 'fmul_fadd_contract_f64':
 ; ALL: estimated cost of 0 for instruction:   %mul = fmul contract double
-; ALL: estimated cost of 3 for instruction:   %add = fadd contract double
+; THRPTALL: estimated cost of 4 for instruction:   %add = fadd contract double
+; SIZEALL: estimated cost of 2 for instruction:   %add = fadd contract double
 define double @fmul_fadd_contract_f64(double %r0, double %r1, double %r2) #0 {
   %mul = fmul contract double %r0, %r1
   %add = fadd contract double %mul, %r2
@@ -132,8 +135,10 @@
 
 ; ALL-LABEL: 'fmul_fadd_v2f64':
 ; CONTRACT: estimated cost of 0 for instruction:   %mul = fmul <2 x double>
-; NOCONTRACT: estimated cost of 6 for instruction:   %mul = fmul <2 x double>
-; ALL: estimated cost of 6 for instruction:   %add = fadd <2 x double>
+; NOCONTRACT: estimated cost of 8 for instruction:   %mul = fmul <2 x double>
+; SZNOCONTRACT: estimated cost of 4 for instruction:   %mul = fmul <2 x double>
+; THRPTALL: estimated cost of 8 for instruction:   %add = fadd <2 x double>
+; SIZEALL: estimated cost of 4 for instruction:   %add = fadd <2 x double>
 define <2 x double> @fmul_fadd_v2f64(<2 x double> %r0, <2 x double> %r1, <2 x double> %r2) #0 {
   %mul = fmul <2 x double> %r0, %r1
   %add = fadd <2 x double> %mul, %r2
@@ -142,8 +147,10 @@
 
 ; ALL-LABEL: 'fmul_fsub_f64':
 ; CONTRACT: estimated cost of 0 for instruction:   %mul = fmul double
-; NOCONTRACT: estimated cost of 3 for instruction:   %mul = fmul double
-; ALL: estimated cost of 3 for instruction:   %sub = fsub double
+; NOCONTRACT: estimated cost of 4 for instruction:   %mul = fmul double
+; SZNOCONTRACT: estimated cost of 2 for instruction:   %mul = fmul double
+; THRPTALL: estimated cost of 4 for instruction:   %sub = fsub double
+; SIZEALL: estimated cost of 2 for instruction:   %sub = fsub double
 define double @fmul_fsub_f64(double %r0, double %r1, double %r2) #0 {
   %mul = fmul double %r0, %r1
   %sub = fsub double %mul, %r2
@@ -152,8 +159,10 @@
 
 ; ALL-LABEL: 'fmul_fsub_v2f64':
 ; CONTRACT: estimated cost of 0 for instruction:   %mul = fmul <2 x double>
-; NOCONTRACT: estimated cost of 6 for instruction:   %mul = fmul <2 x double>
-; ALL: estimated cost of 6 for instruction:   %sub = fsub <2 x double>
+; NOCONTRACT: estimated cost of 8 for instruction:   %mul = fmul <2 x double>
+; SZNOCONTRACT: estimated cost of 4 for instruction:   %mul = fmul <2 x double>
+; THRPTALL: estimated cost of 8 for instruction:   %sub = fsub <2 x double>
+; SIZEALL: estimated cost of 4 for instruction:   %sub = fsub <2 x double>
 define <2 x double> @fmul_fsub_v2f64(<2 x double> %r0, <2 x double> %r1, <2 x double> %r2) #0 {
   %mul = fmul <2 x double> %r0, %r1
   %sub = fsub <2 x double> %mul, %r2
diff --git a/llvm/test/Analysis/CostModel/AMDGPU/mul.ll b/llvm/test/Analysis/CostModel/AMDGPU/mul.ll
--- a/llvm/test/Analysis/CostModel/AMDGPU/mul.ll
+++ b/llvm/test/Analysis/CostModel/AMDGPU/mul.ll
@@ -1,10 +1,11 @@
-; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa < %s | FileCheck -check-prefixes=SLOW16,ALL %s
-; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=FAST16,ALL %s
-; RUN: opt -cost-model -cost-kind=code-size -analyze -mtriple=amdgcn-unknown-amdhsa < %s | FileCheck -check-prefixes=SLOW16,ALL %s
-; RUN: opt -cost-model -cost-kind=code-size -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=FAST16,ALL %s
+; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa < %s | FileCheck -check-prefixes=SLOW16,THRPTALL,ALL %s
+; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=FAST16,THRPTALL,ALL %s
+; RUN: opt -cost-model -cost-kind=code-size -analyze -mtriple=amdgcn-unknown-amdhsa < %s | FileCheck -check-prefixes=SIZESLOW16,SIZEALL,ALL %s
+; RUN: opt -cost-model -cost-kind=code-size -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=SIZEFAST16,SIZEALL,ALL %s
 
-; ALL: 'mul_i32'
-; ALL: estimated cost of 3 for {{.*}} mul i32
+; ALL-LABEL: 'mul_i32'
+; THRPTALL: estimated cost of 4 for {{.*}} mul i32
+; SIZEALL: estimated cost of 2 for {{.*}} mul i32
 define amdgpu_kernel void @mul_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %vaddr, i32 %b) #0 {
   %vec = load i32, i32 addrspace(1)* %vaddr
   %mul = mul i32 %vec, %b
@@ -12,8 +13,9 @@
   ret void
 }
 
-; ALL: 'mul_v2i32'
-; ALL: estimated cost of 6 for {{.*}} mul <2 x i32>
+; ALL-LABEL: 'mul_v2i32'
+; THRPTALL: estimated cost of 8 for {{.*}} mul <2 x i32>
+; SIZEALL: estimated cost of 4 for {{.*}} mul <2 x i32>
 define amdgpu_kernel void @mul_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %vaddr, <2 x i32> %b) #0 {
   %vec = load <2 x i32>, <2 x i32> addrspace(1)* %vaddr
   %mul = mul <2 x i32> %vec, %b
@@ -21,10 +23,9 @@
   ret void
 }
 
-; ALL: 'mul_v3i32'
-; Allow for 12 when v3i32 is illegal and TargetLowering thinks it needs widening,
-; and 9 when it is legal.
-; ALL: estimated cost of {{9|12}} for {{.*}} mul <3 x i32>
+; ALL-LABEL: 'mul_v3i32'
+; THRPTALL: estimated cost of 12 for {{.*}} mul <3 x i32>
+; SIZEALL: estimated cost of 6 for {{.*}} mul <3 x i32>
 define amdgpu_kernel void @mul_v3i32(<3 x i32> addrspace(1)* %out, <3 x i32> addrspace(1)* %vaddr, <3 x i32> %b) #0 {
   %vec = load <3 x i32>, <3 x i32> addrspace(1)* %vaddr
   %mul = mul <3 x i32> %vec, %b
@@ -32,10 +33,9 @@
   ret void
 }
 
-; ALL: 'mul_v5i32'
-; Allow for 24 when v5i32 is illegal and TargetLowering thinks it needs widening,
-; and 15 when it is legal.
-; ALL: estimated cost of {{15|24}} for {{.*}} mul <5 x i32>
+; ALL-LABEL: 'mul_v5i32'
+; THRPTALL: estimated cost of 20 for {{.*}} mul <5 x i32>
+; SIZEALL: estimated cost of 10 for {{.*}} mul <5 x i32>
 define amdgpu_kernel void @mul_v5i32(<5 x i32> addrspace(1)* %out, <5 x i32> addrspace(1)* %vaddr, <5 x i32> %b) #0 {
   %vec = load <5 x i32>, <5 x i32> addrspace(1)* %vaddr
   %mul = mul <5 x i32> %vec, %b
@@ -43,8 +43,9 @@
   ret void
 }
 
-; ALL: 'mul_v4i32'
-; ALL: estimated cost of 12 for {{.*}} mul <4 x i32>
+; ALL-LABEL: 'mul_v4i32'
+; THRPTALL: estimated cost of 16 for {{.*}} mul <4 x i32>
+; SIZEALL: estimated cost of 8 for {{.*}} mul <4 x i32>
 define amdgpu_kernel void @mul_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %vaddr, <4 x i32> %b) #0 {
   %vec = load <4 x i32>, <4 x i32> addrspace(1)* %vaddr
   %mul = mul <4 x i32> %vec, %b
@@ -52,8 +53,9 @@
   ret void
 }
 
-; ALL: 'mul_i64'
-; ALL: estimated cost of 16 for {{.*}} mul i64
+; ALL-LABEL: 'mul_i64'
+; THRPTALL: estimated cost of 20 for {{.*}} mul i64
+; SIZEALL: estimated cost of 12 for {{.*}} mul i64
 define amdgpu_kernel void @mul_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %vaddr, i64 %b) #0 {
   %vec = load i64, i64 addrspace(1)* %vaddr
   %mul = mul i64 %vec, %b
@@ -61,8 +63,9 @@
   ret void
 }
 
-; ALL: 'mul_v2i64'
-; ALL: estimated cost of 32 for {{.*}} mul <2 x i64>
+; ALL-LABEL: 'mul_v2i64'
+; THRPTALL: estimated cost of 40 for {{.*}} mul <2 x i64>
+; SIZEALL: estimated cost of 24 for {{.*}} mul <2 x i64>
 define amdgpu_kernel void @mul_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> addrspace(1)* %vaddr, <2 x i64> %b) #0 {
   %vec = load <2 x i64>, <2 x i64> addrspace(1)* %vaddr
   %mul = mul <2 x i64> %vec, %b
@@ -70,8 +73,9 @@
   ret void
 }
 
-; ALL: 'mul_v3i64'
-; ALL: estimated cost of 48 for {{.*}} mul <3 x i64>
+; ALL-LABEL: 'mul_v3i64'
+; THRPTALL: estimated cost of 60 for {{.*}} mul <3 x i64>
+; SIZEALL: estimated cost of 36 for {{.*}} mul <3 x i64>
 define amdgpu_kernel void @mul_v3i64(<3 x i64> addrspace(1)* %out, <3 x i64> addrspace(1)* %vaddr, <3 x i64> %b) #0 {
   %vec = load <3 x i64>, <3 x i64> addrspace(1)* %vaddr
   %mul = mul <3 x i64> %vec, %b
@@ -79,8 +83,9 @@
   ret void
 }
 
-; ALL: 'mul_v4i64'
-; ALL: estimated cost of 64 for {{.*}} mul <4 x i64>
+; ALL-LABEL: 'mul_v4i64'
+; THRPTALL: estimated cost of 80 for {{.*}} mul <4 x i64>
+; SIZEALL: estimated cost of 48 for {{.*}} mul <4 x i64>
 define amdgpu_kernel void @mul_v4i64(<4 x i64> addrspace(1)* %out, <4 x i64> addrspace(1)* %vaddr, <4 x i64> %b) #0 {
   %vec = load <4 x i64>, <4 x i64> addrspace(1)* %vaddr
   %mul = mul <4 x i64> %vec, %b
@@ -89,8 +94,9 @@
 }
 
 
-; ALL: 'mul_v8i64'
-; ALL: estimated cost of 256 for {{.*}} mul <8 x i64>
+; ALL-LABEL: 'mul_v8i64'
+; THRPTALL: estimated cost of 320 for {{.*}} mul <8 x i64>
+; SIZEALL: estimated cost of 192 for {{.*}} mul <8 x i64>
 define amdgpu_kernel void @mul_v8i64(<8 x i64> addrspace(1)* %out, <8 x i64> addrspace(1)* %vaddr, <8 x i64> %b) #0 {
   %vec = load <8 x i64>, <8 x i64> addrspace(1)* %vaddr
   %mul = mul <8 x i64> %vec, %b
@@ -98,8 +104,9 @@
   ret void
 }
 
-; ALL: 'mul_i16'
-; ALL: estimated cost of 3 for {{.*}} mul i16
+; ALL-LABEL: 'mul_i16'
+; THRPTALL: estimated cost of 4 for {{.*}} mul i16
+; SIZEALL: estimated cost of 2 for {{.*}} mul i16
 define amdgpu_kernel void @mul_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %vaddr, i16 %b) #0 {
   %vec = load i16, i16 addrspace(1)* %vaddr
   %mul = mul i16 %vec, %b
@@ -107,9 +114,11 @@
   ret void
 }
 
-; ALL: 'mul_v2i16'
-; SLOW16: estimated cost of 6 for {{.*}} mul <2 x i16>
-; FAST16: estimated cost of 3 for {{.*}} mul <2 x i16>
+; ALL-LABEL: 'mul_v2i16'
+; SLOW16: estimated cost of 8 for {{.*}} mul <2 x i16>
+; FAST16: estimated cost of 4 for {{.*}} mul <2 x i16>
+; SIZESLOW16: estimated cost of 4 for {{.*}} mul <2 x i16>
+; SIZEFAST16: estimated cost of 2 for {{.*}} mul <2 x i16>
 define amdgpu_kernel void @mul_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %vaddr, <2 x i16> %b) #0 {
   %vec = load <2 x i16>, <2 x i16> addrspace(1)* %vaddr
   %mul = mul <2 x i16> %vec, %b
@@ -117,9 +126,11 @@
   ret void
 }
 
-; ALL: 'mul_v3i16'
-; SLOW16: estimated cost of 12 for {{.*}} mul <3 x i16>
-; FAST16: estimated cost of 6 for {{.*}} mul <3 x i16>
+; ALL-LABEL: 'mul_v3i16'
+; SLOW16: estimated cost of 16 for {{.*}} mul <3 x i16>
+; FAST16: estimated cost of 8 for {{.*}} mul <3 x i16>
+; SIZESLOW16: estimated cost of 8 for {{.*}} mul <3 x i16>
+; SIZEFAST16: estimated cost of 4 for {{.*}} mul <3 x i16>
 define amdgpu_kernel void @mul_v3i16(<3 x i16> addrspace(1)* %out, <3 x i16> addrspace(1)* %vaddr, <3 x i16> %b) #0 {
   %vec = load <3 x i16>, <3 x i16> addrspace(1)* %vaddr
   %mul = mul <3 x i16> %vec, %b
diff --git a/llvm/test/Analysis/CostModel/AMDGPU/shifts.ll b/llvm/test/Analysis/CostModel/AMDGPU/shifts.ll
--- a/llvm/test/Analysis/CostModel/AMDGPU/shifts.ll
+++ b/llvm/test/Analysis/CostModel/AMDGPU/shifts.ll
@@ -1,9 +1,9 @@
 ; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 -mattr=+half-rate-64-ops < %s | FileCheck -check-prefixes=ALL,FAST64,FAST16 %s
 ; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa -mattr=-half-rate-64-ops < %s | FileCheck -check-prefixes=ALL,SLOW64,SLOW16 %s
-; RUN: opt -cost-model -cost-kind=code-size -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 -mattr=+half-rate-64-ops < %s | FileCheck -check-prefixes=ALL,FAST64,FAST16 %s
-; RUN: opt -cost-model -cost-kind=code-size -analyze -mtriple=amdgcn-unknown-amdhsa -mattr=-half-rate-64-ops < %s | FileCheck -check-prefixes=ALL,SLOW64,SLOW16 %s
+; RUN: opt -cost-model -cost-kind=code-size -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 -mattr=+half-rate-64-ops < %s | FileCheck -check-prefixes=ALL,SIZEALL,FAST16 %s
+; RUN: opt -cost-model -cost-kind=code-size -analyze -mtriple=amdgcn-unknown-amdhsa -mattr=-half-rate-64-ops < %s | FileCheck -check-prefixes=ALL,SIZEALL,SLOW16 %s
 
-; ALL: 'shl_i32'
+; ALL-LABEL: 'shl_i32'
 ; ALL: estimated cost of 1 for {{.*}} shl i32
 define amdgpu_kernel void @shl_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %vaddr, i32 %b) #0 {
   %vec = load i32, i32 addrspace(1)* %vaddr
@@ -12,9 +12,10 @@
   ret void
 }
 
-; ALL: 'shl_i64'
+; ALL-LABEL: 'shl_i64'
 ; FAST64: estimated cost of 2 for {{.*}} shl i64
-; SLOW64: estimated cost of 3 for {{.*}} shl i64
+; SLOW64: estimated cost of 4 for {{.*}} shl i64
+; SIZEALL: estimated cost of 2 for {{.*}} shl i64
 define amdgpu_kernel void @shl_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %vaddr, i64 %b) #0 {
   %vec = load i64, i64 addrspace(1)* %vaddr
   %or = shl i64 %vec, %b
@@ -22,7 +23,7 @@
   ret void
 }
 
-; ALL: 'shl_i16'
+; ALL-LABEL: 'shl_i16'
 ; ALL: estimated cost of 1 for {{.*}} shl i16
 define amdgpu_kernel void @shl_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %vaddr, i16 %b) #0 {
   %vec = load i16, i16 addrspace(1)* %vaddr
@@ -31,7 +32,7 @@
   ret void
 }
 
-; ALL: 'shl_v2i16'
+; ALL-LABEL: 'shl_v2i16'
 ; SLOW16: estimated cost of 2 for {{.*}} shl <2 x i16>
 ; FAST16: estimated cost of 1 for {{.*}} shl <2 x i16>
 define amdgpu_kernel void @shl_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %vaddr, <2 x i16> %b) #0 {
@@ -41,7 +42,7 @@
   ret void
 }
 
-; ALL: 'lshr_i32'
+; ALL-LABEL: 'lshr_i32'
 ; ALL: estimated cost of 1 for {{.*}} lshr i32
 define amdgpu_kernel void @lshr_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %vaddr, i32 %b) #0 {
   %vec = load i32, i32 addrspace(1)* %vaddr
@@ -50,9 +51,10 @@
   ret void
 }
 
-; ALL: 'lshr_i64'
+; ALL-LABEL: 'lshr_i64'
 ; FAST64: estimated cost of 2 for {{.*}} lshr i64
-; SLOW64: estimated cost of 3 for {{.*}} lshr i64
+; SLOW64: estimated cost of 4 for {{.*}} lshr i64
+; SIZEALL: estimated cost of 2 for {{.*}} lshr i64
 define amdgpu_kernel void @lshr_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %vaddr, i64 %b) #0 {
   %vec = load i64, i64 addrspace(1)* %vaddr
   %or = lshr i64 %vec, %b
@@ -60,7 +62,7 @@
   ret void
 }
 
-; ALL: 'lshr_i16'
+; ALL-LABEL: 'lshr_i16'
 ; ALL: estimated cost of 1 for {{.*}} lshr i16
 define amdgpu_kernel void @lshr_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %vaddr, i16 %b) #0 {
   %vec = load i16, i16 addrspace(1)* %vaddr
@@ -69,7 +71,7 @@
   ret void
 }
 
-; ALL: 'lshr_v2i16'
+; ALL-LABEL: 'lshr_v2i16'
 ; SLOW16: estimated cost of 2 for {{.*}} lshr <2 x i16>
 ; FAST16: estimated cost of 1 for {{.*}} lshr <2 x i16>
 define amdgpu_kernel void @lshr_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %vaddr, <2 x i16> %b) #0 {
@@ -79,7 +81,7 @@
   ret void
 }
 
-; ALL: 'ashr_i32'
+; ALL-LABEL: 'ashr_i32'
 ; ALL: estimated cost of 1 for {{.*}} ashr i32
 define amdgpu_kernel void @ashr_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %vaddr, i32 %b) #0 {
   %vec = load i32, i32 addrspace(1)* %vaddr
@@ -88,9 +90,9 @@
   ret void
 }
 
-; ALL: 'ashr_i64'
+; ALL-LABEL: 'ashr_i64'
 ; FAST64: estimated cost of 2 for {{.*}} ashr i64
-; SLOW64: estimated cost of 3 for {{.*}} ashr i64
+; SLOW64: estimated cost of 4 for {{.*}} ashr i64
 define amdgpu_kernel void @ashr_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %vaddr, i64 %b) #0 {
   %vec = load i64, i64 addrspace(1)* %vaddr
   %or = ashr i64 %vec, %b
@@ -98,7 +100,7 @@
   ret void
 }
 
-; ALL: 'ashr_i16'
+; ALL-LABEL: 'ashr_i16'
 ; ALL: estimated cost of 1 for {{.*}} ashr i16
 define amdgpu_kernel void @ashr_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %vaddr, i16 %b) #0 {
   %vec = load i16, i16 addrspace(1)* %vaddr
@@ -107,7 +109,7 @@
   ret void
 }
 
-; ALL: 'ashr_v2i16'
+; ALL-LABEL: 'ashr_v2i16'
 ; SLOW16: estimated cost of 2 for {{.*}} ashr <2 x i16>
 ; FAST16: estimated cost of 1 for {{.*}} ashr <2 x i16>
 define amdgpu_kernel void @ashr_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %vaddr, <2 x i16> %b) #0 {