Index: lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
===================================================================
--- lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
+++ lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
@@ -59,6 +59,8 @@
       getHalfRateInstrCost() : getQuarterRateInstrCost();
   }
 
+  int getSimpleIntrinsicCost(MVT::SimpleValueType VT, unsigned IID) const;
+
 public:
   explicit AMDGPUTTIImpl(const AMDGPUTargetMachine *TM, const Function &F)
     : BaseT(TM, F.getParent()->getDataLayout()),
@@ -102,6 +104,15 @@
   unsigned getCFInstrCost(unsigned Opcode);
 
   int getVectorInstrCost(unsigned Opcode, Type *ValTy, unsigned Index);
+
+  unsigned getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy,
+                                 ArrayRef<Type *> Tys,
+                                 FastMathFlags FMF,
+                                 unsigned ScalarizationCostPassed = UINT_MAX);
+  int getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy,
+                            ArrayRef<Value *> Args, FastMathFlags FMF,
+                            unsigned VF = 1);
+
   bool isSourceOfDivergence(const Value *V) const;
 
   unsigned getFlatAddressSpace() const {
Index: lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
===================================================================
--- lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
+++ lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
@@ -254,6 +254,93 @@
   return 64;
 }
 
+// Helper function for getIntrinsicCost and getIntrinsicInstrCost.
+int AMDGPUTTIImpl::getSimpleIntrinsicCost(MVT::SimpleValueType VT,
+                                          unsigned IID) const {
+  switch (IID) {
+  case Intrinsic::fma: {
+    if (VT == MVT::f32) {
+      if (ST->hasFastFMAF32())
+        return getFullRateInstrCost();
+    } else if (VT == MVT::f16) {
+      if (ST->has16BitInsts())
+        return getFullRateInstrCost();
+
+      // TODO: Really need cost of conversions + f32 FMA
+    } else if (VT == MVT::v2f16) {
+      llvm_unreachable("packed types handled separately");
+    }
+
+    return getQuarterRateInstrCost();
+  }
+  case Intrinsic::floor: {
+    const int FullRateCost = getFullRateInstrCost();
+    if (VT == MVT::f32 || VT == MVT::f16)
+      return FullRateCost;
+
+    const int FP64RateCost = get64BitInstrCost();
+    if (ST->getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS)
+      return FP64RateCost;
+
+    int Cost = getSimpleIntrinsicCost(VT, Intrinsic::trunc);
+    Cost += 2 * FullRateCost; // setcc x2 i32
+    Cost += FullRateCost; // and i1
+    Cost += 2 * FullRateCost; // select
+    Cost += FP64RateCost; // fadd
+
+    return Cost;
+  }
+  case Intrinsic::trunc: {
+    const int FullRateCost = getFullRateInstrCost();
+    if (VT == MVT::f32 || VT == MVT::f16)
+      return FullRateCost;
+
+    const int FP64RateCost = get64BitInstrCost();
+    if (ST->getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS)
+      return FP64RateCost;
+
+    int Cost = FullRateCost; // bfe i32
+    Cost += FullRateCost; // sub i32
+    Cost += FP64RateCost; // sra i64
+    Cost += 2 * FullRateCost; // not i64
+    Cost += FullRateCost; // and i32
+    Cost += 2 * FullRateCost; // setcc i32 x2
+    Cost += 2 * FullRateCost; // and i64
+    Cost += 4 * FullRateCost; // select x2 i64
+
+    return Cost;
+  }
+  case Intrinsic::ctlz:
+  case Intrinsic::cttz: {
+    // FIXME: This sees the legalized type, so doesn't work correctly for
+    // i8/i16.
+    const int FullRateCost = getFullRateInstrCost();
+    if (VT == MVT::i32)
+      return FullRateCost;
+    // i64 requires 2 instructions. Illegal types require an additional add.
+    return 2 * FullRateCost;
+  }
+  case Intrinsic::amdgcn_workitem_id_x:
+  case Intrinsic::amdgcn_workitem_id_y:
+  case Intrinsic::amdgcn_workitem_id_z:
+  case Intrinsic::amdgcn_workgroup_id_x:
+  case Intrinsic::amdgcn_workgroup_id_y:
+  case Intrinsic::amdgcn_workgroup_id_z:
+  case Intrinsic::amdgcn_kernarg_segment_ptr:
+  case Intrinsic::amdgcn_implicitarg_ptr:
+  case Intrinsic::amdgcn_implicit_buffer_ptr:
+  case Intrinsic::amdgcn_queue_ptr:
+  case Intrinsic::amdgcn_dispatch_ptr:
+  case Intrinsic::amdgcn_dispatch_id:
+  case Intrinsic::amdgcn_groupstaticsize:
+  case Intrinsic::amdgcn_unreachable:
+  case Intrinsic::amdgcn_wave_barrier:
+    return 0;
+  default:
+    return -1;
+  }
+}
+
 int AMDGPUTTIImpl::getArithmeticInstrCost(
     unsigned Opcode, Type *Ty, TTI::OperandValueKind Opd1Info,
     TTI::OperandValueKind Opd2Info, TTI::OperandValueProperties Opd1PropInfo,
@@ -375,6 +462,44 @@
   }
 }
 
+unsigned AMDGPUTTIImpl::getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy,
+                                              ArrayRef<Type *> Tys,
+                                              FastMathFlags FMF,
+                                              unsigned ScalarizationCostPassed) {
+  EVT OrigTy = TLI->getValueType(DL, RetTy);
+  if (!OrigTy.isSimple())
+    return BaseT::getIntrinsicInstrCost(IID, RetTy, Tys, FMF,
+                                        ScalarizationCostPassed);
+
+  std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, RetTy);
+  if (LT.second == MVT::v2f16) {
+    assert(ST->hasVOP3PInsts());
+    switch (IID) {
+    case Intrinsic::fma:
+    case Intrinsic::fmuladd:
+      return LT.first * getFullRateInstrCost();
+    default:
+      break;
+    }
+  }
+
+  MVT::SimpleValueType SLT = LT.second.getScalarType().SimpleTy;
+
+  const int Cost = getSimpleIntrinsicCost(SLT, IID);
+  if (Cost == -1)
+    return BaseT::getIntrinsicInstrCost(IID, RetTy, Tys, FMF,
+                                        ScalarizationCostPassed);
+
+  unsigned NElts = LT.second.isVector() ? LT.second.getVectorNumElements() : 1;
+  return Cost * LT.first * NElts;
+}
+
+int AMDGPUTTIImpl::getIntrinsicInstrCost(
+  Intrinsic::ID IID, Type *RetTy,
+  ArrayRef<Value *> Args, FastMathFlags FMF, unsigned VF) {
+  return BaseT::getIntrinsicInstrCost(IID, RetTy, Args, FMF, VF);
+}
+
 static bool isIntrinsicSourceOfDivergence(const IntrinsicInst *I) {
   switch (I->getIntrinsicID()) {
   case Intrinsic::amdgcn_workitem_id_x:
Index: test/Analysis/CostModel/AMDGPU/ctlz.ll
===================================================================
--- /dev/null
+++ test/Analysis/CostModel/AMDGPU/ctlz.ll
@@ -0,0 +1,87 @@
+; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=hawaii < %s | FileCheck -check-prefixes=GCN,CI %s
+; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=fiji < %s | FileCheck -check-prefixes=GCN,VI %s
+; FIXME: CI, VI should have same costs
+
+declare i8 @llvm.ctlz.i8(i8, i1) #0
+declare i16 @llvm.ctlz.i16(i16, i1) #0
+declare i32 @llvm.ctlz.i32(i32, i1) #0
+declare i64 @llvm.ctlz.i64(i64, i1) #0
+
+; GCN-LABEL: 'ctlz_i32'
+; GCN: estimated cost of 1 for {{.*}} call i32 @llvm.ctlz.i32
+define void @ctlz_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %vaddr) #1 {
+  %vec = load i32, i32 addrspace(1)* %vaddr
+  %trunc = call i32 @llvm.ctlz.i32(i32 %vec, i1 false)
+  store i32 %trunc, i32 addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: 'ctlz_zero_undef_i32'
+; GCN: estimated cost of 1 for {{.*}} call i32 @llvm.ctlz.i32
+define void @ctlz_zero_undef_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %vaddr) #1 {
+  %vec = load i32, i32 addrspace(1)* %vaddr
+  %trunc = call i32 @llvm.ctlz.i32(i32 %vec, i1 true)
+  store i32 %trunc, i32 addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: 'ctlz_i64'
+; GCN: estimated cost of 2 for {{.*}} call i64 @llvm.ctlz.i64
+define void @ctlz_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %vaddr) #1 {
+  %vec = load i64, i64 addrspace(1)* %vaddr
+  %trunc = call i64 @llvm.ctlz.i64(i64 %vec, i1 false)
+  store i64 %trunc, i64 addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: 'ctlz_zero_undef_i64'
+; GCN: estimated cost of 2 for {{.*}} call i64 @llvm.ctlz.i64
+define void @ctlz_zero_undef_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %vaddr) #1 {
+  %vec = load i64, i64 addrspace(1)* %vaddr
+  %trunc = call i64 @llvm.ctlz.i64(i64 %vec, i1 true)
+  store i64 %trunc, i64 addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: 'ctlz_i8'
+; CI: estimated cost of 1 for {{.*}} call i8 @llvm.ctlz.i8
+; VI: estimated cost of 2 for {{.*}} call i8 @llvm.ctlz.i8
+define void @ctlz_i8(i8 addrspace(1)* %out, i8 addrspace(1)* %vaddr) #1 {
+  %vec = load i8, i8 addrspace(1)* %vaddr
+  %trunc = call i8 @llvm.ctlz.i8(i8 %vec, i1 false)
+  store i8 %trunc, i8 addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: 'ctlz_zero_undef_i8'
+; CI: estimated cost of 1 for {{.*}} call i8 @llvm.ctlz.i8
+; VI: estimated cost of 2 for {{.*}} call i8 @llvm.ctlz.i8
+define void @ctlz_zero_undef_i8(i8 addrspace(1)* %out, i8 addrspace(1)* %vaddr) #1 {
+  %vec = load i8, i8 addrspace(1)* %vaddr
+  %trunc = call i8 @llvm.ctlz.i8(i8 %vec, i1 true)
+  store i8 %trunc, i8 addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: 'ctlz_i16'
+; CI: estimated cost of 1 for {{.*}} call i16 @llvm.ctlz.i16
+; VI: estimated cost of 2 for {{.*}} call i16 @llvm.ctlz.i16
+define void @ctlz_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %vaddr) #1 {
+  %vec = load i16, i16 addrspace(1)* %vaddr
+  %trunc = call i16 @llvm.ctlz.i16(i16 %vec, i1 false)
+  store i16 %trunc, i16 addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: 'ctlz_zero_undef_i16'
+; CI: estimated cost of 1 for {{.*}} call i16 @llvm.ctlz.i16
+; VI: estimated cost of 2 for {{.*}} call i16 @llvm.ctlz.i16
+define void @ctlz_zero_undef_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %vaddr) #1 {
+  %vec = load i16, i16 addrspace(1)* %vaddr
+  %trunc = call i16 @llvm.ctlz.i16(i16 %vec, i1 true)
+  store i16 %trunc, i16 addrspace(1)* %out
+  ret void
+}
+
+attributes #0 = { nounwind readnone }
+attributes #1 = { nounwind }
Index: test/Analysis/CostModel/AMDGPU/cttz.ll
===================================================================
--- /dev/null
+++ test/Analysis/CostModel/AMDGPU/cttz.ll
@@ -0,0 +1,86 @@
+; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=hawaii < %s | FileCheck -check-prefixes=GCN,CI %s
+; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=fiji < %s | FileCheck -check-prefixes=GCN,VI %s
+
+declare i8 @llvm.cttz.i8(i8, i1) #0
+declare i16 @llvm.cttz.i16(i16, i1) #0
+declare i32 @llvm.cttz.i32(i32, i1) #0
+declare i64 @llvm.cttz.i64(i64, i1) #0
+
+; GCN-LABEL: 'cttz_i32'
+; GCN: estimated cost of 1 for {{.*}} call i32 @llvm.cttz.i32
+define void @cttz_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %vaddr) #1 {
+  %vec = load i32, i32 addrspace(1)* %vaddr
+  %trunc = call i32 @llvm.cttz.i32(i32 %vec, i1 false)
+  store i32 %trunc, i32 addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: 'cttz_zero_undef_i32'
+; GCN: estimated cost of 1 for {{.*}} call i32 @llvm.cttz.i32
+define void @cttz_zero_undef_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %vaddr) #1 {
+  %vec = load i32, i32 addrspace(1)* %vaddr
+  %trunc = call i32 @llvm.cttz.i32(i32 %vec, i1 true)
+  store i32 %trunc, i32 addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: 'cttz_i64'
+; GCN: estimated cost of 2 for {{.*}} call i64 @llvm.cttz.i64
+define void @cttz_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %vaddr) #1 {
+  %vec = load i64, i64 addrspace(1)* %vaddr
+  %trunc = call i64 @llvm.cttz.i64(i64 %vec, i1 false)
+  store i64 %trunc, i64 addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: 'cttz_zero_undef_i64'
+; GCN: estimated cost of 2 for {{.*}} call i64 @llvm.cttz.i64
+define void @cttz_zero_undef_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %vaddr) #1 {
+  %vec = load i64, i64 addrspace(1)* %vaddr
+  %trunc = call i64 @llvm.cttz.i64(i64 %vec, i1 true)
+  store i64 %trunc, i64 addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: 'cttz_i8'
+; CI: estimated cost of 1 for {{.*}} call i8 @llvm.cttz.i8
+; VI: estimated cost of 2 for {{.*}} call i8 @llvm.cttz.i8
+define void @cttz_i8(i8 addrspace(1)* %out, i8 addrspace(1)* %vaddr) #1 {
+  %vec = load i8, i8 addrspace(1)* %vaddr
+  %trunc = call i8 @llvm.cttz.i8(i8 %vec, i1 false)
+  store i8 %trunc, i8 addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: 'cttz_zero_undef_i8'
+; CI: estimated cost of 1 for {{.*}} call i8 @llvm.cttz.i8
+; VI: estimated cost of 2 for {{.*}} call i8 @llvm.cttz.i8
+define void @cttz_zero_undef_i8(i8 addrspace(1)* %out, i8 addrspace(1)* %vaddr) #1 {
+  %vec = load i8, i8 addrspace(1)* %vaddr
+  %trunc = call i8 @llvm.cttz.i8(i8 %vec, i1 true)
+  store i8 %trunc, i8 addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: 'cttz_i16'
+; CI: estimated cost of 1 for {{.*}} call i16 @llvm.cttz.i16
+; VI: estimated cost of 2 for {{.*}} call i16 @llvm.cttz.i16
+define void @cttz_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %vaddr) #1 {
+  %vec = load i16, i16 addrspace(1)* %vaddr
+  %trunc = call i16 @llvm.cttz.i16(i16 %vec, i1 false)
+  store i16 %trunc, i16 addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: 'cttz_zero_undef_i16'
+; CI: estimated cost of 1 for {{.*}} call i16 @llvm.cttz.i16
+; VI: estimated cost of 2 for {{.*}} call i16 @llvm.cttz.i16
+define void @cttz_zero_undef_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %vaddr) #1 {
+  %vec = load i16, i16 addrspace(1)* %vaddr
+  %trunc = call i16 @llvm.cttz.i16(i16 %vec, i1 true)
+  store i16 %trunc, i16 addrspace(1)* %out
+  ret void
+}
+
+attributes #0 = { nounwind readnone }
+attributes #1 = { nounwind }
Index: test/Analysis/CostModel/AMDGPU/ffloor.ll
===================================================================
--- /dev/null
+++ test/Analysis/CostModel/AMDGPU/ffloor.ll
@@ -0,0 +1,113 @@
+; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=tahiti < %s | FileCheck -check-prefix=SI-FASTFP64 -check-prefix=ALL %s
+; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=verde < %s | FileCheck -check-prefix=SI-SLOWFP64 -check-prefix=ALL %s
+; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=hawaii < %s | FileCheck -check-prefix=CI-FASTFP64 -check-prefix=ALL %s
+; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=bonaire < %s | FileCheck -check-prefix=CI-SLOWFP64 -check-prefix=ALL %s
+
+; ALL: 'floor_f32'
+; ALL: estimated cost of 1 for {{.*}} call float @llvm.floor.f32
+define void @floor_f32(float addrspace(1)* %out, float addrspace(1)* %vaddr) #0 {
+  %vec = load float, float addrspace(1)* %vaddr
+  %floor = call float @llvm.floor.f32(float %vec) #1
+  store float %floor, float addrspace(1)* %out
+  ret void
+}
+
+; ALL: 'floor_v2f32'
+; ALL: estimated cost of 2 for {{.*}} call <2 x float> @llvm.floor.v2f32
+define void @floor_v2f32(<2 x float> addrspace(1)* %out, <2 x float> addrspace(1)* %vaddr) #0 {
+  %vec = load <2 x float>, <2 x float> addrspace(1)* %vaddr
+  %floor = call <2 x float> @llvm.floor.v2f32(<2 x float> %vec) #1
+  store <2 x float> %floor, <2 x float> addrspace(1)* %out
+  ret void
+}
+
+; ALL: 'floor_v3f32'
+; ALL: estimated cost of 3 for {{.*}} call <3 x float> @llvm.floor.v3f32
+define void @floor_v3f32(<3 x float> addrspace(1)* %out, <3 x float> addrspace(1)* %vaddr) #0 {
+  %vec = load <3 x float>, <3 x float> addrspace(1)* %vaddr
+  %floor = call <3 x float> @llvm.floor.v3f32(<3 x float> %vec) #1
+  store <3 x float> %floor, <3 x float> addrspace(1)* %out
+  ret void
+}
+
+; ALL: 'floor_f64'
+; SI-FASTFP64: estimated cost of 22 for {{.*}} call double @llvm.floor.f64
+; SI-SLOWFP64: estimated cost of 24 for {{.*}} call double @llvm.floor.f64
+
+; CI-FASTFP64: estimated cost of 2 for {{.*}} call double @llvm.floor.f64
+; CI-SLOWFP64: estimated cost of 3 for {{.*}} call double @llvm.floor.f64
+define void @floor_f64(double addrspace(1)* %out, double addrspace(1)* %vaddr) #0 {
+  %vec = load double, double addrspace(1)* %vaddr
+  %floor = call double @llvm.floor.f64(double %vec) #1
+  store double %floor, double addrspace(1)* %out
+  ret void
+}
+
+; ALL: 'floor_v2f64'
+; SI-FASTFP64: estimated cost of 44 for {{.*}} call <2 x double> @llvm.floor.v2f64
+; SI-SLOWFP64: estimated cost of 48 for {{.*}} call <2 x double> @llvm.floor.v2f64
+
+; CI-FASTFP64: estimated cost of 4 for {{.*}} call <2 x double> @llvm.floor.v2f64
+; CI-SLOWFP64: estimated cost of 6 for {{.*}} call <2 x double> @llvm.floor.v2f64
+define void @floor_v2f64(<2 x double> addrspace(1)* %out, <2 x double> addrspace(1)* %vaddr) #0 {
+  %vec = load <2 x double>, <2 x double> addrspace(1)* %vaddr
+  %floor = call <2 x double> @llvm.floor.v2f64(<2 x double> %vec) #1
+  store <2 x double> %floor, <2 x double> addrspace(1)* %out
+  ret void
+}
+
+; ALL: 'floor_v3f64'
+; SI-FASTFP64: estimated cost of 66 for {{.*}} call <3 x double> @llvm.floor.v3f64
+; SI-SLOWFP64: estimated cost of 72 for {{.*}} call <3 x double> @llvm.floor.v3f64
+
+; CI-FASTFP64: estimated cost of 6 for {{.*}} call <3 x double> @llvm.floor.v3f64
+; CI-SLOWFP64: estimated cost of 9 for {{.*}} call <3 x double> @llvm.floor.v3f64
+define void @floor_v3f64(<3 x double> addrspace(1)* %out, <3 x double> addrspace(1)* %vaddr) #0 {
+  %vec = load <3 x double>, <3 x double> addrspace(1)* %vaddr
+  %floor = call <3 x double> @llvm.floor.v3f64(<3 x double> %vec) #1
+  store <3 x double> %floor, <3 x double> addrspace(1)* %out
+  ret void
+}
+
+; ALL: 'floor_f16'
+; ALL: estimated cost of 1 for {{.*}} call half @llvm.floor.f16
+define void @floor_f16(half addrspace(1)* %out, half addrspace(1)* %vaddr) #0 {
+  %vec = load half, half addrspace(1)* %vaddr
+  %floor = call half @llvm.floor.f16(half %vec) #1
+  store half %floor, half addrspace(1)* %out
+  ret void
+}
+
+; ALL: 'floor_v2f16'
+; ALL: estimated cost of 2 for {{.*}} call <2 x half> @llvm.floor.v2f16
+define void @floor_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %vaddr) #0 {
+  %vec = load <2 x half>, <2 x half> addrspace(1)* %vaddr
+  %floor = call <2 x half> @llvm.floor.v2f16(<2 x half> %vec) #1
+  store <2 x half> %floor, <2 x half> addrspace(1)* %out
+  ret void
+}
+
+; FIXME: Should be 3
+; ALL: 'floor_v3f16'
+; ALL: estimated cost of 8 for {{.*}} call <3 x half> @llvm.floor.v3f16
+define void @floor_v3f16(<3 x half> addrspace(1)* %out, <3 x half> addrspace(1)* %vaddr) #0 {
+  %vec = load <3 x half>, <3 x half> addrspace(1)* %vaddr
+  %floor = call <3 x half> @llvm.floor.v3f16(<3 x half> %vec) #1
+  store <3 x half> %floor, <3 x half> addrspace(1)* %out
+  ret void
+}
+
+declare float @llvm.floor.f32(float) #1
+declare <2 x float> @llvm.floor.v2f32(<2 x float>) #1
+declare <3 x float> @llvm.floor.v3f32(<3 x float>) #1
+
+declare double @llvm.floor.f64(double) #1
+declare <2 x double> @llvm.floor.v2f64(<2 x double>) #1
+declare <3 x double> @llvm.floor.v3f64(<3 x double>) #1
+
+declare half @llvm.floor.f16(half) #1
+declare <2 x half> @llvm.floor.v2f16(<2 x half>) #1
+declare <3 x half> @llvm.floor.v3f16(<3 x half>) #1
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone }
Index: test/Analysis/CostModel/AMDGPU/fma.ll
===================================================================
--- /dev/null
+++ test/Analysis/CostModel/AMDGPU/fma.ll
@@ -0,0 +1,83 @@
+; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=tahiti < %s | FileCheck -check-prefixes=GCN,FASTFMA32,SICI,SICI-FASTFMA %s
+; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=verde < %s | FileCheck -check-prefixes=GCN,SLOWFMA32,SICI,SICI-SLOWFMA %s
+; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=hawaii < %s | FileCheck -check-prefixes=GCN,FASTFMA32,SICI,SICI-FASTMFA %s
+; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=bonaire < %s | FileCheck -check-prefixes=GCN,SLOWFMA32,SICI,SICI-SLOWFMA %s
+; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=fiji < %s | FileCheck -check-prefixes=GCN,SLOWFMA32,VI %s
+; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,FASTFMA32,GFX9 %s
+
+; FASTFMA32: Found an estimated cost of 1 for instruction: %fma = call float @llvm.fma.f32(
+; SLOWFMA32: Found an estimated cost of 3 for instruction: %fma = call float @llvm.fma.f32(
+define float @fma_f32(float %a, float %b, float %c) #0 {
+  %fma = call float @llvm.fma.f32(float %a, float %b, float %c)
+  ret float %fma
+}
+
+; FASTFMA32: Found an estimated cost of 2 for instruction: %fma = call <2 x float> @llvm.fma.v2f32(
+; SLOWFMA32: Found an estimated cost of 6 for instruction: %fma = call <2 x float> @llvm.fma.v2f32(
+define <2 x float> @fma_v2f32(<2 x float> %a, <2 x float> %b, <2 x float> %c) #0 {
+  %fma = call <2 x float> @llvm.fma.v2f32(<2 x float> %a, <2 x float> %b, <2 x float> %c)
+  ret <2 x float> %fma
+}
+
+; GCN: Cost Model: Found an estimated cost of 3 for instruction: %fma = call double @llvm.fma.f64(
+define double @fma_f64(double %a, double %b, double %c) #0 {
+  %fma = call double @llvm.fma.f64(double %a, double %b, double %c)
+  ret double %fma
+}
+
+; GCN: Found an estimated cost of 6 for instruction: %fma = call <2 x double> @llvm.fma.v2f64(
+define <2 x double> @fma_v2f64(<2 x double> %a, <2 x double> %b, <2 x double> %c) #0 {
+  %fma = call <2 x double> @llvm.fma.v2f64(<2 x double> %a, <2 x double> %b, <2 x double> %c)
+  ret <2 x double> %fma
+}
+
+; FIXME: Should be expensive for SI because of conversions
+; SICI-FASTFMA: Found an estimated cost of 1 for instruction: %fma = call half @llvm.fma.f16(
+; SICI-SLOWFMA: Found an estimated cost of 3 for instruction: %fma = call half @llvm.fma.f16(
+; VI: Found an estimated cost of 1 for instruction: %fma = call half @llvm.fma.f16(
+; GFX9: Found an estimated cost of 1 for instruction: %fma = call half @llvm.fma.f16(
+define half @fma_f16(half %a, half %b, half %c) #0 {
+  %fma = call half @llvm.fma.f16(half %a, half %b, half %c)
+  ret half %fma
+}
+
+; SICI-FASTFMA: Cost Model: Found an estimated cost of 2 for instruction: %fma = call <2 x half> @llvm.fma.v2f16(
+; SICI-SLOWFMA: Cost Model: Found an estimated cost of 6 for instruction: %fma = call <2 x half> @llvm.fma.v2f16(
+; VI: Cost Model: Found an estimated cost of 2 for instruction: %fma = call <2 x half> @llvm.fma.v2f16(
+; GFX9: Cost Model: Found an estimated cost of 1 for instruction: %fma = call <2 x half> @llvm.fma.v2f16(
+define <2 x half> @fma_v2f16(<2 x half> %a, <2 x half> %b, <2 x half> %c) #0 {
+  %fma = call <2 x half> @llvm.fma.v2f16(<2 x half> %a, <2 x half> %b, <2 x half> %c)
+  ret <2 x half> %fma
+}
+
+; FIXME: gfx9 should be 2
+; SICI: Cost Model: Found an estimated cost of 8 for instruction: %fma = call <3 x half> @llvm.fma.v3f16(
+; VI: Cost Model: Found an estimated cost of 8 for instruction: %fma = call <3 x half> @llvm.fma.v3f16(
+; GFX9: Cost Model: Found an estimated cost of 4 for instruction: %fma = call <3 x half> @llvm.fma.v3f16(
+define <3 x half> @fma_v3f16(<3 x half> %a, <3 x half> %b, <3 x half> %c) #0 {
+  %fma = call <3 x half> @llvm.fma.v3f16(<3 x half> %a, <3 x half> %b, <3 x half> %c)
+  ret <3 x half> %fma
+}
+
+; SICI-FASTFMA: Cost Model: Found an estimated cost of 4 for instruction: %fma = call <4 x half> @llvm.fma.v4f16(
+; SICI-SLOWFMA: Cost Model: Found an estimated cost of 12 for instruction: %fma = call <4 x half> @llvm.fma.v4f16(
+; VI: Cost Model: Found an estimated cost of 4 for instruction: %fma = call <4 x half> @llvm.fma.v4f16(
+; GFX9: Cost Model: Found an estimated cost of 2 for instruction: %fma = call <4 x half> @llvm.fma.v4f16(
+define <4 x half> @fma_v4f16(<4 x half> %a, <4 x half> %b, <4 x half> %c) #0 {
+  %fma = call <4 x half> @llvm.fma.v4f16(<4 x half> %a, <4 x half> %b, <4 x half> %c)
+  ret <4 x half> %fma
+}
+
+declare float @llvm.fma.f32(float, float, float) #1
+declare <2 x float> @llvm.fma.v2f32(<2 x float>, <2 x float>, <2 x float>) #1
+
+declare half @llvm.fma.f16(half, half, half) #1
+declare <2 x half> @llvm.fma.v2f16(<2 x half>, <2 x half>, <2 x half>) #1
+declare <3 x half> @llvm.fma.v3f16(<3 x half>, <3 x half>, <3 x half>) #1
+declare <4 x half> @llvm.fma.v4f16(<4 x half>, <4 x half>, <4 x half>) #1
+
+declare double @llvm.fma.f64(double, double, double) #1
+declare <2 x double> @llvm.fma.v2f64(<2 x double>, <2 x double>, <2 x double>) #1
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone }
Index: test/Analysis/CostModel/AMDGPU/free-intrinsics.ll
===================================================================
--- /dev/null
+++ test/Analysis/CostModel/AMDGPU/free-intrinsics.ll
@@ -0,0 +1,60 @@
+; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=kaveri < %s | FileCheck %s
+
+; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %workitem.id.x = call i32 @llvm.amdgcn.workitem.id.x()
+; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %workitem.id.y = call i32 @llvm.amdgcn.workitem.id.y()
+; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %workitem.id.z = call i32 @llvm.amdgcn.workitem.id.z()
+; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %workgroup.id.x = call i32 @llvm.amdgcn.workgroup.id.x()
+; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %workgroup.id.y = call i32 @llvm.amdgcn.workgroup.id.y()
+; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %workgroup.id.z = call i32 @llvm.amdgcn.workgroup.id.z()
+; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %kernarg.segment.ptr = call i8 addrspace(2)* @llvm.amdgcn.kernarg.segment.ptr()
+; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %implicitarg.ptr = call i8 addrspace(2)* @llvm.amdgcn.implicitarg.ptr()
+; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %implicit.buffer.ptr = call i8 addrspace(2)* @llvm.amdgcn.implicit.buffer.ptr()
+; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %queue_ptr = call i8 addrspace(2)* @llvm.amdgcn.queue.ptr()
+; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %dispatch_ptr = call i8 addrspace(2)* @llvm.amdgcn.dispatch.ptr()
+; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %dispatch.id = call i64 @llvm.amdgcn.dispatch.id()
+; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %static_lds_size = call i32 @llvm.amdgcn.groupstaticsize()
+; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   call void @llvm.amdgcn.unreachable()
+; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   call void @llvm.amdgcn.wave.barrier()
+
+define void @test() #0 {
+  %workitem.id.x = call i32 @llvm.amdgcn.workitem.id.x()
+  %workitem.id.y = call i32 @llvm.amdgcn.workitem.id.y()
+  %workitem.id.z = call i32 @llvm.amdgcn.workitem.id.z()
+  %workgroup.id.x = call i32 @llvm.amdgcn.workgroup.id.x()
+  %workgroup.id.y = call i32 @llvm.amdgcn.workgroup.id.y()
+  %workgroup.id.z = call i32 @llvm.amdgcn.workgroup.id.z()
+
+  %kernarg.segment.ptr = call i8 addrspace(2)* @llvm.amdgcn.kernarg.segment.ptr()
+  %implicitarg.ptr = call i8 addrspace(2)* @llvm.amdgcn.implicitarg.ptr()
+  %implicit.buffer.ptr = call i8 addrspace(2)* @llvm.amdgcn.implicit.buffer.ptr()
+  %queue_ptr = call i8 addrspace(2)* @llvm.amdgcn.queue.ptr()
+  %dispatch_ptr = call i8 addrspace(2)* @llvm.amdgcn.dispatch.ptr()
+
+  %dispatch.id = call i64 @llvm.amdgcn.dispatch.id()
+  %static_lds_size = call i32 @llvm.amdgcn.groupstaticsize()
+
+  call void @llvm.amdgcn.unreachable()
+  call void @llvm.amdgcn.wave.barrier()
+
+  ret void
+}
+
+declare i32 @llvm.amdgcn.workitem.id.x() #1
+declare i32 @llvm.amdgcn.workitem.id.y() #1
+declare i32 @llvm.amdgcn.workitem.id.z() #1
+declare i32 @llvm.amdgcn.workgroup.id.x() #1
+declare i32 @llvm.amdgcn.workgroup.id.y() #1
+declare i32 @llvm.amdgcn.workgroup.id.z() #1
+declare i64 @llvm.amdgcn.dispatch.id() #1
+declare i8 addrspace(2)* @llvm.amdgcn.kernarg.segment.ptr() #1
+declare i8 addrspace(2)* @llvm.amdgcn.implicitarg.ptr() #1
+declare i8 addrspace(2)* @llvm.amdgcn.implicit.buffer.ptr() #1
+declare i8 addrspace(2)* @llvm.amdgcn.queue.ptr() #1
+declare i8 addrspace(2)* @llvm.amdgcn.dispatch.ptr() #0
+declare void @llvm.amdgcn.unreachable() #0
+declare i32 @llvm.amdgcn.groupstaticsize() #1
+declare void @llvm.amdgcn.wave.barrier() #2
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone }
+attributes #2 = { convergent nounwind }
Index: test/Analysis/CostModel/AMDGPU/ftrunc.ll
===================================================================
--- /dev/null
+++ test/Analysis/CostModel/AMDGPU/ftrunc.ll
@@ -0,0 +1,113 @@
+; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=tahiti < %s | FileCheck -check-prefix=SI-FASTFP64 -check-prefix=ALL %s
+; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=verde < %s | FileCheck -check-prefix=SI-SLOWFP64 -check-prefix=ALL %s
+; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=hawaii < %s | FileCheck -check-prefix=CI-FASTFP64 -check-prefix=ALL %s
+; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=bonaire < %s | FileCheck -check-prefix=CI-SLOWFP64 -check-prefix=ALL %s
+
+; ALL: 'trunc_f32'
+; ALL: estimated cost of 1 for {{.*}} call float @llvm.trunc.f32
+define void @trunc_f32(float addrspace(1)* %out, float addrspace(1)* %vaddr) #0 {
+  %vec = load float, float addrspace(1)* %vaddr
+  %trunc = call float @llvm.trunc.f32(float %vec) #1
+  store float %trunc, float addrspace(1)* %out
+  ret void
+}
+
+; ALL: 'trunc_v2f32'
+; ALL: estimated cost of 2 for {{.*}} call <2 x float> @llvm.trunc.v2f32
+define void @trunc_v2f32(<2 x float> addrspace(1)* %out, <2 x float> addrspace(1)* %vaddr) #0 {
+  %vec = load <2 x float>, <2 x float> addrspace(1)* %vaddr
+  %trunc = call <2 x float> @llvm.trunc.v2f32(<2 x float> %vec) #1
+  store <2 x float> %trunc, <2 x float> addrspace(1)* %out
+  ret void
+}
+
+; ALL: 'trunc_v3f32'
+; ALL: estimated cost of 3 for {{.*}} call <3 x float> @llvm.trunc.v3f32
+define void @trunc_v3f32(<3 x float> addrspace(1)* %out, <3 x float> addrspace(1)* %vaddr) #0 {
+  %vec = load <3 x float>, <3 x float> addrspace(1)* %vaddr
+  %trunc = call <3 x float> @llvm.trunc.v3f32(<3 x float> %vec) #1
+  store <3 x float> %trunc, <3 x float> addrspace(1)* %out
+  ret void
+}
+
+; ALL: 'trunc_f64'
+; SI-FASTFP64: estimated cost of 15 for {{.*}} call double @llvm.trunc.f64
+; SI-SLOWFP64: estimated cost of 16 for {{.*}} call double @llvm.trunc.f64
+
+; CI-FASTFP64: estimated cost of 2 for {{.*}} call double @llvm.trunc.f64
+; CI-SLOWFP64: estimated cost of 3 for {{.*}} call double @llvm.trunc.f64
+define void @trunc_f64(double addrspace(1)* %out, double addrspace(1)* %vaddr) #0 {
+  %vec = load double, double addrspace(1)* %vaddr
+  %trunc = call double @llvm.trunc.f64(double %vec) #1
+  store double %trunc, double addrspace(1)* %out
+  ret void
+}
+
+; ALL: 'trunc_v2f64'
+; SI-FASTFP64: estimated cost of 30 for {{.*}} call <2 x double> @llvm.trunc.v2f64
+; SI-SLOWFP64: estimated cost of 32 for {{.*}} call <2 x double> @llvm.trunc.v2f64
+
+; CI-FASTFP64: estimated cost of 4 for {{.*}} call <2 x double> @llvm.trunc.v2f64
+; CI-SLOWFP64: estimated cost of 6 for {{.*}} call <2 x double> @llvm.trunc.v2f64
+define void @trunc_v2f64(<2 x double> addrspace(1)* %out, <2 x double> addrspace(1)* %vaddr) #0 {
+  %vec = load <2 x double>, <2 x double> addrspace(1)* %vaddr
+  %trunc = call <2 x double> @llvm.trunc.v2f64(<2 x double> %vec) #1
+  store <2 x double> %trunc, <2 x double> addrspace(1)* %out
+  ret void
+}
+
+; ALL: 'trunc_v3f64'
+; SI-FASTFP64: estimated cost of 45 for {{.*}} call <3 x double> @llvm.trunc.v3f64
+; SI-SLOWFP64: estimated cost of 48 for {{.*}} call <3 x double> @llvm.trunc.v3f64
+
+; CI-FASTFP64: estimated cost of 6 for {{.*}} call <3 x double> @llvm.trunc.v3f64
+; CI-SLOWFP64: estimated cost of 9 for {{.*}} call <3 x double> @llvm.trunc.v3f64
+define void @trunc_v3f64(<3 x double> addrspace(1)* %out, <3 x double> addrspace(1)* %vaddr) #0 {
+  %vec = load <3 x double>, <3 x double> addrspace(1)* %vaddr
+  %trunc = call <3 x double> @llvm.trunc.v3f64(<3 x double> %vec) #1
+  store <3 x double> %trunc, <3 x double> addrspace(1)* %out
+  ret void
+}
+
+; ALL: 'trunc_f16'
+; ALL: estimated cost of 1 for {{.*}} call half @llvm.trunc.f16
+define void @trunc_f16(half addrspace(1)* %out, half addrspace(1)* %vaddr) #0 {
+  %vec = load half, half addrspace(1)* %vaddr
+  %trunc = call half @llvm.trunc.f16(half %vec) #1
+  store half %trunc, half addrspace(1)* %out
+  ret void
+}
+
+; ALL: 'trunc_v2f16'
+; ALL: estimated cost of 2 for {{.*}} call <2 x half> @llvm.trunc.v2f16
+define void @trunc_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %vaddr) #0 {
+  %vec = load <2 x half>, <2 x half> addrspace(1)* %vaddr
+  %trunc = call <2 x half> @llvm.trunc.v2f16(<2 x half> %vec) #1
+  store <2 x half> %trunc, <2 x half> addrspace(1)* %out
+  ret void
+}
+
+; FIXME: Should be 3
+; ALL: 'trunc_v3f16'
+; ALL: estimated cost of 8 for {{.*}} call <3 x half> @llvm.trunc.v3f16
+define void @trunc_v3f16(<3 x half> addrspace(1)* %out, <3 x half> addrspace(1)* %vaddr) #0 {
+  %vec = load <3 x half>, <3 x half> addrspace(1)* %vaddr
+  %trunc = call <3 x half> @llvm.trunc.v3f16(<3 x half> %vec) #1
+  store <3 x half> %trunc, <3 x half> addrspace(1)* %out
+  ret void
+}
+
+declare float @llvm.trunc.f32(float) #1
+declare <2 x float> @llvm.trunc.v2f32(<2 x float>) #1
+declare <3 x float> @llvm.trunc.v3f32(<3 x float>) #1
+
+declare double @llvm.trunc.f64(double) #1
+declare <2 x double> @llvm.trunc.v2f64(<2 x double>) #1
+declare <3 x double> @llvm.trunc.v3f64(<3 x double>) #1
+
+declare half @llvm.trunc.f16(half) #1
+declare <2 x half> @llvm.trunc.v2f16(<2 x half>) #1
+declare <3 x half> @llvm.trunc.v3f16(<3 x half>) #1
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone }