Index: lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
===================================================================
--- lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
+++ lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
@@ -111,6 +111,16 @@
   /// \returns True.
   bool promoteUniformBitreverseToI32(IntrinsicInst &I) const;
 
+  /// \brief Promotes 'ctlz' intrinsic \p I to 32 bit 'ctlz' intrinsic.
+  ///
+  /// \details \p I's base element bit width must be greater than 1 and less
+  /// than or equal 16. Promotion is done by zero extending the first operand to
+  /// 32 bits, replacing \p I with 32 bit 'ctlz' intrinsic, and truncating the
+  /// result of 32 bit 'ctlz' intrinsic back to \p I's original type.
+  ///
+  /// \returns True.
+  bool promoteCtlzToI32(IntrinsicInst &I) const;
+
 public:
   static char ID;
   AMDGPUCodeGenPrepare(const TargetMachine *TM = nullptr) :
@@ -130,6 +140,7 @@
 
   bool visitIntrinsicInst(IntrinsicInst &I);
   bool visitBitreverseIntrinsicInst(IntrinsicInst &I);
+  bool visitCtlzIntrinsicInst(IntrinsicInst &I);
 
   bool doInitialization(Module &M) override;
   bool runOnFunction(Function &F) override;
@@ -308,6 +319,28 @@
   return true;
 }
 
+bool AMDGPUCodeGenPrepare::promoteCtlzToI32(IntrinsicInst &I) const {
+  assert(I.getIntrinsicID() == Intrinsic::ctlz &&
+         "I must be ctlz intrinsic");
+  assert(needsPromotionToI32(I.getType()) &&
+         "I does not need promotion to i32");
+
+  IRBuilder<> Builder(&I);
+  Builder.SetCurrentDebugLocation(I.getDebugLoc());
+
+  Type *I32Ty = getI32Ty(Builder, I.getType());
+  Function *I32 =
+      Intrinsic::getDeclaration(Mod, Intrinsic::ctlz, { I32Ty });
+  Value *ExtOp = Builder.CreateZExt(I.getOperand(0), I32Ty);
+  Value *ExtRes = Builder.CreateCall(I32, { ExtOp, I.getOperand(1) });
+  Value *TruncRes = Builder.CreateTrunc(ExtRes, I.getType());
+
+  I.replaceAllUsesWith(TruncRes);
+  I.eraseFromParent();
+
+  return true;
+}
+
 static bool shouldKeepFDivF32(Value *Num, bool UnsafeDiv) {
   const ConstantFP *CNum = dyn_cast<ConstantFP>(Num);
   if (!CNum)
@@ -426,6 +459,8 @@
   switch (I.getIntrinsicID()) {
   case Intrinsic::bitreverse:
     return visitBitreverseIntrinsicInst(I);
+  case Intrinsic::ctlz:
+    return visitCtlzIntrinsicInst(I);
   default:
     return false;
   }
@@ -441,6 +476,15 @@
   return Changed;
 }
 
+bool AMDGPUCodeGenPrepare::visitCtlzIntrinsicInst(IntrinsicInst &I) {
+  bool Changed = false;
+
+  if (needsPromotionToI32(I.getType()))
+    Changed |= promoteCtlzToI32(I);
+
+  return Changed;
+}
+
 bool AMDGPUCodeGenPrepare::doInitialization(Module &M) {
   Mod = &M;
   return false;
Index: test/CodeGen/AMDGPU/amdgpu-codegenprepare-i16-to-i32.ll
===================================================================
--- test/CodeGen/AMDGPU/amdgpu-codegenprepare-i16-to-i32.ll
+++ test/CodeGen/AMDGPU/amdgpu-codegenprepare-i16-to-i32.ll
@@ -1039,6 +1039,27 @@
   ret i16 %brev
 }
 
+declare i16 @llvm.ctlz.i16(i16, i1)
+; GCN-LABEL: @ctlz_i16_false(
+; GCN: %[[A_32:[0-9]+]] = zext i16 %a to i32
+; GCN-NEXT: %[[R_32:[0-9]+]] = call i32 @llvm.ctlz.i32(i32 %[[A_32]], i1 false)
+; GCN-NEXT: %[[R_16:[0-9]+]] = trunc i32 %[[R_32]] to i16
+; GCN-NEXT: ret i16 %[[R_16]]
+define i16 @ctlz_i16_false(i16 %a) {
+  %ctlz = call i16 @llvm.ctlz.i16(i16 %a, i1 false)
+  ret i16 %ctlz
+}
+
+; GCN-LABEL: @ctlz_i16_true(
+; GCN: %[[A_32:[0-9]+]] = zext i16 %a to i32
+; GCN-NEXT: %[[R_32:[0-9]+]] = call i32 @llvm.ctlz.i32(i32 %[[A_32]], i1 true)
+; GCN-NEXT: %[[R_16:[0-9]+]] = trunc i32 %[[R_32]] to i16
+; GCN-NEXT: ret i16 %[[R_16]]
+define i16 @ctlz_i16_true(i16 %a) {
+  %ctlz = call i16 @llvm.ctlz.i16(i16 %a, i1 true)
+  ret i16 %ctlz
+}
+
 ; GCN-LABEL: @add_3xi15(
 ; SI: %r = add <3 x i15> %a, %b
 ; SI-NEXT: ret <3 x i15> %r
@@ -2076,3 +2097,24 @@
   %brev = call <3 x i16> @llvm.bitreverse.v3i16(<3 x i16> %a)
   ret <3 x i16> %brev
 }
+
+declare <3 x i16> @llvm.ctlz.v3i16(<3 x i16>, i1)
+; GCN-LABEL: @ctlz_v3i16_false(
+; GCN: %[[A_32:[0-9]+]] = zext <3 x i16> %a to <3 x i32>
+; GCN-NEXT: %[[R_32:[0-9]+]] = call <3 x i32> @llvm.ctlz.v3i32(<3 x i32> %[[A_32]], i1 false)
+; GCN-NEXT: %[[R_16:[0-9]+]] = trunc <3 x i32> %[[R_32]] to <3 x i16>
+; GCN-NEXT: ret <3 x i16> %[[R_16]]
+define <3 x i16> @ctlz_v3i16_false(<3 x i16> %a) {
+  %ctlz = call <3 x i16> @llvm.ctlz.v3i16(<3 x i16> %a, i1 false)
+  ret <3 x i16> %ctlz
+}
+
+; GCN-LABEL: @ctlz_v3i16_true(
+; GCN: %[[A_32:[0-9]+]] = zext <3 x i16> %a to <3 x i32>
+; GCN-NEXT: %[[R_32:[0-9]+]] = call <3 x i32> @llvm.ctlz.v3i32(<3 x i32> %[[A_32]], i1 true)
+; GCN-NEXT: %[[R_16:[0-9]+]] = trunc <3 x i32> %[[R_32]] to <3 x i16>
+; GCN-NEXT: ret <3 x i16> %[[R_16]]
+define <3 x i16> @ctlz_v3i16_true(<3 x i16> %a) {
+  %ctlz = call <3 x i16> @llvm.ctlz.v3i16(<3 x i16> %a, i1 true)
+  ret <3 x i16> %ctlz
+}
Index: test/CodeGen/AMDGPU/ctlz.ll
===================================================================
--- test/CodeGen/AMDGPU/ctlz.ll
+++ test/CodeGen/AMDGPU/ctlz.ll
@@ -97,12 +97,9 @@
 }
 
 ; FUNC-LABEL: {{^}}v_ctlz_i8:
-; GCN: buffer_load_ubyte [[VAL:v[0-9]+]],
-; GCN-DAG: v_ffbh_u32_e32 [[FFBH:v[0-9]+]], [[VAL]]
-; GCN-DAG: v_cmp_eq_u32_e32 vcc, 0, [[CTLZ]]
-; GCN-DAG: v_cndmask_b32_e64 [[CORRECTED_FFBH:v[0-9]+]], [[FFBH]], 32, vcc
-; GCN: v_add_i32_e32 [[RESULT:v[0-9]+]], vcc, 0xffffffe8, [[CORRECTED_FFBH]]
-; GCN: buffer_store_byte [[RESULT]],
+; GCN: {{buffer|flat}}_load_ubyte [[VAL:v[0-9]+]],
+; GCN: v_ffbh_u32_e32 [[FFBH:v[0-9]+]], [[VAL]]
+; GCN: {{buffer|flat}}_store_byte [[FFBH]],
 define void @v_ctlz_i8(i8 addrspace(1)* noalias %out, i8 addrspace(1)* noalias %valptr) nounwind {
   %val = load i8, i8 addrspace(1)* %valptr
   %ctlz = call i8 @llvm.ctlz.i8(i8 %val, i1 false) nounwind readnone
@@ -257,12 +254,10 @@
   ret void
 }
 
-; FIXME: Need to handle non-uniform case for function below (load without gep).
 ; FUNC-LABEL: {{^}}v_ctlz_i7_sel_eq_neg1:
 ; GCN: {{buffer|flat}}_load_ubyte [[VAL:v[0-9]+]],
 ; GCN: v_ffbh_u32_e32 [[FFBH:v[0-9]+]], [[VAL]]
-; GCN: v_and_b32_e32 [[TRUNC:v[0-9]+]], 0x7f, [[FFBH]]
-; GCN: {{buffer|flat}}_store_byte [[TRUNC]],
+; GCN: {{buffer|flat}}_store_byte [[FFBH]],
 define void @v_ctlz_i7_sel_eq_neg1(i7 addrspace(1)* noalias %out, i7 addrspace(1)* noalias %valptr) nounwind {
   %tid = call i32 @llvm.r600.read.tidig.x()
   %valptr.gep = getelementptr i7, i7 addrspace(1)* %valptr, i32 %tid
Index: test/CodeGen/AMDGPU/ctlz_zero_undef.ll
===================================================================
--- test/CodeGen/AMDGPU/ctlz_zero_undef.ll
+++ test/CodeGen/AMDGPU/ctlz_zero_undef.ll
@@ -79,10 +79,9 @@
 }
 
 ; FUNC-LABEL: {{^}}v_ctlz_zero_undef_i8:
-; SI: buffer_load_ubyte [[VAL:v[0-9]+]],
+; SI: {{buffer|flat}}_load_ubyte [[VAL:v[0-9]+]],
 ; SI: v_ffbh_u32_e32 [[FFBH:v[0-9]+]], [[VAL]]
-; SI: v_add_i32_e32 [[RESULT:v[0-9]+]], vcc, 0xffffffe8, [[FFBH]]
-; SI: buffer_store_byte [[RESULT]],
+; SI: {{buffer|flat}}_store_byte [[FFBH]],
 define void @v_ctlz_zero_undef_i8(i8 addrspace(1)* noalias %out, i8 addrspace(1)* noalias %valptr) nounwind {
   %val = load i8, i8 addrspace(1)* %valptr
   %ctlz = call i8 @llvm.ctlz.i8(i8 %val, i1 true) nounwind readnone