Index: lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
===================================================================
--- lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
+++ lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
@@ -77,14 +77,14 @@
   ///
   /// \returns True if 16 bit binary operation is promoted to equivalent 32 bit
   /// binary operation, false otherwise.
-  bool promoteUniformI16OpToI32Op(BinaryOperator &I) const;
+  bool promoteUniformI16OpToI32(BinaryOperator &I) const;
 
   /// \brief Promotes uniform 16 bit 'icmp' operation \p I to 32 bit 'icmp'
   /// operation by sign or zero extending operands to 32 bits, and replacing 16
   /// bit operation with 32 bit operation.
   ///
   /// \returns True.
-  bool promoteUniformI16OpToI32Op(ICmpInst &I) const;
+  bool promoteUniformI16OpToI32(ICmpInst &I) const;
 
   /// \brief Promotes uniform 16 bit 'select' operation \p I to 32 bit 'select'
   /// operation by sign or zero extending operands to 32 bits, replacing 16 bit
@@ -92,7 +92,16 @@
   /// operation back to 16 bits.
   ///
   /// \returns True.
-  bool promoteUniformI16OpToI32Op(SelectInst &I) const;
+  bool promoteUniformI16OpToI32(SelectInst &I) const;
+
+  /// \brief Promotes uniform 16 bit 'bitreverse' intrinsic \p I to 32 bit
+  /// 'bitreverse' intrinsic by zero extending operand to 32 bits, replacing 16
+  /// bit intrinsic with 32 bit intrinsic, shifting the result of 32 bit
+  /// intrinsic 16 bits to the right with zero fill, and truncating the result
+  /// of shift operation back to 16 bits.
+  ///
+  /// \returns True.
+  bool promoteUniformI16BitreverseIntrinsicToI32(IntrinsicInst &I) const;
 
 public:
   static char ID;
@@ -111,6 +120,9 @@
   bool visitICmpInst(ICmpInst &I);
   bool visitSelectInst(SelectInst &I);
 
+  bool visitIntrinsicInst(IntrinsicInst &I);
+  bool visitBitreverseIntrinsicInst(IntrinsicInst &I);
+
   bool doInitialization(Module &M) override;
   bool runOnFunction(Function &F) override;
 
@@ -183,8 +195,8 @@
       cast<ICmpInst>(I.getOperand(0))->isSigned() : false;
 }
 
-bool AMDGPUCodeGenPrepare::promoteUniformI16OpToI32Op(BinaryOperator &I) const {
-  assert(isI16Ty(I.getType()) && "Op must be 16 bits");
+bool AMDGPUCodeGenPrepare::promoteUniformI16OpToI32(BinaryOperator &I) const {
+  assert(isI16Ty(I.getType()) && "I must be 16 bits");
 
   if (I.getOpcode() == Instruction::SDiv || I.getOpcode() == Instruction::UDiv)
     return false;
@@ -214,7 +226,7 @@
   return true;
 }
 
-bool AMDGPUCodeGenPrepare::promoteUniformI16OpToI32Op(ICmpInst &I) const {
+bool AMDGPUCodeGenPrepare::promoteUniformI16OpToI32(ICmpInst &I) const {
   assert(isI16Ty(I.getOperand(0)->getType()) && "Op0 must be 16 bits");
   assert(isI16Ty(I.getOperand(1)->getType()) && "Op1 must be 16 bits");
 
@@ -242,8 +254,8 @@
   return true;
 }
 
-bool AMDGPUCodeGenPrepare::promoteUniformI16OpToI32Op(SelectInst &I) const {
-  assert(isI16Ty(I.getType()) && "Op must be 16 bits");
+bool AMDGPUCodeGenPrepare::promoteUniformI16OpToI32(SelectInst &I) const {
+  assert(isI16Ty(I.getType()) && "I must be 16 bits");
 
   IRBuilder<> Builder(&I);
   Builder.SetCurrentDebugLocation(I.getDebugLoc());
@@ -270,6 +282,33 @@
   return true;
 }
 
+bool AMDGPUCodeGenPrepare::promoteUniformI16BitreverseIntrinsicToI32(
+    IntrinsicInst &I) const {
+  assert(I.getIntrinsicID() == Intrinsic::bitreverse && "I must be bitreverse");
+  assert(isI16Ty(I.getType()) && "I must be 16 bits");
+
+  IRBuilder<> Builder(&I);
+  Builder.SetCurrentDebugLocation(I.getDebugLoc());
+
+  Type *I32Ty = getI32Ty(Builder, I.getType());
+  Function *I32 = nullptr;
+  Value *ExtOp = nullptr;
+  Value *ExtRes = nullptr;
+  Value *LShrOp = nullptr;
+  Value *TruncRes = nullptr;
+
+  ExtOp = Builder.CreateZExt(I.getOperand(0), I32Ty);
+  I32 = Intrinsic::getDeclaration(Mod, Intrinsic::bitreverse, { I32Ty });
+  ExtRes = Builder.CreateCall(I32, { ExtOp });
+  LShrOp = Builder.CreateLShr(ExtRes, 16);
+  TruncRes = Builder.CreateTrunc(LShrOp, getI16Ty(Builder, ExtRes->getType()));
+
+  I.replaceAllUsesWith(TruncRes);
+  I.eraseFromParent();
+
+  return true;
+}
+
 static bool shouldKeepFDivF32(Value *Num, bool UnsafeDiv) {
   const ConstantFP *CNum = dyn_cast<ConstantFP>(Num);
   if (!CNum)
@@ -359,7 +398,7 @@
 
   // TODO: Should we promote smaller types that will be legalized to i16?
   if (ST->has16BitInsts() && isI16Ty(I.getType()) && DA->isUniform(&I))
-    Changed |= promoteUniformI16OpToI32Op(I);
+    Changed |= promoteUniformI16OpToI32(I);
 
   return Changed;
 }
@@ -370,7 +409,7 @@
   // TODO: Should we promote smaller types that will be legalized to i16?
   if (ST->has16BitInsts() && isI16Ty(I.getOperand(0)->getType()) &&
           isI16Ty(I.getOperand(1)->getType()) && DA->isUniform(&I))
-    Changed |= promoteUniformI16OpToI32Op(I);
+    Changed |= promoteUniformI16OpToI32(I);
 
   return Changed;
 }
@@ -380,7 +419,26 @@
 
   // TODO: Should we promote smaller types that will be legalized to i16?
   if (ST->has16BitInsts() && isI16Ty(I.getType()) && DA->isUniform(&I))
-    Changed |= promoteUniformI16OpToI32Op(I);
+    Changed |= promoteUniformI16OpToI32(I);
+
+  return Changed;
+}
+
+bool AMDGPUCodeGenPrepare::visitIntrinsicInst(IntrinsicInst &I) {
+  bool Changed = false;
+
+  if (I.getIntrinsicID() == Intrinsic::bitreverse)
+    Changed |= visitBitreverseIntrinsicInst(I);
+
+  return Changed;
+}
+
+bool AMDGPUCodeGenPrepare::visitBitreverseIntrinsicInst(IntrinsicInst &I) {
+  bool Changed = false;
+
+  // TODO: Should we promote smaller types that will be legalized to i16?
+  if (ST->has16BitInsts() && isI16Ty(I.getType()) && DA->isUniform(&I))
+    Changed |= promoteUniformI16BitreverseIntrinsicToI32(I);
 
   return Changed;
 }
Index: test/CodeGen/AMDGPU/amdgpu-codegenprepare-i16-to-i32.ll
===================================================================
--- test/CodeGen/AMDGPU/amdgpu-codegenprepare-i16-to-i32.ll
+++ test/CodeGen/AMDGPU/amdgpu-codegenprepare-i16-to-i32.ll
@@ -430,6 +430,18 @@
   ret i16 %sel
 }
 
+declare i16 @llvm.bitreverse.i16(i16)
+; VI-LABEL: @bitreverse_i16(
+; VI: %[[A_32:[0-9]+]] = zext i16 %a to i32
+; VI: %[[R_32:[0-9]+]] = call i32 @llvm.bitreverse.i32(i32 %[[A_32]])
+; VI: %[[S_32:[0-9]+]] = lshr i32 %[[R_32]], 16
+; VI: %[[R_16:[0-9]+]] = trunc i32 %[[S_32]] to i16
+; VI: ret i16 %[[R_16]]
+define i16 @bitreverse_i16(i16 %a) {
+  %brev = call i16 @llvm.bitreverse.i16(i16 %a)
+  ret i16 %brev
+}
+
 ; VI-LABEL: @add_3xi16(
 ; VI: %[[A_32:[0-9]+]] = zext <3 x i16> %a to <3 x i32>
 ; VI: %[[B_32:[0-9]+]] = zext <3 x i16> %b to <3 x i32>
@@ -854,3 +866,15 @@
   %sel = select <3 x i1> %cmp, <3 x i16> %a, <3 x i16> %b
   ret <3 x i16> %sel
 }
+
+declare <3 x i16> @llvm.bitreverse.v3i16(<3 x i16>)
+; VI-LABEL: @bitreverse_3xi16(
+; VI: %[[A_32:[0-9]+]] = zext <3 x i16> %a to <3 x i32>
+; VI: %[[R_32:[0-9]+]] = call <3 x i32> @llvm.bitreverse.v3i32(<3 x i32> %[[A_32]])
+; VI: %[[S_32:[0-9]+]] = lshr <3 x i32> %[[R_32]], <i32 16, i32 16, i32 16>
+; VI: %[[R_16:[0-9]+]] = trunc <3 x i32> %[[S_32]] to <3 x i16>
+; VI: ret <3 x i16> %[[R_16]]
+define <3 x i16> @bitreverse_3xi16(<3 x i16> %a) {
+  %brev = call <3 x i16> @llvm.bitreverse.v3i16(<3 x i16> %a)
+  ret <3 x i16> %brev
+}