Index: lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
===================================================================
--- lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
+++ lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
@@ -39,6 +39,52 @@
   Module *Mod;
   bool HasUnsafeFPMath;
 
+  /// \returns True if binary operation \p I is a signed binary operation, false
+  /// otherwise.
+  bool isSigned(const BinaryOperator &I) const {
+    return I.getOpcode() == Instruction::SDiv ||
+        I.getOpcode() == Instruction::SRem;
+  }
+
+  /// \returns True if 'icmp' operation \p I is a signed 'icmp' operation, false
+  /// otherwise.
+  bool isSigned(const ICmpInst &I) const {
+    return I.getPredicate() == ICmpInst::ICMP_SGT ||
+        I.getPredicate() == ICmpInst::ICMP_SGE ||
+        I.getPredicate() == ICmpInst::ICMP_SLT ||
+        I.getPredicate() == ICmpInst::ICMP_SLE;
+  }
+
+  /// \returns True if the condition of 'select' operation \p I comes from a
+  /// signed 'icmp' operation, false otherwise.
+  bool isSigned(const SelectInst &I) const {
+    return isa<ICmpInst>(I.getOperand(0)) ?
+        isSigned(*cast<ICmpInst>(I.getOperand(0))) : false;
+  }
+
+  /// \brief Promotes uniform 16 bit binary operation \p I to equivalent 32 bit
+  /// binary operation by sign or zero extending operands to 32 bits, replacing
+  /// 16 bit operation with equivalent 32 bit operation, and truncating the
+  /// result of 32 bit operation back to 16 bits.
+  ///
+  /// \returns True.
+  bool promoteUniformI16OpToI32Op(BinaryOperator &I) const;
+
+  /// \brief Promotes uniform 16 bit 'icmp' operation \p I to 32 bit 'icmp'
+  /// operation by sign or zero extending operands to 32 bits, and replacing 16
+  /// bit operation with 32 bit operation.
+  ///
+  /// \returns True.
+  bool promoteUniformI16OpToI32Op(ICmpInst &I) const;
+
+  /// \brief Promotes uniform 16 bit 'select' operation \p I to 32 bit 'select'
+  /// operation by sign or zero extending operands to 32 bits, replacing 16 bit
+  /// operation with 32 bit operation, and truncating the result of 32 bit
+  /// operation back to 16 bits.
+  ///
+  /// \returns True.
+  bool promoteUniformI16OpToI32Op(SelectInst &I) const;
+
 public:
   static char ID;
   AMDGPUCodeGenPrepare(const TargetMachine *TM = nullptr) :
@@ -51,9 +97,10 @@
 
   bool visitFDiv(BinaryOperator &I);
 
-  bool visitInstruction(Instruction &I) {
-    return false;
-  }
+  bool visitInstruction(Instruction &I) { return false; }
+  bool visitBinaryOperator(BinaryOperator &I);
+  bool visitICmpInst(ICmpInst &I);
+  bool visitSelectInst(SelectInst &I);
 
   bool doInitialization(Module &M) override;
   bool runOnFunction(Function &F) override;
@@ -70,6 +117,95 @@
 
 } // End anonymous namespace
 
+bool AMDGPUCodeGenPrepare::promoteUniformI16OpToI32Op(BinaryOperator &I) const {
+  assert(I.getType()->isIntegerTy(16) && "Op must be 16 bits");
+  assert(DA->isUniform(&I) && "Op must be uniform");
+
+  IRBuilder<> Builder(&I);
+  Builder.SetCurrentDebugLocation(I.getDebugLoc());
+
+  Type *Int32Ty = Builder.getInt32Ty();
+  Value *ExtOp0 = nullptr;
+  Value *ExtOp1 = nullptr;
+  Value *ExtRes = nullptr;
+  Value *TruncRes = nullptr;
+
+  if (isSigned(I)) {
+    ExtOp0 = Builder.CreateSExt(I.getOperand(0), Int32Ty);
+    ExtOp1 = Builder.CreateSExt(I.getOperand(1), Int32Ty);
+  } else {
+    ExtOp0 = Builder.CreateZExt(I.getOperand(0), Int32Ty);
+    ExtOp1 = Builder.CreateZExt(I.getOperand(1), Int32Ty);
+  }
+  ExtRes = Builder.CreateBinOp(I.getOpcode(), ExtOp0, ExtOp1);
+  TruncRes = Builder.CreateTrunc(ExtRes, Builder.getInt16Ty());
+
+  I.replaceAllUsesWith(TruncRes);
+  I.dropAllReferences();
+  I.eraseFromParent();
+
+  return true;
+}
+
+bool AMDGPUCodeGenPrepare::promoteUniformI16OpToI32Op(ICmpInst &I) const {
+  assert(I.getOperand(0)->getType()->isIntegerTy(16) && "Op0 must be 16 bits");
+  assert(I.getOperand(1)->getType()->isIntegerTy(16) && "Op1 must be 16 bits");
+  assert(DA->isUniform(&I) && "Op must be uniform");
+
+  IRBuilder<> Builder(&I);
+  Builder.SetCurrentDebugLocation(I.getDebugLoc());
+
+  Type *Int32Ty = Builder.getInt32Ty();
+  Value *ExtOp0 = nullptr;
+  Value *ExtOp1 = nullptr;
+  Value *NewICmp  = nullptr;
+
+  if (isSigned(I)) {
+    ExtOp0 = Builder.CreateSExt(I.getOperand(0), Int32Ty);
+    ExtOp1 = Builder.CreateSExt(I.getOperand(1), Int32Ty);
+  } else {
+    ExtOp0 = Builder.CreateZExt(I.getOperand(0), Int32Ty);
+    ExtOp1 = Builder.CreateZExt(I.getOperand(1), Int32Ty);
+  }
+  NewICmp = Builder.CreateICmp(I.getPredicate(), ExtOp0, ExtOp1);
+
+  I.replaceAllUsesWith(NewICmp);
+  I.dropAllReferences();
+  I.eraseFromParent();
+
+  return true;
+}
+
+bool AMDGPUCodeGenPrepare::promoteUniformI16OpToI32Op(SelectInst &I) const {
+  assert(I.getType()->isIntegerTy(16) && "Op must be 16 bits");
+  assert(DA->isUniform(&I) && "Op must be uniform");
+
+  IRBuilder<> Builder(&I);
+  Builder.SetCurrentDebugLocation(I.getDebugLoc());
+
+  Type *Int32Ty = Builder.getInt32Ty();
+  Value *ExtOp1 = nullptr;
+  Value *ExtOp2 = nullptr;
+  Value *ExtRes = nullptr;
+  Value *TruncRes = nullptr;
+
+  if (isSigned(I)) {
+    ExtOp1 = Builder.CreateSExt(I.getOperand(1), Int32Ty);
+    ExtOp2 = Builder.CreateSExt(I.getOperand(2), Int32Ty);
+  } else {
+    ExtOp1 = Builder.CreateZExt(I.getOperand(1), Int32Ty);
+    ExtOp2 = Builder.CreateZExt(I.getOperand(2), Int32Ty);
+  }
+  ExtRes = Builder.CreateSelect(I.getOperand(0), ExtOp1, ExtOp2);
+  TruncRes = Builder.CreateTrunc(ExtRes, Builder.getInt16Ty());
+
+  I.replaceAllUsesWith(TruncRes);
+  I.dropAllReferences();
+  I.eraseFromParent();
+
+  return true;
+}
+
 static bool shouldKeepFDivF32(Value *Num, bool UnsafeDiv) {
   const ConstantFP *CNum = dyn_cast<ConstantFP>(Num);
   if (!CNum)
@@ -154,6 +290,37 @@
   return Attr.getValueAsString() == "true";
 }
 
+bool AMDGPUCodeGenPrepare::visitBinaryOperator(BinaryOperator &I) {
+  bool Changed = false;
+
+  // TODO: Should we promote smaller types that will be legalized to i16?
+  if (I.getType()->isIntegerTy(16) && DA->isUniform(&I))
+    Changed |= promoteUniformI16OpToI32Op(I);
+
+  return Changed;
+}
+
+bool AMDGPUCodeGenPrepare::visitICmpInst(ICmpInst &I) {
+  bool Changed = false;
+
+  // TODO: Should we promote smaller types that will be legalized to i16?
+  if (I.getOperand(0)->getType()->isIntegerTy(16) &&
+          I.getOperand(1)->getType()->isIntegerTy(16) && DA->isUniform(&I))
+    Changed |= promoteUniformI16OpToI32Op(I);
+
+  return Changed;
+}
+
+bool AMDGPUCodeGenPrepare::visitSelectInst(SelectInst &I) {
+  bool Changed = false;
+
+  // TODO: Should we promote smaller types that will be legalized to i16?
+  if (I.getType()->isIntegerTy(16) && DA->isUniform(&I))
+    Changed |= promoteUniformI16OpToI32Op(I);
+
+  return Changed;
+}
+
 bool AMDGPUCodeGenPrepare::doInitialization(Module &M) {
   Mod = &M;
   return false;
Index: lib/Target/AMDGPU/SIISelLowering.cpp
===================================================================
--- lib/Target/AMDGPU/SIISelLowering.cpp
+++ lib/Target/AMDGPU/SIISelLowering.cpp
@@ -540,6 +540,10 @@
 
 bool SITargetLowering::isTypeDesirableForOp(unsigned Op, EVT VT) const {
 
+  // i16 is not desirable unless it is a load or a store.
+  if (VT == MVT::i16 && Op != ISD::LOAD && Op != ISD::STORE)
+    return false;
+
   // SimplifySetCC uses this function to determine whether or not it should
   // create setcc with i1 operands.  We don't have instructions for i1 setcc.
   if (VT == MVT::i1 && Op == ISD::SETCC)
Index: test/CodeGen/AMDGPU/amdgpu-codegenprepare.ll
===================================================================
--- test/CodeGen/AMDGPU/amdgpu-codegenprepare.ll
+++ test/CodeGen/AMDGPU/amdgpu-codegenprepare.ll
@@ -231,6 +231,299 @@
   ret void
 }
 
+; CHECK-LABEL: @promote_uniform_add_i16(
+; CHECK: %[[A_32:[0-9]+]] = zext i16 %a to i32
+; CHECK: %[[B_32:[0-9]+]] = zext i16 %b to i32
+; CHECK: %[[R_32:[0-9]+]] = add i32 %[[A_32]], %[[B_32]]
+; CHECK: %[[R_16:[0-9]+]] = trunc i32 %[[R_32]] to i16
+; CHECK: ret i16 %[[R_16]]
+define i16 @promote_uniform_add_i16(i16 %a, i16 %b) {
+  %r = add i16 %a, %b
+  ret i16 %r
+}
+
+; CHECK-LABEL: @promote_uniform_sub_i16(
+; CHECK: %[[A_32:[0-9]+]] = zext i16 %a to i32
+; CHECK: %[[B_32:[0-9]+]] = zext i16 %b to i32
+; CHECK: %[[R_32:[0-9]+]] = sub i32 %[[A_32]], %[[B_32]]
+; CHECK: %[[R_16:[0-9]+]] = trunc i32 %[[R_32]] to i16
+; CHECK: ret i16 %[[R_16]]
+define i16 @promote_uniform_sub_i16(i16 %a, i16 %b) {
+  %r = sub i16 %a, %b
+  ret i16 %r
+}
+
+; CHECK-LABEL: @promote_uniform_mul_i16(
+; CHECK: %[[A_32:[0-9]+]] = zext i16 %a to i32
+; CHECK: %[[B_32:[0-9]+]] = zext i16 %b to i32
+; CHECK: %[[R_32:[0-9]+]] = mul i32 %[[A_32]], %[[B_32]]
+; CHECK: %[[R_16:[0-9]+]] = trunc i32 %[[R_32]] to i16
+; CHECK: ret i16 %[[R_16]]
+define i16 @promote_uniform_mul_i16(i16 %a, i16 %b) {
+  %r = mul i16 %a, %b
+  ret i16 %r
+}
+
+; CHECK-LABEL: @promote_uniform_udiv_i16(
+; CHECK: %[[A_32:[0-9]+]] = zext i16 %a to i32
+; CHECK: %[[B_32:[0-9]+]] = zext i16 %b to i32
+; CHECK: %[[R_32:[0-9]+]] = udiv i32 %[[A_32]], %[[B_32]]
+; CHECK: %[[R_16:[0-9]+]] = trunc i32 %[[R_32]] to i16
+; CHECK: ret i16 %[[R_16]]
+define i16 @promote_uniform_udiv_i16(i16 %a, i16 %b) {
+  %r = udiv i16 %a, %b
+  ret i16 %r
+}
+
+; CHECK-LABEL: @promote_uniform_sdiv_i16(
+; CHECK: %[[A_32:[0-9]+]] = sext i16 %a to i32
+; CHECK: %[[B_32:[0-9]+]] = sext i16 %b to i32
+; CHECK: %[[R_32:[0-9]+]] = sdiv i32 %[[A_32]], %[[B_32]]
+; CHECK: %[[R_16:[0-9]+]] = trunc i32 %[[R_32]] to i16
+; CHECK: ret i16 %[[R_16]]
+define i16 @promote_uniform_sdiv_i16(i16 %a, i16 %b) {
+  %r = sdiv i16 %a, %b
+  ret i16 %r
+}
+
+; CHECK-LABEL: @promote_uniform_urem_i16(
+; CHECK: %[[A_32:[0-9]+]] = zext i16 %a to i32
+; CHECK: %[[B_32:[0-9]+]] = zext i16 %b to i32
+; CHECK: %[[R_32:[0-9]+]] = urem i32 %[[A_32]], %[[B_32]]
+; CHECK: %[[R_16:[0-9]+]] = trunc i32 %[[R_32]] to i16
+; CHECK: ret i16 %[[R_16]]
+define i16 @promote_uniform_urem_i16(i16 %a, i16 %b) {
+  %r = urem i16 %a, %b
+  ret i16 %r
+}
+
+; CHECK-LABEL: @promote_uniform_srem_i16(
+; CHECK: %[[A_32:[0-9]+]] = sext i16 %a to i32
+; CHECK: %[[B_32:[0-9]+]] = sext i16 %b to i32
+; CHECK: %[[R_32:[0-9]+]] = srem i32 %[[A_32]], %[[B_32]]
+; CHECK: %[[R_16:[0-9]+]] = trunc i32 %[[R_32]] to i16
+; CHECK: ret i16 %[[R_16]]
+define i16 @promote_uniform_srem_i16(i16 %a, i16 %b) {
+  %r = srem i16 %a, %b
+  ret i16 %r
+}
+
+; CHECK-LABEL: @promote_uniform_shl_i16(
+; CHECK: %[[A_32:[0-9]+]] = zext i16 %a to i32
+; CHECK: %[[B_32:[0-9]+]] = zext i16 %b to i32
+; CHECK: %[[R_32:[0-9]+]] = shl i32 %[[A_32]], %[[B_32]]
+; CHECK: %[[R_16:[0-9]+]] = trunc i32 %[[R_32]] to i16
+; CHECK: ret i16 %[[R_16]]
+define i16 @promote_uniform_shl_i16(i16 %a, i16 %b) {
+  %r = shl i16 %a, %b
+  ret i16 %r
+}
+
+; CHECK-LABEL: @promote_uniform_lshr_i16(
+; CHECK: %[[A_32:[0-9]+]] = zext i16 %a to i32
+; CHECK: %[[B_32:[0-9]+]] = zext i16 %b to i32
+; CHECK: %[[R_32:[0-9]+]] = lshr i32 %[[A_32]], %[[B_32]]
+; CHECK: %[[R_16:[0-9]+]] = trunc i32 %[[R_32]] to i16
+; CHECK: ret i16 %[[R_16]]
+define i16 @promote_uniform_lshr_i16(i16 %a, i16 %b) {
+  %r = lshr i16 %a, %b
+  ret i16 %r
+}
+
+; CHECK-LABEL: @promote_uniform_ashr_i16(
+; CHECK: %[[A_32:[0-9]+]] = zext i16 %a to i32
+; CHECK: %[[B_32:[0-9]+]] = zext i16 %b to i32
+; CHECK: %[[R_32:[0-9]+]] = ashr i32 %[[A_32]], %[[B_32]]
+; CHECK: %[[R_16:[0-9]+]] = trunc i32 %[[R_32]] to i16
+; CHECK: ret i16 %[[R_16]]
+define i16 @promote_uniform_ashr_i16(i16 %a, i16 %b) {
+  %r = ashr i16 %a, %b
+  ret i16 %r
+}
+
+; CHECK-LABEL: @promote_uniform_and_i16(
+; CHECK: %[[A_32:[0-9]+]] = zext i16 %a to i32
+; CHECK: %[[B_32:[0-9]+]] = zext i16 %b to i32
+; CHECK: %[[R_32:[0-9]+]] = and i32 %[[A_32]], %[[B_32]]
+; CHECK: %[[R_16:[0-9]+]] = trunc i32 %[[R_32]] to i16
+; CHECK: ret i16 %[[R_16]]
+define i16 @promote_uniform_and_i16(i16 %a, i16 %b) {
+  %r = and i16 %a, %b
+  ret i16 %r
+}
+
+; CHECK-LABEL: @promote_uniform_or_i16(
+; CHECK: %[[A_32:[0-9]+]] = zext i16 %a to i32
+; CHECK: %[[B_32:[0-9]+]] = zext i16 %b to i32
+; CHECK: %[[R_32:[0-9]+]] = or i32 %[[A_32]], %[[B_32]]
+; CHECK: %[[R_16:[0-9]+]] = trunc i32 %[[R_32]] to i16
+; CHECK: ret i16 %[[R_16]]
+define i16 @promote_uniform_or_i16(i16 %a, i16 %b) {
+  %r = or i16 %a, %b
+  ret i16 %r
+}
+
+; CHECK-LABEL: @promote_uniform_xor_i16(
+; CHECK: %[[A_32:[0-9]+]] = zext i16 %a to i32
+; CHECK: %[[B_32:[0-9]+]] = zext i16 %b to i32
+; CHECK: %[[R_32:[0-9]+]] = xor i32 %[[A_32]], %[[B_32]]
+; CHECK: %[[R_16:[0-9]+]] = trunc i32 %[[R_32]] to i16
+; CHECK: ret i16 %[[R_16]]
+define i16 @promote_uniform_xor_i16(i16 %a, i16 %b) {
+  %r = xor i16 %a, %b
+  ret i16 %r
+}
+
+; CHECK-LABEL: @promote_uniform_select_eq_i16(
+; CHECK: %[[A_32_0:[0-9]+]] = zext i16 %a to i32
+; CHECK: %[[B_32_0:[0-9]+]] = zext i16 %b to i32
+; CHECK: %[[CMP:[0-9]+]] = icmp eq i32 %[[A_32_0]], %[[B_32_0]]
+; CHECK: %[[A_32_1:[0-9]+]] = zext i16 %a to i32
+; CHECK: %[[B_32_1:[0-9]+]] = zext i16 %b to i32
+; CHECK: %[[SEL_32:[0-9]+]] = select i1 %[[CMP]], i32 %[[A_32_1]], i32 %[[B_32_1]]
+; CHECK: %[[SEL_16:[0-9]+]] = trunc i32 %[[SEL_32]] to i16
+; CHECK: ret i16 %[[SEL_16]]
+define i16 @promote_uniform_select_eq_i16(i16 %a, i16 %b) {
+  %cmp = icmp eq i16 %a, %b
+  %sel = select i1 %cmp, i16 %a, i16 %b
+  ret i16 %sel
+}
+
+; CHECK-LABEL: @promote_uniform_select_ne_i16(
+; CHECK: %[[A_32_0:[0-9]+]] = zext i16 %a to i32
+; CHECK: %[[B_32_0:[0-9]+]] = zext i16 %b to i32
+; CHECK: %[[CMP:[0-9]+]] = icmp ne i32 %[[A_32_0]], %[[B_32_0]]
+; CHECK: %[[A_32_1:[0-9]+]] = zext i16 %a to i32
+; CHECK: %[[B_32_1:[0-9]+]] = zext i16 %b to i32
+; CHECK: %[[SEL_32:[0-9]+]] = select i1 %[[CMP]], i32 %[[A_32_1]], i32 %[[B_32_1]]
+; CHECK: %[[SEL_16:[0-9]+]] = trunc i32 %[[SEL_32]] to i16
+; CHECK: ret i16 %[[SEL_16]]
+define i16 @promote_uniform_select_ne_i16(i16 %a, i16 %b) {
+  %cmp = icmp ne i16 %a, %b
+  %sel = select i1 %cmp, i16 %a, i16 %b
+  ret i16 %sel
+}
+
+; CHECK-LABEL: @promote_uniform_select_ugt_i16(
+; CHECK: %[[A_32_0:[0-9]+]] = zext i16 %a to i32
+; CHECK: %[[B_32_0:[0-9]+]] = zext i16 %b to i32
+; CHECK: %[[CMP:[0-9]+]] = icmp ugt i32 %[[A_32_0]], %[[B_32_0]]
+; CHECK: %[[A_32_1:[0-9]+]] = zext i16 %a to i32
+; CHECK: %[[B_32_1:[0-9]+]] = zext i16 %b to i32
+; CHECK: %[[SEL_32:[0-9]+]] = select i1 %[[CMP]], i32 %[[A_32_1]], i32 %[[B_32_1]]
+; CHECK: %[[SEL_16:[0-9]+]] = trunc i32 %[[SEL_32]] to i16
+; CHECK: ret i16 %[[SEL_16]]
+define i16 @promote_uniform_select_ugt_i16(i16 %a, i16 %b) {
+  %cmp = icmp ugt i16 %a, %b
+  %sel = select i1 %cmp, i16 %a, i16 %b
+  ret i16 %sel
+}
+
+; CHECK-LABEL: @promote_uniform_select_uge_i16(
+; CHECK: %[[A_32_0:[0-9]+]] = zext i16 %a to i32
+; CHECK: %[[B_32_0:[0-9]+]] = zext i16 %b to i32
+; CHECK: %[[CMP:[0-9]+]] = icmp uge i32 %[[A_32_0]], %[[B_32_0]]
+; CHECK: %[[A_32_1:[0-9]+]] = zext i16 %a to i32
+; CHECK: %[[B_32_1:[0-9]+]] = zext i16 %b to i32
+; CHECK: %[[SEL_32:[0-9]+]] = select i1 %[[CMP]], i32 %[[A_32_1]], i32 %[[B_32_1]]
+; CHECK: %[[SEL_16:[0-9]+]] = trunc i32 %[[SEL_32]] to i16
+; CHECK: ret i16 %[[SEL_16]]
+define i16 @promote_uniform_select_uge_i16(i16 %a, i16 %b) {
+  %cmp = icmp uge i16 %a, %b
+  %sel = select i1 %cmp, i16 %a, i16 %b
+  ret i16 %sel
+}
+
+; CHECK-LABEL: @promote_uniform_select_ult_i16(
+; CHECK: %[[A_32_0:[0-9]+]] = zext i16 %a to i32
+; CHECK: %[[B_32_0:[0-9]+]] = zext i16 %b to i32
+; CHECK: %[[CMP:[0-9]+]] = icmp ult i32 %[[A_32_0]], %[[B_32_0]]
+; CHECK: %[[A_32_1:[0-9]+]] = zext i16 %a to i32
+; CHECK: %[[B_32_1:[0-9]+]] = zext i16 %b to i32
+; CHECK: %[[SEL_32:[0-9]+]] = select i1 %[[CMP]], i32 %[[A_32_1]], i32 %[[B_32_1]]
+; CHECK: %[[SEL_16:[0-9]+]] = trunc i32 %[[SEL_32]] to i16
+; CHECK: ret i16 %[[SEL_16]]
+define i16 @promote_uniform_select_ult_i16(i16 %a, i16 %b) {
+  %cmp = icmp ult i16 %a, %b
+  %sel = select i1 %cmp, i16 %a, i16 %b
+  ret i16 %sel
+}
+
+; CHECK-LABEL: @promote_uniform_select_ule_i16(
+; CHECK: %[[A_32_0:[0-9]+]] = zext i16 %a to i32
+; CHECK: %[[B_32_0:[0-9]+]] = zext i16 %b to i32
+; CHECK: %[[CMP:[0-9]+]] = icmp ule i32 %[[A_32_0]], %[[B_32_0]]
+; CHECK: %[[A_32_1:[0-9]+]] = zext i16 %a to i32
+; CHECK: %[[B_32_1:[0-9]+]] = zext i16 %b to i32
+; CHECK: %[[SEL_32:[0-9]+]] = select i1 %[[CMP]], i32 %[[A_32_1]], i32 %[[B_32_1]]
+; CHECK: %[[SEL_16:[0-9]+]] = trunc i32 %[[SEL_32]] to i16
+; CHECK: ret i16 %[[SEL_16]]
+define i16 @promote_uniform_select_ule_i16(i16 %a, i16 %b) {
+  %cmp = icmp ule i16 %a, %b
+  %sel = select i1 %cmp, i16 %a, i16 %b
+  ret i16 %sel
+}
+
+; CHECK-LABEL: @promote_uniform_select_sgt_i16(
+; CHECK: %[[A_32_0:[0-9]+]] = sext i16 %a to i32
+; CHECK: %[[B_32_0:[0-9]+]] = sext i16 %b to i32
+; CHECK: %[[CMP:[0-9]+]] = icmp sgt i32 %[[A_32_0]], %[[B_32_0]]
+; CHECK: %[[A_32_1:[0-9]+]] = sext i16 %a to i32
+; CHECK: %[[B_32_1:[0-9]+]] = sext i16 %b to i32
+; CHECK: %[[SEL_32:[0-9]+]] = select i1 %[[CMP]], i32 %[[A_32_1]], i32 %[[B_32_1]]
+; CHECK: %[[SEL_16:[0-9]+]] = trunc i32 %[[SEL_32]] to i16
+; CHECK: ret i16 %[[SEL_16]]
+define i16 @promote_uniform_select_sgt_i16(i16 %a, i16 %b) {
+  %cmp = icmp sgt i16 %a, %b
+  %sel = select i1 %cmp, i16 %a, i16 %b
+  ret i16 %sel
+}
+
+; CHECK-LABEL: @promote_uniform_select_sge_i16(
+; CHECK: %[[A_32_0:[0-9]+]] = sext i16 %a to i32
+; CHECK: %[[B_32_0:[0-9]+]] = sext i16 %b to i32
+; CHECK: %[[CMP:[0-9]+]] = icmp sge i32 %[[A_32_0]], %[[B_32_0]]
+; CHECK: %[[A_32_1:[0-9]+]] = sext i16 %a to i32
+; CHECK: %[[B_32_1:[0-9]+]] = sext i16 %b to i32
+; CHECK: %[[SEL_32:[0-9]+]] = select i1 %[[CMP]], i32 %[[A_32_1]], i32 %[[B_32_1]]
+; CHECK: %[[SEL_16:[0-9]+]] = trunc i32 %[[SEL_32]] to i16
+; CHECK: ret i16 %[[SEL_16]]
+define i16 @promote_uniform_select_sge_i16(i16 %a, i16 %b) {
+  %cmp = icmp sge i16 %a, %b
+  %sel = select i1 %cmp, i16 %a, i16 %b
+  ret i16 %sel
+}
+
+; CHECK-LABEL: @promote_uniform_select_slt_i16(
+; CHECK: %[[A_32_0:[0-9]+]] = sext i16 %a to i32
+; CHECK: %[[B_32_0:[0-9]+]] = sext i16 %b to i32
+; CHECK: %[[CMP:[0-9]+]] = icmp slt i32 %[[A_32_0]], %[[B_32_0]]
+; CHECK: %[[A_32_1:[0-9]+]] = sext i16 %a to i32
+; CHECK: %[[B_32_1:[0-9]+]] = sext i16 %b to i32
+; CHECK: %[[SEL_32:[0-9]+]] = select i1 %[[CMP]], i32 %[[A_32_1]], i32 %[[B_32_1]]
+; CHECK: %[[SEL_16:[0-9]+]] = trunc i32 %[[SEL_32]] to i16
+; CHECK: ret i16 %[[SEL_16]]
+define i16 @promote_uniform_select_slt_i16(i16 %a, i16 %b) {
+  %cmp = icmp slt i16 %a, %b
+  %sel = select i1 %cmp, i16 %a, i16 %b
+  ret i16 %sel
+}
+
+; CHECK-LABEL: @promote_uniform_select_sle_i16(
+; CHECK: %[[A_32_0:[0-9]+]] = sext i16 %a to i32
+; CHECK: %[[B_32_0:[0-9]+]] = sext i16 %b to i32
+; CHECK: %[[CMP:[0-9]+]] = icmp sle i32 %[[A_32_0]], %[[B_32_0]]
+; CHECK: %[[A_32_1:[0-9]+]] = sext i16 %a to i32
+; CHECK: %[[B_32_1:[0-9]+]] = sext i16 %b to i32
+; CHECK: %[[SEL_32:[0-9]+]] = select i1 %[[CMP]], i32 %[[A_32_1]], i32 %[[B_32_1]]
+; CHECK: %[[SEL_16:[0-9]+]] = trunc i32 %[[SEL_32]] to i16
+; CHECK: ret i16 %[[SEL_16]]
+define i16 @promote_uniform_select_sle_i16(i16 %a, i16 %b) {
+  %cmp = icmp sle i16 %a, %b
+  %sel = select i1 %cmp, i16 %a, i16 %b
+  ret i16 %sel
+}
+
 attributes #0 = { nounwind optnone noinline }
 attributes #1 = { nounwind }
 attributes #2 = { nounwind "target-features"="+fp32-denormals" }
Index: test/CodeGen/AMDGPU/ctlz.ll
===================================================================
--- test/CodeGen/AMDGPU/ctlz.ll
+++ test/CodeGen/AMDGPU/ctlz.ll
@@ -244,7 +244,7 @@
 ; FUNC-LABEL: {{^}}v_ctlz_i16_sel_eq_neg1:
 ; SI: buffer_load_ushort [[VAL:v[0-9]+]],
 ; SI: v_ffbh_u32_e32 [[FFBH:v[0-9]+]], [[VAL]]
-; SI: buffer_store_short [[FFBH]],
+; SI: buffer_store_short
  define void @v_ctlz_i16_sel_eq_neg1(i16 addrspace(1)* noalias %out, i16 addrspace(1)* noalias %valptr) nounwind {
   %val = load i16, i16 addrspace(1)* %valptr
   %ctlz = call i16 @llvm.ctlz.i16(i16 %val, i1 false) nounwind readnone
Index: test/CodeGen/AMDGPU/mul_uint24.ll
===================================================================
--- test/CodeGen/AMDGPU/mul_uint24.ll
+++ test/CodeGen/AMDGPU/mul_uint24.ll
@@ -23,8 +23,8 @@
 ; EG: BFE_INT {{[* ]*}}T{{[0-9]}}.{{[XYZW]}}, PV.[[MUL_CHAN]], 0.0, literal.x
 ; EG: 16
 
-; SI: v_mul_u32_u24_e{{(32|64)}} [[MUL:v[0-9]]], {{[sv][0-9], [sv][0-9]}}
-; SI: v_bfe_i32 v{{[0-9]}}, [[MUL]], 0, 16
+; SI: s_mul_i32
+; SI: s_sext_i32_i16
 define void @test_umul24_i16_sext(i32 addrspace(1)* %out, i16 %a, i16 %b) {
 entry:
   %mul = mul i16 %a, %b
@@ -33,10 +33,22 @@
   ret void
 }
 
+; FUNC-LABEL: {{^}}test_umul24_2xi16_sext:
+; SI: v_mul_u32_u24_e{{(32|64)}} [[MUL:v[0-9]]], {{[sv][0-9], [sv][0-9]}}
+; SI: v_bfe_i32 v{{[0-9]}}, [[MUL]], 0, 16
+define void @test_umul24_2xi16_sext(<2 x i32> addrspace(1)* %out, <2 x i16> %a, <2 x i16> %b) {
+entry:
+  %mul = mul <2 x i16> %a, %b
+  %ext = sext <2 x i16> %mul to <2 x i32>
+  store <2 x i32> %ext, <2 x i32> addrspace(1)* %out
+  ret void
+}
+
 ; FUNC-LABEL: {{^}}test_umul24_i16:
+; SI: s_mul_i32
 ; SI: s_and_b32
-; SI: v_mul_u32_u24_e32
-; SI: v_and_b32_e32
+; SI: v_mov_b32_e32
+
 define void @test_umul24_i16(i32 addrspace(1)* %out, i16 %a, i16 %b) {
 entry:
   %mul = mul i16 %a, %b
@@ -45,6 +57,18 @@
   ret void
 }
 
+; FUNC-LABEL: {{^}}test_umul24_2xi16:
+; SI: v_and_b32
+; SI: v_mul_u32_u24_e32
+; SI: v_and_b32_e32
+define void @test_umul24_2xi16(<2 x i32> addrspace(1)* %out, <2 x i16> %a, <2 x i16> %b) {
+entry:
+  %mul = mul <2 x i16> %a, %b
+  %ext = zext <2 x i16> %mul to <2 x i32>
+  store <2 x i32> %ext, <2 x i32> addrspace(1)* %out
+  ret void
+}
+
 ; FUNC-LABEL: {{^}}test_umul24_i8:
 ; EG: MUL_UINT24 {{[* ]*}}T{{[0-9]}}.[[MUL_CHAN:[XYZW]]]
 ; The result must be sign-extended
Index: test/CodeGen/AMDGPU/sad.ll
===================================================================
--- test/CodeGen/AMDGPU/sad.ll
+++ test/CodeGen/AMDGPU/sad.ll
@@ -202,17 +202,18 @@
   ret void
 }
 
-; GCN-LABEL: {{^}}v_sad_u32_i16_pat2:
-; GCN: v_sad_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define void @v_sad_u32_i16_pat2(i16 addrspace(1)* %out, i16 zeroext %a, i16 zeroext %b, i16 zeroext %c) {
-  %icmp0 = icmp ugt i16 %a, %b
-  %sub0 = sub i16 %a, %b
-  %sub1 = sub i16 %b, %a
-  %ret0 = select i1 %icmp0, i16 %sub0, i16 %sub1
+; GCN-LABEL: {{^}}v_sad_u32_2xi16_pat2:
+; GCN: v_sad_u32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+; GCN: v_sad_u32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+define void @v_sad_u32_2xi16_pat2(<2 x i16> addrspace(1)* %out, <2 x i16> %a, <2 x i16> %b, <2 x i16> %c) {
+  %icmp0 = icmp ugt <2 x i16> %a, %b
+  %sub0 = sub <2 x i16> %a, %b
+  %sub1 = sub <2 x i16> %b, %a
+  %ret0 = select <2 x i1> %icmp0, <2 x i16> %sub0, <2 x i16> %sub1
 
-  %ret = add i16 %ret0, %c
+  %ret = add <2 x i16> %ret0, %c
 
-  store i16 %ret, i16 addrspace(1)* %out
+  store <2 x i16> %ret, <2 x i16> addrspace(1)* %out
   ret void
 }