Index: llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
===================================================================
--- llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
+++ llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
@@ -530,14 +530,32 @@
   return true;
 }
 
+// Find a select instruction, which may have been casted. This is mostly to deal
+// with cases where i16 selects weer promoted here to i32.
+static SelectInst *findSelectThroughCast(Value *V, CastInst *&Cast) {
+  Cast = nullptr;
+  if (SelectInst *Sel = dyn_cast<SelectInst>(V))
+    return Sel;
+
+  if ((Cast = dyn_cast<CastInst>(V))) {
+    if (SelectInst *Sel = dyn_cast<SelectInst>(Cast->getOperand(0)))
+      return Sel;
+  }
+
+  return nullptr;
+}
+
 bool AMDGPUCodeGenPrepare::foldBinOpIntoSelect(BinaryOperator &BO) const {
   // Don't do this unless the old select is going away. We want to eliminate the
   // binary operator, not replace a binop with a select.
   int SelOpNo = 0;
-  SelectInst *Sel = dyn_cast<SelectInst>(BO.getOperand(0));
+
+  CastInst *CastOp;
+
+  SelectInst *Sel = findSelectThroughCast(BO.getOperand(0), CastOp);
   if (!Sel || !Sel->hasOneUse()) {
     SelOpNo = 1;
-    Sel = dyn_cast<SelectInst>(BO.getOperand(1));
+    Sel = findSelectThroughCast(BO.getOperand(1), CastOp);
   }
 
   if (!Sel || !Sel->hasOneUse())
@@ -549,6 +567,11 @@
   if (!CBO || !CT || !CF)
     return false;
 
+  if (CastOp) {
+    CT = ConstantFoldCastOperand(CastOp->getOpcode(), CT, BO.getType(), *DL);
+    CF = ConstantFoldCastOperand(CastOp->getOpcode(), CF, BO.getType(), *DL);
+  }
+
   // TODO: Handle special 0/-1 cases DAG combine does, although we only really
   // need to handle divisions here.
   Constant *FoldedT = SelOpNo ?
@@ -573,6 +596,8 @@
   NewSelect->takeName(&BO);
   BO.replaceAllUsesWith(NewSelect);
   BO.eraseFromParent();
+  if (CastOp)
+    CastOp->eraseFromParent();
   Sel->eraseFromParent();
   return true;
 }
Index: llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-fold-binop-select.ll
===================================================================
--- llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-fold-binop-select.ll
+++ llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-fold-binop-select.ll
@@ -410,13 +410,18 @@
 ; IR-LABEL: @select_mul_rhs_const_i32(
 ; IR-NEXT:    [[OP:%.*]] = select i1 [[COND:%.*]], i32 5000, i32 8000
 ; IR-NEXT:    ret i32 [[OP]]
+;
   %select = select i1 %cond, i32 5, i32 8
   %op = mul i32 %select, 1000
   ret i32 %op
 }
 
-; FIXME: Truncate from promoted select blocks this.
 define amdgpu_kernel void @select_add_lhs_const_i16(i1 %cond) {
+; IR-LABEL: @select_add_lhs_const_i16(
+; IR-NEXT:    [[OP:%.*]] = select i1 [[COND:%.*]], i16 128, i16 131
+; IR-NEXT:    store i16 [[OP]], i16 addrspace(1)* undef
+; IR-NEXT:    ret void
+
 ; GCN-LABEL: select_add_lhs_const_i16:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dword s0, s[4:5], 0x0
@@ -428,16 +433,62 @@
 ; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
 ; GCN-NEXT:    flat_store_short v[0:1], v0
 ; GCN-NEXT:    s_endpgm
-; IR-LABEL: @select_add_lhs_const_i16(
-; IR-NEXT:    [[TMP1:%.*]] = select i1 [[COND:%.*]], i32 5, i32 8
-; IR-NEXT:    [[TMP2:%.*]] = trunc i32 [[TMP1]] to i16
-; IR-NEXT:    [[TMP3:%.*]] = zext i16 [[TMP2]] to i32
-; IR-NEXT:    [[TMP4:%.*]] = add nuw nsw i32 [[TMP3]], 123
-; IR-NEXT:    [[TMP5:%.*]] = trunc i32 [[TMP4]] to i16
-; IR-NEXT:    store i16 [[TMP5]], i16 addrspace(1)* undef
-; IR-NEXT:    ret void
+;
   %select = select i1 %cond, i16 5, i16 8
   %op = add i16 %select, 123
   store i16 %op, i16 addrspace(1)* undef
   ret void
 }
+
+define i16 @select_add_trunc_select(i1 %cond) {
+; GCN-LABEL: select_add_trunc_select:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v0, 1, v0
+; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
+; GCN-NEXT:    v_cndmask_b32_e64 v0, 50, 47, vcc
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+; IR-LABEL: @select_add_trunc_select(
+; IR-NEXT:    [[OP:%.*]] = select i1 [[COND:%.*]], i16 47, i16 50
+; IR-NEXT:    ret i16 [[OP]]
+;
+  %select = select i1 %cond, i32 5, i32 8
+  %trunc = trunc i32 %select to i16
+  %op = add i16 %trunc, 42
+  ret i16 %op
+}
+
+define i32 @select_add_sext_select(i1 %cond) {
+; IR-LABEL: @select_add_sext_select(
+; IR-NEXT:    [[OP:%.*]] = select i1 [[COND:%.*]], i32 29, i32 50
+; IR-NEXT:    ret i32 [[OP]]
+; GCN-LABEL: select_add_sext_select:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v0, 1, v0
+; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
+; GCN-NEXT:    v_cndmask_b32_e64 v0, 50, 29, vcc
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+  %select = select i1 %cond, i16 -13, i16 8
+  %trunc = sext i16 %select to i32
+  %op = add i32 %trunc, 42
+  ret i32 %op
+}
+
+define i32 @select_add_zext_select(i1 %cond) {
+; IR-LABEL: @select_add_zext_select(
+; IR-NEXT:    [[OP:%.*]] = select i1 [[COND:%.*]], i32 47, i32 50
+; IR-NEXT:    ret i32 [[OP]]
+
+; GCN-LABEL: select_add_zext_select:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v0, 1, v0
+; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
+; GCN-NEXT:    v_cndmask_b32_e64 v0, 50, 47, vcc
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+  %select = select i1 %cond, i16 5, i16 8
+  %trunc = zext i16 %select to i32
+  %op = add i32 %trunc, 42
+  ret i32 %op
+}
Index: llvm/test/CodeGen/AMDGPU/dagcombine-select.ll
===================================================================
--- llvm/test/CodeGen/AMDGPU/dagcombine-select.ll
+++ llvm/test/CodeGen/AMDGPU/dagcombine-select.ll
@@ -112,9 +112,7 @@
 }
 
 ; GCN-LABEL: {{^}}sel_constants_sub_constant_sel_constants_i16:
-; TODO: shrink i16 constant. This is correct but suboptimal.
-; GCN: v_mov_b32_e32 [[T:v[0-9]+]], 0xffff0009
-; GCN: v_cndmask_b32_e32 v{{[0-9]+}}, 2, [[T]],
+; GCN: v_cndmask_b32_e64 v2, 2, 9,
 define amdgpu_kernel void @sel_constants_sub_constant_sel_constants_i16(i16 addrspace(1)* %p, i1 %cond) {
   %sel = select i1 %cond, i16 -4, i16 3
   %bo = sub i16 5, %sel