Index: llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp +++ llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp @@ -530,14 +530,32 @@ return true; } +// Find a select instruction, which may have been casted. This is mostly to deal +// with cases where i16 selects weer promoted here to i32. +static SelectInst *findSelectThroughCast(Value *V, CastInst *&Cast) { + Cast = nullptr; + if (SelectInst *Sel = dyn_cast(V)) + return Sel; + + if ((Cast = dyn_cast(V))) { + if (SelectInst *Sel = dyn_cast(Cast->getOperand(0))) + return Sel; + } + + return nullptr; +} + bool AMDGPUCodeGenPrepare::foldBinOpIntoSelect(BinaryOperator &BO) const { // Don't do this unless the old select is going away. We want to eliminate the // binary operator, not replace a binop with a select. int SelOpNo = 0; - SelectInst *Sel = dyn_cast(BO.getOperand(0)); + + CastInst *CastOp; + + SelectInst *Sel = findSelectThroughCast(BO.getOperand(0), CastOp); if (!Sel || !Sel->hasOneUse()) { SelOpNo = 1; - Sel = dyn_cast(BO.getOperand(1)); + Sel = findSelectThroughCast(BO.getOperand(1), CastOp); } if (!Sel || !Sel->hasOneUse()) @@ -549,6 +567,11 @@ if (!CBO || !CT || !CF) return false; + if (CastOp) { + CT = ConstantFoldCastOperand(CastOp->getOpcode(), CT, BO.getType(), *DL); + CF = ConstantFoldCastOperand(CastOp->getOpcode(), CF, BO.getType(), *DL); + } + // TODO: Handle special 0/-1 cases DAG combine does, although we only really // need to handle divisions here. Constant *FoldedT = SelOpNo ? @@ -573,6 +596,8 @@ NewSelect->takeName(&BO); BO.replaceAllUsesWith(NewSelect); BO.eraseFromParent(); + if (CastOp) + CastOp->eraseFromParent(); Sel->eraseFromParent(); return true; } Index: llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-fold-binop-select.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-fold-binop-select.ll +++ llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-fold-binop-select.ll @@ -410,13 +410,18 @@ ; IR-LABEL: @select_mul_rhs_const_i32( ; IR-NEXT: [[OP:%.*]] = select i1 [[COND:%.*]], i32 5000, i32 8000 ; IR-NEXT: ret i32 [[OP]] +; %select = select i1 %cond, i32 5, i32 8 %op = mul i32 %select, 1000 ret i32 %op } -; FIXME: Truncate from promoted select blocks this. define amdgpu_kernel void @select_add_lhs_const_i16(i1 %cond) { +; IR-LABEL: @select_add_lhs_const_i16( +; IR-NEXT: [[OP:%.*]] = select i1 [[COND:%.*]], i16 128, i16 131 +; IR-NEXT: store i16 [[OP]], i16 addrspace(1)* undef +; IR-NEXT: ret void + ; GCN-LABEL: select_add_lhs_const_i16: ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dword s0, s[4:5], 0x0 @@ -428,16 +433,62 @@ ; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc ; GCN-NEXT: flat_store_short v[0:1], v0 ; GCN-NEXT: s_endpgm -; IR-LABEL: @select_add_lhs_const_i16( -; IR-NEXT: [[TMP1:%.*]] = select i1 [[COND:%.*]], i32 5, i32 8 -; IR-NEXT: [[TMP2:%.*]] = trunc i32 [[TMP1]] to i16 -; IR-NEXT: [[TMP3:%.*]] = zext i16 [[TMP2]] to i32 -; IR-NEXT: [[TMP4:%.*]] = add nuw nsw i32 [[TMP3]], 123 -; IR-NEXT: [[TMP5:%.*]] = trunc i32 [[TMP4]] to i16 -; IR-NEXT: store i16 [[TMP5]], i16 addrspace(1)* undef -; IR-NEXT: ret void +; %select = select i1 %cond, i16 5, i16 8 %op = add i16 %select, 123 store i16 %op, i16 addrspace(1)* undef ret void } + +define i16 @select_add_trunc_select(i1 %cond) { +; GCN-LABEL: select_add_trunc_select: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_and_b32_e32 v0, 1, v0 +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GCN-NEXT: v_cndmask_b32_e64 v0, 50, 47, vcc +; GCN-NEXT: s_setpc_b64 s[30:31] +; IR-LABEL: @select_add_trunc_select( +; IR-NEXT: [[OP:%.*]] = select i1 [[COND:%.*]], i16 47, i16 50 +; IR-NEXT: ret i16 [[OP]] +; + %select = select i1 %cond, i32 5, i32 8 + %trunc = trunc i32 %select to i16 + %op = add i16 %trunc, 42 + ret i16 %op +} + +define i32 @select_add_sext_select(i1 %cond) { +; IR-LABEL: @select_add_sext_select( +; IR-NEXT: [[OP:%.*]] = select i1 [[COND:%.*]], i32 29, i32 50 +; IR-NEXT: ret i32 [[OP]] +; GCN-LABEL: select_add_sext_select: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_and_b32_e32 v0, 1, v0 +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GCN-NEXT: v_cndmask_b32_e64 v0, 50, 29, vcc +; GCN-NEXT: s_setpc_b64 s[30:31] + %select = select i1 %cond, i16 -13, i16 8 + %trunc = sext i16 %select to i32 + %op = add i32 %trunc, 42 + ret i32 %op +} + +define i32 @select_add_zext_select(i1 %cond) { +; IR-LABEL: @select_add_zext_select( +; IR-NEXT: [[OP:%.*]] = select i1 [[COND:%.*]], i32 47, i32 50 +; IR-NEXT: ret i32 [[OP]] + +; GCN-LABEL: select_add_zext_select: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_and_b32_e32 v0, 1, v0 +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GCN-NEXT: v_cndmask_b32_e64 v0, 50, 47, vcc +; GCN-NEXT: s_setpc_b64 s[30:31] + %select = select i1 %cond, i16 5, i16 8 + %trunc = zext i16 %select to i32 + %op = add i32 %trunc, 42 + ret i32 %op +} Index: llvm/test/CodeGen/AMDGPU/dagcombine-select.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/dagcombine-select.ll +++ llvm/test/CodeGen/AMDGPU/dagcombine-select.ll @@ -112,9 +112,7 @@ } ; GCN-LABEL: {{^}}sel_constants_sub_constant_sel_constants_i16: -; TODO: shrink i16 constant. This is correct but suboptimal. -; GCN: v_mov_b32_e32 [[T:v[0-9]+]], 0xffff0009 -; GCN: v_cndmask_b32_e32 v{{[0-9]+}}, 2, [[T]], +; GCN: v_cndmask_b32_e64 v2, 2, 9, define amdgpu_kernel void @sel_constants_sub_constant_sel_constants_i16(i16 addrspace(1)* %p, i1 %cond) { %sel = select i1 %cond, i16 -4, i16 3 %bo = sub i16 5, %sel