Index: lib/CodeGen/SelectionDAG/LegalizeDAG.cpp =================================================================== --- lib/CodeGen/SelectionDAG/LegalizeDAG.cpp +++ lib/CodeGen/SelectionDAG/LegalizeDAG.cpp @@ -4142,10 +4142,24 @@ Tmp1 = DAG.getNode(Node->getOpcode(), dl, NVT, Tmp1); if (Node->getOpcode() == ISD::CTLZ || Node->getOpcode() == ISD::CTLZ_ZERO_UNDEF) { - // Tmp1 = Tmp1 - (sizeinbits(NVT) - sizeinbits(Old VT)) - Tmp1 = DAG.getNode(ISD::SUB, dl, NVT, Tmp1, - DAG.getConstant(NVT.getSizeInBits() - - OVT.getSizeInBits(), dl, NVT)); + unsigned Bits = NVT.getSizeInBits() - OVT.getSizeInBits(); + // If type-legalization has already promoted the CTLZ once, replace the + // add with an updated constant to avoid the extra unfolded add. This + // is needed as long as DAGCombiner::visitTRUNCATE() will not help this + // case post-legalize. + SDValue NewAdd; + if (Node->hasOneUse() && Node->use_begin()->getOpcode() == ISD::ADD) { + SDNode *Add = *Node->use_begin(); + if (ConstantSDNode *C = dyn_cast(Add->getOperand(1))) { + NewAdd = DAG.getNode(ISD::ADD, dl, OVT, Add->getOperand(0), + DAG.getConstant(C->getSExtValue() - Bits, dl, OVT)); + ReplaceNode(SDValue(Add, 0), NewAdd); + } + } + if (!NewAdd.getNode()) + // Tmp1 = Tmp1 - (sizeinbits(NVT) - sizeinbits(Old VT)) + Tmp1 = DAG.getNode(ISD::SUB, dl, NVT, Tmp1, + DAG.getConstant(Bits, dl, NVT)); } Results.push_back(DAG.getNode(ISD::TRUNCATE, dl, OVT, Tmp1)); break; Index: test/CodeGen/AMDGPU/ctlz.ll =================================================================== --- test/CodeGen/AMDGPU/ctlz.ll +++ test/CodeGen/AMDGPU/ctlz.ll @@ -112,7 +112,7 @@ ; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], 32, [[FFBH]], vcc ; SI: v_subrev_i32_e32 [[RESULT:v[0-9]+]], vcc, 24, [[SELECT]] -; VI: v_add_u32_e32 [[RESULT:v[0-9]+]], vcc, -16, [[SELECT]] +; VI: v_u16_e32 [[RESULT:v[0-9]+]], 24, [[SELECT]] ; GCN: buffer_store_byte [[RESULT]], ; GCN: s_endpgm define amdgpu_kernel void @v_ctlz_i8(i8 addrspace(1)* noalias %out, i8 addrspace(1)* noalias %valptr) nounwind { Index: test/CodeGen/SystemZ/scalar-ctlz.ll =================================================================== --- test/CodeGen/SystemZ/scalar-ctlz.ll +++ test/CodeGen/SystemZ/scalar-ctlz.ll @@ -1,6 +1,4 @@ ; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s -; -; FIXME: two consecutive immediate adds not fused in i16/i8 functions. declare i64 @llvm.ctlz.i64(i64, i1) declare i32 @llvm.ctlz.i32(i32, i1) @@ -56,8 +54,7 @@ ; CHECK-NEXT: # kill ; CHECK-NEXT: llghr %r0, %r2 ; CHECK-NEXT: flogr %r2, %r0 -; CHECK-NEXT: aghi %r2, -32 -; CHECK-NEXT: ahi %r2, -16 +; CHECK-NEXT: ahi %r2, -48 ; CHECK-NEXT: # kill ; CHECK-NEXT: br %r14 %1 = tail call i16 @llvm.ctlz.i16(i16 %arg, i1 false) @@ -70,8 +67,7 @@ ; CHECK-NEXT: # kill ; CHECK-NEXT: llghr %r0, %r2 ; CHECK-NEXT: flogr %r2, %r0 -; CHECK-NEXT: aghi %r2, -32 -; CHECK-NEXT: ahi %r2, -16 +; CHECK-NEXT: ahi %r2, -48 ; CHECK-NEXT: # kill ; CHECK-NEXT: br %r14 %1 = tail call i16 @llvm.ctlz.i16(i16 %arg, i1 true) @@ -84,8 +80,7 @@ ; CHECK-NEXT: # kill ; CHECK-NEXT: llgcr %r0, %r2 ; CHECK-NEXT: flogr %r2, %r0 -; CHECK-NEXT: aghi %r2, -32 -; CHECK-NEXT: ahi %r2, -24 +; CHECK-NEXT: ahi %r2, -56 ; CHECK-NEXT: # kill ; CHECK-NEXT: br %r14 %1 = tail call i8 @llvm.ctlz.i8(i8 %arg, i1 false) @@ -98,8 +93,7 @@ ; CHECK-NEXT: # kill ; CHECK-NEXT: llgcr %r0, %r2 ; CHECK-NEXT: flogr %r2, %r0 -; CHECK-NEXT: aghi %r2, -32 -; CHECK-NEXT: ahi %r2, -24 +; CHECK-NEXT: ahi %r2, -56 ; CHECK-NEXT: # kill ; CHECK-NEXT: br %r14 %1 = tail call i8 @llvm.ctlz.i8(i8 %arg, i1 true)