diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp @@ -4342,8 +4342,13 @@ case ISD::CTLZ: case ISD::CTLZ_ZERO_UNDEF: case ISD::CTPOP: - // Zero extend the argument. - Tmp1 = DAG.getNode(ISD::ZERO_EXTEND, dl, NVT, Node->getOperand(0)); + // Zero extend the argument unless its cttz, then use any_extend. + if (Node->getOpcode() == ISD::CTTZ || + Node->getOpcode() == ISD::CTTZ_ZERO_UNDEF) + Tmp1 = DAG.getNode(ISD::ANY_EXTEND, dl, NVT, Node->getOperand(0)); + else + Tmp1 = DAG.getNode(ISD::ZERO_EXTEND, dl, NVT, Node->getOperand(0)); + if (Node->getOpcode() == ISD::CTTZ) { // The count is the same in the promoted type except if the original // value was zero. This can be handled by setting the bit just off diff --git a/llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll b/llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll --- a/llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll +++ b/llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll @@ -131,7 +131,7 @@ ; FUNC-LABEL: {{^}}v_cttz_zero_undef_i8_with_select: ; SI-NOSDWA: v_ffbl_b32_e32 v{{[0-9]+}}, v{{[0-9]+}} -; SI-SDWA: v_ffbl_b32_sdwa +; SI-SDWA: v_ffbl_b32_e32 ; EG: MEM_RAT MSKOR define amdgpu_kernel void @v_cttz_zero_undef_i8_with_select(i8 addrspace(1)* noalias %out, i8 addrspace(1)* nocapture readonly %arrayidx) nounwind { %val = load i8, i8 addrspace(1)* %arrayidx, align 1 @@ -144,7 +144,7 @@ ; FUNC-LABEL: {{^}}v_cttz_zero_undef_i16_with_select: ; SI-NOSDWA: v_ffbl_b32_e32 v{{[0-9]+}}, v{{[0-9]+}} -; SI-SDWA: v_ffbl_b32_sdwa +; SI-SDWA: v_ffbl_b32_e32 ; EG: MEM_RAT MSKOR define amdgpu_kernel void @v_cttz_zero_undef_i16_with_select(i16 addrspace(1)* noalias %out, i16 addrspace(1)* nocapture readonly %arrayidx) nounwind { %val = load i16, i16 addrspace(1)* %arrayidx, align 1 @@ -246,7 +246,7 @@ ; FUNC-LABEL: {{^}}v_cttz_i8_sel_eq_neg1: ; SI: {{buffer|flat}}_load_ubyte ; SI-NOSDWA: v_ffbl_b32_e32 v{{[0-9]+}}, v{{[0-9]+}} -; SI-SDWA: v_ffbl_b32_sdwa +; SI-SDWA: v_ffbl_b32_e32 ; EG: MEM_RAT MSKOR ; EG: FFBL_INT define amdgpu_kernel void @v_cttz_i8_sel_eq_neg1(i8 addrspace(1)* noalias %out, i8 addrspace(1)* nocapture readonly %arrayidx) nounwind { diff --git a/llvm/test/CodeGen/X86/clz.ll b/llvm/test/CodeGen/X86/clz.ll --- a/llvm/test/CodeGen/X86/clz.ll +++ b/llvm/test/CodeGen/X86/clz.ll @@ -17,29 +17,25 @@ define i8 @cttz_i8(i8 %x) { ; X32-LABEL: cttz_i8: ; X32: # %bb.0: -; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax -; X32-NEXT: bsfl %eax, %eax +; X32-NEXT: bsfl {{[0-9]+}}(%esp), %eax ; X32-NEXT: # kill: def $al killed $al killed $eax ; X32-NEXT: retl ; ; X64-LABEL: cttz_i8: ; X64: # %bb.0: -; X64-NEXT: movzbl %dil, %eax -; X64-NEXT: bsfl %eax, %eax +; X64-NEXT: bsfl %edi, %eax ; X64-NEXT: # kill: def $al killed $al killed $eax ; X64-NEXT: retq ; ; X32-CLZ-LABEL: cttz_i8: ; X32-CLZ: # %bb.0: -; X32-CLZ-NEXT: movzbl {{[0-9]+}}(%esp), %eax -; X32-CLZ-NEXT: tzcntl %eax, %eax +; X32-CLZ-NEXT: tzcntl {{[0-9]+}}(%esp), %eax ; X32-CLZ-NEXT: # kill: def $al killed $al killed $eax ; X32-CLZ-NEXT: retl ; ; X64-CLZ-LABEL: cttz_i8: ; X64-CLZ: # %bb.0: -; X64-CLZ-NEXT: movzbl %dil, %eax -; X64-CLZ-NEXT: tzcntl %eax, %eax +; X64-CLZ-NEXT: tzcntl %edi, %eax ; X64-CLZ-NEXT: # kill: def $al killed $al killed $eax ; X64-CLZ-NEXT: retq %tmp = call i8 @llvm.cttz.i8( i8 %x, i1 true ) @@ -503,17 +499,16 @@ ; ; X32-CLZ-LABEL: cttz_i8_zero_test: ; X32-CLZ: # %bb.0: -; X32-CLZ-NEXT: movzbl {{[0-9]+}}(%esp), %eax -; X32-CLZ-NEXT: orl $256, %eax # imm = 0x100 +; X32-CLZ-NEXT: movl $256, %eax # imm = 0x100 +; X32-CLZ-NEXT: orl {{[0-9]+}}(%esp), %eax ; X32-CLZ-NEXT: tzcntl %eax, %eax ; X32-CLZ-NEXT: # kill: def $al killed $al killed $eax ; X32-CLZ-NEXT: retl ; ; X64-CLZ-LABEL: cttz_i8_zero_test: ; X64-CLZ: # %bb.0: -; X64-CLZ-NEXT: movzbl %dil, %eax -; X64-CLZ-NEXT: orl $256, %eax # imm = 0x100 -; X64-CLZ-NEXT: tzcntl %eax, %eax +; X64-CLZ-NEXT: orl $256, %edi # imm = 0x100 +; X64-CLZ-NEXT: tzcntl %edi, %eax ; X64-CLZ-NEXT: # kill: def $al killed $al killed $eax ; X64-CLZ-NEXT: retq %tmp1 = call i8 @llvm.cttz.i8(i8 %n, i1 false)