Index: lib/Target/AMDGPU/AMDGPUISelLowering.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -2521,6 +2521,27 @@ return Opc == ISD::CTLZ || Opc == ISD::CTLZ_ZERO_UNDEF; } +// Get FFBH node if the incoming op may have been type legalized from a smaller +// type VT. +// Need to match pre-legalized type because the generic legalization inserts the +// add/sub between the select and compare. +static SDValue getFFBH_U32(const TargetLowering &TLI, + SelectionDAG &DAG, SDLoc SL, SDValue Op) { + EVT VT = Op.getValueType(); + EVT LegalVT = TLI.getTypeToTransformTo(*DAG.getContext(), VT); + if (LegalVT != MVT::i32) + return SDValue(); + + if (VT != MVT::i32) + Op = DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i32, Op); + + SDValue FFBH = DAG.getNode(AMDGPUISD::FFBH_U32, SL, MVT::i32, Op); + if (VT != MVT::i32) + FFBH = DAG.getNode(ISD::TRUNCATE, SL, VT, FFBH); + + return FFBH; +} + // The native instructions return -1 on 0 input. Optimize out a select that // produces -1 on 0. // @@ -2546,7 +2567,7 @@ isCtlzOpc(RHS.getOpcode()) && RHS.getOperand(0) == CmpLHS && isNegativeOne(LHS)) { - return DAG.getNode(AMDGPUISD::FFBH_U32, SL, MVT::i32, CmpLHS); + return getFFBH_U32(*this, DAG, SL, CmpLHS); } // select (setcc x, 0, ne), (ctlz_zero_undef x), -1 -> ffbh_u32 x @@ -2554,7 +2575,7 @@ isCtlzOpc(LHS.getOpcode()) && LHS.getOperand(0) == CmpLHS && isNegativeOne(RHS)) { - return DAG.getNode(AMDGPUISD::FFBH_U32, SL, MVT::i32, CmpLHS); + return getFFBH_U32(*this, DAG, SL, CmpLHS); } return SDValue(); @@ -2578,10 +2599,7 @@ return CombineFMinMaxLegacy(SDLoc(N), VT, LHS, RHS, True, False, CC, DCI); // There's no reason to not do this if the condition has other uses. - if (VT == MVT::i32) - return performCtlzCombine(SDLoc(N), Cond, True, False, DCI); - - return SDValue(); + return performCtlzCombine(SDLoc(N), Cond, True, False, DCI); } SDValue AMDGPUTargetLowering::PerformDAGCombine(SDNode *N, Index: test/CodeGen/AMDGPU/ctlz.ll =================================================================== --- test/CodeGen/AMDGPU/ctlz.ll +++ test/CodeGen/AMDGPU/ctlz.ll @@ -2,6 +2,10 @@ ; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s ; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s +declare i7 @llvm.ctlz.i7(i7, i1) nounwind readnone +declare i8 @llvm.ctlz.i8(i8, i1) nounwind readnone +declare i16 @llvm.ctlz.i16(i16, i1) nounwind readnone + declare i32 @llvm.ctlz.i32(i32, i1) nounwind readnone declare <2 x i32> @llvm.ctlz.v2i32(<2 x i32>, i1) nounwind readnone declare <4 x i32> @llvm.ctlz.v4i32(<4 x i32>, i1) nounwind readnone @@ -92,6 +96,20 @@ ret void } +; FUNC-LABEL: {{^}}v_ctlz_i8: +; SI: buffer_load_ubyte [[VAL:v[0-9]+]], +; SI-DAG: v_ffbh_u32_e32 [[FFBH:v[0-9]+]], [[VAL]] +; SI-DAG: v_cmp_eq_i32_e32 vcc, 0, [[CTLZ]] +; SI-DAG: v_cndmask_b32_e64 [[CORRECTED_FFBH:v[0-9]+]], [[FFBH]], 32, vcc +; SI: v_add_i32_e32 [[RESULT:v[0-9]+]], vcc, 0xffffffe8, [[CORRECTED_FFBH]] +; SI: buffer_store_byte [[RESULT]], +define void @v_ctlz_i8(i8 addrspace(1)* noalias %out, i8 addrspace(1)* noalias %valptr) nounwind { + %val = load i8, i8 addrspace(1)* %valptr + %ctlz = call i8 @llvm.ctlz.i8(i8 %val, i1 false) nounwind readnone + store i8 %ctlz, i8 addrspace(1)* %out + ret void +} + ; FUNC-LABEL: {{^}}s_ctlz_i64: ; SI: s_load_dwordx2 s{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, {{0xb|0x2c}} ; SI-DAG: v_cmp_eq_i32_e64 vcc, 0, s[[HI]] @@ -209,3 +227,43 @@ store i32 %sel, i32 addrspace(1)* %out ret void } + +; FUNC-LABEL: {{^}}v_ctlz_i8_sel_eq_neg1: +; SI: buffer_load_ubyte [[VAL:v[0-9]+]], +; SI: v_ffbh_u32_e32 [[FFBH:v[0-9]+]], [[VAL]] +; SI: buffer_store_byte [[FFBH]], + define void @v_ctlz_i8_sel_eq_neg1(i8 addrspace(1)* noalias %out, i8 addrspace(1)* noalias %valptr) nounwind { + %val = load i8, i8 addrspace(1)* %valptr + %ctlz = call i8 @llvm.ctlz.i8(i8 %val, i1 false) nounwind readnone + %cmp = icmp eq i8 %val, 0 + %sel = select i1 %cmp, i8 -1, i8 %ctlz + store i8 %sel, i8 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}v_ctlz_i16_sel_eq_neg1: +; SI: buffer_load_ushort [[VAL:v[0-9]+]], +; SI: v_ffbh_u32_e32 [[FFBH:v[0-9]+]], [[VAL]] +; SI: buffer_store_short [[FFBH]], + define void @v_ctlz_i16_sel_eq_neg1(i16 addrspace(1)* noalias %out, i16 addrspace(1)* noalias %valptr) nounwind { + %val = load i16, i16 addrspace(1)* %valptr + %ctlz = call i16 @llvm.ctlz.i16(i16 %val, i1 false) nounwind readnone + %cmp = icmp eq i16 %val, 0 + %sel = select i1 %cmp, i16 -1, i16 %ctlz + store i16 %sel, i16 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}v_ctlz_i7_sel_eq_neg1: +; SI: buffer_load_ubyte [[VAL:v[0-9]+]], +; SI: v_ffbh_u32_e32 [[FFBH:v[0-9]+]], [[VAL]] +; SI: v_and_b32_e32 [[TRUNC:v[0-9]+]], 0x7f, [[FFBH]] +; SI: buffer_store_byte [[TRUNC]], + define void @v_ctlz_i7_sel_eq_neg1(i7 addrspace(1)* noalias %out, i7 addrspace(1)* noalias %valptr) nounwind { + %val = load i7, i7 addrspace(1)* %valptr + %ctlz = call i7 @llvm.ctlz.i7(i7 %val, i1 false) nounwind readnone + %cmp = icmp eq i7 %val, 0 + %sel = select i1 %cmp, i7 -1, i7 %ctlz + store i7 %sel, i7 addrspace(1)* %out + ret void +} Index: test/CodeGen/AMDGPU/ctlz_zero_undef.ll =================================================================== --- test/CodeGen/AMDGPU/ctlz_zero_undef.ll +++ test/CodeGen/AMDGPU/ctlz_zero_undef.ll @@ -78,6 +78,18 @@ ret void } +; FUNC-LABEL: {{^}}v_ctlz_zero_undef_i8: +; SI: buffer_load_ubyte [[VAL:v[0-9]+]], +; SI: v_ffbh_u32_e32 [[FFBH:v[0-9]+]], [[VAL]] +; SI: v_add_i32_e32 [[RESULT:v[0-9]+]], vcc, 0xffffffe8, [[FFBH]] +; SI: buffer_store_byte [[RESULT]], +define void @v_ctlz_zero_undef_i8(i8 addrspace(1)* noalias %out, i8 addrspace(1)* noalias %valptr) nounwind { + %val = load i8, i8 addrspace(1)* %valptr + %ctlz = call i8 @llvm.ctlz.i8(i8 %val, i1 true) nounwind readnone + store i8 %ctlz, i8 addrspace(1)* %out + ret void +} + ; FUNC-LABEL: {{^}}s_ctlz_zero_undef_i64: ; SI: s_load_dwordx2 s{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, {{0xb|0x2c}} ; SI-DAG: v_cmp_eq_i32_e64 vcc, 0, s[[HI]] @@ -160,6 +172,19 @@ ret void } +; FUNC-LABEL: {{^}}v_ctlz_zero_undef_i8_sel_eq_neg1: +; SI: buffer_load_ubyte [[VAL:v[0-9]+]], +; SI: v_ffbh_u32_e32 [[FFBH:v[0-9]+]], [[VAL]] +; SI: buffer_store_byte [[FFBH]], + define void @v_ctlz_zero_undef_i8_sel_eq_neg1(i8 addrspace(1)* noalias %out, i8 addrspace(1)* noalias %valptr) nounwind { + %val = load i8, i8 addrspace(1)* %valptr + %ctlz = call i8 @llvm.ctlz.i8(i8 %val, i1 true) nounwind readnone + %cmp = icmp eq i8 %val, 0 + %sel = select i1 %cmp, i8 -1, i8 %ctlz + store i8 %sel, i8 addrspace(1)* %out + ret void +} + ; FUNC-LABEL: {{^}}v_ctlz_zero_undef_i32_sel_eq_neg1_two_use: ; SI: buffer_load_dword [[VAL:v[0-9]+]], ; SI-DAG: v_ffbh_u32_e32 [[RESULT0:v[0-9]+]], [[VAL]] @@ -241,15 +266,3 @@ store i32 %sel, i32 addrspace(1)* %out ret void } - -; FUNC-LABEL: {{^}}v_ctlz_zero_undef_i8: -; SI: buffer_load_dword [[VAL:v[0-9]+]], -; SI: v_ffbh_u32_e32 [[FFBH:v[0-9]+]], [[VAL]] -; SI: v_add_i32_e32 [[RESULT:v[0-9]+]], vcc, 0xffffffe8, [[FFBH]] -; SI: buffer_store_dword [[RESULT]], -define void @v_ctlz_zero_undef_i8(i8 addrspace(1)* noalias %out, i8 addrspace(1)* noalias %valptr) nounwind { - %val = load i8, i8 addrspace(1)* %valptr - %ctlz = call i8 @llvm.ctlz.i8(i8 %val, i1 true) nounwind readnone - store i8 %ctlz, i8 addrspace(1)* %out - ret void -}