Index: include/llvm/Target/TargetSelectionDAG.td =================================================================== --- include/llvm/Target/TargetSelectionDAG.td +++ include/llvm/Target/TargetSelectionDAG.td @@ -132,7 +132,7 @@ def SDTFPTernaryOp : SDTypeProfile<1, 3, [ // fmadd, fnmsub, etc. SDTCisSameAs<0, 1>, SDTCisSameAs<0, 2>, SDTCisSameAs<0, 3>, SDTCisFP<0> ]>; -def SDTIntUnaryOp : SDTypeProfile<1, 1, [ // ctlz +def SDTIntUnaryOp : SDTypeProfile<1, 1, [ // ctlz, cttz SDTCisSameAs<0, 1>, SDTCisInt<0> ]>; def SDTIntExtendOp : SDTypeProfile<1, 1, [ // sext, zext, anyext Index: lib/CodeGen/SelectionDAG/LegalizeDAG.cpp =================================================================== --- lib/CodeGen/SelectionDAG/LegalizeDAG.cpp +++ lib/CodeGen/SelectionDAG/LegalizeDAG.cpp @@ -2780,11 +2780,22 @@ // This trivially expands to CTTZ. return DAG.getNode(ISD::CTTZ, dl, Op.getValueType(), Op); case ISD::CTTZ: { + EVT VT = Op.getValueType(); + unsigned len = VT.getSizeInBits(); + + if (TLI.isOperationLegalOrCustom(ISD::CTTZ_ZERO_UNDEF, VT)) { + EVT SetCCVT = getSetCCResultType(VT); + SDValue CTTZ = DAG.getNode(ISD::CTTZ_ZERO_UNDEF, dl, VT, Op); + SDValue Zero = DAG.getConstant(0, dl, VT); + SDValue SrcIsZero = DAG.getSetCC(dl, SetCCVT, Op, Zero, ISD::SETEQ); + return DAG.getNode(ISD::SELECT, dl, VT, SrcIsZero, + DAG.getConstant(len, dl, VT), CTTZ); + } + // for now, we use: { return popcount(~x & (x - 1)); } // unless the target has ctlz but not ctpop, in which case we use: // { return 32 - nlz(~x & (x-1)); } // Ref: "Hacker's Delight" by Henry Warren - EVT VT = Op.getValueType(); SDValue Tmp3 = DAG.getNode(ISD::AND, dl, VT, DAG.getNOT(dl, Op, VT), DAG.getNode(ISD::SUB, dl, VT, Op, Index: lib/Target/AMDGPU/AMDGPUISelLowering.h =================================================================== --- lib/Target/AMDGPU/AMDGPUISelLowering.h +++ lib/Target/AMDGPU/AMDGPUISelLowering.h @@ -115,6 +115,7 @@ SDValue LowerSDIVREM(SDValue Op, SelectionDAG &DAG) const; SDValue LowerUDIVREM(SDValue Op, SelectionDAG &DAG) const; SDValue LowerDIVREM24(SDValue Op, SelectionDAG &DAG, bool sign) const; + SDValue LowerCTTZ(SDValue Op, SelectionDAG &DAG) const; void LowerUDIVREM64(SDValue Op, SelectionDAG &DAG, SmallVectorImpl &Results) const; void analyzeFormalArgumentsCompute(CCState &State, @@ -371,6 +372,8 @@ BFM, // Insert a range of bits into a 32-bit word. FFBH_U32, // ctlz with -1 if input is zero. FFBH_I32, + FFBL_U32, // cttz with -1 if input is zero. + FFBL_I32, MUL_U24, MUL_I24, MULHI_U24, Index: lib/Target/AMDGPU/AMDGPUISelLowering.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -417,7 +417,7 @@ setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32, Custom); if (Subtarget->hasFFBL()) - setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i32, Legal); + setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i32, Custom); setOperationAction(ISD::CTLZ, MVT::i64, Custom); setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i64, Custom); @@ -1105,6 +1105,9 @@ case ISD::FP_TO_FP16: return LowerFP_TO_FP16(Op, DAG); case ISD::FP_TO_SINT: return LowerFP_TO_SINT(Op, DAG); case ISD::FP_TO_UINT: return LowerFP_TO_UINT(Op, DAG); + case ISD::CTTZ: + case ISD::CTTZ_ZERO_UNDEF: + return LowerCTTZ(Op, DAG); case ISD::CTLZ: case ISD::CTLZ_ZERO_UNDEF: return LowerCTLZ(Op, DAG); @@ -2014,6 +2017,61 @@ return DAG.getNode(ISD::FADD, SL, MVT::f64, Trunc, Add); } + +SDValue AMDGPUTargetLowering:: LowerCTTZ(SDValue Op, SelectionDAG &DAG) const { + SDLoc SL(Op); + SDValue Src = Op.getOperand(0); + bool ZeroUndef = Op.getOpcode() == ISD::CTTZ_ZERO_UNDEF; + + if (ZeroUndef && Src.getValueType() == MVT::i32) + return DAG.getNode(AMDGPUISD::FFBL_U32, SL, MVT::i32, Src); + + SDValue Vec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Src); + + const SDValue Zero = DAG.getConstant(0, SL, MVT::i32); + const SDValue One = DAG.getConstant(1, SL, MVT::i32); + + SDValue Lo = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, Zero); + SDValue Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, One); + + EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(), + *DAG.getContext(), MVT::i32); + + SDValue Hi0 = DAG.getSetCC(SL, SetCCVT, Hi, One, ISD::SETEQ); + + SDValue CttzLo = DAG.getNode(ISD::CTTZ_ZERO_UNDEF, SL, MVT::i32, Lo); + SDValue CttzHi = DAG.getNode(ISD::CTTZ_ZERO_UNDEF, SL, MVT::i32, Hi); + + const SDValue Bits32 = DAG.getConstant(32, SL, MVT::i32); + SDValue Add = DAG.getNode(ISD::ADD, SL, MVT::i32, CttzHi, Bits32); + + // cttz(x) = lo_32(x) == 0 ? cttz(hi_32(x)) + 32 : cttz(lo_32(x)) + SDValue NewCttz = DAG.getNode(ISD::SELECT, SL, MVT::i32, Hi0, Add, CttzLo); + + if (!ZeroUndef) { + // Test if the full 64-bit input is zero. + + // FIXME: DAG combines turn what should be an s_and_b64 into a v_or_b32, + // which we probably don't want. + SDValue Lo0 = DAG.getSetCC(SL, SetCCVT, Lo, Zero, ISD::SETEQ); + SDValue SrcIsZero = DAG.getNode(ISD::AND, SL, SetCCVT, Lo0, Hi0); + + // TODO: If i64 setcc is half rate, it can result in 1 fewer instruction + // with the same cycles, otherwise it is slower. + // SDValue SrcIsZero = DAG.getSetCC(SL, SetCCVT, Src, + // DAG.getConstant(0, SL, MVT::i64), ISD::SETEQ); + + const SDValue Bits32 = DAG.getConstant(64, SL, MVT::i32); + + // The instruction returns -1 for 0 input, but the defined intrinsic + // behavior is to return the number of bits. + NewCttz = DAG.getNode(ISD::SELECT, SL, MVT::i32, + SrcIsZero, Bits32, NewCttz); + } + + return DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i64, NewCttz); +} + SDValue AMDGPUTargetLowering::LowerCTLZ(SDValue Op, SelectionDAG &DAG) const { SDLoc SL(Op); SDValue Src = Op.getOperand(0); @@ -3742,6 +3800,8 @@ NODE_NAME_CASE(BFM) NODE_NAME_CASE(FFBH_U32) NODE_NAME_CASE(FFBH_I32) + NODE_NAME_CASE(FFBL_U32) + NODE_NAME_CASE(FFBL_I32) NODE_NAME_CASE(MUL_U24) NODE_NAME_CASE(MUL_I24) NODE_NAME_CASE(MULHI_U24) Index: lib/Target/AMDGPU/AMDGPUInstrInfo.td =================================================================== --- lib/Target/AMDGPU/AMDGPUInstrInfo.td +++ lib/Target/AMDGPU/AMDGPUInstrInfo.td @@ -298,6 +298,9 @@ def AMDGPUffbh_u32 : SDNode<"AMDGPUISD::FFBH_U32", SDTIntUnaryOp>; def AMDGPUffbh_i32 : SDNode<"AMDGPUISD::FFBH_I32", SDTIntUnaryOp>; +def AMDGPUffbl_u32 : SDNode<"AMDGPUISD::FFBL_U32", SDTIntUnaryOp>; +def AMDGPUffbl_i32 : SDNode<"AMDGPUISD::FFBL_I32", SDTIntUnaryOp>; + // Signed and unsigned 24-bit multiply. The highest 8-bits are ignore // when performing the mulitply. The result is a 32-bit value. def AMDGPUmul_u24 : SDNode<"AMDGPUISD::MUL_U24", SDTIntBinOp, Index: lib/Target/AMDGPU/EvergreenInstructions.td =================================================================== --- lib/Target/AMDGPU/EvergreenInstructions.td +++ lib/Target/AMDGPU/EvergreenInstructions.td @@ -442,7 +442,7 @@ def FLT16_TO_FLT32 : R600_1OP_Helper <0xA3, "FLT16_TO_FLT32", f16_to_fp, VecALU>; def BCNT_INT : R600_1OP_Helper <0xAA, "BCNT_INT", ctpop, VecALU>; def FFBH_UINT : R600_1OP_Helper <0xAB, "FFBH_UINT", AMDGPUffbh_u32, VecALU>; -def FFBL_INT : R600_1OP_Helper <0xAC, "FFBL_INT", cttz_zero_undef, VecALU>; +def FFBL_INT : R600_1OP_Helper <0xAC, "FFBL_INT", AMDGPUffbl_u32, VecALU>; let hasSideEffects = 1 in { def MOVA_INT_eg : R600_1OP <0xCC, "MOVA_INT", [], VecALU>; Index: lib/Target/AMDGPU/SOPInstructions.td =================================================================== --- lib/Target/AMDGPU/SOPInstructions.td +++ lib/Target/AMDGPU/SOPInstructions.td @@ -159,10 +159,11 @@ def S_FF0_I32_B32 : SOP1_32 <"s_ff0_i32_b32">; def S_FF0_I32_B64 : SOP1_32_64 <"s_ff0_i32_b64">; +def S_FF1_I32_B64 : SOP1_32_64 <"s_ff1_i32_b64">; + def S_FF1_I32_B32 : SOP1_32 <"s_ff1_i32_b32", - [(set i32:$sdst, (cttz_zero_undef i32:$src0))] + [(set i32:$sdst, (AMDGPUffbl_u32 i32:$src0))] >; -def S_FF1_I32_B64 : SOP1_32_64 <"s_ff1_i32_b64">; def S_FLBIT_I32_B32 : SOP1_32 <"s_flbit_i32_b32", [(set i32:$sdst, (AMDGPUffbh_u32 i32:$src0))] Index: test/CodeGen/AMDGPU/cttz_zero_undef.ll =================================================================== --- test/CodeGen/AMDGPU/cttz_zero_undef.ll +++ test/CodeGen/AMDGPU/cttz_zero_undef.ll @@ -76,3 +76,28 @@ store <4 x i32> %cttz, <4 x i32> addrspace(1)* %out, align 16 ret void } + +; FUNC-LABEL: {{^}}s_cttz_zero_undef_i32_with_select: +; SI: s_ff1_i32_b32 +; EG: MEM_RAT_CACHELESS STORE_RAW [[RESULT:T[0-9]+\.[XYZW]]] +; EG: FFBL_INT {{\*? *}}[[RESULT]] +define amdgpu_kernel void @s_cttz_zero_undef_i32_with_select(i32 addrspace(1)* noalias %out, i32 %val) nounwind { + %cttz = tail call i32 @llvm.cttz.i32(i32 %val, i1 true) nounwind readnone + %cttz_ret = icmp ne i32 %val, 0 + %ret = select i1 %cttz_ret, i32 %cttz, i32 32 + store i32 %cttz, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}v_cttz_zero_undef_i32_with_select: +; SI: v_ffbl_b32_e32 +; EG: MEM_RAT_CACHELESS STORE_RAW [[RESULT:T[0-9]+\.[XYZW]]] +define amdgpu_kernel void @v_cttz_zero_undef_i32_with_select(i32 addrspace(1)* noalias %out, i32 addrspace(1)* nocapture readonly %arrayidx) nounwind { + %val = load i32, i32 addrspace(1)* %arrayidx, align 1 + %cttz = tail call i32 @llvm.cttz.i32(i32 %val, i1 true) nounwind readnone + %cttz_ret = icmp ne i32 %val, 0 + %ret = select i1 %cttz_ret, i32 %cttz, i32 32 + store i32 %ret, i32 addrspace(1)* %out, align 4 + ret void +} +