Index: llvm/trunk/include/llvm/Target/TargetSelectionDAG.td =================================================================== --- llvm/trunk/include/llvm/Target/TargetSelectionDAG.td +++ llvm/trunk/include/llvm/Target/TargetSelectionDAG.td @@ -132,7 +132,7 @@ def SDTFPTernaryOp : SDTypeProfile<1, 3, [ // fmadd, fnmsub, etc. SDTCisSameAs<0, 1>, SDTCisSameAs<0, 2>, SDTCisSameAs<0, 3>, SDTCisFP<0> ]>; -def SDTIntUnaryOp : SDTypeProfile<1, 1, [ // ctlz +def SDTIntUnaryOp : SDTypeProfile<1, 1, [ // ctlz, cttz SDTCisSameAs<0, 1>, SDTCisInt<0> ]>; def SDTIntExtendOp : SDTypeProfile<1, 1, [ // sext, zext, anyext Index: llvm/trunk/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp =================================================================== --- llvm/trunk/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp +++ llvm/trunk/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp @@ -2773,7 +2773,7 @@ return DAG.getNode(ISD::CTLZ, dl, Op.getValueType(), Op); case ISD::CTLZ: { EVT VT = Op.getValueType(); - unsigned len = VT.getSizeInBits(); + unsigned Len = VT.getSizeInBits(); if (TLI.isOperationLegalOrCustom(ISD::CTLZ_ZERO_UNDEF, VT)) { EVT SetCCVT = getSetCCResultType(VT); @@ -2781,7 +2781,7 @@ SDValue Zero = DAG.getConstant(0, dl, VT); SDValue SrcIsZero = DAG.getSetCC(dl, SetCCVT, Op, Zero, ISD::SETEQ); return DAG.getNode(ISD::SELECT, dl, VT, SrcIsZero, - DAG.getConstant(len, dl, VT), CTLZ); + DAG.getConstant(Len, dl, VT), CTLZ); } // for now, we do this: @@ -2794,7 +2794,7 @@ // // Ref: "Hacker's Delight" by Henry Warren EVT ShVT = TLI.getShiftAmountTy(VT, DAG.getDataLayout()); - for (unsigned i = 0; (1U << i) <= (len / 2); ++i) { + for (unsigned i = 0; (1U << i) <= (Len / 2); ++i) { SDValue Tmp3 = DAG.getConstant(1ULL << i, dl, ShVT); Op = DAG.getNode(ISD::OR, dl, VT, Op, DAG.getNode(ISD::SRL, dl, VT, Op, Tmp3)); @@ -2806,11 +2806,22 @@ // This trivially expands to CTTZ. return DAG.getNode(ISD::CTTZ, dl, Op.getValueType(), Op); case ISD::CTTZ: { + EVT VT = Op.getValueType(); + unsigned Len = VT.getSizeInBits(); + + if (TLI.isOperationLegalOrCustom(ISD::CTTZ_ZERO_UNDEF, VT)) { + EVT SetCCVT = getSetCCResultType(VT); + SDValue CTTZ = DAG.getNode(ISD::CTTZ_ZERO_UNDEF, dl, VT, Op); + SDValue Zero = DAG.getConstant(0, dl, VT); + SDValue SrcIsZero = DAG.getSetCC(dl, SetCCVT, Op, Zero, ISD::SETEQ); + return DAG.getNode(ISD::SELECT, dl, VT, SrcIsZero, + DAG.getConstant(Len, dl, VT), CTTZ); + } + // for now, we use: { return popcount(~x & (x - 1)); } // unless the target has ctlz but not ctpop, in which case we use: // { return 32 - nlz(~x & (x-1)); } // Ref: "Hacker's Delight" by Henry Warren - EVT VT = Op.getValueType(); SDValue Tmp3 = DAG.getNode(ISD::AND, dl, VT, DAG.getNOT(dl, Op, VT), DAG.getNode(ISD::SUB, dl, VT, Op, Index: llvm/trunk/lib/Target/AMDGPU/AMDGPUISelLowering.h =================================================================== --- llvm/trunk/lib/Target/AMDGPU/AMDGPUISelLowering.h +++ llvm/trunk/lib/Target/AMDGPU/AMDGPUISelLowering.h @@ -32,7 +32,7 @@ /// legalized from a smaller type VT. Need to match pre-legalized type because /// the generic legalization inserts the add/sub between the select and /// compare. - SDValue getFFBH_U32(SelectionDAG &DAG, SDValue Op, const SDLoc &DL) const; + SDValue getFFBX_U32(SelectionDAG &DAG, SDValue Op, const SDLoc &DL, unsigned Opc) const; public: static bool isOrEquivalentToAdd(SelectionDAG &DAG, SDValue Op); @@ -57,7 +57,7 @@ SDValue LowerFROUND(SDValue Op, SelectionDAG &DAG) const; SDValue LowerFFLOOR(SDValue Op, SelectionDAG &DAG) const; - SDValue LowerCTLZ(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerCTLZ_CTTZ(SDValue Op, SelectionDAG &DAG) const; SDValue LowerINT_TO_FP32(SDValue Op, SelectionDAG &DAG, bool Signed) const; SDValue LowerINT_TO_FP64(SDValue Op, SelectionDAG &DAG, bool Signed) const; @@ -88,7 +88,7 @@ SDValue performMulhsCombine(SDNode *N, DAGCombinerInfo &DCI) const; SDValue performMulhuCombine(SDNode *N, DAGCombinerInfo &DCI) const; SDValue performMulLoHi24Combine(SDNode *N, DAGCombinerInfo &DCI) const; - SDValue performCtlzCombine(const SDLoc &SL, SDValue Cond, SDValue LHS, + SDValue performCtlz_CttzCombine(const SDLoc &SL, SDValue Cond, SDValue LHS, SDValue RHS, DAGCombinerInfo &DCI) const; SDValue performSelectCombine(SDNode *N, DAGCombinerInfo &DCI) const; SDValue performFNegCombine(SDNode *N, DAGCombinerInfo &DCI) const; @@ -371,6 +371,7 @@ BFM, // Insert a range of bits into a 32-bit word. FFBH_U32, // ctlz with -1 if input is zero. FFBH_I32, + FFBL_B32, // cttz with -1 if input is zero. MUL_U24, MUL_I24, MULHI_U24, Index: llvm/trunk/lib/Target/AMDGPU/AMDGPUISelLowering.cpp =================================================================== --- llvm/trunk/lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ llvm/trunk/lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -417,8 +417,10 @@ setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32, Custom); if (Subtarget->hasFFBL()) - setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i32, Legal); + setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i32, Custom); + setOperationAction(ISD::CTTZ, MVT::i64, Custom); + setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i64, Custom); setOperationAction(ISD::CTLZ, MVT::i64, Custom); setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i64, Custom); @@ -1113,9 +1115,11 @@ case ISD::FP_TO_FP16: return LowerFP_TO_FP16(Op, DAG); case ISD::FP_TO_SINT: return LowerFP_TO_SINT(Op, DAG); case ISD::FP_TO_UINT: return LowerFP_TO_UINT(Op, DAG); + case ISD::CTTZ: + case ISD::CTTZ_ZERO_UNDEF: case ISD::CTLZ: case ISD::CTLZ_ZERO_UNDEF: - return LowerCTLZ(Op, DAG); + return LowerCTLZ_CTTZ(Op, DAG); case ISD::DYNAMIC_STACKALLOC: return LowerDYNAMIC_STACKALLOC(Op, DAG); } return Op; @@ -2154,13 +2158,33 @@ return DAG.getNode(ISD::FADD, SL, MVT::f64, Trunc, Add); } -SDValue AMDGPUTargetLowering::LowerCTLZ(SDValue Op, SelectionDAG &DAG) const { +static bool isCtlzOpc(unsigned Opc) { + return Opc == ISD::CTLZ || Opc == ISD::CTLZ_ZERO_UNDEF; +} + +static bool isCttzOpc(unsigned Opc) { + return Opc == ISD::CTTZ || Opc == ISD::CTTZ_ZERO_UNDEF; +} + +SDValue AMDGPUTargetLowering::LowerCTLZ_CTTZ(SDValue Op, SelectionDAG &DAG) const { SDLoc SL(Op); SDValue Src = Op.getOperand(0); - bool ZeroUndef = Op.getOpcode() == ISD::CTLZ_ZERO_UNDEF; + bool ZeroUndef = Op.getOpcode() == ISD::CTTZ_ZERO_UNDEF || + Op.getOpcode() == ISD::CTLZ_ZERO_UNDEF; + + unsigned ISDOpc, NewOpc; + if (isCtlzOpc(Op.getOpcode())) { + ISDOpc = ISD::CTLZ_ZERO_UNDEF; + NewOpc = AMDGPUISD::FFBH_U32; + } else if (isCttzOpc(Op.getOpcode())) { + ISDOpc = ISD::CTTZ_ZERO_UNDEF; + NewOpc = AMDGPUISD::FFBL_B32; + } else + llvm_unreachable("Unexpected OPCode!!!"); + if (ZeroUndef && Src.getValueType() == MVT::i32) - return DAG.getNode(AMDGPUISD::FFBH_U32, SL, MVT::i32, Src); + return DAG.getNode(NewOpc, SL, MVT::i32, Src); SDValue Vec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Src); @@ -2173,24 +2197,33 @@ EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::i32); - SDValue Hi0 = DAG.getSetCC(SL, SetCCVT, Hi, Zero, ISD::SETEQ); + SDValue ZeroOrOne = isCtlzOpc(Op.getOpcode()) ? Zero : One; + SDValue HiOrLo = isCtlzOpc(Op.getOpcode()) ? Hi : Lo; + SDValue Hi0orLo0 = DAG.getSetCC(SL, SetCCVT, HiOrLo, ZeroOrOne, ISD::SETEQ); - SDValue CtlzLo = DAG.getNode(ISD::CTLZ_ZERO_UNDEF, SL, MVT::i32, Lo); - SDValue CtlzHi = DAG.getNode(ISD::CTLZ_ZERO_UNDEF, SL, MVT::i32, Hi); + SDValue OprLo = DAG.getNode(ISDOpc, SL, MVT::i32, Lo); + SDValue OprHi = DAG.getNode(ISDOpc, SL, MVT::i32, Hi); const SDValue Bits32 = DAG.getConstant(32, SL, MVT::i32); - SDValue Add = DAG.getNode(ISD::ADD, SL, MVT::i32, CtlzLo, Bits32); - - // ctlz(x) = hi_32(x) == 0 ? ctlz(lo_32(x)) + 32 : ctlz(hi_32(x)) - SDValue NewCtlz = DAG.getNode(ISD::SELECT, SL, MVT::i32, Hi0, Add, CtlzHi); + SDValue Add, NewOpr; + if (isCtlzOpc(Op.getOpcode())) { + Add = DAG.getNode(ISD::ADD, SL, MVT::i32, OprLo, Bits32); + // ctlz(x) = hi_32(x) == 0 ? ctlz(lo_32(x)) + 32 : ctlz(hi_32(x)) + NewOpr = DAG.getNode(ISD::SELECT, SL, MVT::i32, Hi0orLo0, Add, OprHi); + } else { + Add = DAG.getNode(ISD::ADD, SL, MVT::i32, OprHi, Bits32); + // cttz(x) = lo_32(x) == 0 ? cttz(hi_32(x)) + 32 : cttz(lo_32(x)) + NewOpr = DAG.getNode(ISD::SELECT, SL, MVT::i32, Hi0orLo0, Add, OprLo); + } if (!ZeroUndef) { // Test if the full 64-bit input is zero. // FIXME: DAG combines turn what should be an s_and_b64 into a v_or_b32, // which we probably don't want. - SDValue Lo0 = DAG.getSetCC(SL, SetCCVT, Lo, Zero, ISD::SETEQ); - SDValue SrcIsZero = DAG.getNode(ISD::AND, SL, SetCCVT, Lo0, Hi0); + SDValue LoOrHi = isCtlzOpc(Op.getOpcode()) ? Lo : Hi; + SDValue Lo0OrHi0 = DAG.getSetCC(SL, SetCCVT, LoOrHi, ZeroOrOne, ISD::SETEQ); + SDValue SrcIsZero = DAG.getNode(ISD::AND, SL, SetCCVT, Lo0OrHi0, Hi0orLo0); // TODO: If i64 setcc is half rate, it can result in 1 fewer instruction // with the same cycles, otherwise it is slower. @@ -2201,11 +2234,11 @@ // The instruction returns -1 for 0 input, but the defined intrinsic // behavior is to return the number of bits. - NewCtlz = DAG.getNode(ISD::SELECT, SL, MVT::i32, - SrcIsZero, Bits32, NewCtlz); + NewOpr = DAG.getNode(ISD::SELECT, SL, MVT::i32, + SrcIsZero, Bits32, NewOpr); } - return DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i64, NewCtlz); + return DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i64, NewOpr); } SDValue AMDGPUTargetLowering::LowerINT_TO_FP32(SDValue Op, SelectionDAG &DAG, @@ -3117,13 +3150,10 @@ return false; } -static bool isCtlzOpc(unsigned Opc) { - return Opc == ISD::CTLZ || Opc == ISD::CTLZ_ZERO_UNDEF; -} - -SDValue AMDGPUTargetLowering::getFFBH_U32(SelectionDAG &DAG, +SDValue AMDGPUTargetLowering::getFFBX_U32(SelectionDAG &DAG, SDValue Op, - const SDLoc &DL) const { + const SDLoc &DL, + unsigned Opc) const { EVT VT = Op.getValueType(); EVT LegalVT = getTypeToTransformTo(*DAG.getContext(), VT); if (LegalVT != MVT::i32 && (Subtarget->has16BitInsts() && @@ -3133,11 +3163,11 @@ if (VT != MVT::i32) Op = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, Op); - SDValue FFBH = DAG.getNode(AMDGPUISD::FFBH_U32, DL, MVT::i32, Op); + SDValue FFBX = DAG.getNode(Opc, DL, MVT::i32, Op); if (VT != MVT::i32) - FFBH = DAG.getNode(ISD::TRUNCATE, DL, VT, FFBH); + FFBX = DAG.getNode(ISD::TRUNCATE, DL, VT, FFBX); - return FFBH; + return FFBX; } // The native instructions return -1 on 0 input. Optimize out a select that @@ -3147,7 +3177,7 @@ // against the bitwidth. // // TODO: Should probably combine against FFBH_U32 instead of ctlz directly. -SDValue AMDGPUTargetLowering::performCtlzCombine(const SDLoc &SL, SDValue Cond, +SDValue AMDGPUTargetLowering::performCtlz_CttzCombine(const SDLoc &SL, SDValue Cond, SDValue LHS, SDValue RHS, DAGCombinerInfo &DCI) const { ConstantSDNode *CmpRhs = dyn_cast(Cond.getOperand(1)); @@ -3158,20 +3188,25 @@ ISD::CondCode CCOpcode = cast(Cond.getOperand(2))->get(); SDValue CmpLHS = Cond.getOperand(0); + unsigned Opc = isCttzOpc(RHS.getOpcode()) ? AMDGPUISD::FFBL_B32 : + AMDGPUISD::FFBH_U32; + // select (setcc x, 0, eq), -1, (ctlz_zero_undef x) -> ffbh_u32 x + // select (setcc x, 0, eq), -1, (cttz_zero_undef x) -> ffbl_u32 x if (CCOpcode == ISD::SETEQ && - isCtlzOpc(RHS.getOpcode()) && + (isCtlzOpc(RHS.getOpcode()) || isCttzOpc(RHS.getOpcode())) && RHS.getOperand(0) == CmpLHS && isNegativeOne(LHS)) { - return getFFBH_U32(DAG, CmpLHS, SL); + return getFFBX_U32(DAG, CmpLHS, SL, Opc); } // select (setcc x, 0, ne), (ctlz_zero_undef x), -1 -> ffbh_u32 x + // select (setcc x, 0, ne), (cttz_zero_undef x), -1 -> ffbl_u32 x if (CCOpcode == ISD::SETNE && - isCtlzOpc(LHS.getOpcode()) && + (isCtlzOpc(LHS.getOpcode()) || isCttzOpc(RHS.getOpcode())) && LHS.getOperand(0) == CmpLHS && isNegativeOne(RHS)) { - return getFFBH_U32(DAG, CmpLHS, SL); + return getFFBX_U32(DAG, CmpLHS, SL, Opc); } return SDValue(); @@ -3304,7 +3339,7 @@ } // There's no reason to not do this if the condition has other uses. - return performCtlzCombine(SDLoc(N), Cond, True, False, DCI); + return performCtlz_CttzCombine(SDLoc(N), Cond, True, False, DCI); } static bool isConstantFPZero(SDValue N) { @@ -3892,6 +3927,7 @@ NODE_NAME_CASE(BFM) NODE_NAME_CASE(FFBH_U32) NODE_NAME_CASE(FFBH_I32) + NODE_NAME_CASE(FFBL_B32) NODE_NAME_CASE(MUL_U24) NODE_NAME_CASE(MUL_I24) NODE_NAME_CASE(MULHI_U24) Index: llvm/trunk/lib/Target/AMDGPU/AMDGPUInstrInfo.td =================================================================== --- llvm/trunk/lib/Target/AMDGPU/AMDGPUInstrInfo.td +++ llvm/trunk/lib/Target/AMDGPU/AMDGPUInstrInfo.td @@ -298,6 +298,8 @@ def AMDGPUffbh_u32 : SDNode<"AMDGPUISD::FFBH_U32", SDTIntUnaryOp>; def AMDGPUffbh_i32 : SDNode<"AMDGPUISD::FFBH_I32", SDTIntUnaryOp>; +def AMDGPUffbl_b32 : SDNode<"AMDGPUISD::FFBL_B32", SDTIntUnaryOp>; + // Signed and unsigned 24-bit multiply. The highest 8-bits are ignore // when performing the mulitply. The result is a 32-bit value. def AMDGPUmul_u24 : SDNode<"AMDGPUISD::MUL_U24", SDTIntBinOp, Index: llvm/trunk/lib/Target/AMDGPU/EvergreenInstructions.td =================================================================== --- llvm/trunk/lib/Target/AMDGPU/EvergreenInstructions.td +++ llvm/trunk/lib/Target/AMDGPU/EvergreenInstructions.td @@ -449,7 +449,7 @@ def FLT16_TO_FLT32 : R600_1OP_Helper <0xA3, "FLT16_TO_FLT32", f16_to_fp, VecALU>; def BCNT_INT : R600_1OP_Helper <0xAA, "BCNT_INT", ctpop, VecALU>; def FFBH_UINT : R600_1OP_Helper <0xAB, "FFBH_UINT", AMDGPUffbh_u32, VecALU>; -def FFBL_INT : R600_1OP_Helper <0xAC, "FFBL_INT", cttz_zero_undef, VecALU>; +def FFBL_INT : R600_1OP_Helper <0xAC, "FFBL_INT", AMDGPUffbl_b32, VecALU>; let hasSideEffects = 1 in { def MOVA_INT_eg : R600_1OP <0xCC, "MOVA_INT", [], VecALU>; Index: llvm/trunk/lib/Target/AMDGPU/SOPInstructions.td =================================================================== --- llvm/trunk/lib/Target/AMDGPU/SOPInstructions.td +++ llvm/trunk/lib/Target/AMDGPU/SOPInstructions.td @@ -159,10 +159,11 @@ def S_FF0_I32_B32 : SOP1_32 <"s_ff0_i32_b32">; def S_FF0_I32_B64 : SOP1_32_64 <"s_ff0_i32_b64">; +def S_FF1_I32_B64 : SOP1_32_64 <"s_ff1_i32_b64">; + def S_FF1_I32_B32 : SOP1_32 <"s_ff1_i32_b32", - [(set i32:$sdst, (cttz_zero_undef i32:$src0))] + [(set i32:$sdst, (AMDGPUffbl_b32 i32:$src0))] >; -def S_FF1_I32_B64 : SOP1_32_64 <"s_ff1_i32_b64">; def S_FLBIT_I32_B32 : SOP1_32 <"s_flbit_i32_b32", [(set i32:$sdst, (AMDGPUffbh_u32 i32:$src0))] Index: llvm/trunk/test/CodeGen/AMDGPU/cttz_zero_undef.ll =================================================================== --- llvm/trunk/test/CodeGen/AMDGPU/cttz_zero_undef.ll +++ llvm/trunk/test/CodeGen/AMDGPU/cttz_zero_undef.ll @@ -1,8 +1,12 @@ -; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=SI -check-prefix=SI-NOSDWA -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=SI -check-prefix=SI-SDWA -check-prefix=FUNC %s +; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=EG -check-prefix=FUNC %s +declare i7 @llvm.cttz.i7(i7, i1) nounwind readnone +declare i8 @llvm.cttz.i8(i8, i1) nounwind readnone +declare i16 @llvm.cttz.i16(i16, i1) nounwind readnone declare i32 @llvm.cttz.i32(i32, i1) nounwind readnone +declare i64 @llvm.cttz.i64(i64, i1) nounwind readnone declare <2 x i32> @llvm.cttz.v2i32(<2 x i32>, i1) nounwind readnone declare <4 x i32> @llvm.cttz.v4i32(<4 x i32>, i1) nounwind readnone declare i32 @llvm.r600.read.tidig.x() nounwind readnone @@ -76,3 +80,187 @@ store <4 x i32> %cttz, <4 x i32> addrspace(1)* %out, align 16 ret void } + +; FUNC-LABEL: {{^}}s_cttz_zero_undef_i8_with_select: +; SI: s_ff1_i32_b32 s{{[0-9]+}}, s{{[0-9]+}} +; EG: MEM_RAT MSKOR +; EG: FFBL_INT +define amdgpu_kernel void @s_cttz_zero_undef_i8_with_select(i8 addrspace(1)* noalias %out, i8 %val) nounwind { + %cttz = tail call i8 @llvm.cttz.i8(i8 %val, i1 true) nounwind readnone + %cttz_ret = icmp ne i8 %val, 0 + %ret = select i1 %cttz_ret, i8 %cttz, i8 32 + store i8 %cttz, i8 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}s_cttz_zero_undef_i16_with_select: +; SI: s_ff1_i32_b32 s{{[0-9]+}}, s{{[0-9]+}} +; EG: MEM_RAT MSKOR +; EG: FFBL_INT +define amdgpu_kernel void @s_cttz_zero_undef_i16_with_select(i16 addrspace(1)* noalias %out, i16 %val) nounwind { + %cttz = tail call i16 @llvm.cttz.i16(i16 %val, i1 true) nounwind readnone + %cttz_ret = icmp ne i16 %val, 0 + %ret = select i1 %cttz_ret, i16 %cttz, i16 32 + store i16 %cttz, i16 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}s_cttz_zero_undef_i32_with_select: +; SI: s_ff1_i32_b32 +; EG: MEM_RAT_CACHELESS STORE_RAW [[RESULT:T[0-9]+\.[XYZW]]] +; EG: FFBL_INT {{\*? *}}[[RESULT]] +define amdgpu_kernel void @s_cttz_zero_undef_i32_with_select(i32 addrspace(1)* noalias %out, i32 %val) nounwind { + %cttz = tail call i32 @llvm.cttz.i32(i32 %val, i1 true) nounwind readnone + %cttz_ret = icmp ne i32 %val, 0 + %ret = select i1 %cttz_ret, i32 %cttz, i32 32 + store i32 %cttz, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}s_cttz_zero_undef_i64_with_select: +; SI: s_ff1_i32_b32 s{{[0-9]+}}, s{{[0-9]+}} +; SI: s_ff1_i32_b32 s{{[0-9]+}}, s{{[0-9]+}} +; EG: MEM_RAT_CACHELESS STORE_RAW [[RESULT:T[0-9]+\.[XYZW]]] +define amdgpu_kernel void @s_cttz_zero_undef_i64_with_select(i64 addrspace(1)* noalias %out, i64 %val) nounwind { + %cttz = tail call i64 @llvm.cttz.i64(i64 %val, i1 true) nounwind readnone + %cttz_ret = icmp ne i64 %val, 0 + %ret = select i1 %cttz_ret, i64 %cttz, i64 32 + store i64 %cttz, i64 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}v_cttz_zero_undef_i8_with_select: +; SI-NOSDWA: v_ffbl_b32_e32 v{{[0-9]+}}, v{{[0-9]+}} +; SI-SDWA: v_ffbl_b32_sdwa +; EG: MEM_RAT MSKOR +define amdgpu_kernel void @v_cttz_zero_undef_i8_with_select(i8 addrspace(1)* noalias %out, i8 addrspace(1)* nocapture readonly %arrayidx) nounwind { + %val = load i8, i8 addrspace(1)* %arrayidx, align 1 + %cttz = tail call i8 @llvm.cttz.i8(i8 %val, i1 true) nounwind readnone + %cttz_ret = icmp ne i8 %val, 0 + %ret = select i1 %cttz_ret, i8 %cttz, i8 32 + store i8 %ret, i8 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}v_cttz_zero_undef_i16_with_select: +; SI-NOSDWA: v_ffbl_b32_e32 v{{[0-9]+}}, v{{[0-9]+}} +; SI-SDWA: v_ffbl_b32_sdwa +; EG: MEM_RAT MSKOR +define amdgpu_kernel void @v_cttz_zero_undef_i16_with_select(i16 addrspace(1)* noalias %out, i16 addrspace(1)* nocapture readonly %arrayidx) nounwind { + %val = load i16, i16 addrspace(1)* %arrayidx, align 1 + %cttz = tail call i16 @llvm.cttz.i16(i16 %val, i1 true) nounwind readnone + %cttz_ret = icmp ne i16 %val, 0 + %ret = select i1 %cttz_ret, i16 %cttz, i16 32 + store i16 %ret, i16 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}v_cttz_zero_undef_i32_with_select: +; SI: v_ffbl_b32_e32 v{{[0-9]+}}, v{{[0-9]+}} +; EG: MEM_RAT_CACHELESS STORE_RAW [[RESULT:T[0-9]+\.[XYZW]]] +define amdgpu_kernel void @v_cttz_zero_undef_i32_with_select(i32 addrspace(1)* noalias %out, i32 addrspace(1)* nocapture readonly %arrayidx) nounwind { + %val = load i32, i32 addrspace(1)* %arrayidx, align 1 + %cttz = tail call i32 @llvm.cttz.i32(i32 %val, i1 true) nounwind readnone + %cttz_ret = icmp ne i32 %val, 0 + %ret = select i1 %cttz_ret, i32 %cttz, i32 32 + store i32 %ret, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}v_cttz_zero_undef_i64_with_select: +; SI-NOSDWA: v_or_b32_e32 +; SI-NOSDWA: v_or_b32_e32 +; SI-NOSDWA: v_or_b32_e32 +; SI-SDWA: v_or_b32_sdwa +; SI-NOSDWA: v_or_b32_e32 +; SI-SDWA: v_or_b32_sdwa +; SI: v_or_b32_e32 [[VAL1:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} +; SI: v_or_b32_e32 [[VAL2:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} +; SI-DAG: v_ffbl_b32_e32 v{{[0-9]+}}, [[VAL1]] +; SI-DAG: v_ffbl_b32_e32 v{{[0-9]+}}, [[VAL2]] +; EG: MEM_RAT_CACHELESS STORE_RAW [[RESULT:T[0-9]+\.[XYZW]]] +define amdgpu_kernel void @v_cttz_zero_undef_i64_with_select(i64 addrspace(1)* noalias %out, i64 addrspace(1)* nocapture readonly %arrayidx) nounwind { + %val = load i64, i64 addrspace(1)* %arrayidx, align 1 + %cttz = tail call i64 @llvm.cttz.i64(i64 %val, i1 true) nounwind readnone + %cttz_ret = icmp ne i64 %val, 0 + %ret = select i1 %cttz_ret, i64 %cttz, i64 32 + store i64 %ret, i64 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}v_cttz_i32_sel_eq_neg1: +; SI: v_ffbl_b32_e32 v{{[0-9]+}}, [[VAL:v[0-9]+]] +; SI: v_cmp_ne_u32_e32 vcc, 0, [[VAL]] +; SI: s_endpgm +; EG: MEM_RAT_CACHELESS STORE_RAW +; EG: FFBL_INT +define amdgpu_kernel void @v_cttz_i32_sel_eq_neg1(i32 addrspace(1)* noalias %out, i32 addrspace(1)* nocapture readonly %arrayidx) nounwind { + %val = load i32, i32 addrspace(1)* %arrayidx, align 1 + %ctlz = call i32 @llvm.cttz.i32(i32 %val, i1 false) nounwind readnone + %cmp = icmp eq i32 %val, 0 + %sel = select i1 %cmp, i32 -1, i32 %ctlz + store i32 %sel, i32 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}v_cttz_i32_sel_ne_neg1: +; SI: v_ffbl_b32_e32 v{{[0-9]+}}, [[VAL:v[0-9]+]] +; SI: v_cmp_ne_u32_e32 vcc, 0, [[VAL]] +; SI: s_endpgm +; EG: MEM_RAT_CACHELESS STORE_RAW +; EG: FFBL_INT +define amdgpu_kernel void @v_cttz_i32_sel_ne_neg1(i32 addrspace(1)* noalias %out, i32 addrspace(1)* nocapture readonly %arrayidx) nounwind { + %val = load i32, i32 addrspace(1)* %arrayidx, align 1 + %ctlz = call i32 @llvm.cttz.i32(i32 %val, i1 false) nounwind readnone + %cmp = icmp ne i32 %val, 0 + %sel = select i1 %cmp, i32 %ctlz, i32 -1 + store i32 %sel, i32 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}v_cttz_i32_sel_ne_bitwidth: +; SI: v_ffbl_b32_e32 v{{[0-9]+}}, v{{[0-9]+}} +; SI: v_cmp +; SI: v_cndmask +; SI: s_endpgm +; EG: MEM_RAT_CACHELESS STORE_RAW +; EG: FFBL_INT +define amdgpu_kernel void @v_cttz_i32_sel_ne_bitwidth(i32 addrspace(1)* noalias %out, i32 addrspace(1)* nocapture readonly %arrayidx) nounwind { + %val = load i32, i32 addrspace(1)* %arrayidx, align 1 + %ctlz = call i32 @llvm.cttz.i32(i32 %val, i1 false) nounwind readnone + %cmp = icmp ne i32 %ctlz, 32 + %sel = select i1 %cmp, i32 %ctlz, i32 -1 + store i32 %sel, i32 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}v_cttz_i8_sel_eq_neg1: +; SI: {{buffer|flat}}_load_ubyte +; SI: v_ffbl_b32_e32 v{{[0-9]+}}, v{{[0-9]+}} +; EG: MEM_RAT MSKOR +; EG: FFBL_INT + define amdgpu_kernel void @v_cttz_i8_sel_eq_neg1(i8 addrspace(1)* noalias %out, i8 addrspace(1)* nocapture readonly %arrayidx) nounwind { + %val = load i8, i8 addrspace(1)* %arrayidx, align 1 + %ctlz = call i8 @llvm.cttz.i8(i8 %val, i1 false) nounwind readnone + %cmp = icmp eq i8 %val, 0 + %sel = select i1 %cmp, i8 -1, i8 %ctlz + store i8 %sel, i8 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}v_cttz_i16_sel_eq_neg1: +; SI: {{buffer|flat}}_load_ubyte +; SI: v_ffbl_b32_e32 v{{[0-9]+}}, v{{[0-9]+}} +; SI: buffer_store_short +; EG: MEM_RAT MSKOR +; EG: FFBL_INT + define amdgpu_kernel void @v_cttz_i16_sel_eq_neg1(i16 addrspace(1)* noalias %out, i16 addrspace(1)* nocapture readonly %arrayidx) nounwind { + %val = load i16, i16 addrspace(1)* %arrayidx, align 1 + %ctlz = call i16 @llvm.cttz.i16(i16 %val, i1 false) nounwind readnone + %cmp = icmp eq i16 %val, 0 + %sel = select i1 %cmp, i16 -1, i16 %ctlz + store i16 %sel, i16 addrspace(1)* %out + ret void +} + +