Index: lib/Target/AMDGPU/AMDGPUISelLowering.h =================================================================== --- lib/Target/AMDGPU/AMDGPUISelLowering.h +++ lib/Target/AMDGPU/AMDGPUISelLowering.h @@ -54,6 +54,8 @@ SDValue LowerFROUND(SDValue Op, SelectionDAG &DAG) const; SDValue LowerFFLOOR(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerCTLZ(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerINT_TO_FP64(SDValue Op, SelectionDAG &DAG, bool Signed) const; SDValue LowerUINT_TO_FP(SDValue Op, SelectionDAG &DAG) const; SDValue LowerSINT_TO_FP(SDValue Op, SelectionDAG &DAG) const; Index: lib/Target/AMDGPU/AMDGPUISelLowering.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -288,9 +288,11 @@ if (!Subtarget->hasFFBL()) setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i32, Expand); - setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i64, Expand); setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i64, Expand); + setOperationAction(ISD::CTLZ, MVT::i64, Custom); + setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i64, Custom); + static const MVT::SimpleValueType VectorIntTypes[] = { MVT::v2i32, MVT::v4i32 }; @@ -636,6 +638,9 @@ case ISD::UINT_TO_FP: return LowerUINT_TO_FP(Op, DAG); case ISD::FP_TO_SINT: return LowerFP_TO_SINT(Op, DAG); case ISD::FP_TO_UINT: return LowerFP_TO_UINT(Op, DAG); + case ISD::CTLZ: + case ISD::CTLZ_ZERO_UNDEF: + return LowerCTLZ(Op, DAG); case ISD::DYNAMIC_STACKALLOC: return LowerDYNAMIC_STACKALLOC(Op, DAG); } return Op; @@ -2162,6 +2167,58 @@ return DAG.getNode(ISD::FADD, SL, MVT::f64, Trunc, Add); } +SDValue AMDGPUTargetLowering::LowerCTLZ(SDValue Op, SelectionDAG &DAG) const { + SDLoc SL(Op); + SDValue Src = Op.getOperand(0); + assert(Src.getValueType() == MVT::i64); + + bool ZeroUndef = Op.getOpcode() == ISD::CTLZ_ZERO_UNDEF; + SDValue Vec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Src); + + const SDValue Zero = DAG.getConstant(0, SL, MVT::i32); + const SDValue One = DAG.getConstant(1, SL, MVT::i32); + + SDValue Lo = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, Zero); + SDValue Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, One); + + EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(), + *DAG.getContext(), MVT::i32); + + SDValue Hi0 = DAG.getSetCC(SL, SetCCVT, Hi, Zero, ISD::SETEQ); + + SDValue CtlzLo = DAG.getNode(ISD::CTLZ_ZERO_UNDEF, SL, MVT::i32, Lo); + SDValue CtlzHi = DAG.getNode(ISD::CTLZ_ZERO_UNDEF, SL, MVT::i32, Hi); + + const SDValue Bits32 = DAG.getConstant(32, SL, MVT::i32); + SDValue Add = DAG.getNode(ISD::ADD, SL, MVT::i32, CtlzLo, Bits32); + + // ctlz(x) = hi_32(x) == 0 ? ctlz(lo_32(x)) + 32 : ctlz(hi_32(x)) + SDValue NewCtlz = DAG.getNode(ISD::SELECT, SL, MVT::i32, Hi0, Add, CtlzHi); + + if (!ZeroUndef) { + // Test if the full 64-bit input is zero. + + // FIXME: DAG combines turn what should be an s_and_b64 into a v_or_b32, + // which we probably don't want. + SDValue Lo0 = DAG.getSetCC(SL, SetCCVT, Lo, Zero, ISD::SETEQ); + SDValue SrcIsZero = DAG.getNode(ISD::AND, SL, SetCCVT, Lo0, Hi0); + + // TODO: If i64 setcc is half rate, it can result in 1 fewer instruction + // with the same cycles, otherwise it is slower. + // SDValue SrcIsZero = DAG.getSetCC(SL, SetCCVT, Src, + // DAG.getConstant(0, SL, MVT::i64), ISD::SETEQ); + + const SDValue Bits32 = DAG.getConstant(64, SL, MVT::i32); + + // The instruction returns -1 for 0 input, but the defined intrinsic + // behavior is to return the number of bits. + NewCtlz = DAG.getNode(ISD::SELECT, SL, MVT::i32, + SrcIsZero, Bits32, NewCtlz); + } + + return DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i64, NewCtlz); +} + SDValue AMDGPUTargetLowering::LowerINT_TO_FP64(SDValue Op, SelectionDAG &DAG, bool Signed) const { SDLoc SL(Op); Index: test/CodeGen/AMDGPU/ctlz.ll =================================================================== --- test/CodeGen/AMDGPU/ctlz.ll +++ test/CodeGen/AMDGPU/ctlz.ll @@ -93,6 +93,16 @@ } ; FUNC-LABEL: {{^}}s_ctlz_i64: +; SI: s_load_dwordx2 s{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, {{0xb|0x2c}} +; SI-DAG: v_cmp_eq_i32_e64 vcc, 0, s[[HI]] +; SI-DAG: s_flbit_i32_b32 [[FFBH_LO:s[0-9]+]], s[[LO]] +; SI-DAG: s_add_i32 [[ADD:s[0-9]+]], [[FFBH_LO]], 32 +; SI-DAG: s_flbit_i32_b32 [[FFBH_HI:s[0-9]+]], s[[HI]] +; SI-DAG: v_mov_b32_e32 [[VFFBH_LO:v[0-9]+]], [[FFBH_LO]] +; SI-DAG: v_mov_b32_e32 [[VFFBH_HI:v[0-9]+]], [[FFBH_HI]] +; SI-DAG: v_cndmask_b32_e32 v[[CTLZ:[0-9]+]], [[VFFBH_HI]], [[VFFBH_LO]] +; SI-DAG: v_mov_b32_e32 v[[CTLZ_HI:[0-9]+]], 0{{$}} +; SI: {{buffer|flat}}_store_dwordx2 v{{\[}}[[CTLZ]]:[[CTLZ_HI]]{{\]}} define void @s_ctlz_i64(i64 addrspace(1)* noalias %out, i64 %val) nounwind { %ctlz = call i64 @llvm.ctlz.i64(i64 %val, i1 false) store i64 %ctlz, i64 addrspace(1)* %out @@ -108,6 +118,17 @@ } ; FUNC-LABEL: {{^}}v_ctlz_i64: +; SI: {{buffer|flat}}_load_dwordx2 v{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]{{\]}} +; SI-DAG: v_cmp_eq_i32_e64 [[CMPHI:s\[[0-9]+:[0-9]+\]]], 0, v[[HI]] +; SI-DAG: v_ffbh_u32_e32 [[FFBH_LO:v[0-9]+]], v[[LO]] +; SI-DAG: v_add_i32_e32 [[ADD:v[0-9]+]], vcc, 32, [[FFBH_LO]] +; SI-DAG: v_ffbh_u32_e32 [[FFBH_HI:v[0-9]+]], v[[HI]] +; SI-DAG: v_cndmask_b32_e64 v[[CTLZ:[0-9]+]], [[FFBH_HI]], [[ADD]], [[CMPHI]] +; SI-DAG: v_or_b32_e32 [[OR:v[0-9]+]], v[[LO]], v[[HI]] +; SI-DAG: v_cmp_eq_i32_e32 vcc, 0, [[OR]] +; SI-DAG: v_cndmask_b32_e64 v[[CLTZ_LO:[0-9]+]], v[[CTLZ:[0-9]+]], 64, vcc +; SI-DAG: v_mov_b32_e32 v[[CTLZ_HI:[0-9]+]], 0{{$}} +; SI: {{buffer|flat}}_store_dwordx2 v{{\[}}[[CLTZ_LO]]:[[CTLZ_HI]]{{\]}} define void @v_ctlz_i64(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in) nounwind { %tid = call i32 @llvm.r600.read.tidig.x() %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %tid Index: test/CodeGen/AMDGPU/ctlz_zero_undef.ll =================================================================== --- test/CodeGen/AMDGPU/ctlz_zero_undef.ll +++ test/CodeGen/AMDGPU/ctlz_zero_undef.ll @@ -6,6 +6,12 @@ declare <2 x i32> @llvm.ctlz.v2i32(<2 x i32>, i1) nounwind readnone declare <4 x i32> @llvm.ctlz.v4i32(<4 x i32>, i1) nounwind readnone +declare i64 @llvm.ctlz.i64(i64, i1) nounwind readnone +declare <2 x i64> @llvm.ctlz.v2i64(<2 x i64>, i1) nounwind readnone +declare <4 x i64> @llvm.ctlz.v4i64(<4 x i64>, i1) nounwind readnone + +declare i32 @llvm.r600.read.tidig.x() nounwind readnone + ; FUNC-LABEL: {{^}}s_ctlz_zero_undef_i32: ; SI: s_load_dword [[VAL:s[0-9]+]], ; SI: s_flbit_i32_b32 [[SRESULT:s[0-9]+]], [[VAL]] @@ -69,3 +75,59 @@ store <4 x i32> %ctlz, <4 x i32> addrspace(1)* %out, align 16 ret void } + +; FUNC-LABEL: {{^}}s_ctlz_zero_undef_i64: +; SI: s_load_dwordx2 s{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, {{0xb|0x2c}} +; SI-DAG: v_cmp_eq_i32_e64 vcc, 0, s[[HI]] +; SI-DAG: s_flbit_i32_b32 [[FFBH_LO:s[0-9]+]], s[[LO]] +; SI-DAG: s_add_i32 [[ADD:s[0-9]+]], [[FFBH_LO]], 32 +; SI-DAG: s_flbit_i32_b32 [[FFBH_HI:s[0-9]+]], s[[HI]] +; SI-DAG: v_mov_b32_e32 [[VFFBH_LO:v[0-9]+]], [[FFBH_LO]] +; SI-DAG: v_mov_b32_e32 [[VFFBH_HI:v[0-9]+]], [[FFBH_HI]] +; SI-DAG: v_cndmask_b32_e32 v[[CTLZ:[0-9]+]], [[VFFBH_HI]], [[VFFBH_LO]] +; SI-DAG: v_mov_b32_e32 v[[CTLZ_HI:[0-9]+]], 0{{$}} +; SI: {{buffer|flat}}_store_dwordx2 v{{\[}}[[CTLZ]]:[[CTLZ_HI]]{{\]}} +define void @s_ctlz_zero_undef_i64(i64 addrspace(1)* noalias %out, i64 %val) nounwind { + %ctlz = call i64 @llvm.ctlz.i64(i64 %val, i1 true) + store i64 %ctlz, i64 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}s_ctlz_zero_undef_i64_trunc: +define void @s_ctlz_zero_undef_i64_trunc(i32 addrspace(1)* noalias %out, i64 %val) nounwind { + %ctlz = call i64 @llvm.ctlz.i64(i64 %val, i1 true) + %trunc = trunc i64 %ctlz to i32 + store i32 %trunc, i32 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}v_ctlz_zero_undef_i64: +; SI: {{buffer|flat}}_load_dwordx2 v{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]{{\]}} +; SI-DAG: v_cmp_eq_i32_e64 [[CMPHI:s\[[0-9]+:[0-9]+\]]], 0, v[[HI]] +; SI-DAG: v_ffbh_u32_e32 [[FFBH_LO:v[0-9]+]], v[[LO]] +; SI-DAG: v_add_i32_e32 [[ADD:v[0-9]+]], vcc, 32, [[FFBH_LO]] +; SI-DAG: v_ffbh_u32_e32 [[FFBH_HI:v[0-9]+]], v[[HI]] +; SI-DAG: v_cndmask_b32_e64 v[[CTLZ:[0-9]+]], [[FFBH_HI]], [[FFBH_LO]] +; SI-DAG: v_mov_b32_e32 v[[CTLZ_HI:[0-9]+]], 0{{$}} +; SI: {{buffer|flat}}_store_dwordx2 v{{\[}}[[CTLZ]]:[[CTLZ_HI]]{{\]}} +define void @v_ctlz_zero_undef_i64(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in) nounwind { + %tid = call i32 @llvm.r600.read.tidig.x() + %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %tid + %out.gep = getelementptr i64, i64 addrspace(1)* %out, i32 %tid + %val = load i64, i64 addrspace(1)* %in.gep + %ctlz = call i64 @llvm.ctlz.i64(i64 %val, i1 true) + store i64 %ctlz, i64 addrspace(1)* %out.gep + ret void +} + +; FUNC-LABEL: {{^}}v_ctlz_zero_undef_i64_trunc: +define void @v_ctlz_zero_undef_i64_trunc(i32 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in) nounwind { + %tid = call i32 @llvm.r600.read.tidig.x() + %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %tid + %out.gep = getelementptr i32, i32 addrspace(1)* %out, i32 %tid + %val = load i64, i64 addrspace(1)* %in.gep + %ctlz = call i64 @llvm.ctlz.i64(i64 %val, i1 true) + %trunc = trunc i64 %ctlz to i32 + store i32 %trunc, i32 addrspace(1)* %out.gep + ret void +}