Index: lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp +++ lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp @@ -111,6 +111,16 @@ /// \returns True. bool promoteUniformBitreverseToI32(IntrinsicInst &I) const; + /// \brief Promotes 'ctlz' intrinsic \p I to 32 bit 'ctlz' intrinsic. + /// + /// \details \p I's base element bit width must be greater than 1 and less + /// than or equal 16. Promotion is done by zero extending the first operand to + /// 32 bits, replacing \p I with 32 bit 'ctlz' intrinsic, and truncating the + /// result of 32 bit 'ctlz' intrinsic back to \p I's original type. + /// + /// \returns True. + bool promoteCtlzToI32(IntrinsicInst &I) const; + public: static char ID; AMDGPUCodeGenPrepare(const TargetMachine *TM = nullptr) : @@ -130,6 +140,7 @@ bool visitIntrinsicInst(IntrinsicInst &I); bool visitBitreverseIntrinsicInst(IntrinsicInst &I); + bool visitCtlzIntrinsicInst(IntrinsicInst &I); bool doInitialization(Module &M) override; bool runOnFunction(Function &F) override; @@ -308,6 +319,28 @@ return true; } +bool AMDGPUCodeGenPrepare::promoteCtlzToI32(IntrinsicInst &I) const { + assert(I.getIntrinsicID() == Intrinsic::ctlz && + "I must be ctlz intrinsic"); + assert(needsPromotionToI32(I.getType()) && + "I does not need promotion to i32"); + + IRBuilder<> Builder(&I); + Builder.SetCurrentDebugLocation(I.getDebugLoc()); + + Type *I32Ty = getI32Ty(Builder, I.getType()); + Function *I32 = + Intrinsic::getDeclaration(Mod, Intrinsic::ctlz, { I32Ty }); + Value *ExtOp = Builder.CreateZExt(I.getOperand(0), I32Ty); + Value *ExtRes = Builder.CreateCall(I32, { ExtOp, I.getOperand(1) }); + Value *TruncRes = Builder.CreateTrunc(ExtRes, I.getType()); + + I.replaceAllUsesWith(TruncRes); + I.eraseFromParent(); + + return true; +} + static bool shouldKeepFDivF32(Value *Num, bool UnsafeDiv) { const ConstantFP *CNum = dyn_cast(Num); if (!CNum) @@ -426,6 +459,8 @@ switch (I.getIntrinsicID()) { case Intrinsic::bitreverse: return visitBitreverseIntrinsicInst(I); + case Intrinsic::ctlz: + return visitCtlzIntrinsicInst(I); default: return false; } @@ -441,6 +476,15 @@ return Changed; } +bool AMDGPUCodeGenPrepare::visitCtlzIntrinsicInst(IntrinsicInst &I) { + bool Changed = false; + + if (needsPromotionToI32(I.getType())) + Changed |= promoteCtlzToI32(I); + + return Changed; +} + bool AMDGPUCodeGenPrepare::doInitialization(Module &M) { Mod = &M; return false; Index: test/CodeGen/AMDGPU/amdgpu-codegenprepare-i16-to-i32.ll =================================================================== --- test/CodeGen/AMDGPU/amdgpu-codegenprepare-i16-to-i32.ll +++ test/CodeGen/AMDGPU/amdgpu-codegenprepare-i16-to-i32.ll @@ -1039,6 +1039,27 @@ ret i16 %brev } +declare i16 @llvm.ctlz.i16(i16, i1) +; GCN-LABEL: @ctlz_i16_false( +; GCN: %[[A_32:[0-9]+]] = zext i16 %a to i32 +; GCN-NEXT: %[[R_32:[0-9]+]] = call i32 @llvm.ctlz.i32(i32 %[[A_32]], i1 false) +; GCN-NEXT: %[[R_16:[0-9]+]] = trunc i32 %[[R_32]] to i16 +; GCN-NEXT: ret i16 %[[R_16]] +define i16 @ctlz_i16_false(i16 %a) { + %ctlz = call i16 @llvm.ctlz.i16(i16 %a, i1 false) + ret i16 %ctlz +} + +; GCN-LABEL: @ctlz_i16_true( +; GCN: %[[A_32:[0-9]+]] = zext i16 %a to i32 +; GCN-NEXT: %[[R_32:[0-9]+]] = call i32 @llvm.ctlz.i32(i32 %[[A_32]], i1 true) +; GCN-NEXT: %[[R_16:[0-9]+]] = trunc i32 %[[R_32]] to i16 +; GCN-NEXT: ret i16 %[[R_16]] +define i16 @ctlz_i16_true(i16 %a) { + %ctlz = call i16 @llvm.ctlz.i16(i16 %a, i1 true) + ret i16 %ctlz +} + ; GCN-LABEL: @add_3xi15( ; SI: %r = add <3 x i15> %a, %b ; SI-NEXT: ret <3 x i15> %r @@ -2076,3 +2097,24 @@ %brev = call <3 x i16> @llvm.bitreverse.v3i16(<3 x i16> %a) ret <3 x i16> %brev } + +declare <3 x i16> @llvm.ctlz.v3i16(<3 x i16>, i1) +; GCN-LABEL: @ctlz_v3i16_false( +; GCN: %[[A_32:[0-9]+]] = zext <3 x i16> %a to <3 x i32> +; GCN-NEXT: %[[R_32:[0-9]+]] = call <3 x i32> @llvm.ctlz.v3i32(<3 x i32> %[[A_32]], i1 false) +; GCN-NEXT: %[[R_16:[0-9]+]] = trunc <3 x i32> %[[R_32]] to <3 x i16> +; GCN-NEXT: ret <3 x i16> %[[R_16]] +define <3 x i16> @ctlz_v3i16_false(<3 x i16> %a) { + %ctlz = call <3 x i16> @llvm.ctlz.v3i16(<3 x i16> %a, i1 false) + ret <3 x i16> %ctlz +} + +; GCN-LABEL: @ctlz_v3i16_true( +; GCN: %[[A_32:[0-9]+]] = zext <3 x i16> %a to <3 x i32> +; GCN-NEXT: %[[R_32:[0-9]+]] = call <3 x i32> @llvm.ctlz.v3i32(<3 x i32> %[[A_32]], i1 true) +; GCN-NEXT: %[[R_16:[0-9]+]] = trunc <3 x i32> %[[R_32]] to <3 x i16> +; GCN-NEXT: ret <3 x i16> %[[R_16]] +define <3 x i16> @ctlz_v3i16_true(<3 x i16> %a) { + %ctlz = call <3 x i16> @llvm.ctlz.v3i16(<3 x i16> %a, i1 true) + ret <3 x i16> %ctlz +} Index: test/CodeGen/AMDGPU/ctlz.ll =================================================================== --- test/CodeGen/AMDGPU/ctlz.ll +++ test/CodeGen/AMDGPU/ctlz.ll @@ -97,12 +97,9 @@ } ; FUNC-LABEL: {{^}}v_ctlz_i8: -; GCN: buffer_load_ubyte [[VAL:v[0-9]+]], -; GCN-DAG: v_ffbh_u32_e32 [[FFBH:v[0-9]+]], [[VAL]] -; GCN-DAG: v_cmp_eq_u32_e32 vcc, 0, [[CTLZ]] -; GCN-DAG: v_cndmask_b32_e64 [[CORRECTED_FFBH:v[0-9]+]], [[FFBH]], 32, vcc -; GCN: v_add_i32_e32 [[RESULT:v[0-9]+]], vcc, 0xffffffe8, [[CORRECTED_FFBH]] -; GCN: buffer_store_byte [[RESULT]], +; GCN: {{buffer|flat}}_load_ubyte [[VAL:v[0-9]+]], +; GCN: v_ffbh_u32_e32 [[FFBH:v[0-9]+]], [[VAL]] +; GCN: {{buffer|flat}}_store_byte [[FFBH]], define void @v_ctlz_i8(i8 addrspace(1)* noalias %out, i8 addrspace(1)* noalias %valptr) nounwind { %val = load i8, i8 addrspace(1)* %valptr %ctlz = call i8 @llvm.ctlz.i8(i8 %val, i1 false) nounwind readnone @@ -257,12 +254,10 @@ ret void } -; FIXME: Need to handle non-uniform case for function below (load without gep). ; FUNC-LABEL: {{^}}v_ctlz_i7_sel_eq_neg1: ; GCN: {{buffer|flat}}_load_ubyte [[VAL:v[0-9]+]], ; GCN: v_ffbh_u32_e32 [[FFBH:v[0-9]+]], [[VAL]] -; GCN: v_and_b32_e32 [[TRUNC:v[0-9]+]], 0x7f, [[FFBH]] -; GCN: {{buffer|flat}}_store_byte [[TRUNC]], +; GCN: {{buffer|flat}}_store_byte [[FFBH]], define void @v_ctlz_i7_sel_eq_neg1(i7 addrspace(1)* noalias %out, i7 addrspace(1)* noalias %valptr) nounwind { %tid = call i32 @llvm.r600.read.tidig.x() %valptr.gep = getelementptr i7, i7 addrspace(1)* %valptr, i32 %tid Index: test/CodeGen/AMDGPU/ctlz_zero_undef.ll =================================================================== --- test/CodeGen/AMDGPU/ctlz_zero_undef.ll +++ test/CodeGen/AMDGPU/ctlz_zero_undef.ll @@ -79,10 +79,9 @@ } ; FUNC-LABEL: {{^}}v_ctlz_zero_undef_i8: -; SI: buffer_load_ubyte [[VAL:v[0-9]+]], +; SI: {{buffer|flat}}_load_ubyte [[VAL:v[0-9]+]], ; SI: v_ffbh_u32_e32 [[FFBH:v[0-9]+]], [[VAL]] -; SI: v_add_i32_e32 [[RESULT:v[0-9]+]], vcc, 0xffffffe8, [[FFBH]] -; SI: buffer_store_byte [[RESULT]], +; SI: {{buffer|flat}}_store_byte [[FFBH]], define void @v_ctlz_zero_undef_i8(i8 addrspace(1)* noalias %out, i8 addrspace(1)* noalias %valptr) nounwind { %val = load i8, i8 addrspace(1)* %valptr %ctlz = call i8 @llvm.ctlz.i8(i8 %val, i1 true) nounwind readnone