diff --git a/clang/include/clang/Basic/BuiltinsAArch64.def b/clang/include/clang/Basic/BuiltinsAArch64.def --- a/clang/include/clang/Basic/BuiltinsAArch64.def +++ b/clang/include/clang/Basic/BuiltinsAArch64.def @@ -33,6 +33,8 @@ // Bit manipulation BUILTIN(__builtin_arm_rbit, "UiUi", "nc") BUILTIN(__builtin_arm_rbit64, "WUiWUi", "nc") +BUILTIN(__builtin_arm_cls, "UiZUi", "nc") +BUILTIN(__builtin_arm_cls64, "UiWUi", "nc") // HINT BUILTIN(__builtin_arm_nop, "v", "") diff --git a/clang/include/clang/Basic/BuiltinsARM.def b/clang/include/clang/Basic/BuiltinsARM.def --- a/clang/include/clang/Basic/BuiltinsARM.def +++ b/clang/include/clang/Basic/BuiltinsARM.def @@ -115,6 +115,8 @@ // Bit manipulation BUILTIN(__builtin_arm_rbit, "UiUi", "nc") +BUILTIN(__builtin_arm_cls, "UiZUi", "nc") +BUILTIN(__builtin_arm_cls64, "UiWUi", "nc") // Store and load exclusive BUILTIN(__builtin_arm_ldrexd, "LLUiv*", "") diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp --- a/clang/lib/CodeGen/CGBuiltin.cpp +++ b/clang/lib/CodeGen/CGBuiltin.cpp @@ -6055,6 +6055,16 @@ CGM.getIntrinsic(Intrinsic::bitreverse, Arg->getType()), Arg, "rbit"); } + if (BuiltinID == ARM::BI__builtin_arm_cls) { + llvm::Value *Arg = EmitScalarExpr(E->getArg(0)); + return Builder.CreateCall(CGM.getIntrinsic(Intrinsic::arm_cls), Arg, "cls"); + } + if (BuiltinID == ARM::BI__builtin_arm_cls64) { + llvm::Value *Arg = EmitScalarExpr(E->getArg(0)); + return Builder.CreateCall(CGM.getIntrinsic(Intrinsic::arm_cls64), Arg, + "cls"); + } + if (BuiltinID == ARM::BI__clear_cache) { assert(E->getNumArgs() == 2 && "__clear_cache takes 2 arguments"); const FunctionDecl *FD = E->getDirectCallee(); @@ -7108,6 +7118,17 @@ CGM.getIntrinsic(Intrinsic::bitreverse, Arg->getType()), Arg, "rbit"); } + if (BuiltinID == AArch64::BI__builtin_arm_cls) { + llvm::Value *Arg = EmitScalarExpr(E->getArg(0)); + return Builder.CreateCall(CGM.getIntrinsic(Intrinsic::aarch64_cls), Arg, + "cls"); + } + if (BuiltinID == AArch64::BI__builtin_arm_cls64) { + llvm::Value *Arg = EmitScalarExpr(E->getArg(0)); + return Builder.CreateCall(CGM.getIntrinsic(Intrinsic::aarch64_cls64), Arg, + "cls"); + } + if (BuiltinID == AArch64::BI__builtin_arm_jcvt) { assert((getContext().getTypeSize(E->getType()) == 32) && "__jcvt of unusual size!"); diff --git a/clang/lib/Headers/arm_acle.h b/clang/lib/Headers/arm_acle.h --- a/clang/lib/Headers/arm_acle.h +++ b/clang/lib/Headers/arm_acle.h @@ -139,6 +139,26 @@ return __builtin_clzll(__t); } +/* CLS */ +static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__)) +__cls(uint32_t __t) { + return __builtin_arm_cls(__t); +} + +static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__)) +__clsl(unsigned long __t) { +#if __SIZEOF_LONG__ == 4 + return __builtin_arm_cls(__t); +#else + return __builtin_arm_cls64(__t); +#endif +} + +static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__)) +__clsll(uint64_t __t) { + return __builtin_arm_cls64(__t); +} + /* REV */ static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__)) __rev(uint32_t __t) { diff --git a/clang/test/CodeGen/arm_acle.c b/clang/test/CodeGen/arm_acle.c --- a/clang/test/CodeGen/arm_acle.c +++ b/clang/test/CodeGen/arm_acle.c @@ -175,6 +175,24 @@ return __clzll(t); } +// ARM-LABEL: test_cls +// ARM: call i32 @llvm.arm.cls(i32 %t) +unsigned test_cls(uint32_t t) { + return __cls(t); +} + +// ARM-LABEL: test_clsl +// AArch32: call i32 @llvm.arm.cls(i32 %t) +// AArch64: call i32 @llvm.arm.cls64(i64 %t) +unsigned test_clsl(unsigned long t) { + return __clsl(t); +} +// ARM-LABEL: test_clsll +// ARM: call i32 @llvm.arm.cls64(i64 %t) +unsigned test_clsll(uint64_t t) { + return __clsll(t); +} + // ARM-LABEL: test_rev // ARM: call i32 @llvm.bswap.i32(i32 %t) uint32_t test_rev(uint32_t t) { diff --git a/clang/test/CodeGen/builtins-arm.c b/clang/test/CodeGen/builtins-arm.c --- a/clang/test/CodeGen/builtins-arm.c +++ b/clang/test/CodeGen/builtins-arm.c @@ -256,6 +256,21 @@ __builtin_arm_wsrp("sysreg", v); } +unsigned int cls(uint32_t v) { + // CHECK: call i32 @llvm.arm.cls(i32 %v) + return __builtin_arm_cls(v); +} + +unsigned int clsl(unsigned long v) { + // CHECK: call i32 @llvm.arm.cls(i32 %v) + return __builtin_arm_cls(v); +} + +unsigned int clsll(uint64_t v) { + // CHECK: call i32 @llvm.arm.cls64(i64 %v) + return __builtin_arm_cls64(v); +} + // CHECK: ![[M0]] = !{!"cp1:2:c3:c4:5"} // CHECK: ![[M1]] = !{!"cp1:2:c3"} // CHECK: ![[M2]] = !{!"sysreg"} diff --git a/clang/test/CodeGen/builtins-arm64.c b/clang/test/CodeGen/builtins-arm64.c --- a/clang/test/CodeGen/builtins-arm64.c +++ b/clang/test/CodeGen/builtins-arm64.c @@ -106,4 +106,21 @@ __builtin_arm_wsrp("1:2:3:4:5", v); } +unsigned int cls(uint32_t v) { + // CHECK: call i32 @llvm.aarch64.cls(i32 %v) + return __builtin_arm_cls(v); +} + +unsigned int clsl(unsigned long v) { + // CHECK-WIN: [[V64:%[^ ]+]] = zext i32 %v to i64 + // CHECK-WIN: call i32 @llvm.aarch64.cls64(i64 [[V64]] + // CHECK-LINUX: call i32 @llvm.aarch64.cls64(i64 %v) + return __builtin_arm_cls64(v); +} + +unsigned int clsll(uint64_t v) { + // CHECK: call i32 @llvm.aarch64.cls64(i64 %v) + return __builtin_arm_cls64(v); +} + // CHECK: ![[M0]] = !{!"1:2:3:4:5"} diff --git a/llvm/include/llvm/IR/IntrinsicsAArch64.td b/llvm/include/llvm/IR/IntrinsicsAArch64.td --- a/llvm/include/llvm/IR/IntrinsicsAArch64.td +++ b/llvm/include/llvm/IR/IntrinsicsAArch64.td @@ -33,6 +33,9 @@ def int_aarch64_fjcvtzs : Intrinsic<[llvm_i32_ty], [llvm_double_ty], [IntrNoMem]>; +def int_aarch64_cls: Intrinsic<[llvm_i32_ty], [llvm_i32_ty], [IntrNoMem]>; +def int_aarch64_cls64: Intrinsic<[llvm_i32_ty], [llvm_i64_ty], [IntrNoMem]>; + //===----------------------------------------------------------------------===// // HINT diff --git a/llvm/include/llvm/IR/IntrinsicsARM.td b/llvm/include/llvm/IR/IntrinsicsARM.td --- a/llvm/include/llvm/IR/IntrinsicsARM.td +++ b/llvm/include/llvm/IR/IntrinsicsARM.td @@ -843,4 +843,7 @@ def int_arm_mve_vst4q: Intrinsic<[], [llvm_anyptr_ty, llvm_anyvector_ty, LLVMMatchType<1>, LLVMMatchType<1>, LLVMMatchType<1>, llvm_i32_ty], [IntrWriteMem] >; +def int_arm_cls: Intrinsic<[llvm_i32_ty], [llvm_i32_ty], [IntrNoMem]>; +def int_arm_cls64: Intrinsic<[llvm_i32_ty], [llvm_i64_ty], [IntrNoMem]>; + } // end TargetPrefix diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td --- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td @@ -1527,6 +1527,8 @@ def : Pat<(ctlz (or (shl (xor (sra GPR64:$Rn, (i64 63)), GPR64:$Rn), (i64 1)), (i64 1))), (CLSXr GPR64:$Rn)>; +def : Pat<(int_aarch64_cls GPR32:$Rn), (CLSWr GPR32:$Rn)>; +def : Pat<(int_aarch64_cls64 GPR64:$Rm), (EXTRACT_SUBREG (CLSXr GPR64:$Rm), sub_32)>; // Unlike the other one operand instructions, the instructions with the "rev" // mnemonic do *not* just different in the size bit, but actually use different diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp --- a/llvm/lib/Target/ARM/ARMISelLowering.cpp +++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp @@ -3629,6 +3629,49 @@ EVT PtrVT = getPointerTy(DAG.getDataLayout()); return DAG.getNode(ARMISD::THREAD_POINTER, dl, PtrVT); } + case Intrinsic::arm_cls: { + const SDValue &Operand = Op.getOperand(1); + const EVT VTy = Op.getValueType(); + SDValue SRA = + DAG.getNode(ISD::SRA, dl, VTy, Operand, DAG.getConstant(31, dl, VTy)); + SDValue XOR = DAG.getNode(ISD::XOR, dl, VTy, SRA, Operand); + SDValue SHL = + DAG.getNode(ISD::SHL, dl, VTy, XOR, DAG.getConstant(1, dl, VTy)); + SDValue OR = + DAG.getNode(ISD::OR, dl, VTy, SHL, DAG.getConstant(1, dl, VTy)); + SDValue Result = DAG.getNode(ISD::CTLZ, dl, VTy, OR); + return Result; + } + case Intrinsic::arm_cls64: { + // cls(x) = if cls(hi(x)) != 31 then cls(hi(x)) + // else 31 + clz(if hi(x) == 0 then lo(x) else not(lo(x))) + const SDValue &Operand = Op.getOperand(1); + const EVT VTy = Op.getValueType(); + + SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, VTy, Operand, + DAG.getConstant(1, dl, VTy)); + SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, VTy, Operand, + DAG.getConstant(0, dl, VTy)); + SDValue Constant0 = DAG.getConstant(0, dl, VTy); + SDValue Constant1 = DAG.getConstant(1, dl, VTy); + SDValue Constant31 = DAG.getConstant(31, dl, VTy); + SDValue SRAHi = DAG.getNode(ISD::SRA, dl, VTy, Hi, Constant31); + SDValue XORHi = DAG.getNode(ISD::XOR, dl, VTy, SRAHi, Hi); + SDValue SHLHi = DAG.getNode(ISD::SHL, dl, VTy, XORHi, Constant1); + SDValue ORHi = DAG.getNode(ISD::OR, dl, VTy, SHLHi, Constant1); + SDValue CLSHi = DAG.getNode(ISD::CTLZ, dl, VTy, ORHi); + SDValue CheckLo = + DAG.getSetCC(dl, MVT::i1, CLSHi, Constant31, ISD::CondCode::SETEQ); + SDValue HiIsZero = + DAG.getSetCC(dl, MVT::i1, Hi, Constant0, ISD::CondCode::SETEQ); + SDValue AdjustedLo = + DAG.getSelect(dl, VTy, HiIsZero, Lo, DAG.getNOT(dl, Lo, VTy)); + SDValue CLZAdjustedLo = DAG.getNode(ISD::CTLZ, dl, VTy, AdjustedLo); + SDValue Result = + DAG.getSelect(dl, VTy, CheckLo, + DAG.getNode(ISD::ADD, dl, VTy, CLZAdjustedLo, Constant31), CLSHi); + return Result; + } case Intrinsic::eh_sjlj_lsda: { MachineFunction &MF = DAG.getMachineFunction(); ARMFunctionInfo *AFI = MF.getInfo(); diff --git a/llvm/test/CodeGen/AArch64/cls.ll b/llvm/test/CodeGen/AArch64/cls.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/cls.ll @@ -0,0 +1,20 @@ +; RUN: llc -mtriple=aarch64 %s -o - | FileCheck %s + +; @llvm.aarch64.cls must be directly translated into the 'cls' instruction + +; CHECK-LABEL: cls +; CHECK: cls [[REG:w[0-9]+]], [[REG]] +define i32 @cls(i32 %t) { + %cls.i = call i32 @llvm.aarch64.cls(i32 %t) + ret i32 %cls.i +} + +; CHECK-LABEL: cls64 +; CHECK: cls [[REG:x[0-9]+]], [[REG]] +define i32 @cls64(i64 %t) { + %cls.i = call i32 @llvm.aarch64.cls64(i64 %t) + ret i32 %cls.i +} + +declare i32 @llvm.aarch64.cls(i32) nounwind +declare i32 @llvm.aarch64.cls64(i64) nounwind diff --git a/llvm/test/CodeGen/ARM/cls.ll b/llvm/test/CodeGen/ARM/cls.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/ARM/cls.ll @@ -0,0 +1,27 @@ +; RUN: llc -mtriple=armv5 %s -o - | FileCheck %s + +; CHECK: eor [[T:r[0-9]+]], [[T]], [[T]], asr #31 +; CHECK-NEXT: mov [[C1:r[0-9]+]], #1 +; CHECK-NEXT: orr [[T]], [[C1]], [[T]], lsl #1 +; CHECK-NEXT: clz [[T]], [[T]] +define i32 @cls(i32 %t) { + %cls.i = call i32 @llvm.arm.cls(i32 %t) + ret i32 %cls.i +} + +; CHECK: cmp r1, #0 +; CHECK: mvnne [[ADJUSTEDLO:r[0-9]+]], r0 +; CHECK: clz [[CLZLO:r[0-9]+]], [[ADJUSTEDLO]] +; CHECK: eor [[A:r[0-9]+]], r1, r1, asr #31 +; CHECK: mov r1, #1 +; CHECK: orr [[A]], r1, [[A]], lsl #1 +; CHECK: clz [[CLSHI:r[0-9]+]], [[A]] +; CHECK: cmp [[CLSHI]], #31 +; CHECK: addeq r0, [[CLZLO]], #31 +define i32 @cls64(i64 %t) { + %cls.i = call i32 @llvm.arm.cls64(i64 %t) + ret i32 %cls.i +} + +declare i32 @llvm.arm.cls(i32) nounwind +declare i32 @llvm.arm.cls64(i64) nounwind