Index: llvm/lib/Target/AArch64/AArch64ISelLowering.cpp =================================================================== --- llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -17651,6 +17651,45 @@ return SDValue(); } +static SDValue foldCSELofCTTZ(SDNode *N, SelectionDAG &DAG) { + unsigned CC = N->getConstantOperandVal(2); + SDValue SUBS = N->getOperand(3); + SDValue Zero, CTTZ, AND; + + if (CC == AArch64CC::EQ && SUBS.getOpcode() == AArch64ISD::SUBS) { + Zero = N->getOperand(0); + CTTZ = N->getOperand(1); + } else if (CC == AArch64CC::NE && SUBS.getOpcode() == AArch64ISD::SUBS) { + Zero = N->getOperand(1); + CTTZ = N->getOperand(0); + } else + return SDValue(); + + if ((CTTZ.getOpcode() != ISD::CTTZ && CTTZ.getOpcode() != ISD::TRUNCATE) || + (CTTZ.getOpcode() == ISD::TRUNCATE && + CTTZ.getOperand(0).getOpcode() != ISD::CTTZ)) + return SDValue(); + + assert((CTTZ.getValueType() == MVT::i32 || CTTZ.getValueType() == MVT::i64) && + "Illegal type in CTTZ folding"); + + if (!isNullConstant(Zero) || !isNullConstant(SUBS.getValue(1).getOperand(1))) + return SDValue(); + + SDValue X = CTTZ.getOpcode() == ISD::TRUNCATE + ? CTTZ.getOperand(0).getOperand(0) + : CTTZ.getOperand(0); + + if (X != SUBS.getOperand(0)) + return SDValue(); + + unsigned BitWidth = CTTZ.getValueSizeInBits(); + SDValue BitWidthMinusOne = + DAG.getConstant(BitWidth - 1, SDLoc(N), CTTZ.getValueType()); + return DAG.getNode(ISD::AND, SDLoc(N), CTTZ.getValueType(), CTTZ, + BitWidthMinusOne); +} + // Optimize CSEL instructions static SDValue performCSELCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, @@ -17659,6 +17698,11 @@ if (N->getOperand(0) == N->getOperand(1)) return N->getOperand(0); + // CSEL 0, cttz(X), eq(X, 0) -> AND cttz bitwidth-1 + // CSEL cttz(X), 0, ne(X, 0) -> AND cttz bitwidth-1 + if (SDValue Folded = foldCSELofCTTZ(N, DAG)) + return Folded; + return performCONDCombine(N, DCI, DAG, 2, 3); } Index: llvm/test/CodeGen/AArch64/table-based-cttz.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/AArch64/table-based-cttz.ll @@ -0,0 +1,139 @@ +; RUN: llc -march=aarch64 < %s | FileCheck %s + +;; Check the transformation +;; CSEL 0, cttz, cc -> AND cttz numbits-1 +;; for cttz in the case of i32 and i64 respectively + +;; Cases for which the optimzation takes place +define i32 @cttzi32(i32 %x) { +; CHECK: rbit w8, w0 +; CHECK-NEXT: clz w8, w8 +; CHECK-NEXT: and w0, w8, #0x1f +; CHECK-NEXT: ret +entry: + %0 = call i32 @llvm.cttz.i32(i32 %x, i1 true) + %1 = icmp eq i32 %x, 0 + %2 = select i1 %1, i32 0, i32 %0 + ret i32 %2 +} + +define i64 @cttzi64(i64 %x) { +; CHECK: rbit x8, x0 +; CHECK-NEXT: clz x8, x8 +; CHECK-NEXT: and x0, x8, #0x3f +; CHECK-NEXT: ret +entry: + %0 = call i64 @llvm.cttz.i64(i64 %x, i1 true) + %1 = icmp eq i64 %x, 0 + %2 = select i1 %1, i64 0, i64 %0 + ret i64 %2 +} + +define i32 @cttzi32ne(i32 %x) { +; CHECK: rbit w8, w0 +; CHECK-NEXT: clz w8, w8 +; CHECK-NEXT: and w0, w8, #0x1f +; CHECK-NEXT: ret +entry: + %0 = call i32 @llvm.cttz.i32(i32 %x, i1 true) + %1 = icmp ne i32 %x, 0 + %2 = select i1 %1, i32 %0, i32 0 + ret i32 %2 +} + +define i64 @cttzi64ne(i64 %x) { +; CHECK: rbit x8, x0 +; CHECK-NEXT: clz x8, x8 +; CHECK-NEXT: and x0, x8, #0x3f +; CHECK-NEXT: ret +entry: + %0 = call i64 @llvm.cttz.i64(i64 %x, i1 true) + %1 = icmp ne i64 %x, 0 + %2 = select i1 %1, i64 %0, i64 0 + ret i64 %2 +} + +define i32 @cttztrunc(i64 %x) { +; CHECK: rbit x8, x0 +; CHECK-NEXT: clz x8, x8 +; CHECK-NEXT: and w0, w8, #0x1f +; CHECK-NEXT: ret +entry: + %0 = call i64 @llvm.cttz.i64(i64 %x, i1 true) + %1 = icmp eq i64 %x, 0 + %2 = select i1 %1, i64 0, i64 %0 + %3 = trunc i64 %2 to i32 + ret i32 %3 +} + +;; Cases for which the optimization does not take place +define i32 @cttzne(i32 %x) { +; CHECK: rbit w8, w0 +; CHECK-NEXT: cmp w0, #0 +; CHECK-NEXT: clz w8, w8 +; CHECK-NEXT: csel w0, wzr, w8, ne +; CHECK-NEXT: ret +entry: + %0 = call i32 @llvm.cttz.i32(i32 %x, i1 true) + %1 = icmp ne i32 %x, 0 + %2 = select i1 %1, i32 0, i32 %0 + ret i32 %2 +} + +define i32 @cttzxnot0(i32 %x) { +; CHECK: rbit w8, w0 +; CHECK-NEXT: cmp w0, #10 +; CHECK-NEXT: clz w8, w8 +; CHECK-NEXT: csel w0, wzr, w8, eq +; CHECK-NEXT: ret +entry: + %0 = call i32 @llvm.cttz.i32(i32 %x, i1 true) + %1 = icmp eq i32 %x, 10 + %2 = select i1 %1, i32 0, i32 %0 + ret i32 %2 +} + +define i32 @cttzlhsnot0(i32 %x) { +; CHECK: rbit w9, w0 +; CHECK-NEXT: mov w8, #10 +; CHECK-NEXT: clz w9, w9 +; CHECK-NEXT: cmp w0, #0 +; CHECK-NEXT: csel w0, w8, w9, eq +; CHECK-NEXT: ret +entry: + %0 = call i32 @llvm.cttz.i32(i32 %x, i1 true) + %1 = icmp eq i32 %x, 0 + %2 = select i1 %1, i32 10, i32 %0 + ret i32 %2 +} + +define i32 @notcttz(i32 %x) { +; CHECK: clz w8, w0 +; CHECK-NEXT: cmp w0, #0 +; CHECK-NEXT: csel w0, wzr, w8, eq +; CHECK-NEXT: ret +entry: + %0 = call i32 @llvm.ctlz.i32(i32 %x, i1 true) + %1 = icmp eq i32 %x, 0 + %2 = select i1 %1, i32 0, i32 %0 + ret i32 %2 +} + +define i32 @cttzlhsnotx(i32 %x, i32 %y) { +; CHECK: rbit w8, w0 +; CHECK-NEXT: cmp w1, #0 +; CHECK-NEXT: clz w8, w8 +; CHECK-NEXT: csel w0, wzr, w8, eq +; CHECK-NEXT: ret +entry: + %0 = call i32 @llvm.cttz.i32(i32 %x, i1 true) + %1 = icmp eq i32 %y, 0 + %2 = select i1 %1, i32 0, i32 %0 + ret i32 %2 +} + +declare i32 @llvm.cttz.i32(i32, i1) + +declare i64 @llvm.cttz.i64(i64, i1) + +declare i32 @llvm.ctlz.i32(i32, i1)