diff --git a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp --- a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp @@ -310,6 +310,7 @@ findAddrModeSVELoadStore(SDNode *N, unsigned Opc_rr, unsigned Opc_ri, const SDValue &OldBase, const SDValue &OldOffset, unsigned Scale); + void optimizeShiftAccumulate(SDNode *N); bool tryBitfieldExtractOp(SDNode *N); bool tryBitfieldExtractOpFromSExt(SDNode *N); @@ -2730,6 +2731,31 @@ return false; } +// Turn an OR into an ADD if it is adding 2 operands with no common bits +// and one of the operand is a VLSHR. +// +// We can select this into a USRA instruction. +void AArch64DAGToDAGISel::optimizeShiftAccumulate(SDNode *N) { + if (N->getOpcode() != ISD::OR) + return; + + // Only be able to turn an OR into an ADD if no common bits set. + if (!CurDAG->haveNoCommonBitsSet(N->getOperand(0), N->getOperand(1))) + return; + + for (const SDValue &Op : N->op_values()) { + if (Op->getOpcode() == AArch64ISD::VLSHR) { + SmallVector Ops; + for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) + Ops.push_back(N->getOperand(i)); + + CurDAG->MorphNodeTo(N, ISD::ADD, CurDAG->getVTList(N->getValueType(0)), + Ops); + return; + } + } +} + bool AArch64DAGToDAGISel::tryBitfieldInsertOp(SDNode *N) { if (N->getOpcode() != ISD::OR) return false; @@ -3469,6 +3495,9 @@ case ISD::OR: if (tryBitfieldInsertOp(Node)) return; + + // See whether we can turn into a shift accumulate. + optimizeShiftAccumulate(Node); break; case ISD::EXTRACT_SUBVECTOR: { diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -1796,6 +1796,13 @@ Known = KnownBits::commonBits(Known, Known2); break; } + case AArch64ISD::VLSHR: { + KnownBits Known2; + Known = DAG.computeKnownBits(Op->getOperand(0), Depth + 1); + Known2 = DAG.computeKnownBits(Op->getOperand(1), Depth + 1); + Known = KnownBits::lshr(Known, Known2); + break; + } case AArch64ISD::LOADgot: case AArch64ISD::ADDlow: { if (!Subtarget->isTargetILP32()) diff --git a/llvm/test/CodeGen/AArch64/shift-accumulate.ll b/llvm/test/CodeGen/AArch64/shift-accumulate.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/shift-accumulate.ll @@ -0,0 +1,17 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=aarch64-unknown-unknown | FileCheck %s + +define dso_local i32 @usra(<16 x i8> %0) local_unnamed_addr #0 align 32 { +; CHECK-LABEL: usra: +; CHECK: // %bb.0: +; CHECK-NEXT: ushr v0.16b, v0.16b, #7 +; CHECK-NEXT: usra v0.8h, v0.8h, #7 + %2 = lshr <16 x i8> %0, + %3 = bitcast <16 x i8> %2 to <8 x i16> + %4 = lshr <8 x i16> %3, + %5 = or <8 x i16> %4, %3 + %6 = bitcast <8 x i16> %5 to <16 x i8> + %7 = extractelement <16 x i8> %6, i32 0 + %8 = zext i8 %7 to i32 + ret i32 %8 +}