diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -1796,6 +1796,34 @@ Known = KnownBits::commonBits(Known, Known2); break; } + case AArch64ISD::BICi: { + KnownBits Known2, Known3; + Known2 = DAG.computeKnownBits(Op->getOperand(1), Depth + 1); + Known3 = DAG.computeKnownBits(Op->getOperand(2), Depth + 1); + if (!Known2.isConstant() || !Known3.isConstant()) + break; + + // Compute the bit cleared value. + uint64_t Mask = + ~(Op->getConstantOperandVal(1) << Op->getConstantOperandVal(2)); + Known = DAG.computeKnownBits(Op->getOperand(0), Depth + 1); + Known &= KnownBits::makeConstant(APInt(Known.getBitWidth(), Mask)); + break; + } + case AArch64ISD::VLSHR: { + KnownBits Known2; + Known = DAG.computeKnownBits(Op->getOperand(0), Depth + 1); + Known2 = DAG.computeKnownBits(Op->getOperand(1), Depth + 1); + Known = KnownBits::lshr(Known, Known2); + break; + } + case AArch64ISD::VASHR: { + KnownBits Known2; + Known = DAG.computeKnownBits(Op->getOperand(0), Depth + 1); + Known2 = DAG.computeKnownBits(Op->getOperand(1), Depth + 1); + Known = KnownBits::ashr(Known, Known2); + break; + } case AArch64ISD::LOADgot: case AArch64ISD::ADDlow: { if (!Subtarget->isTargetILP32()) diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td --- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td @@ -6342,6 +6342,22 @@ VectorIndexS:$idx)), (SQDMULLv1i64_indexed FPR32:$Rn, V128:$Vm, VectorIndexS:$idx)>; +// Match add node and also treat an 'or' node is as an 'add' if the or'ed operands +// have no common bits. +def add_and_or_is_add : PatFrags<(ops node:$lhs, node:$rhs), + [(add node:$lhs, node:$rhs), (or node:$lhs, node:$rhs)],[{ + if (N->getOpcode() == ISD::ADD) + return true; + return CurDAG->haveNoCommonBitsSet(N->getOperand(0), N->getOperand(1)); +}]> { + let GISelPredicateCode = [{ + // Only handle G_ADD for now. FIXME. build capability to compute whether + // operands of G_OR have common bits set or not. + return MI.getOpcode() == TargetOpcode::G_ADD; + }]; +} + + //---------------------------------------------------------------------------- // AdvSIMD scalar shift instructions //---------------------------------------------------------------------------- @@ -6447,7 +6463,7 @@ (AArch64srshri node:$MHS, node:$RHS))>>; defm SSHR : SIMDScalarRShiftD< 0, 0b00000, "sshr", AArch64vashr>; defm SSRA : SIMDScalarRShiftDTied< 0, 0b00010, "ssra", - TriOpFrag<(add node:$LHS, + TriOpFrag<(add_and_or_is_add node:$LHS, (AArch64vashr node:$MHS, node:$RHS))>>; defm UQRSHRN : SIMDScalarRShiftBHS< 1, 0b10011, "uqrshrn", int_aarch64_neon_uqrshrn>; @@ -6460,7 +6476,7 @@ (AArch64urshri node:$MHS, node:$RHS))>>; defm USHR : SIMDScalarRShiftD< 1, 0b00000, "ushr", AArch64vlshr>; defm USRA : SIMDScalarRShiftDTied< 1, 0b00010, "usra", - TriOpFrag<(add node:$LHS, + TriOpFrag<(add_and_or_is_add node:$LHS, (AArch64vlshr node:$MHS, node:$RHS))>>; //---------------------------------------------------------------------------- @@ -6502,7 +6518,7 @@ defm SSHR : SIMDVectorRShiftBHSD<0, 0b00000, "sshr", AArch64vashr>; defm SSRA : SIMDVectorRShiftBHSDTied<0, 0b00010, "ssra", - TriOpFrag<(add node:$LHS, (AArch64vashr node:$MHS, node:$RHS))>>; + TriOpFrag<(add_and_or_is_add node:$LHS, (AArch64vashr node:$MHS, node:$RHS))>>; defm UCVTF : SIMDVectorRShiftToFP<1, 0b11100, "ucvtf", int_aarch64_neon_vcvtfxu2fp>; defm UQRSHRN : SIMDVectorRShiftNarrowBHS<1, 0b10011, "uqrshrn", @@ -6518,7 +6534,7 @@ BinOpFrag<(AArch64vshl (zext node:$LHS), node:$RHS)>>; defm USHR : SIMDVectorRShiftBHSD<1, 0b00000, "ushr", AArch64vlshr>; defm USRA : SIMDVectorRShiftBHSDTied<1, 0b00010, "usra", - TriOpFrag<(add node:$LHS, (AArch64vlshr node:$MHS, node:$RHS))> >; + TriOpFrag<(add_and_or_is_add node:$LHS, (AArch64vlshr node:$MHS, node:$RHS))> >; // SHRN patterns for when a logical right shift was used instead of arithmetic // (the immediate guarantees no sign bits actually end up in the result so it diff --git a/llvm/test/CodeGen/AArch64/shift-accumulate.ll b/llvm/test/CodeGen/AArch64/shift-accumulate.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/shift-accumulate.ll @@ -0,0 +1,64 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=aarch64-unknown-unknown | FileCheck %s + +define dso_local <4 x i16> @usra_v4i16(<8 x i8> %0) local_unnamed_addr #0 align 32 { +; CHECK-LABEL: usra_v4i16: +; CHECK: // %bb.0: +; CHECK-NEXT: ushr v0.8b, v0.8b, #7 +; CHECK-NEXT: usra v0.4h, v0.4h, #7 +; CHECK-NEXT: ret + %2 = lshr <8 x i8> %0, + %3 = bitcast <8 x i8> %2 to <4 x i16> + %4 = lshr <4 x i16> %3, + %5 = or <4 x i16> %4, %3 + ret <4 x i16> %5 +} + +define dso_local <4 x i32> @usra_v4i32(<8 x i16> %0) local_unnamed_addr #0 align 32 { +; CHECK-LABEL: usra_v4i32: +; CHECK: // %bb.0: +; CHECK-NEXT: ushr v0.8h, v0.8h, #15 +; CHECK-NEXT: usra v0.4s, v0.4s, #15 +; CHECK-NEXT: ret + %2 = lshr <8 x i16> %0, + %3 = bitcast <8 x i16> %2 to <4 x i32> + %4 = lshr <4 x i32> %3, + %5 = or <4 x i32> %4, %3 + ret <4 x i32> %5 +} + +define dso_local <4 x i16> @ssra_v4i16(<4 x i16> %0) local_unnamed_addr #0 align 32 { +; CHECK-LABEL: ssra_v4i16: +; CHECK: // %bb.0: +; CHECK-NEXT: ushr v1.4h, v0.4h, #15 +; CHECK-NEXT: bic v0.4h, #64, lsl #8 +; CHECK-NEXT: ssra v1.4h, v0.4h, #14 +; CHECK-NEXT: fmov d0, d1 +; CHECK-NEXT: ret + ; set the 15th bit to zero. e.g. 0b1111111111111111 to 0b1011111111111111 + %2 = and <4 x i16> %0, + ; the first 15 bits are zero, the last bit can be zero or one. e.g. 0b1011111111111111 to 0b0000000000000001 + %3 = lshr <4 x i16> %0, + ; the first 15 bits maybe 1, and the last bit is zero. 0b1011111111111111 to 0b1111111111111110 + %4 = ashr <4 x i16> %2, + %5 = or <4 x i16> %3, %4 + ret <4 x i16> %5 +} + +define dso_local <4 x i32> @ssra_v4i32(<4 x i32> %0) local_unnamed_addr #0 align 32 { +; CHECK-LABEL: ssra_v4i32: +; CHECK: // %bb.0: +; CHECK-NEXT: ushr v1.4s, v0.4s, #31 +; CHECK-NEXT: bic v0.4s, #64, lsl #24 +; CHECK-NEXT: ssra v1.4s, v0.4s, #30 +; CHECK-NEXT: mov v0.16b, v1.16b +; CHECK-NEXT: ret + ; set the 31th bit to zero. + %2 = and <4 x i32> %0, + ; the first 31 bits are zero, the last bit can be zero or one. + %3 = lshr <4 x i32> %0, + ; the first 31 bits maybe 1, and the last bit is zero. + %4 = ashr <4 x i32> %2, + %5 = or <4 x i32> %3, %4 + ret <4 x i32> %5 +}