diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -1832,6 +1832,28 @@ Known = KnownBits::commonBits(Known, Known2); break; } + case AArch64ISD::BICi: { + // Compute the bit cleared value. + uint64_t Mask = + ~(Op->getConstantOperandVal(1) << Op->getConstantOperandVal(2)); + Known = DAG.computeKnownBits(Op->getOperand(0), Depth + 1); + Known &= KnownBits::makeConstant(APInt(Known.getBitWidth(), Mask)); + break; + } + case AArch64ISD::VLSHR: { + KnownBits Known2; + Known = DAG.computeKnownBits(Op->getOperand(0), Depth + 1); + Known2 = DAG.computeKnownBits(Op->getOperand(1), Depth + 1); + Known = KnownBits::lshr(Known, Known2); + break; + } + case AArch64ISD::VASHR: { + KnownBits Known2; + Known = DAG.computeKnownBits(Op->getOperand(0), Depth + 1); + Known2 = DAG.computeKnownBits(Op->getOperand(1), Depth + 1); + Known = KnownBits::ashr(Known, Known2); + break; + } case AArch64ISD::LOADgot: case AArch64ISD::ADDlow: { if (!Subtarget->isTargetILP32()) diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td --- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td @@ -6423,6 +6423,22 @@ VectorIndexS:$idx)), (SQDMULLv1i64_indexed FPR32:$Rn, V128:$Vm, VectorIndexS:$idx)>; +// Match add node and also treat an 'or' node is as an 'add' if the or'ed operands +// have no common bits. +def add_and_or_is_add : PatFrags<(ops node:$lhs, node:$rhs), + [(add node:$lhs, node:$rhs), (or node:$lhs, node:$rhs)],[{ + if (N->getOpcode() == ISD::ADD) + return true; + return CurDAG->haveNoCommonBitsSet(N->getOperand(0), N->getOperand(1)); +}]> { + let GISelPredicateCode = [{ + // Only handle G_ADD for now. FIXME. build capability to compute whether + // operands of G_OR have common bits set or not. + return MI.getOpcode() == TargetOpcode::G_ADD; + }]; +} + + //---------------------------------------------------------------------------- // AdvSIMD scalar shift instructions //---------------------------------------------------------------------------- @@ -6528,7 +6544,7 @@ (AArch64srshri node:$MHS, node:$RHS))>>; defm SSHR : SIMDScalarRShiftD< 0, 0b00000, "sshr", AArch64vashr>; defm SSRA : SIMDScalarRShiftDTied< 0, 0b00010, "ssra", - TriOpFrag<(add node:$LHS, + TriOpFrag<(add_and_or_is_add node:$LHS, (AArch64vashr node:$MHS, node:$RHS))>>; defm UQRSHRN : SIMDScalarRShiftBHS< 1, 0b10011, "uqrshrn", int_aarch64_neon_uqrshrn>; @@ -6541,7 +6557,7 @@ (AArch64urshri node:$MHS, node:$RHS))>>; defm USHR : SIMDScalarRShiftD< 1, 0b00000, "ushr", AArch64vlshr>; defm USRA : SIMDScalarRShiftDTied< 1, 0b00010, "usra", - TriOpFrag<(add node:$LHS, + TriOpFrag<(add_and_or_is_add node:$LHS, (AArch64vlshr node:$MHS, node:$RHS))>>; //---------------------------------------------------------------------------- @@ -6583,7 +6599,7 @@ defm SSHR : SIMDVectorRShiftBHSD<0, 0b00000, "sshr", AArch64vashr>; defm SSRA : SIMDVectorRShiftBHSDTied<0, 0b00010, "ssra", - TriOpFrag<(add node:$LHS, (AArch64vashr node:$MHS, node:$RHS))>>; + TriOpFrag<(add_and_or_is_add node:$LHS, (AArch64vashr node:$MHS, node:$RHS))>>; defm UCVTF : SIMDVectorRShiftToFP<1, 0b11100, "ucvtf", int_aarch64_neon_vcvtfxu2fp>; defm UQRSHRN : SIMDVectorRShiftNarrowBHS<1, 0b10011, "uqrshrn", @@ -6599,7 +6615,7 @@ BinOpFrag<(AArch64vshl (zext node:$LHS), node:$RHS)>>; defm USHR : SIMDVectorRShiftBHSD<1, 0b00000, "ushr", AArch64vlshr>; defm USRA : SIMDVectorRShiftBHSDTied<1, 0b00010, "usra", - TriOpFrag<(add node:$LHS, (AArch64vlshr node:$MHS, node:$RHS))> >; + TriOpFrag<(add_and_or_is_add node:$LHS, (AArch64vlshr node:$MHS, node:$RHS))> >; // RADDHN patterns for when RSHRN shifts by half the size of the vector element def : Pat<(v8i8 (int_aarch64_neon_rshrn (v8i16 V128:$Vn), (i32 8))), diff --git a/llvm/test/CodeGen/AArch64/shift-accumulate.ll b/llvm/test/CodeGen/AArch64/shift-accumulate.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/shift-accumulate.ll @@ -0,0 +1,122 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=aarch64-none-eabi | FileCheck %s + +define <4 x i16> @usra_v4i16(<8 x i8> %0) { +; CHECK-LABEL: usra_v4i16: +; CHECK: // %bb.0: +; CHECK-NEXT: ushr v0.8b, v0.8b, #7 +; CHECK-NEXT: usra v0.4h, v0.4h, #7 +; CHECK-NEXT: ret + %2 = lshr <8 x i8> %0, + %3 = bitcast <8 x i8> %2 to <4 x i16> + %4 = lshr <4 x i16> %3, + %5 = or <4 x i16> %4, %3 + ret <4 x i16> %5 +} + +define <4 x i32> @usra_v4i32(<8 x i16> %0) { +; CHECK-LABEL: usra_v4i32: +; CHECK: // %bb.0: +; CHECK-NEXT: ushr v0.8h, v0.8h, #15 +; CHECK-NEXT: usra v0.4s, v0.4s, #15 +; CHECK-NEXT: ret + %2 = lshr <8 x i16> %0, + %3 = bitcast <8 x i16> %2 to <4 x i32> + %4 = lshr <4 x i32> %3, + %5 = or <4 x i32> %4, %3 + ret <4 x i32> %5 +} + +define <2 x i64> @usra_v2i64(<4 x i32> %0) { +; CHECK-LABEL: usra_v2i64: +; CHECK: // %bb.0: +; CHECK-NEXT: ushr v0.4s, v0.4s, #31 +; CHECK-NEXT: usra v0.2d, v0.2d, #31 +; CHECK-NEXT: ret + %2 = lshr <4 x i32> %0, + %3 = bitcast <4 x i32> %2 to <2 x i64> + %4 = lshr <2 x i64> %3, + %5 = or <2 x i64> %4, %3 + ret <2 x i64> %5 +} + +define <1 x i64> @usra_v1i64(<2 x i32> %0) { +; CHECK-LABEL: usra_v1i64: +; CHECK: // %bb.0: +; CHECK-NEXT: ushr v0.2s, v0.2s, #31 +; CHECK-NEXT: usra d0, d0, #31 +; CHECK-NEXT: ret + %2 = lshr <2 x i32> %0, + %3 = bitcast <2 x i32> %2 to <1 x i64> + %4 = lshr <1 x i64> %3, + %5 = or <1 x i64> %4, %3 + ret <1 x i64> %5 +} + +define <4 x i16> @ssra_v4i16(<4 x i16> %0) { +; CHECK-LABEL: ssra_v4i16: +; CHECK: // %bb.0: +; CHECK-NEXT: ushr v1.4h, v0.4h, #15 +; CHECK-NEXT: bic v0.4h, #64, lsl #8 +; CHECK-NEXT: ssra v1.4h, v0.4h, #14 +; CHECK-NEXT: fmov d0, d1 +; CHECK-NEXT: ret + ; set the 15th bit to zero. e.g. 0b1111111111111111 to 0b1011111111111111 + %2 = and <4 x i16> %0, + ; the first 15 bits are zero, the last bit can be zero or one. e.g. 0b1011111111111111 to 0b0000000000000001 + %3 = lshr <4 x i16> %0, + ; the first 15 bits maybe 1, and the last bit is zero. 0b1011111111111111 to 0b1111111111111110 + %4 = ashr <4 x i16> %2, + %5 = or <4 x i16> %3, %4 + ret <4 x i16> %5 +} + +define <4 x i32> @ssra_v4i32(<4 x i32> %0) { +; CHECK-LABEL: ssra_v4i32: +; CHECK: // %bb.0: +; CHECK-NEXT: ushr v1.4s, v0.4s, #31 +; CHECK-NEXT: bic v0.4s, #64, lsl #24 +; CHECK-NEXT: ssra v1.4s, v0.4s, #30 +; CHECK-NEXT: mov v0.16b, v1.16b +; CHECK-NEXT: ret + ; set the 31th bit to zero. + %2 = and <4 x i32> %0, + ; the first 31 bits are zero, the last bit can be zero or one. + %3 = lshr <4 x i32> %0, + ; the first 31 bits maybe 1, and the last bit is zero. + %4 = ashr <4 x i32> %2, + %5 = or <4 x i32> %3, %4 + ret <4 x i32> %5 +} + +define <1 x i64> @ssra_v1i64(<2 x i32> %0) { +; CHECK-LABEL: ssra_v1i64: +; CHECK: // %bb.0: +; CHECK-NEXT: bic v0.2s, #64, lsl #24 +; CHECK-NEXT: ushr d1, d0, #63 +; CHECK-NEXT: ssra d1, d0, #62 +; CHECK-NEXT: fmov d0, d1 +; CHECK-NEXT: ret + %2 = and <2 x i32> %0, + %3 = bitcast <2 x i32> %2 to <1 x i64> + %4 = lshr <1 x i64> %3, + %5 = ashr <1 x i64> %3, + %6 = or <1 x i64> %4, %5 + ret <1 x i64> %6 +} + +define <2 x i64> @ssra_v2i64(<4 x i32> %0) { +; CHECK-LABEL: ssra_v2i64: +; CHECK: // %bb.0: +; CHECK-NEXT: bic v0.4s, #64, lsl #24 +; CHECK-NEXT: ushr v1.2d, v0.2d, #63 +; CHECK-NEXT: ssra v1.2d, v0.2d, #62 +; CHECK-NEXT: mov v0.16b, v1.16b +; CHECK-NEXT: ret + %2 = and <4 x i32> %0, + %3 = bitcast <4 x i32> %2 to <2 x i64> + %4 = lshr <2 x i64> %3, + %5 = ashr <2 x i64> %3, + %6 = or <2 x i64> %4, %5 + ret <2 x i64> %6 +}