diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h @@ -1091,6 +1091,7 @@ SDValue LowerWindowsDYNAMIC_STACKALLOC(SDValue Op, SDValue Chain, SDValue &Size, SelectionDAG &DAG) const; + SDValue LowerAVGFloor_AVGCeil(SDValue Node, SelectionDAG &DAG) const; SDValue LowerFixedLengthVectorIntDivideToSVE(SDValue Op, SelectionDAG &DAG) const; diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -1288,12 +1288,10 @@ setOperationAction(ISD::SDIVREM, VT, Expand); setOperationAction(ISD::UDIVREM, VT, Expand); - if (Subtarget->hasSVE2()) { - setOperationAction(ISD::AVGFLOORS, VT, Custom); - setOperationAction(ISD::AVGFLOORU, VT, Custom); - setOperationAction(ISD::AVGCEILS, VT, Custom); - setOperationAction(ISD::AVGCEILU, VT, Custom); - } + setOperationAction(ISD::AVGFLOORS, VT, Custom); + setOperationAction(ISD::AVGFLOORU, VT, Custom); + setOperationAction(ISD::AVGCEILS, VT, Custom); + setOperationAction(ISD::AVGCEILU, VT, Custom); } // Illegal unpacked integer vector types. @@ -6076,13 +6074,21 @@ case ISD::ABDU: return LowerToPredicatedOp(Op, DAG, AArch64ISD::ABDU_PRED); case ISD::AVGFLOORS: - return LowerToPredicatedOp(Op, DAG, AArch64ISD::HADDS_PRED); + if(Subtarget->hasSVE2()) + return LowerToPredicatedOp(Op, DAG, AArch64ISD::HADDS_PRED); + return LowerAVGFloor_AVGCeil(Op, DAG); case ISD::AVGFLOORU: - return LowerToPredicatedOp(Op, DAG, AArch64ISD::HADDU_PRED); + if(Subtarget->hasSVE2()) + return LowerToPredicatedOp(Op, DAG, AArch64ISD::HADDU_PRED); + return LowerAVGFloor_AVGCeil(Op, DAG); case ISD::AVGCEILS: - return LowerToPredicatedOp(Op, DAG, AArch64ISD::RHADDS_PRED); + if(Subtarget->hasSVE2()) + return LowerToPredicatedOp(Op, DAG, AArch64ISD::RHADDS_PRED); + return LowerAVGFloor_AVGCeil(Op, DAG); case ISD::AVGCEILU: - return LowerToPredicatedOp(Op, DAG, AArch64ISD::RHADDU_PRED); + if(Subtarget->hasSVE2()) + return LowerToPredicatedOp(Op, DAG, AArch64ISD::RHADDU_PRED); + return LowerAVGFloor_AVGCeil(Op, DAG); case ISD::BITREVERSE: return LowerBitreverse(Op, DAG); case ISD::BSWAP: @@ -13362,6 +13368,34 @@ return Chain; } +SDValue AArch64TargetLowering::LowerAVGFloor_AVGCeil(SDValue Node, + SelectionDAG &DAG) const { + SDLoc dl(Node); + SDValue OpA = Node->getOperand(0); + SDValue OpB = Node->getOperand(1); + EVT VT = Node->getValueType(0); + SDValue ConstantOne = DAG.getConstant(1, dl, VT); + + assert(VT.isScalableVector() && "Only expect to lower scalable vector op!"); + + SDValue srlA = DAG.getNode(ISD::SRL, dl, VT, OpA, ConstantOne); + SDValue srlB = DAG.getNode(ISD::SRL, dl, VT, OpB, ConstantOne); + + SDValue tmp; + if(Node->getOpcode() == ISD::AVGFLOORU || + Node->getOpcode() == ISD::AVGFLOORS) { + tmp = DAG.getNode(ISD::AND, dl, VT, OpA, OpB); + } + else { + tmp = DAG.getNode(ISD::OR, dl, VT, OpA, OpB); + } + + tmp = DAG.getNode(ISD::AND, dl, VT, tmp, ConstantOne); + SDValue Add = DAG.getNode(ISD::ADD, dl, VT, srlA, srlB); + return DAG.getNode(ISD::ADD, dl, VT, Add, tmp); +} + + SDValue AArch64TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const { @@ -17593,7 +17627,8 @@ } static SDValue performTruncateCombine(SDNode *N, - SelectionDAG &DAG) { + SelectionDAG &DAG, + const AArch64Subtarget *Subtarget) { EVT VT = N->getValueType(0); SDValue N0 = N->getOperand(0); if (VT.isFixedLengthVector() && VT.is64BitVector() && N0.hasOneUse() && @@ -17605,6 +17640,77 @@ return DAG.getNode(N0.getOpcode(), SDLoc(N), VT, Op); } + // If we see something like trunc( lshr( add(add(a, b), c) , 1) ), where one of (a,b,c) should be 0 or 1 then + // we can convert that into lshr(a, 1) + lshr(b, 1) + (a|b)&1 + if(!Subtarget->hasSVE2() && N->getOperand(0).getOpcode() == ISD::SRL) { + SDValue Op0 = N->getOperand(0).getOperand(0); //add(add(a, b), c) + SDValue Op1 = N->getOperand(0).getOperand(1); //shiftAmount + + if(Op0.getOpcode() != ISD::ADD) + return SDValue(); + + if(!isOneConstant(Op1)) + return SDValue(); + + if(Op0.getOperand(0).getOpcode() != ISD::ADD) // add(a, b) + return SDValue(); + + SDValue A, B, C; + C = Op0.getOperand(1); + A = Op0.getOperand(0).getOperand(0); // a + B = Op0.getOperand(0).getOperand(1); // b + + // we have add(add(a, b), c), + // one of a or b or c should be a constant 0 or 1: + // usually C is the constant, but in some cases, A or B may be the constant. + // check if any A or B is the constants, and then swap them with C + if(isNullConstant(A) || isOneConstant(A)) { + SDValue tmp = A; + A = C; + C = tmp; + } + else if(isNullConstant(B) || isOneConstant(B)) { + SDValue tmp = B; + B = C; + C = tmp; + } + else if(!isNullConstant(C) && !isOneConstant(C)) + // no constant found + return SDValue(); + + if(A.getOpcode() == ISD::ZERO_EXTEND) + A = A.getOperand(0); + + if(B.getOpcode() == ISD::ZERO_EXTEND) + B = B.getOperand(0); + + // (A >> 1) + SDValue AShifted = DAG.getNode(ISD::SRL, SDLoc(N), A.getValueType(), A, Op1); + // (B >> 1) + SDValue BShifted = DAG.getNode(ISD::SRL, SDLoc(N), B.getValueType(), B, Op1); + + SDValue AB; + if(isNullConstant(C)) { + // (A & B) + AB = DAG.getNode(ISD::AND, SDLoc(N), A.getValueType(), A, B); + } + else { + // (A | B) + AB = DAG.getNode(ISD::OR, SDLoc(N), A.getValueType(), A, B); + } + + // (A & B)&1 + // (A | B)&1 + SDValue ABndOne = DAG.getNode(ISD::AND, SDLoc(N), A.getValueType(), AB, DAG.getConstant(1, SDLoc(N), AB.getValueType())); + + // AShifted + BShifted + SDValue Add1 = DAG.getNode(ISD::ADD, SDLoc(N), AShifted.getValueType(), AShifted, BShifted); + // (AShifted + BShifted) + ABndOne + SDValue newInstr = DAG.getNode(ISD::ADD, SDLoc(N), AShifted.getValueType(), Add1, ABndOne); + + return newInstr; + } + return SDValue(); } @@ -18489,7 +18595,8 @@ static SDValue performExtendCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, - SelectionDAG &DAG) { + SelectionDAG &DAG, + const AArch64Subtarget *Subtarget) { // If we see something like (zext (sabd (extract_high ...), (DUP ...))) then // we can convert that DUP into another extract_high (of a bigger DUP), which // helps the backend to decide that an sabdl2 would be useful, saving a real @@ -18511,6 +18618,66 @@ N->getOperand(0)->getOpcode() == ISD::SETCC) return performSignExtendSetCCCombine(N, DCI, DAG); + // If we see something like zext( lshr( add(add(a, b), 1) , 1) ) then + // we can convert that into lshr(a, 1) + lshr(b, 1) + (a|b)&1 + if(!Subtarget->hasSVE2() && N->getOperand(0).getOpcode() == ISD::SRL) { + SDValue Op0 = N->getOperand(0).getOperand(0); //add(add(a, b), 1) + SDValue Op1 = N->getOperand(0).getOperand(1); //shiftAmount + if(Op0.getOpcode() != ISD::ADD || !isOneConstant(Op1)) + return SDValue(); + + if(Op0.getOperand(0).getOpcode() != ISD::ADD) // add(a, b) + return SDValue(); + + SDValue AddedValue = Op0.getOperand(1); + // AddedValue must be 0 or 1 + if(!isNullConstant(AddedValue) && !isOneConstant(AddedValue)) + return SDValue(); + + SDValue A, B; + if(Op0.getOperand(0).getOperand(0).getOpcode() == ISD::TRUNCATE) + A = Op0.getOperand(0).getOperand(0).getOperand(0); // a + else + A = Op0.getOperand(0).getOperand(0); // a + + if(Op0.getOperand(0).getOperand(1).getOpcode() == ISD::TRUNCATE) + B = Op0.getOperand(0).getOperand(1).getOperand(0); // b + else + B = Op0.getOperand(0).getOperand(1); // b + + // (A >> 1) + SDValue AShifted = DAG.getNode(ISD::SRL, SDLoc(N), A.getValueType(), A, Op1); + // (B >> 1) + SDValue BShifted = DAG.getNode(ISD::SRL, SDLoc(N), B.getValueType(), B, Op1); + + SDValue AB; + if(isNullConstant(AddedValue)) { + // (A & B) + AB = DAG.getNode(ISD::AND, SDLoc(N), A.getValueType(), A, B); + } + else { + // (A | B) + AB = DAG.getNode(ISD::OR, SDLoc(N), A.getValueType(), A, B); + } + + // (A & B)&1 + // (A | B)&1 + SDValue ABndOne = DAG.getNode(ISD::AND, SDLoc(N), A.getValueType(), AB, DAG.getConstant(1, SDLoc(N), AB.getValueType())); + + // AShifted + BShifted + SDValue Add1 = DAG.getNode(ISD::ADD, SDLoc(N), AShifted.getValueType(), AShifted, BShifted); + // (AShifted + BShifted) + ABndOne + SDValue newInstr = DAG.getNode(ISD::ADD, SDLoc(N), AShifted.getValueType(), Add1, ABndOne); + + if(N->getValueType(0).getSizeInBits() == newInstr.getValueSizeInBits()) + return newInstr; + + else if(N->getValueType(0).getSizeInBits() > newInstr.getValueSizeInBits()) + return DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), N->getValueType(0), newInstr); + + return DAG.getNode(ISD::TRUNCATE, SDLoc(N), N->getValueType(0), newInstr); + } + return SDValue(); } @@ -21497,7 +21664,7 @@ case ISD::BUILD_VECTOR: return performBuildVectorCombine(N, DCI, DAG); case ISD::TRUNCATE: - return performTruncateCombine(N, DAG); + return performTruncateCombine(N, DAG, Subtarget); case AArch64ISD::ANDS: return performFlagSettingCombine(N, DCI, ISD::AND); case AArch64ISD::ADC: @@ -21537,7 +21704,7 @@ case ISD::ANY_EXTEND: case ISD::ZERO_EXTEND: case ISD::SIGN_EXTEND: - return performExtendCombine(N, DCI, DAG); + return performExtendCombine(N, DCI, DAG, Subtarget); case ISD::SIGN_EXTEND_INREG: return performSignExtendInRegCombine(N, DCI, DAG); case ISD::CONCAT_VECTORS: diff --git a/llvm/test/CodeGen/AArch64/neon-lshr.ll b/llvm/test/CodeGen/AArch64/neon-lshr.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/neon-lshr.ll @@ -0,0 +1,23 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s + +define i8 @lshr_trunc(i8 %a, i8%b) +; CHECK-LABEL: lshr_trunc: +; CHECK: // %bb.0: +; CHECK-NEXT: and w8, w1, #0xfe +; CHECK-NEXT: ubfx w9, w0, #1, #7 +; CHECK-NEXT: orr w10, w0, w1 +; CHECK-NEXT: add w8, w9, w8, lsr #1 +; CHECK-NEXT: and w9, w10, #0x1 +; CHECK-NEXT: add w0, w8, w9 +; CHECK-NEXT: ret +{ + %zexta = zext i8 %a to i16 + %zextb = zext i8 %b to i16 + %zextab = add i16 %zexta, %zextb + %add = add i16 %zextab, 1 + %shift = lshr i16 %add, 1 + %trunca = trunc i16 %shift to i8 + ret i8 %trunca +} + diff --git a/llvm/test/CodeGen/AArch64/sve-avg_floor_ceil.ll b/llvm/test/CodeGen/AArch64/sve-avg_floor_ceil.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sve-avg_floor_ceil.ll @@ -0,0 +1,723 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=aarch64-none-linux-gnu -mcpu=neoverse-v1 | FileCheck %s + +define @hadds_v2i64( %s0, %s1) { +; CHECK-LABEL: hadds_v2i64: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: lsr z2.d, z1.d, #1 +; CHECK-NEXT: lsr z3.d, z0.d, #1 +; CHECK-NEXT: and z0.d, z0.d, z1.d +; CHECK-NEXT: add z2.d, z3.d, z2.d +; CHECK-NEXT: and z0.d, z0.d, #0x1 +; CHECK-NEXT: add z0.d, z2.d, z0.d +; CHECK-NEXT: ret +entry: + %s0s = sext %s0 to + %s1s = sext %s1 to + %m = add nsw %s0s, %s1s + %s = lshr %m, shufflevector ( insertelement ( poison, i128 1, i32 0), poison, zeroinitializer) + %s2 = trunc %s to + ret %s2 +} + +define @haddu_v2i64( %s0, %s1) { +; CHECK-LABEL: haddu_v2i64: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: lsr z2.d, z1.d, #1 +; CHECK-NEXT: lsr z3.d, z0.d, #1 +; CHECK-NEXT: and z0.d, z0.d, z1.d +; CHECK-NEXT: add z2.d, z3.d, z2.d +; CHECK-NEXT: and z0.d, z0.d, #0x1 +; CHECK-NEXT: add z0.d, z2.d, z0.d +; CHECK-NEXT: ret +entry: + %s0s = zext %s0 to + %s1s = zext %s1 to + %m = add nuw nsw %s0s, %s1s + %s = lshr %m, shufflevector ( insertelement ( poison, i128 1, i32 0), poison, zeroinitializer) + %s2 = trunc %s to + ret %s2 +} + +define @hadds_v2i32( %s0, %s1) { +; CHECK-LABEL: hadds_v2i32: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: sxtw z0.d, p0/m, z0.d +; CHECK-NEXT: adr z0.d, [z0.d, z1.d, sxtw] +; CHECK-NEXT: lsr z0.d, z0.d, #1 +; CHECK-NEXT: ret +entry: + %s0s = sext %s0 to + %s1s = sext %s1 to + %m = add nsw %s0s, %s1s + %s = lshr %m, shufflevector ( insertelement ( poison, i64 1, i32 0), poison, zeroinitializer) + %s2 = trunc %s to + ret %s2 +} + +define @haddu_v2i32( %s0, %s1) { +; CHECK-LABEL: haddu_v2i32: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: and z2.d, z0.d, z1.d +; CHECK-NEXT: and z0.d, z0.d, #0xffffffff +; CHECK-NEXT: and z1.d, z1.d, #0xffffffff +; CHECK-NEXT: lsr z1.d, z1.d, #1 +; CHECK-NEXT: lsr z0.d, z0.d, #1 +; CHECK-NEXT: and z2.d, z2.d, #0x1 +; CHECK-NEXT: add z0.d, z0.d, z1.d +; CHECK-NEXT: add z0.d, z0.d, z2.d +; CHECK-NEXT: ret +entry: + %s0s = zext %s0 to + %s1s = zext %s1 to + %m = add nuw nsw %s0s, %s1s + %s = lshr %m, shufflevector ( insertelement ( poison, i64 1, i32 0), poison, zeroinitializer) + %s2 = trunc %s to + ret %s2 +} + +define @hadds_v4i32( %s0, %s1) { +; CHECK-LABEL: hadds_v4i32: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: lsr z2.s, z1.s, #1 +; CHECK-NEXT: lsr z3.s, z0.s, #1 +; CHECK-NEXT: and z0.d, z0.d, z1.d +; CHECK-NEXT: add z2.s, z3.s, z2.s +; CHECK-NEXT: and z0.s, z0.s, #0x1 +; CHECK-NEXT: add z0.s, z2.s, z0.s +; CHECK-NEXT: ret +entry: + %s0s = sext %s0 to + %s1s = sext %s1 to + %m = add nsw %s0s, %s1s + %s = lshr %m, shufflevector ( insertelement ( poison, i64 1, i32 0), poison, zeroinitializer) + %s2 = trunc %s to + ret %s2 +} + +define @haddu_v4i32( %s0, %s1) { +; CHECK-LABEL: haddu_v4i32: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: lsr z2.s, z1.s, #1 +; CHECK-NEXT: lsr z3.s, z0.s, #1 +; CHECK-NEXT: and z0.d, z0.d, z1.d +; CHECK-NEXT: add z2.s, z3.s, z2.s +; CHECK-NEXT: and z0.s, z0.s, #0x1 +; CHECK-NEXT: add z0.s, z2.s, z0.s +; CHECK-NEXT: ret +entry: + %s0s = zext %s0 to + %s1s = zext %s1 to + %m = add nuw nsw %s0s, %s1s + %s = lshr %m, shufflevector ( insertelement ( poison, i64 1, i32 0), poison, zeroinitializer) + %s2 = trunc %s to + ret %s2 +} + +define @hadds_v2i16( %s0, %s1) { +; CHECK-LABEL: hadds_v2i16: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: sxth z0.d, p0/m, z0.d +; CHECK-NEXT: sxth z1.d, p0/m, z1.d +; CHECK-NEXT: add z0.d, z0.d, z1.d +; CHECK-NEXT: and z0.d, z0.d, #0xffffffff +; CHECK-NEXT: lsr z0.d, z0.d, #1 +; CHECK-NEXT: ret +entry: + %s0s = sext %s0 to + %s1s = sext %s1 to + %m = add nsw %s0s, %s1s + %s = lshr %m, shufflevector ( insertelement ( poison, i32 1, i32 0), poison, zeroinitializer) + %s2 = trunc %s to + ret %s2 +} + +define @haddu_v2i16( %s0, %s1) { +; CHECK-LABEL: haddu_v2i16: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: and z2.d, z0.d, z1.d +; CHECK-NEXT: and z0.d, z0.d, #0xffff +; CHECK-NEXT: and z1.d, z1.d, #0xffff +; CHECK-NEXT: lsr z1.d, z1.d, #1 +; CHECK-NEXT: lsr z0.d, z0.d, #1 +; CHECK-NEXT: and z2.d, z2.d, #0x1 +; CHECK-NEXT: add z0.d, z0.d, z1.d +; CHECK-NEXT: add z0.d, z0.d, z2.d +; CHECK-NEXT: ret +entry: + %s0s = zext %s0 to + %s1s = zext %s1 to + %m = add nuw nsw %s0s, %s1s + %s = lshr %m, shufflevector ( insertelement ( poison, i32 1, i32 0), poison, zeroinitializer) + %s2 = trunc %s to + ret %s2 +} + +define @hadds_v4i16( %s0, %s1) { +; CHECK-LABEL: hadds_v4i16: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: sxth z0.s, p0/m, z0.s +; CHECK-NEXT: sxth z1.s, p0/m, z1.s +; CHECK-NEXT: add z0.s, z0.s, z1.s +; CHECK-NEXT: lsr z0.s, z0.s, #1 +; CHECK-NEXT: ret +entry: + %s0s = sext %s0 to + %s1s = sext %s1 to + %m = add nsw %s0s, %s1s + %s = lshr %m, shufflevector ( insertelement ( poison, i32 1, i32 0), poison, zeroinitializer) + %s2 = trunc %s to + ret %s2 +} + +define @haddu_v4i16( %s0, %s1) { +; CHECK-LABEL: haddu_v4i16: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: and z2.d, z0.d, z1.d +; CHECK-NEXT: and z0.s, z0.s, #0xffff +; CHECK-NEXT: and z1.s, z1.s, #0xffff +; CHECK-NEXT: lsr z1.s, z1.s, #1 +; CHECK-NEXT: lsr z0.s, z0.s, #1 +; CHECK-NEXT: and z2.s, z2.s, #0x1 +; CHECK-NEXT: add z0.s, z0.s, z1.s +; CHECK-NEXT: add z0.s, z0.s, z2.s +; CHECK-NEXT: ret +entry: + %s0s = zext %s0 to + %s1s = zext %s1 to + %m = add nuw nsw %s0s, %s1s + %s = lshr %m, shufflevector ( insertelement ( poison, i32 1, i32 0), poison, zeroinitializer) + %s2 = trunc %s to + ret %s2 +} + +define @hadds_v8i16( %s0, %s1) { +; CHECK-LABEL: hadds_v8i16: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: lsr z2.h, z1.h, #1 +; CHECK-NEXT: lsr z3.h, z0.h, #1 +; CHECK-NEXT: and z0.d, z0.d, z1.d +; CHECK-NEXT: add z2.h, z3.h, z2.h +; CHECK-NEXT: and z0.h, z0.h, #0x1 +; CHECK-NEXT: add z0.h, z2.h, z0.h +; CHECK-NEXT: ret +entry: + %s0s = sext %s0 to + %s1s = sext %s1 to + %m = add nsw %s0s, %s1s + %s = lshr %m, shufflevector ( insertelement ( poison, i32 1, i32 0), poison, zeroinitializer) + %s2 = trunc %s to + ret %s2 +} + +define @haddu_v8i16( %s0, %s1) { +; CHECK-LABEL: haddu_v8i16: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: lsr z2.h, z1.h, #1 +; CHECK-NEXT: lsr z3.h, z0.h, #1 +; CHECK-NEXT: and z0.d, z0.d, z1.d +; CHECK-NEXT: add z2.h, z3.h, z2.h +; CHECK-NEXT: and z0.h, z0.h, #0x1 +; CHECK-NEXT: add z0.h, z2.h, z0.h +; CHECK-NEXT: ret +entry: + %s0s = zext %s0 to + %s1s = zext %s1 to + %m = add nuw nsw %s0s, %s1s + %s = lshr %m, shufflevector ( insertelement ( poison, i32 1, i32 0), poison, zeroinitializer) + %s2 = trunc %s to + ret %s2 +} + +define @hadds_v4i8( %s0, %s1) { +; CHECK-LABEL: hadds_v4i8: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: sxtb z0.s, p0/m, z0.s +; CHECK-NEXT: sxtb z1.s, p0/m, z1.s +; CHECK-NEXT: add z0.s, z0.s, z1.s +; CHECK-NEXT: and z0.s, z0.s, #0xffff +; CHECK-NEXT: lsr z0.s, z0.s, #1 +; CHECK-NEXT: ret +entry: + %s0s = sext %s0 to + %s1s = sext %s1 to + %m = add nsw %s0s, %s1s + %s = lshr %m, shufflevector ( insertelement ( poison, i16 1, i32 0), poison, zeroinitializer) + %s2 = trunc %s to + ret %s2 +} + +define @haddu_v4i8( %s0, %s1) { +; CHECK-LABEL: haddu_v4i8: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: and z2.d, z0.d, z1.d +; CHECK-NEXT: and z0.s, z0.s, #0xff +; CHECK-NEXT: and z1.s, z1.s, #0xff +; CHECK-NEXT: lsr z1.s, z1.s, #1 +; CHECK-NEXT: lsr z0.s, z0.s, #1 +; CHECK-NEXT: and z2.s, z2.s, #0x1 +; CHECK-NEXT: add z0.s, z0.s, z1.s +; CHECK-NEXT: add z0.s, z0.s, z2.s +; CHECK-NEXT: ret +entry: + %s0s = zext %s0 to + %s1s = zext %s1 to + %m = add nuw nsw %s0s, %s1s + %s = lshr %m, shufflevector ( insertelement ( poison, i16 1, i32 0), poison, zeroinitializer) + %s2 = trunc %s to + ret %s2 +} + +define @hadds_v8i8( %s0, %s1) { +; CHECK-LABEL: hadds_v8i8: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: sxtb z0.h, p0/m, z0.h +; CHECK-NEXT: sxtb z1.h, p0/m, z1.h +; CHECK-NEXT: add z0.h, z0.h, z1.h +; CHECK-NEXT: lsr z0.h, z0.h, #1 +; CHECK-NEXT: ret +entry: + %s0s = sext %s0 to + %s1s = sext %s1 to + %m = add nsw %s0s, %s1s + %s = lshr %m, shufflevector ( insertelement ( poison, i16 1, i32 0), poison, zeroinitializer) + %s2 = trunc %s to + ret %s2 +} + +define @haddu_v8i8( %s0, %s1) { +; CHECK-LABEL: haddu_v8i8: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: and z2.d, z0.d, z1.d +; CHECK-NEXT: and z0.h, z0.h, #0xff +; CHECK-NEXT: and z1.h, z1.h, #0xff +; CHECK-NEXT: lsr z1.h, z1.h, #1 +; CHECK-NEXT: lsr z0.h, z0.h, #1 +; CHECK-NEXT: and z2.h, z2.h, #0x1 +; CHECK-NEXT: add z0.h, z0.h, z1.h +; CHECK-NEXT: add z0.h, z0.h, z2.h +; CHECK-NEXT: ret +entry: + %s0s = zext %s0 to + %s1s = zext %s1 to + %m = add nuw nsw %s0s, %s1s + %s = lshr %m, shufflevector ( insertelement ( poison, i16 1, i32 0), poison, zeroinitializer) + %s2 = trunc %s to + ret %s2 +} + +define @hadds_v16i8( %s0, %s1) { +; CHECK-LABEL: hadds_v16i8: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: lsr z2.b, z1.b, #1 +; CHECK-NEXT: lsr z3.b, z0.b, #1 +; CHECK-NEXT: and z0.d, z0.d, z1.d +; CHECK-NEXT: add z2.b, z3.b, z2.b +; CHECK-NEXT: and z0.b, z0.b, #0x1 +; CHECK-NEXT: add z0.b, z2.b, z0.b +; CHECK-NEXT: ret +entry: + %s0s = sext %s0 to + %s1s = sext %s1 to + %m = add nsw %s0s, %s1s + %s = lshr %m, shufflevector ( insertelement ( poison, i16 1, i32 0), poison, zeroinitializer) + %s2 = trunc %s to + ret %s2 +} + +define @haddu_v16i8( %s0, %s1) { +; CHECK-LABEL: haddu_v16i8: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: lsr z2.b, z1.b, #1 +; CHECK-NEXT: lsr z3.b, z0.b, #1 +; CHECK-NEXT: and z0.d, z0.d, z1.d +; CHECK-NEXT: add z2.b, z3.b, z2.b +; CHECK-NEXT: and z0.b, z0.b, #0x1 +; CHECK-NEXT: add z0.b, z2.b, z0.b +; CHECK-NEXT: ret +entry: + %s0s = zext %s0 to + %s1s = zext %s1 to + %m = add nuw nsw %s0s, %s1s + %s = lshr %m, shufflevector ( insertelement ( poison, i16 1, i32 0), poison, zeroinitializer) + %s2 = trunc %s to + ret %s2 +} + +define @rhadds_v2i64( %s0, %s1) { +; CHECK-LABEL: rhadds_v2i64: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: lsr z2.d, z1.d, #1 +; CHECK-NEXT: lsr z3.d, z0.d, #1 +; CHECK-NEXT: orr z0.d, z0.d, z1.d +; CHECK-NEXT: add z2.d, z3.d, z2.d +; CHECK-NEXT: and z0.d, z0.d, #0x1 +; CHECK-NEXT: add z0.d, z2.d, z0.d +; CHECK-NEXT: ret +entry: + %s0s = sext %s0 to + %s1s = sext %s1 to + %add = add %s0s, shufflevector ( insertelement ( poison, i128 1, i32 0), poison, zeroinitializer) + %add2 = add %add, %s1s + %s = lshr %add2, shufflevector ( insertelement ( poison, i128 1, i32 0), poison, zeroinitializer) + %result = trunc %s to + ret %result +} + +define @rhaddu_v2i64( %s0, %s1) { +; CHECK-LABEL: rhaddu_v2i64: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: lsr z2.d, z1.d, #1 +; CHECK-NEXT: lsr z3.d, z0.d, #1 +; CHECK-NEXT: orr z0.d, z0.d, z1.d +; CHECK-NEXT: add z2.d, z3.d, z2.d +; CHECK-NEXT: and z0.d, z0.d, #0x1 +; CHECK-NEXT: add z0.d, z2.d, z0.d +; CHECK-NEXT: ret +entry: + %s0s = zext %s0 to + %s1s = zext %s1 to + %add = add nuw nsw %s0s, shufflevector ( insertelement ( poison, i128 1, i32 0), poison, zeroinitializer) + %add2 = add nuw nsw %add, %s1s + %s = lshr %add2, shufflevector ( insertelement ( poison, i128 1, i32 0), poison, zeroinitializer) + %result = trunc %s to + ret %result +} + +define @rhadds_v2i32( %s0, %s1) { +; CHECK-LABEL: rhadds_v2i32: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: mov z2.d, #-1 // =0xffffffffffffffff +; CHECK-NEXT: sxtw z0.d, p0/m, z0.d +; CHECK-NEXT: sxtw z1.d, p0/m, z1.d +; CHECK-NEXT: eor z0.d, z0.d, z2.d +; CHECK-NEXT: sub z0.d, z1.d, z0.d +; CHECK-NEXT: lsr z0.d, z0.d, #1 +; CHECK-NEXT: ret +entry: + %s0s = sext %s0 to + %s1s = sext %s1 to + %add = add %s0s, shufflevector ( insertelement ( poison, i64 1, i32 0), poison, zeroinitializer) + %add2 = add %add, %s1s + %s = lshr %add2, shufflevector ( insertelement ( poison, i64 1, i32 0), poison, zeroinitializer) + %result = trunc %s to + ret %result +} + +define @rhaddu_v2i32( %s0, %s1) { +; CHECK-LABEL: rhaddu_v2i32: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: orr z2.d, z0.d, z1.d +; CHECK-NEXT: and z0.d, z0.d, #0xffffffff +; CHECK-NEXT: and z1.d, z1.d, #0xffffffff +; CHECK-NEXT: lsr z1.d, z1.d, #1 +; CHECK-NEXT: lsr z0.d, z0.d, #1 +; CHECK-NEXT: and z2.d, z2.d, #0x1 +; CHECK-NEXT: add z0.d, z0.d, z1.d +; CHECK-NEXT: add z0.d, z0.d, z2.d +; CHECK-NEXT: ret +entry: + %s0s = zext %s0 to + %s1s = zext %s1 to + %add = add nuw nsw %s0s, shufflevector ( insertelement ( poison, i64 1, i32 0), poison, zeroinitializer) + %add2 = add nuw nsw %add, %s1s + %s = lshr %add2, shufflevector ( insertelement ( poison, i64 1, i32 0), poison, zeroinitializer) + %result = trunc %s to + ret %result +} + +define @rhadds_v4i32( %s0, %s1) { +; CHECK-LABEL: rhadds_v4i32: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: lsr z2.s, z1.s, #1 +; CHECK-NEXT: lsr z3.s, z0.s, #1 +; CHECK-NEXT: orr z0.d, z0.d, z1.d +; CHECK-NEXT: add z2.s, z3.s, z2.s +; CHECK-NEXT: and z0.s, z0.s, #0x1 +; CHECK-NEXT: add z0.s, z2.s, z0.s +; CHECK-NEXT: ret +entry: + %s0s = sext %s0 to + %s1s = sext %s1 to + %add = add %s0s, shufflevector ( insertelement ( poison, i64 1, i32 0), poison, zeroinitializer) + %add2 = add %add, %s1s + %s = lshr %add2, shufflevector ( insertelement ( poison, i64 1, i32 0), poison, zeroinitializer) + %result = trunc %s to + ret %result +} + +define @rhaddu_v4i32( %s0, %s1) { +; CHECK-LABEL: rhaddu_v4i32: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: lsr z2.s, z1.s, #1 +; CHECK-NEXT: lsr z3.s, z0.s, #1 +; CHECK-NEXT: orr z0.d, z0.d, z1.d +; CHECK-NEXT: add z2.s, z3.s, z2.s +; CHECK-NEXT: and z0.s, z0.s, #0x1 +; CHECK-NEXT: add z0.s, z2.s, z0.s +; CHECK-NEXT: ret +entry: + %s0s = zext %s0 to + %s1s = zext %s1 to + %add = add nuw nsw %s0s, shufflevector ( insertelement ( poison, i64 1, i32 0), poison, zeroinitializer) + %add2 = add nuw nsw %add, %s1s + %s = lshr %add2, shufflevector ( insertelement ( poison, i64 1, i32 0), poison, zeroinitializer) + %result = trunc %s to + ret %result +} + +define @rhadds_v2i16( %s0, %s1) { +; CHECK-LABEL: rhadds_v2i16: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: mov z2.d, #-1 // =0xffffffffffffffff +; CHECK-NEXT: sxth z0.d, p0/m, z0.d +; CHECK-NEXT: sxth z1.d, p0/m, z1.d +; CHECK-NEXT: eor z0.d, z0.d, z2.d +; CHECK-NEXT: sub z0.d, z1.d, z0.d +; CHECK-NEXT: and z0.d, z0.d, #0xffffffff +; CHECK-NEXT: lsr z0.d, z0.d, #1 +; CHECK-NEXT: ret +entry: + %s0s = sext %s0 to + %s1s = sext %s1 to + %add = add %s0s, shufflevector ( insertelement ( poison, i32 1, i32 0), poison, zeroinitializer) + %add2 = add %add, %s1s + %s = lshr %add2, shufflevector ( insertelement ( poison, i32 1, i32 0), poison, zeroinitializer) + %result = trunc %s to + ret %result +} + +define @rhaddu_v2i16( %s0, %s1) { +; CHECK-LABEL: rhaddu_v2i16: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: and z0.d, z0.d, #0xffff +; CHECK-NEXT: mov z2.d, #-1 // =0xffffffffffffffff +; CHECK-NEXT: and z1.d, z1.d, #0xffff +; CHECK-NEXT: eor z0.d, z0.d, z2.d +; CHECK-NEXT: sub z0.d, z1.d, z0.d +; CHECK-NEXT: lsr z0.d, z0.d, #1 +; CHECK-NEXT: ret +entry: + %s0s = zext %s0 to + %s1s = zext %s1 to + %add = add nuw nsw %s0s, shufflevector ( insertelement ( poison, i32 1, i32 0), poison, zeroinitializer) + %add2 = add nuw nsw %add, %s1s + %s = lshr %add2, shufflevector ( insertelement ( poison, i32 1, i32 0), poison, zeroinitializer) + %result = trunc %s to + ret %result +} + +define @rhadds_v4i16( %s0, %s1) { +; CHECK-LABEL: rhadds_v4i16: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: mov z2.s, #-1 // =0xffffffffffffffff +; CHECK-NEXT: sxth z0.s, p0/m, z0.s +; CHECK-NEXT: sxth z1.s, p0/m, z1.s +; CHECK-NEXT: eor z0.d, z0.d, z2.d +; CHECK-NEXT: sub z0.s, z1.s, z0.s +; CHECK-NEXT: lsr z0.s, z0.s, #1 +; CHECK-NEXT: ret +entry: + %s0s = sext %s0 to + %s1s = sext %s1 to + %add = add %s0s, shufflevector ( insertelement ( poison, i32 1, i32 0), poison, zeroinitializer) + %add2 = add %add, %s1s + %s = lshr %add2, shufflevector ( insertelement ( poison, i32 1, i32 0), poison, zeroinitializer) + %result = trunc %s to + ret %result +} + +define @rhaddu_v4i16( %s0, %s1) { +; CHECK-LABEL: rhaddu_v4i16: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: orr z2.d, z0.d, z1.d +; CHECK-NEXT: and z0.s, z0.s, #0xffff +; CHECK-NEXT: and z1.s, z1.s, #0xffff +; CHECK-NEXT: lsr z1.s, z1.s, #1 +; CHECK-NEXT: lsr z0.s, z0.s, #1 +; CHECK-NEXT: and z2.s, z2.s, #0x1 +; CHECK-NEXT: add z0.s, z0.s, z1.s +; CHECK-NEXT: add z0.s, z0.s, z2.s +; CHECK-NEXT: ret +entry: + %s0s = zext %s0 to + %s1s = zext %s1 to + %add = add nuw nsw %s0s, shufflevector ( insertelement ( poison, i32 1, i32 0), poison, zeroinitializer) + %add2 = add nuw nsw %add, %s1s + %s = lshr %add2, shufflevector ( insertelement ( poison, i32 1, i32 0), poison, zeroinitializer) + %result = trunc %s to + ret %result +} + +define @rhadds_v8i16( %s0, %s1) { +; CHECK-LABEL: rhadds_v8i16: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: lsr z2.h, z1.h, #1 +; CHECK-NEXT: lsr z3.h, z0.h, #1 +; CHECK-NEXT: orr z0.d, z0.d, z1.d +; CHECK-NEXT: add z2.h, z3.h, z2.h +; CHECK-NEXT: and z0.h, z0.h, #0x1 +; CHECK-NEXT: add z0.h, z2.h, z0.h +; CHECK-NEXT: ret +entry: + %s0s = sext %s0 to + %s1s = sext %s1 to + %add = add %s0s, shufflevector ( insertelement ( poison, i32 1, i32 0), poison, zeroinitializer) + %add2 = add %add, %s1s + %s = lshr %add2, shufflevector ( insertelement ( poison, i32 1, i32 0), poison, zeroinitializer) + %result = trunc %s to + ret %result +} + +define @rhaddu_v8i16( %s0, %s1) { +; CHECK-LABEL: rhaddu_v8i16: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: lsr z2.h, z1.h, #1 +; CHECK-NEXT: lsr z3.h, z0.h, #1 +; CHECK-NEXT: orr z0.d, z0.d, z1.d +; CHECK-NEXT: add z2.h, z3.h, z2.h +; CHECK-NEXT: and z0.h, z0.h, #0x1 +; CHECK-NEXT: add z0.h, z2.h, z0.h +; CHECK-NEXT: ret +entry: + %s0s = zext %s0 to + %s1s = zext %s1 to + %add = add nuw nsw %s0s, shufflevector ( insertelement ( poison, i32 1, i32 0), poison, zeroinitializer) + %add2 = add nuw nsw %add, %s1s + %s = lshr %add2, shufflevector ( insertelement ( poison, i32 1, i32 0), poison, zeroinitializer) + %result = trunc %s to + ret %result +} + +define @rhadds_v4i8( %s0, %s1) { +; CHECK-LABEL: rhadds_v4i8: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: mov z2.s, #-1 // =0xffffffffffffffff +; CHECK-NEXT: sxtb z0.s, p0/m, z0.s +; CHECK-NEXT: sxtb z1.s, p0/m, z1.s +; CHECK-NEXT: eor z0.d, z0.d, z2.d +; CHECK-NEXT: sub z0.s, z1.s, z0.s +; CHECK-NEXT: and z0.s, z0.s, #0xffff +; CHECK-NEXT: lsr z0.s, z0.s, #1 +; CHECK-NEXT: ret +entry: + %s0s = sext %s0 to + %s1s = sext %s1 to + %add = add %s0s, shufflevector ( insertelement ( poison, i16 1, i32 0), poison, zeroinitializer) + %add2 = add %add, %s1s + %s = lshr %add2, shufflevector ( insertelement ( poison, i16 1, i32 0), poison, zeroinitializer) + %result = trunc %s to + ret %result +} + +define @rhaddu_v4i8( %s0, %s1) { +; CHECK-LABEL: rhaddu_v4i8: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: and z0.s, z0.s, #0xff +; CHECK-NEXT: mov z2.s, #-1 // =0xffffffffffffffff +; CHECK-NEXT: and z1.s, z1.s, #0xff +; CHECK-NEXT: eor z0.d, z0.d, z2.d +; CHECK-NEXT: sub z0.s, z1.s, z0.s +; CHECK-NEXT: lsr z0.s, z0.s, #1 +; CHECK-NEXT: ret +entry: + %s0s = zext %s0 to + %s1s = zext %s1 to + %add = add nuw nsw %s0s, shufflevector ( insertelement ( poison, i16 1, i32 0), poison, zeroinitializer) + %add2 = add nuw nsw %add, %s1s + %s = lshr %add2, shufflevector ( insertelement ( poison, i16 1, i32 0), poison, zeroinitializer) + %result = trunc %s to + ret %result +} + +define @rhadds_v8i8( %s0, %s1) { +; CHECK-LABEL: rhadds_v8i8: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: mov z2.h, #-1 // =0xffffffffffffffff +; CHECK-NEXT: sxtb z0.h, p0/m, z0.h +; CHECK-NEXT: sxtb z1.h, p0/m, z1.h +; CHECK-NEXT: eor z0.d, z0.d, z2.d +; CHECK-NEXT: sub z0.h, z1.h, z0.h +; CHECK-NEXT: lsr z0.h, z0.h, #1 +; CHECK-NEXT: ret +entry: + %s0s = sext %s0 to + %s1s = sext %s1 to + %add = add %s0s, shufflevector ( insertelement ( poison, i16 1, i32 0), poison, zeroinitializer) + %add2 = add %add, %s1s + %s = lshr %add2, shufflevector ( insertelement ( poison, i16 1, i32 0), poison, zeroinitializer) + %result = trunc %s to + ret %result +} + +define @rhaddu_v8i8( %s0, %s1) { +; CHECK-LABEL: rhaddu_v8i8: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: orr z2.d, z0.d, z1.d +; CHECK-NEXT: and z0.h, z0.h, #0xff +; CHECK-NEXT: and z1.h, z1.h, #0xff +; CHECK-NEXT: lsr z1.h, z1.h, #1 +; CHECK-NEXT: lsr z0.h, z0.h, #1 +; CHECK-NEXT: and z2.h, z2.h, #0x1 +; CHECK-NEXT: add z0.h, z0.h, z1.h +; CHECK-NEXT: add z0.h, z0.h, z2.h +; CHECK-NEXT: ret +entry: + %s0s = zext %s0 to + %s1s = zext %s1 to + %add = add nuw nsw %s0s, shufflevector ( insertelement ( poison, i16 1, i32 0), poison, zeroinitializer) + %add2 = add nuw nsw %add, %s1s + %s = lshr %add2, shufflevector ( insertelement ( poison, i16 1, i32 0), poison, zeroinitializer) + %result = trunc %s to + ret %result +} + +define @rhadds_v16i8( %s0, %s1) { +; CHECK-LABEL: rhadds_v16i8: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: lsr z2.b, z1.b, #1 +; CHECK-NEXT: lsr z3.b, z0.b, #1 +; CHECK-NEXT: orr z0.d, z0.d, z1.d +; CHECK-NEXT: add z2.b, z3.b, z2.b +; CHECK-NEXT: and z0.b, z0.b, #0x1 +; CHECK-NEXT: add z0.b, z2.b, z0.b +; CHECK-NEXT: ret +entry: + %s0s = sext %s0 to + %s1s = sext %s1 to + %add = add %s0s, shufflevector ( insertelement ( poison, i16 1, i32 0), poison, zeroinitializer) + %add2 = add %add, %s1s + %s = lshr %add2, shufflevector ( insertelement ( poison, i16 1, i32 0), poison, zeroinitializer) + %result = trunc %s to + ret %result +} + +define @rhaddu_v16i8( %s0, %s1) { +; CHECK-LABEL: rhaddu_v16i8: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: lsr z2.b, z1.b, #1 +; CHECK-NEXT: lsr z3.b, z0.b, #1 +; CHECK-NEXT: orr z0.d, z0.d, z1.d +; CHECK-NEXT: add z2.b, z3.b, z2.b +; CHECK-NEXT: and z0.b, z0.b, #0x1 +; CHECK-NEXT: add z0.b, z2.b, z0.b +; CHECK-NEXT: ret +entry: + %s0s = zext %s0 to + %s1s = zext %s1 to + %add = add nuw nsw %s0s, shufflevector ( insertelement ( poison, i16 1, i32 0), poison, zeroinitializer) + %add2 = add nuw nsw %add, %s1s + %s = lshr %add2, shufflevector ( insertelement ( poison, i16 1, i32 0), poison, zeroinitializer) + %result = trunc %s to + ret %result +}