diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h @@ -187,6 +187,10 @@ SADDV, UADDV, + // Vector rounding halving addition + SRHADD, + URHADD, + // Vector across-lanes min/max // Only the lower result lane is defined. SMINV, @@ -863,6 +867,7 @@ SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const; SDValue LowerFSINCOS(SDValue Op, SelectionDAG &DAG) const; SDValue LowerVSCALE(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const; SDValue LowerVECREDUCE(SDValue Op, SelectionDAG &DAG) const; SDValue LowerATOMIC_LOAD_SUB(SDValue Op, SelectionDAG &DAG) const; SDValue LowerATOMIC_LOAD_AND(SDValue Op, SelectionDAG &DAG) const; diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -838,6 +838,8 @@ setOperationAction(ISD::UADDSAT, VT, Legal); setOperationAction(ISD::SSUBSAT, VT, Legal); setOperationAction(ISD::USUBSAT, VT, Legal); + + setOperationAction(ISD::TRUNCATE, VT, Custom); } for (MVT VT : { MVT::v4f16, MVT::v2f32, MVT::v8f16, MVT::v4f32, MVT::v2f64 }) { @@ -1432,6 +1434,8 @@ MAKE_CASE(AArch64ISD::FCMLTz) MAKE_CASE(AArch64ISD::SADDV) MAKE_CASE(AArch64ISD::UADDV) + MAKE_CASE(AArch64ISD::SRHADD) + MAKE_CASE(AArch64ISD::URHADD) MAKE_CASE(AArch64ISD::SMINV) MAKE_CASE(AArch64ISD::UMINV) MAKE_CASE(AArch64ISD::SMAXV) @@ -3260,6 +3264,14 @@ return DAG.getNode(Opcode, dl, Ty, Op.getOperand(1), Op.getOperand(2), Op.getOperand(3)); } + + case Intrinsic::aarch64_neon_srhadd: + case Intrinsic::aarch64_neon_urhadd: { + bool IsSignedAdd = IntNo == Intrinsic::aarch64_neon_srhadd; + unsigned Opcode = IsSignedAdd ? AArch64ISD::SRHADD : AArch64ISD::URHADD; + return DAG.getNode(Opcode, dl, Op.getValueType(), Op.getOperand(1), + Op.getOperand(2)); + } } } @@ -3524,6 +3536,8 @@ return LowerDYNAMIC_STACKALLOC(Op, DAG); case ISD::VSCALE: return LowerVSCALE(Op, DAG); + case ISD::TRUNCATE: + return LowerTRUNCATE(Op, DAG); case ISD::LOAD: if (useSVEForFixedLengthVectorVT(Op.getValueType())) return LowerFixedLengthVectorLoadToSVE(Op, DAG); @@ -8773,6 +8787,78 @@ return (Cnt >= 1 && Cnt <= (isNarrow ? ElementBits / 2 : ElementBits)); } +// Attempt to form urhadd(OpA, OpB) from +// truncate(vlshr(sub(zext(OpB), xor(zext(OpA), Ones(ElemSizeInBits))), 1)). +// The original form of this expression is +// truncate(srl(add(zext(OpB), add(zext(OpA), 1)), 1)) and before this function +// is called the srl will have been lowered to AArch64ISD::VLSHR and the +// ((OpA + OpB + 1) >> 1) expression will have been changed to (OpB - (~OpA)). +// This pass can also recognize a variant of this pattern that uses sign +// extension instead of zero extension and form a srhadd(OpA, OpB) from it. +SDValue AArch64TargetLowering::LowerTRUNCATE(SDValue Op, + SelectionDAG &DAG) const { + EVT VT = Op.getValueType(); + + if (!VT.isVector() || VT.isScalableVector()) + return Op; + + // Since we are looking for a right shift by a constant value of 1 and we are + // operating on types at least 16 bits in length (sign/zero extended OpA and + // OpB, which are at least 8 bits), it follows that the truncate will always + // discard the shifted-in bit and therefore the right shift will be logical + // regardless of the signedness of OpA and OpB. + SDValue Shift = Op.getOperand(0); + if (Shift.getOpcode() != AArch64ISD::VLSHR) + return Op; + + // Is the right shift using an immediate value of 1? + uint64_t ShiftAmount = Shift.getConstantOperandVal(1); + if (ShiftAmount != 1) + return Op; + + SDValue Sub = Shift->getOperand(0); + if (Sub.getOpcode() != ISD::SUB) + return Op; + + SDValue Xor = Sub.getOperand(1); + if (Xor.getOpcode() != ISD::XOR) + return Op; + + SDValue ExtendOpA = Xor.getOperand(0); + SDValue ExtendOpB = Sub.getOperand(0); + unsigned ExtendOpAOpc = ExtendOpA.getOpcode(); + unsigned ExtendOpBOpc = ExtendOpB.getOpcode(); + if (!(ExtendOpAOpc == ExtendOpBOpc && + (ExtendOpAOpc == ISD::ZERO_EXTEND || ExtendOpAOpc == ISD::SIGN_EXTEND))) + return Op; + + // Is the result of the right shift being truncated to the same value type as + // the original operands, OpA and OpB? + SDValue OpA = ExtendOpA.getOperand(0); + SDValue OpB = ExtendOpB.getOperand(0); + EVT OpAVT = OpA.getValueType(); + assert(ExtendOpA.getValueType() == ExtendOpB.getValueType()); + if (!(VT == OpAVT && OpAVT == OpB.getValueType())) + return Op; + + // Is the XOR using a constant amount of all ones in the right hand side? + uint64_t C; + if (!isAllConstantBuildVector(Xor.getOperand(1), C)) + return Op; + + unsigned ElemSizeInBits = VT.getScalarSizeInBits(); + APInt CAsAPInt(ElemSizeInBits, C); + if (CAsAPInt != APInt::getAllOnesValue(ElemSizeInBits)) + return Op; + + SDLoc DL(Op); + bool IsSignExtend = ExtendOpAOpc == ISD::SIGN_EXTEND; + unsigned RHADDOpc = IsSignExtend ? AArch64ISD::SRHADD : AArch64ISD::URHADD; + SDValue ResultURHADD = DAG.getNode(RHADDOpc, DL, VT, OpA, OpB); + + return ResultURHADD; +} + SDValue AArch64TargetLowering::LowerVectorSRA_SRL_SHL(SDValue Op, SelectionDAG &DAG) const { EVT VT = Op.getValueType(); @@ -10981,6 +11067,7 @@ SDLoc dl(N); EVT VT = N->getValueType(0); SDValue N0 = N->getOperand(0), N1 = N->getOperand(1); + unsigned N0Opc = N0->getOpcode(), N1Opc = N1->getOpcode(); // Optimize concat_vectors of truncated vectors, where the intermediate // type is illegal, to avoid said illegality, e.g., @@ -10993,9 +11080,8 @@ // This isn't really target-specific, but ISD::TRUNCATE legality isn't keyed // on both input and result type, so we might generate worse code. // On AArch64 we know it's fine for v2i64->v4i16 and v4i32->v8i8. - if (N->getNumOperands() == 2 && - N0->getOpcode() == ISD::TRUNCATE && - N1->getOpcode() == ISD::TRUNCATE) { + if (N->getNumOperands() == 2 && N0Opc == ISD::TRUNCATE && + N1Opc == ISD::TRUNCATE) { SDValue N00 = N0->getOperand(0); SDValue N10 = N1->getOperand(0); EVT N00VT = N00.getValueType(); @@ -11020,6 +11106,52 @@ if (DCI.isBeforeLegalizeOps()) return SDValue(); + // Optimise concat_vectors of two [us]rhadds that use extracted subvectors + // from the same original vectors. Combine these into a single [us]rhadd that + // operates on the two original vectors. Example: + // (v16i8 (concat_vectors (v8i8 (urhadd (extract_subvector (v16i8 OpA, <0>), + // extract_subvector (v16i8 OpB, + // <0>))), + // (v8i8 (urhadd (extract_subvector (v16i8 OpA, <8>), + // extract_subvector (v16i8 OpB, + // <8>))))) + // -> + // (v16i8(urhadd(v16i8 OpA, v16i8 OpB))) + if (N->getNumOperands() == 2 && N0Opc == N1Opc && + (N0Opc == AArch64ISD::URHADD || N0Opc == AArch64ISD::SRHADD)) { + SDValue N00 = N0->getOperand(0); + SDValue N01 = N0->getOperand(1); + SDValue N10 = N1->getOperand(0); + SDValue N11 = N1->getOperand(1); + + EVT N00VT = N00.getValueType(); + EVT N10VT = N10.getValueType(); + + if (N00->getOpcode() == ISD::EXTRACT_SUBVECTOR && + N01->getOpcode() == ISD::EXTRACT_SUBVECTOR && + N10->getOpcode() == ISD::EXTRACT_SUBVECTOR && + N11->getOpcode() == ISD::EXTRACT_SUBVECTOR && N00VT == N10VT) { + SDValue N00Source = N00->getOperand(0); + SDValue N01Source = N01->getOperand(0); + SDValue N10Source = N10->getOperand(0); + SDValue N11Source = N11->getOperand(0); + + if (N00Source == N10Source && N01Source == N11Source && + N00Source.getValueType() == VT && N01Source.getValueType() == VT) { + assert(N0.getValueType() == N1.getValueType()); + + uint64_t N00Index = N00.getConstantOperandVal(1); + uint64_t N01Index = N01.getConstantOperandVal(1); + uint64_t N10Index = N10.getConstantOperandVal(1); + uint64_t N11Index = N11.getConstantOperandVal(1); + + if (N00Index == N01Index && N10Index == N11Index && N00Index == 0 && + N10Index == N00VT.getVectorNumElements()) + return DAG.getNode(N0Opc, dl, VT, N00Source, N01Source); + } + } + } + // If we see a (concat_vectors (v1x64 A), (v1x64 A)) it's really a vector // splat. The indexed instructions are going to be expecting a DUPLANE64, so // canonicalise to that. @@ -11038,7 +11170,7 @@ // becomes // (bitconvert (concat_vectors (v4i16 (bitconvert LHS)), RHS)) - if (N1->getOpcode() != ISD::BITCAST) + if (N1Opc != ISD::BITCAST) return SDValue(); SDValue RHS = N1->getOperand(0); MVT RHSTy = RHS.getValueType().getSimpleVT(); diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td --- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td @@ -552,6 +552,9 @@ def AArch64smaxv : SDNode<"AArch64ISD::SMAXV", SDT_AArch64UnaryVec>; def AArch64umaxv : SDNode<"AArch64ISD::UMAXV", SDT_AArch64UnaryVec>; +def AArch64srhadd : SDNode<"AArch64ISD::SRHADD", SDT_AArch64binvec>; +def AArch64urhadd : SDNode<"AArch64ISD::URHADD", SDT_AArch64binvec>; + def SDT_AArch64SETTAG : SDTypeProfile<0, 2, [SDTCisPtrTy<0>, SDTCisPtrTy<1>]>; def AArch64stg : SDNode<"AArch64ISD::STG", SDT_AArch64SETTAG, [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>; def AArch64stzg : SDNode<"AArch64ISD::STZG", SDT_AArch64SETTAG, [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>; @@ -4071,7 +4074,7 @@ defm SQRSHL : SIMDThreeSameVector<0,0b01011,"sqrshl", int_aarch64_neon_sqrshl>; defm SQSHL : SIMDThreeSameVector<0,0b01001,"sqshl", int_aarch64_neon_sqshl>; defm SQSUB : SIMDThreeSameVector<0,0b00101,"sqsub", int_aarch64_neon_sqsub>; -defm SRHADD : SIMDThreeSameVectorBHS<0,0b00010,"srhadd",int_aarch64_neon_srhadd>; +defm SRHADD : SIMDThreeSameVectorBHS<0,0b00010,"srhadd", AArch64srhadd>; defm SRSHL : SIMDThreeSameVector<0,0b01010,"srshl", int_aarch64_neon_srshl>; defm SSHL : SIMDThreeSameVector<0,0b01000,"sshl", int_aarch64_neon_sshl>; defm SUB : SIMDThreeSameVector<1,0b10000,"sub", sub>; @@ -4088,7 +4091,7 @@ defm UQRSHL : SIMDThreeSameVector<1,0b01011,"uqrshl", int_aarch64_neon_uqrshl>; defm UQSHL : SIMDThreeSameVector<1,0b01001,"uqshl", int_aarch64_neon_uqshl>; defm UQSUB : SIMDThreeSameVector<1,0b00101,"uqsub", int_aarch64_neon_uqsub>; -defm URHADD : SIMDThreeSameVectorBHS<1,0b00010,"urhadd", int_aarch64_neon_urhadd>; +defm URHADD : SIMDThreeSameVectorBHS<1,0b00010,"urhadd", AArch64urhadd>; defm URSHL : SIMDThreeSameVector<1,0b01010,"urshl", int_aarch64_neon_urshl>; defm USHL : SIMDThreeSameVector<1,0b01000,"ushl", int_aarch64_neon_ushl>; defm SQRDMLAH : SIMDThreeSameVectorSQRDMLxHTiedHS<1,0b10000,"sqrdmlah", diff --git a/llvm/test/CodeGen/AArch64/arm64-vhadd.ll b/llvm/test/CodeGen/AArch64/arm64-vhadd.ll --- a/llvm/test/CodeGen/AArch64/arm64-vhadd.ll +++ b/llvm/test/CodeGen/AArch64/arm64-vhadd.ll @@ -1,8 +1,13 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s -mtriple=arm64-eabi -aarch64-neon-syntax=apple | FileCheck %s define <8 x i8> @shadd8b(<8 x i8>* %A, <8 x i8>* %B) nounwind { -;CHECK-LABEL: shadd8b: -;CHECK: shadd.8b +; CHECK-LABEL: shadd8b: +; CHECK: // %bb.0: +; CHECK-NEXT: ldr d0, [x0] +; CHECK-NEXT: ldr d1, [x1] +; CHECK-NEXT: shadd.8b v0, v0, v1 +; CHECK-NEXT: ret %tmp1 = load <8 x i8>, <8 x i8>* %A %tmp2 = load <8 x i8>, <8 x i8>* %B %tmp3 = call <8 x i8> @llvm.aarch64.neon.shadd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2) @@ -10,8 +15,12 @@ } define <16 x i8> @shadd16b(<16 x i8>* %A, <16 x i8>* %B) nounwind { -;CHECK-LABEL: shadd16b: -;CHECK: shadd.16b +; CHECK-LABEL: shadd16b: +; CHECK: // %bb.0: +; CHECK-NEXT: ldr q0, [x0] +; CHECK-NEXT: ldr q1, [x1] +; CHECK-NEXT: shadd.16b v0, v0, v1 +; CHECK-NEXT: ret %tmp1 = load <16 x i8>, <16 x i8>* %A %tmp2 = load <16 x i8>, <16 x i8>* %B %tmp3 = call <16 x i8> @llvm.aarch64.neon.shadd.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2) @@ -19,8 +28,12 @@ } define <4 x i16> @shadd4h(<4 x i16>* %A, <4 x i16>* %B) nounwind { -;CHECK-LABEL: shadd4h: -;CHECK: shadd.4h +; CHECK-LABEL: shadd4h: +; CHECK: // %bb.0: +; CHECK-NEXT: ldr d0, [x0] +; CHECK-NEXT: ldr d1, [x1] +; CHECK-NEXT: shadd.4h v0, v0, v1 +; CHECK-NEXT: ret %tmp1 = load <4 x i16>, <4 x i16>* %A %tmp2 = load <4 x i16>, <4 x i16>* %B %tmp3 = call <4 x i16> @llvm.aarch64.neon.shadd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2) @@ -28,8 +41,12 @@ } define <8 x i16> @shadd8h(<8 x i16>* %A, <8 x i16>* %B) nounwind { -;CHECK-LABEL: shadd8h: -;CHECK: shadd.8h +; CHECK-LABEL: shadd8h: +; CHECK: // %bb.0: +; CHECK-NEXT: ldr q0, [x0] +; CHECK-NEXT: ldr q1, [x1] +; CHECK-NEXT: shadd.8h v0, v0, v1 +; CHECK-NEXT: ret %tmp1 = load <8 x i16>, <8 x i16>* %A %tmp2 = load <8 x i16>, <8 x i16>* %B %tmp3 = call <8 x i16> @llvm.aarch64.neon.shadd.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2) @@ -37,8 +54,12 @@ } define <2 x i32> @shadd2s(<2 x i32>* %A, <2 x i32>* %B) nounwind { -;CHECK-LABEL: shadd2s: -;CHECK: shadd.2s +; CHECK-LABEL: shadd2s: +; CHECK: // %bb.0: +; CHECK-NEXT: ldr d0, [x0] +; CHECK-NEXT: ldr d1, [x1] +; CHECK-NEXT: shadd.2s v0, v0, v1 +; CHECK-NEXT: ret %tmp1 = load <2 x i32>, <2 x i32>* %A %tmp2 = load <2 x i32>, <2 x i32>* %B %tmp3 = call <2 x i32> @llvm.aarch64.neon.shadd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2) @@ -46,8 +67,12 @@ } define <4 x i32> @shadd4s(<4 x i32>* %A, <4 x i32>* %B) nounwind { -;CHECK-LABEL: shadd4s: -;CHECK: shadd.4s +; CHECK-LABEL: shadd4s: +; CHECK: // %bb.0: +; CHECK-NEXT: ldr q0, [x0] +; CHECK-NEXT: ldr q1, [x1] +; CHECK-NEXT: shadd.4s v0, v0, v1 +; CHECK-NEXT: ret %tmp1 = load <4 x i32>, <4 x i32>* %A %tmp2 = load <4 x i32>, <4 x i32>* %B %tmp3 = call <4 x i32> @llvm.aarch64.neon.shadd.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2) @@ -55,8 +80,12 @@ } define <8 x i8> @uhadd8b(<8 x i8>* %A, <8 x i8>* %B) nounwind { -;CHECK-LABEL: uhadd8b: -;CHECK: uhadd.8b +; CHECK-LABEL: uhadd8b: +; CHECK: // %bb.0: +; CHECK-NEXT: ldr d0, [x0] +; CHECK-NEXT: ldr d1, [x1] +; CHECK-NEXT: uhadd.8b v0, v0, v1 +; CHECK-NEXT: ret %tmp1 = load <8 x i8>, <8 x i8>* %A %tmp2 = load <8 x i8>, <8 x i8>* %B %tmp3 = call <8 x i8> @llvm.aarch64.neon.uhadd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2) @@ -64,8 +93,12 @@ } define <16 x i8> @uhadd16b(<16 x i8>* %A, <16 x i8>* %B) nounwind { -;CHECK-LABEL: uhadd16b: -;CHECK: uhadd.16b +; CHECK-LABEL: uhadd16b: +; CHECK: // %bb.0: +; CHECK-NEXT: ldr q0, [x0] +; CHECK-NEXT: ldr q1, [x1] +; CHECK-NEXT: uhadd.16b v0, v0, v1 +; CHECK-NEXT: ret %tmp1 = load <16 x i8>, <16 x i8>* %A %tmp2 = load <16 x i8>, <16 x i8>* %B %tmp3 = call <16 x i8> @llvm.aarch64.neon.uhadd.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2) @@ -73,8 +106,12 @@ } define <4 x i16> @uhadd4h(<4 x i16>* %A, <4 x i16>* %B) nounwind { -;CHECK-LABEL: uhadd4h: -;CHECK: uhadd.4h +; CHECK-LABEL: uhadd4h: +; CHECK: // %bb.0: +; CHECK-NEXT: ldr d0, [x0] +; CHECK-NEXT: ldr d1, [x1] +; CHECK-NEXT: uhadd.4h v0, v0, v1 +; CHECK-NEXT: ret %tmp1 = load <4 x i16>, <4 x i16>* %A %tmp2 = load <4 x i16>, <4 x i16>* %B %tmp3 = call <4 x i16> @llvm.aarch64.neon.uhadd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2) @@ -82,8 +119,12 @@ } define <8 x i16> @uhadd8h(<8 x i16>* %A, <8 x i16>* %B) nounwind { -;CHECK-LABEL: uhadd8h: -;CHECK: uhadd.8h +; CHECK-LABEL: uhadd8h: +; CHECK: // %bb.0: +; CHECK-NEXT: ldr q0, [x0] +; CHECK-NEXT: ldr q1, [x1] +; CHECK-NEXT: uhadd.8h v0, v0, v1 +; CHECK-NEXT: ret %tmp1 = load <8 x i16>, <8 x i16>* %A %tmp2 = load <8 x i16>, <8 x i16>* %B %tmp3 = call <8 x i16> @llvm.aarch64.neon.uhadd.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2) @@ -91,8 +132,12 @@ } define <2 x i32> @uhadd2s(<2 x i32>* %A, <2 x i32>* %B) nounwind { -;CHECK-LABEL: uhadd2s: -;CHECK: uhadd.2s +; CHECK-LABEL: uhadd2s: +; CHECK: // %bb.0: +; CHECK-NEXT: ldr d0, [x0] +; CHECK-NEXT: ldr d1, [x1] +; CHECK-NEXT: uhadd.2s v0, v0, v1 +; CHECK-NEXT: ret %tmp1 = load <2 x i32>, <2 x i32>* %A %tmp2 = load <2 x i32>, <2 x i32>* %B %tmp3 = call <2 x i32> @llvm.aarch64.neon.uhadd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2) @@ -100,8 +145,12 @@ } define <4 x i32> @uhadd4s(<4 x i32>* %A, <4 x i32>* %B) nounwind { -;CHECK-LABEL: uhadd4s: -;CHECK: uhadd.4s +; CHECK-LABEL: uhadd4s: +; CHECK: // %bb.0: +; CHECK-NEXT: ldr q0, [x0] +; CHECK-NEXT: ldr q1, [x1] +; CHECK-NEXT: uhadd.4s v0, v0, v1 +; CHECK-NEXT: ret %tmp1 = load <4 x i32>, <4 x i32>* %A %tmp2 = load <4 x i32>, <4 x i32>* %B %tmp3 = call <4 x i32> @llvm.aarch64.neon.uhadd.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2) @@ -125,8 +174,12 @@ declare <4 x i32> @llvm.aarch64.neon.uhadd.v4i32(<4 x i32>, <4 x i32>) nounwind readnone define <8 x i8> @srhadd8b(<8 x i8>* %A, <8 x i8>* %B) nounwind { -;CHECK-LABEL: srhadd8b: -;CHECK: srhadd.8b +; CHECK-LABEL: srhadd8b: +; CHECK: // %bb.0: +; CHECK-NEXT: ldr d0, [x0] +; CHECK-NEXT: ldr d1, [x1] +; CHECK-NEXT: srhadd.8b v0, v0, v1 +; CHECK-NEXT: ret %tmp1 = load <8 x i8>, <8 x i8>* %A %tmp2 = load <8 x i8>, <8 x i8>* %B %tmp3 = call <8 x i8> @llvm.aarch64.neon.srhadd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2) @@ -134,8 +187,12 @@ } define <16 x i8> @srhadd16b(<16 x i8>* %A, <16 x i8>* %B) nounwind { -;CHECK-LABEL: srhadd16b: -;CHECK: srhadd.16b +; CHECK-LABEL: srhadd16b: +; CHECK: // %bb.0: +; CHECK-NEXT: ldr q0, [x0] +; CHECK-NEXT: ldr q1, [x1] +; CHECK-NEXT: srhadd.16b v0, v0, v1 +; CHECK-NEXT: ret %tmp1 = load <16 x i8>, <16 x i8>* %A %tmp2 = load <16 x i8>, <16 x i8>* %B %tmp3 = call <16 x i8> @llvm.aarch64.neon.srhadd.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2) @@ -143,8 +200,12 @@ } define <4 x i16> @srhadd4h(<4 x i16>* %A, <4 x i16>* %B) nounwind { -;CHECK-LABEL: srhadd4h: -;CHECK: srhadd.4h +; CHECK-LABEL: srhadd4h: +; CHECK: // %bb.0: +; CHECK-NEXT: ldr d0, [x0] +; CHECK-NEXT: ldr d1, [x1] +; CHECK-NEXT: srhadd.4h v0, v0, v1 +; CHECK-NEXT: ret %tmp1 = load <4 x i16>, <4 x i16>* %A %tmp2 = load <4 x i16>, <4 x i16>* %B %tmp3 = call <4 x i16> @llvm.aarch64.neon.srhadd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2) @@ -152,8 +213,12 @@ } define <8 x i16> @srhadd8h(<8 x i16>* %A, <8 x i16>* %B) nounwind { -;CHECK-LABEL: srhadd8h: -;CHECK: srhadd.8h +; CHECK-LABEL: srhadd8h: +; CHECK: // %bb.0: +; CHECK-NEXT: ldr q0, [x0] +; CHECK-NEXT: ldr q1, [x1] +; CHECK-NEXT: srhadd.8h v0, v0, v1 +; CHECK-NEXT: ret %tmp1 = load <8 x i16>, <8 x i16>* %A %tmp2 = load <8 x i16>, <8 x i16>* %B %tmp3 = call <8 x i16> @llvm.aarch64.neon.srhadd.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2) @@ -161,8 +226,12 @@ } define <2 x i32> @srhadd2s(<2 x i32>* %A, <2 x i32>* %B) nounwind { -;CHECK-LABEL: srhadd2s: -;CHECK: srhadd.2s +; CHECK-LABEL: srhadd2s: +; CHECK: // %bb.0: +; CHECK-NEXT: ldr d0, [x0] +; CHECK-NEXT: ldr d1, [x1] +; CHECK-NEXT: srhadd.2s v0, v0, v1 +; CHECK-NEXT: ret %tmp1 = load <2 x i32>, <2 x i32>* %A %tmp2 = load <2 x i32>, <2 x i32>* %B %tmp3 = call <2 x i32> @llvm.aarch64.neon.srhadd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2) @@ -170,8 +239,12 @@ } define <4 x i32> @srhadd4s(<4 x i32>* %A, <4 x i32>* %B) nounwind { -;CHECK-LABEL: srhadd4s: -;CHECK: srhadd.4s +; CHECK-LABEL: srhadd4s: +; CHECK: // %bb.0: +; CHECK-NEXT: ldr q0, [x0] +; CHECK-NEXT: ldr q1, [x1] +; CHECK-NEXT: srhadd.4s v0, v0, v1 +; CHECK-NEXT: ret %tmp1 = load <4 x i32>, <4 x i32>* %A %tmp2 = load <4 x i32>, <4 x i32>* %B %tmp3 = call <4 x i32> @llvm.aarch64.neon.srhadd.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2) @@ -179,8 +252,12 @@ } define <8 x i8> @urhadd8b(<8 x i8>* %A, <8 x i8>* %B) nounwind { -;CHECK-LABEL: urhadd8b: -;CHECK: urhadd.8b +; CHECK-LABEL: urhadd8b: +; CHECK: // %bb.0: +; CHECK-NEXT: ldr d0, [x0] +; CHECK-NEXT: ldr d1, [x1] +; CHECK-NEXT: urhadd.8b v0, v0, v1 +; CHECK-NEXT: ret %tmp1 = load <8 x i8>, <8 x i8>* %A %tmp2 = load <8 x i8>, <8 x i8>* %B %tmp3 = call <8 x i8> @llvm.aarch64.neon.urhadd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2) @@ -188,8 +265,12 @@ } define <16 x i8> @urhadd16b(<16 x i8>* %A, <16 x i8>* %B) nounwind { -;CHECK-LABEL: urhadd16b: -;CHECK: urhadd.16b +; CHECK-LABEL: urhadd16b: +; CHECK: // %bb.0: +; CHECK-NEXT: ldr q0, [x0] +; CHECK-NEXT: ldr q1, [x1] +; CHECK-NEXT: urhadd.16b v0, v0, v1 +; CHECK-NEXT: ret %tmp1 = load <16 x i8>, <16 x i8>* %A %tmp2 = load <16 x i8>, <16 x i8>* %B %tmp3 = call <16 x i8> @llvm.aarch64.neon.urhadd.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2) @@ -197,8 +278,12 @@ } define <4 x i16> @urhadd4h(<4 x i16>* %A, <4 x i16>* %B) nounwind { -;CHECK-LABEL: urhadd4h: -;CHECK: urhadd.4h +; CHECK-LABEL: urhadd4h: +; CHECK: // %bb.0: +; CHECK-NEXT: ldr d0, [x0] +; CHECK-NEXT: ldr d1, [x1] +; CHECK-NEXT: urhadd.4h v0, v0, v1 +; CHECK-NEXT: ret %tmp1 = load <4 x i16>, <4 x i16>* %A %tmp2 = load <4 x i16>, <4 x i16>* %B %tmp3 = call <4 x i16> @llvm.aarch64.neon.urhadd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2) @@ -206,8 +291,12 @@ } define <8 x i16> @urhadd8h(<8 x i16>* %A, <8 x i16>* %B) nounwind { -;CHECK-LABEL: urhadd8h: -;CHECK: urhadd.8h +; CHECK-LABEL: urhadd8h: +; CHECK: // %bb.0: +; CHECK-NEXT: ldr q0, [x0] +; CHECK-NEXT: ldr q1, [x1] +; CHECK-NEXT: urhadd.8h v0, v0, v1 +; CHECK-NEXT: ret %tmp1 = load <8 x i16>, <8 x i16>* %A %tmp2 = load <8 x i16>, <8 x i16>* %B %tmp3 = call <8 x i16> @llvm.aarch64.neon.urhadd.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2) @@ -215,8 +304,12 @@ } define <2 x i32> @urhadd2s(<2 x i32>* %A, <2 x i32>* %B) nounwind { -;CHECK-LABEL: urhadd2s: -;CHECK: urhadd.2s +; CHECK-LABEL: urhadd2s: +; CHECK: // %bb.0: +; CHECK-NEXT: ldr d0, [x0] +; CHECK-NEXT: ldr d1, [x1] +; CHECK-NEXT: urhadd.2s v0, v0, v1 +; CHECK-NEXT: ret %tmp1 = load <2 x i32>, <2 x i32>* %A %tmp2 = load <2 x i32>, <2 x i32>* %B %tmp3 = call <2 x i32> @llvm.aarch64.neon.urhadd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2) @@ -224,14 +317,210 @@ } define <4 x i32> @urhadd4s(<4 x i32>* %A, <4 x i32>* %B) nounwind { -;CHECK-LABEL: urhadd4s: -;CHECK: urhadd.4s +; CHECK-LABEL: urhadd4s: +; CHECK: // %bb.0: +; CHECK-NEXT: ldr q0, [x0] +; CHECK-NEXT: ldr q1, [x1] +; CHECK-NEXT: urhadd.4s v0, v0, v1 +; CHECK-NEXT: ret %tmp1 = load <4 x i32>, <4 x i32>* %A %tmp2 = load <4 x i32>, <4 x i32>* %B %tmp3 = call <4 x i32> @llvm.aarch64.neon.urhadd.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2) ret <4 x i32> %tmp3 } +define void @testLowerToSRHADD8b(<8 x i8> %src1, <8 x i8> %src2, <8 x i8>* %dest) nounwind { +; CHECK-LABEL: testLowerToSRHADD8b: +; CHECK: // %bb.0: +; CHECK-NEXT: srhadd.8b v0, v0, v1 +; CHECK-NEXT: str d0, [x0] +; CHECK-NEXT: ret + %sextsrc1 = sext <8 x i8> %src1 to <8 x i16> + %sextsrc2 = sext <8 x i8> %src2 to <8 x i16> + %add1 = add <8 x i16> %sextsrc1, %sextsrc2 + %add2 = add <8 x i16> %add1, + %resulti16 = lshr <8 x i16> %add2, + %result = trunc <8 x i16> %resulti16 to <8 x i8> + store <8 x i8> %result, <8 x i8>* %dest, align 8 + ret void +} + +define void @testLowerToSRHADD4h(<4 x i16> %src1, <4 x i16> %src2, <4 x i16>* %dest) nounwind { +; CHECK-LABEL: testLowerToSRHADD4h: +; CHECK: // %bb.0: +; CHECK-NEXT: srhadd.4h v0, v0, v1 +; CHECK-NEXT: str d0, [x0] +; CHECK-NEXT: ret + %sextsrc1 = sext <4 x i16> %src1 to <4 x i32> + %sextsrc2 = sext <4 x i16> %src2 to <4 x i32> + %add1 = add <4 x i32> %sextsrc1, %sextsrc2 + %add2 = add <4 x i32> %add1, + %resulti16 = lshr <4 x i32> %add2, + %result = trunc <4 x i32> %resulti16 to <4 x i16> + store <4 x i16> %result, <4 x i16>* %dest, align 8 + ret void +} + +define void @testLowerToSRHADD2s(<2 x i32> %src1, <2 x i32> %src2, <2 x i32>* %dest) nounwind { +; CHECK-LABEL: testLowerToSRHADD2s: +; CHECK: // %bb.0: +; CHECK-NEXT: srhadd.2s v0, v0, v1 +; CHECK-NEXT: str d0, [x0] +; CHECK-NEXT: ret + %sextsrc1 = sext <2 x i32> %src1 to <2 x i64> + %sextsrc2 = sext <2 x i32> %src2 to <2 x i64> + %add1 = add <2 x i64> %sextsrc1, %sextsrc2 + %add2 = add <2 x i64> %add1, + %resulti16 = lshr <2 x i64> %add2, + %result = trunc <2 x i64> %resulti16 to <2 x i32> + store <2 x i32> %result, <2 x i32>* %dest, align 8 + ret void +} + +define void @testLowerToSRHADD16b(<16 x i8> %src1, <16 x i8> %src2, <16 x i8>* %dest) nounwind { +; CHECK-LABEL: testLowerToSRHADD16b: +; CHECK: // %bb.0: +; CHECK-NEXT: srhadd.16b v0, v0, v1 +; CHECK-NEXT: str q0, [x0] +; CHECK-NEXT: ret + %sextsrc1 = sext <16 x i8> %src1 to <16 x i16> + %sextsrc2 = sext <16 x i8> %src2 to <16 x i16> + %add1 = add <16 x i16> %sextsrc1, %sextsrc2 + %add2 = add <16 x i16> %add1, + %resulti16 = lshr <16 x i16> %add2, + %result = trunc <16 x i16> %resulti16 to <16 x i8> + store <16 x i8> %result, <16 x i8>* %dest, align 16 + ret void +} + +define void @testLowerToSRHADD8h(<8 x i16> %src1, <8 x i16> %src2, <8 x i16>* %dest) nounwind { +; CHECK-LABEL: testLowerToSRHADD8h: +; CHECK: // %bb.0: +; CHECK-NEXT: srhadd.8h v0, v0, v1 +; CHECK-NEXT: str q0, [x0] +; CHECK-NEXT: ret + %sextsrc1 = sext <8 x i16> %src1 to <8 x i32> + %sextsrc2 = sext <8 x i16> %src2 to <8 x i32> + %add1 = add <8 x i32> %sextsrc1, %sextsrc2 + %add2 = add <8 x i32> %add1, + %resulti16 = lshr <8 x i32> %add2, + %result = trunc <8 x i32> %resulti16 to <8 x i16> + store <8 x i16> %result, <8 x i16>* %dest, align 16 + ret void +} + +define void @testLowerToSRHADD4s(<4 x i32> %src1, <4 x i32> %src2, <4 x i32>* %dest) nounwind { +; CHECK-LABEL: testLowerToSRHADD4s: +; CHECK: // %bb.0: +; CHECK-NEXT: srhadd.4s v0, v0, v1 +; CHECK-NEXT: str q0, [x0] +; CHECK-NEXT: ret + %sextsrc1 = sext <4 x i32> %src1 to <4 x i64> + %sextsrc2 = sext <4 x i32> %src2 to <4 x i64> + %add1 = add <4 x i64> %sextsrc1, %sextsrc2 + %add2 = add <4 x i64> %add1, + %resulti16 = lshr <4 x i64> %add2, + %result = trunc <4 x i64> %resulti16 to <4 x i32> + store <4 x i32> %result, <4 x i32>* %dest, align 16 + ret void +} + +define void @testLowerToURHADD8b(<8 x i8> %src1, <8 x i8> %src2, <8 x i8>* %dest) nounwind { +; CHECK-LABEL: testLowerToURHADD8b: +; CHECK: // %bb.0: +; CHECK-NEXT: urhadd.8b v0, v0, v1 +; CHECK-NEXT: str d0, [x0] +; CHECK-NEXT: ret + %zextsrc1 = zext <8 x i8> %src1 to <8 x i16> + %zextsrc2 = zext <8 x i8> %src2 to <8 x i16> + %add1 = add <8 x i16> %zextsrc1, %zextsrc2 + %add2 = add <8 x i16> %add1, + %resulti16 = lshr <8 x i16> %add2, + %result = trunc <8 x i16> %resulti16 to <8 x i8> + store <8 x i8> %result, <8 x i8>* %dest, align 8 + ret void +} + +define void @testLowerToURHADD4h(<4 x i16> %src1, <4 x i16> %src2, <4 x i16>* %dest) nounwind { +; CHECK-LABEL: testLowerToURHADD4h: +; CHECK: // %bb.0: +; CHECK-NEXT: urhadd.4h v0, v0, v1 +; CHECK-NEXT: str d0, [x0] +; CHECK-NEXT: ret + %zextsrc1 = zext <4 x i16> %src1 to <4 x i32> + %zextsrc2 = zext <4 x i16> %src2 to <4 x i32> + %add1 = add <4 x i32> %zextsrc1, %zextsrc2 + %add2 = add <4 x i32> %add1, + %resulti16 = lshr <4 x i32> %add2, + %result = trunc <4 x i32> %resulti16 to <4 x i16> + store <4 x i16> %result, <4 x i16>* %dest, align 8 + ret void +} + +define void @testLowerToURHADD2s(<2 x i32> %src1, <2 x i32> %src2, <2 x i32>* %dest) nounwind { +; CHECK-LABEL: testLowerToURHADD2s: +; CHECK: // %bb.0: +; CHECK-NEXT: urhadd.2s v0, v0, v1 +; CHECK-NEXT: str d0, [x0] +; CHECK-NEXT: ret + %zextsrc1 = zext <2 x i32> %src1 to <2 x i64> + %zextsrc2 = zext <2 x i32> %src2 to <2 x i64> + %add1 = add <2 x i64> %zextsrc1, %zextsrc2 + %add2 = add <2 x i64> %add1, + %resulti16 = lshr <2 x i64> %add2, + %result = trunc <2 x i64> %resulti16 to <2 x i32> + store <2 x i32> %result, <2 x i32>* %dest, align 8 + ret void +} + +define void @testLowerToURHADD16b(<16 x i8> %src1, <16 x i8> %src2, <16 x i8>* %dest) nounwind { +; CHECK-LABEL: testLowerToURHADD16b: +; CHECK: // %bb.0: +; CHECK-NEXT: urhadd.16b v0, v0, v1 +; CHECK-NEXT: str q0, [x0] +; CHECK-NEXT: ret + %zextsrc1 = zext <16 x i8> %src1 to <16 x i16> + %zextsrc2 = zext <16 x i8> %src2 to <16 x i16> + %add1 = add <16 x i16> %zextsrc1, %zextsrc2 + %add2 = add <16 x i16> %add1, + %resulti16 = lshr <16 x i16> %add2, + %result = trunc <16 x i16> %resulti16 to <16 x i8> + store <16 x i8> %result, <16 x i8>* %dest, align 16 + ret void +} + +define void @testLowerToURHADD8h(<8 x i16> %src1, <8 x i16> %src2, <8 x i16>* %dest) nounwind { +; CHECK-LABEL: testLowerToURHADD8h: +; CHECK: // %bb.0: +; CHECK-NEXT: urhadd.8h v0, v0, v1 +; CHECK-NEXT: str q0, [x0] +; CHECK-NEXT: ret + %zextsrc1 = zext <8 x i16> %src1 to <8 x i32> + %zextsrc2 = zext <8 x i16> %src2 to <8 x i32> + %add1 = add <8 x i32> %zextsrc1, %zextsrc2 + %add2 = add <8 x i32> %add1, + %resulti16 = lshr <8 x i32> %add2, + %result = trunc <8 x i32> %resulti16 to <8 x i16> + store <8 x i16> %result, <8 x i16>* %dest, align 16 + ret void +} + +define void @testLowerToURHADD4s(<4 x i32> %src1, <4 x i32> %src2, <4 x i32>* %dest) nounwind { +; CHECK-LABEL: testLowerToURHADD4s: +; CHECK: // %bb.0: +; CHECK-NEXT: urhadd.4s v0, v0, v1 +; CHECK-NEXT: str q0, [x0] +; CHECK-NEXT: ret + %zextsrc1 = zext <4 x i32> %src1 to <4 x i64> + %zextsrc2 = zext <4 x i32> %src2 to <4 x i64> + %add1 = add <4 x i64> %zextsrc1, %zextsrc2 + %add2 = add <4 x i64> %add1, + %resulti16 = lshr <4 x i64> %add2, + %result = trunc <4 x i64> %resulti16 to <4 x i32> + store <4 x i32> %result, <4 x i32>* %dest, align 16 + ret void +} + declare <8 x i8> @llvm.aarch64.neon.srhadd.v8i8(<8 x i8>, <8 x i8>) nounwind readnone declare <4 x i16> @llvm.aarch64.neon.srhadd.v4i16(<4 x i16>, <4 x i16>) nounwind readnone declare <2 x i32> @llvm.aarch64.neon.srhadd.v2i32(<2 x i32>, <2 x i32>) nounwind readnone