diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h @@ -164,6 +164,9 @@ SADDV, UADDV, + // Vector usigned rounding halving addition + URHADD, + // Vector across-lanes min/max // Only the lower result lane is defined. SMINV, @@ -838,6 +841,7 @@ SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const; SDValue LowerFSINCOS(SDValue Op, SelectionDAG &DAG) const; SDValue LowerVSCALE(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const; SDValue LowerVECREDUCE(SDValue Op, SelectionDAG &DAG) const; SDValue LowerATOMIC_LOAD_SUB(SDValue Op, SelectionDAG &DAG) const; SDValue LowerATOMIC_LOAD_AND(SDValue Op, SelectionDAG &DAG) const; diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -838,6 +838,8 @@ setOperationAction(ISD::UADDSAT, VT, Legal); setOperationAction(ISD::SSUBSAT, VT, Legal); setOperationAction(ISD::USUBSAT, VT, Legal); + + setOperationAction(ISD::TRUNCATE, VT, Custom); } for (MVT VT : { MVT::v4f16, MVT::v2f32, MVT::v8f16, MVT::v4f32, MVT::v2f64 }) { @@ -1429,6 +1431,7 @@ case AArch64ISD::FCMLTz: return "AArch64ISD::FCMLTz"; case AArch64ISD::SADDV: return "AArch64ISD::SADDV"; case AArch64ISD::UADDV: return "AArch64ISD::UADDV"; + case AArch64ISD::URHADD: return "AArch64ISD::URHADD"; case AArch64ISD::SMINV: return "AArch64ISD::SMINV"; case AArch64ISD::UMINV: return "AArch64ISD::UMINV"; case AArch64ISD::SMAXV: return "AArch64ISD::SMAXV"; @@ -3262,6 +3265,11 @@ return DAG.getNode(Opcode, dl, Ty, Op.getOperand(1), Op.getOperand(2), Op.getOperand(3)); } + + case Intrinsic::aarch64_neon_urhadd: { + return DAG.getNode(AArch64ISD::URHADD, dl, Op.getValueType(), + Op.getOperand(1), Op.getOperand(2)); + } } } @@ -3524,6 +3532,8 @@ return LowerDYNAMIC_STACKALLOC(Op, DAG); case ISD::VSCALE: return LowerVSCALE(Op, DAG); + case ISD::TRUNCATE: + return LowerTRUNCATE(Op, DAG); case ISD::LOAD: if (useSVEForFixedLengthVectorVT(Op.getValueType())) return LowerFixedLengthVectorLoadToSVE(Op, DAG); @@ -8788,6 +8798,74 @@ return (Cnt >= 1 && Cnt <= (isNarrow ? ElementBits / 2 : ElementBits)); } +// Attempt to form urhadd(OpA, OpB) from +// truncate(vlshr(sub(zext(OpB), xor(zext(OpA), Ones(ElemSizeInBits))), 1)). +// The original form of this expression is +// truncate(srl(add(zext(OpB), add(zext(OpA), 1)), 1)) and before this function +// is called the srl will have been lowered to AArch64ISD::VLSHR and the +// ((OpA + OpB + 1) >> 1) expression will have been changed to (OpB - (~OpA)). +SDValue AArch64TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const { + EVT VT = Op.getValueType(); + + if (!VT.isVector()) + return Op; + + SDValue Shift = Op.getOperand(0); + if (Shift.getOpcode() != AArch64ISD::VLSHR) + return Op; + + // Is the right shift using an immediate value of 1? + ConstantSDNode *ShiftAmount = dyn_cast(Shift.getOperand(1)); + if (!ShiftAmount) + return Op; + if (ShiftAmount->getZExtValue() != 1) + return Op; + + SDValue Sub = Shift->getOperand(0); + if (Sub.getOpcode() != ISD::SUB) + return Op; + + SDValue Xor = Sub.getOperand(1); + if (Xor.getOpcode() != ISD::XOR) + return Op; + + SDValue ZextOpA = Xor.getOperand(0); + SDValue ZextOpB = Sub.getOperand(0); + if (!(ZextOpA.getOpcode() == ISD::ZERO_EXTEND && + ZextOpB.getOpcode() == ISD::ZERO_EXTEND)) + return Op; + + // Is the result of the right shift being truncated to the same value type as + // the original operands, OpA and OpB? + SDValue OpA = ZextOpA.getOperand(0); + SDValue OpB = ZextOpB.getOperand(0); + EVT OpAVT = OpA.getValueType(); + assert(ZextOpA.getValueType() == ZextOpB.getValueType()); + if (!(VT == OpAVT && OpAVT == OpB.getValueType())) + return Op; + + // Is the XOR using a constant amount of all ones in the right hand side? + uint64_t C; + if (!isAllConstantBuildVector(Xor.getOperand(1), C)) + return Op; + + unsigned ElemSizeInBits = VT.getScalarSizeInBits(); + APInt CAsAPInt(ElemSizeInBits, C); + if (CAsAPInt != APInt::getAllOnesValue(ElemSizeInBits)) + return Op; + + SDLoc DL(Op); + + SDValue ResultURHADD = DAG.getNode(AArch64ISD::URHADD, DL, VT, OpA, OpB); + + LLVM_DEBUG(dbgs() << "aarch64-lower: transformed: \n"); + LLVM_DEBUG(Op->dump(&DAG)); + LLVM_DEBUG(dbgs() << "into: \n"); + LLVM_DEBUG(ResultURHADD->dump(&DAG)); + + return ResultURHADD; +} + SDValue AArch64TargetLowering::LowerVectorSRA_SRL_SHL(SDValue Op, SelectionDAG &DAG) const { EVT VT = Op.getValueType(); @@ -11032,6 +11110,42 @@ if (DCI.isBeforeLegalizeOps()) return SDValue(); +// Optimise concat_vectors of two urhadds that use extracted subvectors from the +// same original vectors. Combine these into a single urhadd that operates on +// the two original vectors. Example: +// (v16i8 (concat_vectors (v8i8 (urhadd (extract_subvector (v16i8 OpA, <0>), +// extract_subvector (v16i8 OpB, <0>))), +// (v8i8 (urhadd (extract_subvector (v16i8 OpA, <8>), +// extract_subvector (v16i8 OpB, <8>))))) +// -> +// (v16i8(urhadd(v16i8 OpA, v16i8 OpB))) + if (N->getNumOperands() == 2 && + N0->getOpcode() == AArch64ISD::URHADD && + N1->getOpcode() == AArch64ISD::URHADD) { + SDValue N00 = N0->getOperand(0); + SDValue N01 = N0->getOperand(1); + SDValue N10 = N1->getOperand(0); + SDValue N11 = N1->getOperand(1); + + if (N00->getOpcode() == ISD::EXTRACT_SUBVECTOR && + N01->getOpcode() == ISD::EXTRACT_SUBVECTOR && + N10->getOpcode() == ISD::EXTRACT_SUBVECTOR && + N11->getOpcode() == ISD::EXTRACT_SUBVECTOR) { + SDValue N00Source = N00->getOperand(0); + SDValue N01Source = N01->getOperand(0); + SDValue N10Source = N10->getOperand(0); + SDValue N11Source = N11->getOperand(0); + + if (N00Source == N10Source && N01Source == N11Source && + N00Source->getValueType(0) == VT && + N01Source->getValueType(0) == VT) { + assert(N0->getValueType(0) == N1->getValueType(0)); + return DAG.getNode(AArch64ISD::URHADD, dl, VT, + N00->getOperand(0), N01->getOperand(0)); + } + } + } + // If we see a (concat_vectors (v1x64 A), (v1x64 A)) it's really a vector // splat. The indexed instructions are going to be expecting a DUPLANE64, so // canonicalise to that. diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td --- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td @@ -552,6 +552,8 @@ def AArch64smaxv : SDNode<"AArch64ISD::SMAXV", SDT_AArch64UnaryVec>; def AArch64umaxv : SDNode<"AArch64ISD::UMAXV", SDT_AArch64UnaryVec>; +def AArch64urhadd : SDNode<"AArch64ISD::URHADD", SDT_AArch64binvec>; + def SDT_AArch64SETTAG : SDTypeProfile<0, 2, [SDTCisPtrTy<0>, SDTCisPtrTy<1>]>; def AArch64stg : SDNode<"AArch64ISD::STG", SDT_AArch64SETTAG, [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>; def AArch64stzg : SDNode<"AArch64ISD::STZG", SDT_AArch64SETTAG, [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>; @@ -4088,7 +4090,7 @@ defm UQRSHL : SIMDThreeSameVector<1,0b01011,"uqrshl", int_aarch64_neon_uqrshl>; defm UQSHL : SIMDThreeSameVector<1,0b01001,"uqshl", int_aarch64_neon_uqshl>; defm UQSUB : SIMDThreeSameVector<1,0b00101,"uqsub", int_aarch64_neon_uqsub>; -defm URHADD : SIMDThreeSameVectorBHS<1,0b00010,"urhadd", int_aarch64_neon_urhadd>; +defm URHADD : SIMDThreeSameVectorBHS<1,0b00010,"urhadd", AArch64urhadd>; defm URSHL : SIMDThreeSameVector<1,0b01010,"urshl", int_aarch64_neon_urshl>; defm USHL : SIMDThreeSameVector<1,0b01000,"ushl", int_aarch64_neon_ushl>; defm SQRDMLAH : SIMDThreeSameVectorSQRDMLxHTiedHS<1,0b10000,"sqrdmlah", diff --git a/llvm/test/CodeGen/AArch64/arm64-vhadd.ll b/llvm/test/CodeGen/AArch64/arm64-vhadd.ll --- a/llvm/test/CodeGen/AArch64/arm64-vhadd.ll +++ b/llvm/test/CodeGen/AArch64/arm64-vhadd.ll @@ -1,8 +1,13 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s -mtriple=arm64-eabi -aarch64-neon-syntax=apple | FileCheck %s define <8 x i8> @shadd8b(<8 x i8>* %A, <8 x i8>* %B) nounwind { -;CHECK-LABEL: shadd8b: -;CHECK: shadd.8b +; CHECK-LABEL: shadd8b: +; CHECK: // %bb.0: +; CHECK-NEXT: ldr d0, [x0] +; CHECK-NEXT: ldr d1, [x1] +; CHECK-NEXT: shadd.8b v0, v0, v1 +; CHECK-NEXT: ret %tmp1 = load <8 x i8>, <8 x i8>* %A %tmp2 = load <8 x i8>, <8 x i8>* %B %tmp3 = call <8 x i8> @llvm.aarch64.neon.shadd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2) @@ -10,8 +15,12 @@ } define <16 x i8> @shadd16b(<16 x i8>* %A, <16 x i8>* %B) nounwind { -;CHECK-LABEL: shadd16b: -;CHECK: shadd.16b +; CHECK-LABEL: shadd16b: +; CHECK: // %bb.0: +; CHECK-NEXT: ldr q0, [x0] +; CHECK-NEXT: ldr q1, [x1] +; CHECK-NEXT: shadd.16b v0, v0, v1 +; CHECK-NEXT: ret %tmp1 = load <16 x i8>, <16 x i8>* %A %tmp2 = load <16 x i8>, <16 x i8>* %B %tmp3 = call <16 x i8> @llvm.aarch64.neon.shadd.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2) @@ -19,8 +28,12 @@ } define <4 x i16> @shadd4h(<4 x i16>* %A, <4 x i16>* %B) nounwind { -;CHECK-LABEL: shadd4h: -;CHECK: shadd.4h +; CHECK-LABEL: shadd4h: +; CHECK: // %bb.0: +; CHECK-NEXT: ldr d0, [x0] +; CHECK-NEXT: ldr d1, [x1] +; CHECK-NEXT: shadd.4h v0, v0, v1 +; CHECK-NEXT: ret %tmp1 = load <4 x i16>, <4 x i16>* %A %tmp2 = load <4 x i16>, <4 x i16>* %B %tmp3 = call <4 x i16> @llvm.aarch64.neon.shadd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2) @@ -28,8 +41,12 @@ } define <8 x i16> @shadd8h(<8 x i16>* %A, <8 x i16>* %B) nounwind { -;CHECK-LABEL: shadd8h: -;CHECK: shadd.8h +; CHECK-LABEL: shadd8h: +; CHECK: // %bb.0: +; CHECK-NEXT: ldr q0, [x0] +; CHECK-NEXT: ldr q1, [x1] +; CHECK-NEXT: shadd.8h v0, v0, v1 +; CHECK-NEXT: ret %tmp1 = load <8 x i16>, <8 x i16>* %A %tmp2 = load <8 x i16>, <8 x i16>* %B %tmp3 = call <8 x i16> @llvm.aarch64.neon.shadd.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2) @@ -37,8 +54,12 @@ } define <2 x i32> @shadd2s(<2 x i32>* %A, <2 x i32>* %B) nounwind { -;CHECK-LABEL: shadd2s: -;CHECK: shadd.2s +; CHECK-LABEL: shadd2s: +; CHECK: // %bb.0: +; CHECK-NEXT: ldr d0, [x0] +; CHECK-NEXT: ldr d1, [x1] +; CHECK-NEXT: shadd.2s v0, v0, v1 +; CHECK-NEXT: ret %tmp1 = load <2 x i32>, <2 x i32>* %A %tmp2 = load <2 x i32>, <2 x i32>* %B %tmp3 = call <2 x i32> @llvm.aarch64.neon.shadd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2) @@ -46,8 +67,12 @@ } define <4 x i32> @shadd4s(<4 x i32>* %A, <4 x i32>* %B) nounwind { -;CHECK-LABEL: shadd4s: -;CHECK: shadd.4s +; CHECK-LABEL: shadd4s: +; CHECK: // %bb.0: +; CHECK-NEXT: ldr q0, [x0] +; CHECK-NEXT: ldr q1, [x1] +; CHECK-NEXT: shadd.4s v0, v0, v1 +; CHECK-NEXT: ret %tmp1 = load <4 x i32>, <4 x i32>* %A %tmp2 = load <4 x i32>, <4 x i32>* %B %tmp3 = call <4 x i32> @llvm.aarch64.neon.shadd.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2) @@ -55,8 +80,12 @@ } define <8 x i8> @uhadd8b(<8 x i8>* %A, <8 x i8>* %B) nounwind { -;CHECK-LABEL: uhadd8b: -;CHECK: uhadd.8b +; CHECK-LABEL: uhadd8b: +; CHECK: // %bb.0: +; CHECK-NEXT: ldr d0, [x0] +; CHECK-NEXT: ldr d1, [x1] +; CHECK-NEXT: uhadd.8b v0, v0, v1 +; CHECK-NEXT: ret %tmp1 = load <8 x i8>, <8 x i8>* %A %tmp2 = load <8 x i8>, <8 x i8>* %B %tmp3 = call <8 x i8> @llvm.aarch64.neon.uhadd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2) @@ -64,8 +93,12 @@ } define <16 x i8> @uhadd16b(<16 x i8>* %A, <16 x i8>* %B) nounwind { -;CHECK-LABEL: uhadd16b: -;CHECK: uhadd.16b +; CHECK-LABEL: uhadd16b: +; CHECK: // %bb.0: +; CHECK-NEXT: ldr q0, [x0] +; CHECK-NEXT: ldr q1, [x1] +; CHECK-NEXT: uhadd.16b v0, v0, v1 +; CHECK-NEXT: ret %tmp1 = load <16 x i8>, <16 x i8>* %A %tmp2 = load <16 x i8>, <16 x i8>* %B %tmp3 = call <16 x i8> @llvm.aarch64.neon.uhadd.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2) @@ -73,8 +106,12 @@ } define <4 x i16> @uhadd4h(<4 x i16>* %A, <4 x i16>* %B) nounwind { -;CHECK-LABEL: uhadd4h: -;CHECK: uhadd.4h +; CHECK-LABEL: uhadd4h: +; CHECK: // %bb.0: +; CHECK-NEXT: ldr d0, [x0] +; CHECK-NEXT: ldr d1, [x1] +; CHECK-NEXT: uhadd.4h v0, v0, v1 +; CHECK-NEXT: ret %tmp1 = load <4 x i16>, <4 x i16>* %A %tmp2 = load <4 x i16>, <4 x i16>* %B %tmp3 = call <4 x i16> @llvm.aarch64.neon.uhadd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2) @@ -82,8 +119,12 @@ } define <8 x i16> @uhadd8h(<8 x i16>* %A, <8 x i16>* %B) nounwind { -;CHECK-LABEL: uhadd8h: -;CHECK: uhadd.8h +; CHECK-LABEL: uhadd8h: +; CHECK: // %bb.0: +; CHECK-NEXT: ldr q0, [x0] +; CHECK-NEXT: ldr q1, [x1] +; CHECK-NEXT: uhadd.8h v0, v0, v1 +; CHECK-NEXT: ret %tmp1 = load <8 x i16>, <8 x i16>* %A %tmp2 = load <8 x i16>, <8 x i16>* %B %tmp3 = call <8 x i16> @llvm.aarch64.neon.uhadd.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2) @@ -91,8 +132,12 @@ } define <2 x i32> @uhadd2s(<2 x i32>* %A, <2 x i32>* %B) nounwind { -;CHECK-LABEL: uhadd2s: -;CHECK: uhadd.2s +; CHECK-LABEL: uhadd2s: +; CHECK: // %bb.0: +; CHECK-NEXT: ldr d0, [x0] +; CHECK-NEXT: ldr d1, [x1] +; CHECK-NEXT: uhadd.2s v0, v0, v1 +; CHECK-NEXT: ret %tmp1 = load <2 x i32>, <2 x i32>* %A %tmp2 = load <2 x i32>, <2 x i32>* %B %tmp3 = call <2 x i32> @llvm.aarch64.neon.uhadd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2) @@ -100,8 +145,12 @@ } define <4 x i32> @uhadd4s(<4 x i32>* %A, <4 x i32>* %B) nounwind { -;CHECK-LABEL: uhadd4s: -;CHECK: uhadd.4s +; CHECK-LABEL: uhadd4s: +; CHECK: // %bb.0: +; CHECK-NEXT: ldr q0, [x0] +; CHECK-NEXT: ldr q1, [x1] +; CHECK-NEXT: uhadd.4s v0, v0, v1 +; CHECK-NEXT: ret %tmp1 = load <4 x i32>, <4 x i32>* %A %tmp2 = load <4 x i32>, <4 x i32>* %B %tmp3 = call <4 x i32> @llvm.aarch64.neon.uhadd.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2) @@ -125,8 +174,12 @@ declare <4 x i32> @llvm.aarch64.neon.uhadd.v4i32(<4 x i32>, <4 x i32>) nounwind readnone define <8 x i8> @srhadd8b(<8 x i8>* %A, <8 x i8>* %B) nounwind { -;CHECK-LABEL: srhadd8b: -;CHECK: srhadd.8b +; CHECK-LABEL: srhadd8b: +; CHECK: // %bb.0: +; CHECK-NEXT: ldr d0, [x0] +; CHECK-NEXT: ldr d1, [x1] +; CHECK-NEXT: srhadd.8b v0, v0, v1 +; CHECK-NEXT: ret %tmp1 = load <8 x i8>, <8 x i8>* %A %tmp2 = load <8 x i8>, <8 x i8>* %B %tmp3 = call <8 x i8> @llvm.aarch64.neon.srhadd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2) @@ -134,8 +187,12 @@ } define <16 x i8> @srhadd16b(<16 x i8>* %A, <16 x i8>* %B) nounwind { -;CHECK-LABEL: srhadd16b: -;CHECK: srhadd.16b +; CHECK-LABEL: srhadd16b: +; CHECK: // %bb.0: +; CHECK-NEXT: ldr q0, [x0] +; CHECK-NEXT: ldr q1, [x1] +; CHECK-NEXT: srhadd.16b v0, v0, v1 +; CHECK-NEXT: ret %tmp1 = load <16 x i8>, <16 x i8>* %A %tmp2 = load <16 x i8>, <16 x i8>* %B %tmp3 = call <16 x i8> @llvm.aarch64.neon.srhadd.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2) @@ -143,8 +200,12 @@ } define <4 x i16> @srhadd4h(<4 x i16>* %A, <4 x i16>* %B) nounwind { -;CHECK-LABEL: srhadd4h: -;CHECK: srhadd.4h +; CHECK-LABEL: srhadd4h: +; CHECK: // %bb.0: +; CHECK-NEXT: ldr d0, [x0] +; CHECK-NEXT: ldr d1, [x1] +; CHECK-NEXT: srhadd.4h v0, v0, v1 +; CHECK-NEXT: ret %tmp1 = load <4 x i16>, <4 x i16>* %A %tmp2 = load <4 x i16>, <4 x i16>* %B %tmp3 = call <4 x i16> @llvm.aarch64.neon.srhadd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2) @@ -152,8 +213,12 @@ } define <8 x i16> @srhadd8h(<8 x i16>* %A, <8 x i16>* %B) nounwind { -;CHECK-LABEL: srhadd8h: -;CHECK: srhadd.8h +; CHECK-LABEL: srhadd8h: +; CHECK: // %bb.0: +; CHECK-NEXT: ldr q0, [x0] +; CHECK-NEXT: ldr q1, [x1] +; CHECK-NEXT: srhadd.8h v0, v0, v1 +; CHECK-NEXT: ret %tmp1 = load <8 x i16>, <8 x i16>* %A %tmp2 = load <8 x i16>, <8 x i16>* %B %tmp3 = call <8 x i16> @llvm.aarch64.neon.srhadd.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2) @@ -161,8 +226,12 @@ } define <2 x i32> @srhadd2s(<2 x i32>* %A, <2 x i32>* %B) nounwind { -;CHECK-LABEL: srhadd2s: -;CHECK: srhadd.2s +; CHECK-LABEL: srhadd2s: +; CHECK: // %bb.0: +; CHECK-NEXT: ldr d0, [x0] +; CHECK-NEXT: ldr d1, [x1] +; CHECK-NEXT: srhadd.2s v0, v0, v1 +; CHECK-NEXT: ret %tmp1 = load <2 x i32>, <2 x i32>* %A %tmp2 = load <2 x i32>, <2 x i32>* %B %tmp3 = call <2 x i32> @llvm.aarch64.neon.srhadd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2) @@ -170,8 +239,12 @@ } define <4 x i32> @srhadd4s(<4 x i32>* %A, <4 x i32>* %B) nounwind { -;CHECK-LABEL: srhadd4s: -;CHECK: srhadd.4s +; CHECK-LABEL: srhadd4s: +; CHECK: // %bb.0: +; CHECK-NEXT: ldr q0, [x0] +; CHECK-NEXT: ldr q1, [x1] +; CHECK-NEXT: srhadd.4s v0, v0, v1 +; CHECK-NEXT: ret %tmp1 = load <4 x i32>, <4 x i32>* %A %tmp2 = load <4 x i32>, <4 x i32>* %B %tmp3 = call <4 x i32> @llvm.aarch64.neon.srhadd.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2) @@ -179,8 +252,12 @@ } define <8 x i8> @urhadd8b(<8 x i8>* %A, <8 x i8>* %B) nounwind { -;CHECK-LABEL: urhadd8b: -;CHECK: urhadd.8b +; CHECK-LABEL: urhadd8b: +; CHECK: // %bb.0: +; CHECK-NEXT: ldr d0, [x0] +; CHECK-NEXT: ldr d1, [x1] +; CHECK-NEXT: urhadd.8b v0, v0, v1 +; CHECK-NEXT: ret %tmp1 = load <8 x i8>, <8 x i8>* %A %tmp2 = load <8 x i8>, <8 x i8>* %B %tmp3 = call <8 x i8> @llvm.aarch64.neon.urhadd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2) @@ -188,8 +265,12 @@ } define <16 x i8> @urhadd16b(<16 x i8>* %A, <16 x i8>* %B) nounwind { -;CHECK-LABEL: urhadd16b: -;CHECK: urhadd.16b +; CHECK-LABEL: urhadd16b: +; CHECK: // %bb.0: +; CHECK-NEXT: ldr q0, [x0] +; CHECK-NEXT: ldr q1, [x1] +; CHECK-NEXT: urhadd.16b v0, v0, v1 +; CHECK-NEXT: ret %tmp1 = load <16 x i8>, <16 x i8>* %A %tmp2 = load <16 x i8>, <16 x i8>* %B %tmp3 = call <16 x i8> @llvm.aarch64.neon.urhadd.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2) @@ -197,8 +278,12 @@ } define <4 x i16> @urhadd4h(<4 x i16>* %A, <4 x i16>* %B) nounwind { -;CHECK-LABEL: urhadd4h: -;CHECK: urhadd.4h +; CHECK-LABEL: urhadd4h: +; CHECK: // %bb.0: +; CHECK-NEXT: ldr d0, [x0] +; CHECK-NEXT: ldr d1, [x1] +; CHECK-NEXT: urhadd.4h v0, v0, v1 +; CHECK-NEXT: ret %tmp1 = load <4 x i16>, <4 x i16>* %A %tmp2 = load <4 x i16>, <4 x i16>* %B %tmp3 = call <4 x i16> @llvm.aarch64.neon.urhadd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2) @@ -206,8 +291,12 @@ } define <8 x i16> @urhadd8h(<8 x i16>* %A, <8 x i16>* %B) nounwind { -;CHECK-LABEL: urhadd8h: -;CHECK: urhadd.8h +; CHECK-LABEL: urhadd8h: +; CHECK: // %bb.0: +; CHECK-NEXT: ldr q0, [x0] +; CHECK-NEXT: ldr q1, [x1] +; CHECK-NEXT: urhadd.8h v0, v0, v1 +; CHECK-NEXT: ret %tmp1 = load <8 x i16>, <8 x i16>* %A %tmp2 = load <8 x i16>, <8 x i16>* %B %tmp3 = call <8 x i16> @llvm.aarch64.neon.urhadd.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2) @@ -215,8 +304,12 @@ } define <2 x i32> @urhadd2s(<2 x i32>* %A, <2 x i32>* %B) nounwind { -;CHECK-LABEL: urhadd2s: -;CHECK: urhadd.2s +; CHECK-LABEL: urhadd2s: +; CHECK: // %bb.0: +; CHECK-NEXT: ldr d0, [x0] +; CHECK-NEXT: ldr d1, [x1] +; CHECK-NEXT: urhadd.2s v0, v0, v1 +; CHECK-NEXT: ret %tmp1 = load <2 x i32>, <2 x i32>* %A %tmp2 = load <2 x i32>, <2 x i32>* %B %tmp3 = call <2 x i32> @llvm.aarch64.neon.urhadd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2) @@ -224,14 +317,66 @@ } define <4 x i32> @urhadd4s(<4 x i32>* %A, <4 x i32>* %B) nounwind { -;CHECK-LABEL: urhadd4s: -;CHECK: urhadd.4s +; CHECK-LABEL: urhadd4s: +; CHECK: // %bb.0: +; CHECK-NEXT: ldr q0, [x0] +; CHECK-NEXT: ldr q1, [x1] +; CHECK-NEXT: urhadd.4s v0, v0, v1 +; CHECK-NEXT: ret %tmp1 = load <4 x i32>, <4 x i32>* %A %tmp2 = load <4 x i32>, <4 x i32>* %B %tmp3 = call <4 x i32> @llvm.aarch64.neon.urhadd.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2) ret <4 x i32> %tmp3 } +define void @testLowerToURHADD16b(<16 x i8> %src1, <16 x i8> %src2, <16 x i8>* %dest) nounwind { +; CHECK-LABEL: testLowerToURHADD16b: +; CHECK: // %bb.0: +; CHECK-NEXT: urhadd.16b v0, v0, v1 +; CHECK-NEXT: str q0, [x0] +; CHECK-NEXT: ret + %zextsrc1 = zext <16 x i8> %src1 to <16 x i16> + %zextsrc2 = zext <16 x i8> %src2 to <16 x i16> + %add1 = add <16 x i16> %zextsrc1, %zextsrc2 + %add2 = add <16 x i16> %add1, + %resulti16 = lshr <16 x i16> %add2, + %result = trunc <16 x i16> %resulti16 to <16 x i8> + store <16 x i8> %result, <16 x i8>* %dest, align 16 + ret void +} + +define void @testLowerToURHADD8h(<8 x i16> %src1, <8 x i16> %src2, <8 x i16>* %dest) nounwind { +; CHECK-LABEL: testLowerToURHADD8h: +; CHECK: // %bb.0: +; CHECK-NEXT: urhadd.8h v0, v0, v1 +; CHECK-NEXT: str q0, [x0] +; CHECK-NEXT: ret + %zextsrc1 = zext <8 x i16> %src1 to <8 x i32> + %zextsrc2 = zext <8 x i16> %src2 to <8 x i32> + %add1 = add <8 x i32> %zextsrc1, %zextsrc2 + %add2 = add <8 x i32> %add1, + %resulti16 = lshr <8 x i32> %add2, + %result = trunc <8 x i32> %resulti16 to <8 x i16> + store <8 x i16> %result, <8 x i16>* %dest, align 16 + ret void +} + +define void @testLowerToURHADD4s(<4 x i32> %src1, <4 x i32> %src2, <4 x i32>* %dest) nounwind { +; CHECK-LABEL: testLowerToURHADD4s: +; CHECK: // %bb.0: +; CHECK-NEXT: urhadd.4s v0, v0, v1 +; CHECK-NEXT: str q0, [x0] +; CHECK-NEXT: ret + %zextsrc1 = zext <4 x i32> %src1 to <4 x i64> + %zextsrc2 = zext <4 x i32> %src2 to <4 x i64> + %add1 = add <4 x i64> %zextsrc1, %zextsrc2 + %add2 = add <4 x i64> %add1, + %resulti16 = lshr <4 x i64> %add2, + %result = trunc <4 x i64> %resulti16 to <4 x i32> + store <4 x i32> %result, <4 x i32>* %dest, align 16 + ret void +} + declare <8 x i8> @llvm.aarch64.neon.srhadd.v8i8(<8 x i8>, <8 x i8>) nounwind readnone declare <4 x i16> @llvm.aarch64.neon.srhadd.v4i16(<4 x i16>, <4 x i16>) nounwind readnone declare <2 x i32> @llvm.aarch64.neon.srhadd.v2i32(<2 x i32>, <2 x i32>) nounwind readnone