Index: llvm/lib/Target/AArch64/AArch64ISelLowering.h =================================================================== --- llvm/lib/Target/AArch64/AArch64ISelLowering.h +++ llvm/lib/Target/AArch64/AArch64ISelLowering.h @@ -232,6 +232,8 @@ SADDV, UADDV, + // Add Pairwise of two vectors + ADDP, // Add Long Pairwise SADDLP, UADDLP, Index: llvm/lib/Target/AArch64/AArch64ISelLowering.cpp =================================================================== --- llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -1067,6 +1067,10 @@ setLoadExtAction(ISD::EXTLOAD, MVT::v4i32, MVT::v4i8, Custom); setLoadExtAction(ISD::SEXTLOAD, MVT::v4i32, MVT::v4i8, Custom); setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i32, MVT::v4i8, Custom); + + // ADDP custom lowering + for (MVT VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) + setOperationAction(ISD::ADD, VT, Custom); } if (Subtarget->hasSVE()) { @@ -2233,6 +2237,7 @@ MAKE_CASE(AArch64ISD::CTPOP_MERGE_PASSTHRU) MAKE_CASE(AArch64ISD::DUP_MERGE_PASSTHRU) MAKE_CASE(AArch64ISD::INDEX_VECTOR) + MAKE_CASE(AArch64ISD::ADDP) MAKE_CASE(AArch64ISD::SADDLP) MAKE_CASE(AArch64ISD::UADDLP) MAKE_CASE(AArch64ISD::CALL_RVMARKER) @@ -19387,6 +19392,49 @@ Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, Op)); } +static void ReplaceAddWithADDP(SDNode *N, SmallVectorImpl &Results, + SelectionDAG &DAG) { + EVT VT = N->getValueType(0); + if (!VT.is256BitVector()) + return; + + SDValue X = N->getOperand(0); + auto *Shuf = dyn_cast(N->getOperand(1)); + if (!Shuf) { + Shuf = dyn_cast(N->getOperand(0)); + X = N->getOperand(1); + if (!Shuf) + return; + } + + if (Shuf->getOperand(0) != X || !Shuf->getOperand(1)->isUndef()) + return; + + // Check the mask is 1,0,3,2,5,4,... + ArrayRef Mask = Shuf->getMask(); + for (int I = 0, E = Mask.size(); I < E; I++) + if (Mask[I] != (I % 2 == 0 ? I + 1 : I - 1)) + return; + + SDLoc DL(N); + auto LoHi = DAG.SplitVector(X, DL); + assert(LoHi.first.getValueType() == LoHi.second.getValueType()); + SDValue Addp = DAG.getNode(AArch64ISD::ADDP, N, LoHi.first.getValueType(), + LoHi.first, LoHi.second); + + // Shuffle the elements back into order. + SmallVector NMask; + for (unsigned I = 0, E = VT.getVectorNumElements() / 2; I < E; I++) { + NMask.push_back(I); + NMask.push_back(I); + } + Results.push_back( + DAG.getVectorShuffle(VT, DL, + DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Addp, + DAG.getUNDEF(LoHi.first.getValueType())), + DAG.getUNDEF(VT), NMask)); +} + static void ReplaceReductionResults(SDNode *N, SmallVectorImpl &Results, SelectionDAG &DAG, unsigned InterOp, @@ -19564,6 +19612,9 @@ case ISD::VECREDUCE_UMIN: Results.push_back(LowerVECREDUCE(SDValue(N, 0), DAG)); return; + case ISD::ADD: + ReplaceAddWithADDP(N, Results, DAG); + return; case ISD::CTPOP: if (SDValue Result = LowerCTPOP(SDValue(N, 0), DAG)) Index: llvm/lib/Target/AArch64/AArch64InstrInfo.td =================================================================== --- llvm/lib/Target/AArch64/AArch64InstrInfo.td +++ llvm/lib/Target/AArch64/AArch64InstrInfo.td @@ -642,8 +642,12 @@ [(abds node:$lhs, node:$rhs), (int_aarch64_neon_sabd node:$lhs, node:$rhs)]>; +def AArch64addp_n : SDNode<"AArch64ISD::ADDP", SDT_AArch64Zip>; def AArch64uaddlp_n : SDNode<"AArch64ISD::UADDLP", SDT_AArch64uaddlp>; def AArch64saddlp_n : SDNode<"AArch64ISD::SADDLP", SDT_AArch64uaddlp>; +def AArch64addp : PatFrags<(ops node:$Rn, node:$Rm), + [(AArch64addp_n node:$Rn, node:$Rm), + (int_aarch64_neon_addp node:$Rn, node:$Rm)]>; def AArch64uaddlp : PatFrags<(ops node:$src), [(AArch64uaddlp_n node:$src), (int_aarch64_neon_uaddlp node:$src)]>; @@ -4454,7 +4458,7 @@ //===----------------------------------------------------------------------===// defm ADD : SIMDThreeSameVector<0, 0b10000, "add", add>; -defm ADDP : SIMDThreeSameVector<0, 0b10111, "addp", int_aarch64_neon_addp>; +defm ADDP : SIMDThreeSameVector<0, 0b10111, "addp", AArch64addp>; defm CMEQ : SIMDThreeSameVector<1, 0b10001, "cmeq", AArch64cmeq>; defm CMGE : SIMDThreeSameVector<0, 0b00111, "cmge", AArch64cmge>; defm CMGT : SIMDThreeSameVector<0, 0b00110, "cmgt", AArch64cmgt>; Index: llvm/test/CodeGen/AArch64/arm64-addp.ll =================================================================== --- llvm/test/CodeGen/AArch64/arm64-addp.ll +++ llvm/test/CodeGen/AArch64/arm64-addp.ll @@ -52,10 +52,9 @@ define <4 x i64> @addp_v4i64(<4 x i64> %a) { ; CHECK-LABEL: addp_v4i64: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ext.16b v2, v1, v1, #8 -; CHECK-NEXT: ext.16b v3, v0, v0, #8 -; CHECK-NEXT: add.2d v0, v3, v0 -; CHECK-NEXT: add.2d v1, v2, v1 +; CHECK-NEXT: addp.2d v1, v0, v1 +; CHECK-NEXT: dup.2d v0, v1[0] +; CHECK-NEXT: dup.2d v1, v1[1] ; CHECK-NEXT: ret entry: %s = shufflevector <4 x i64> %a, <4 x i64> poison, <4 x i32> @@ -78,10 +77,9 @@ define <8 x i32> @addp_v8i32(<8 x i32> %a) { ; CHECK-LABEL: addp_v8i32: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: rev64.4s v2, v1 -; CHECK-NEXT: rev64.4s v3, v0 -; CHECK-NEXT: add.4s v0, v3, v0 -; CHECK-NEXT: add.4s v1, v2, v1 +; CHECK-NEXT: addp.4s v1, v0, v1 +; CHECK-NEXT: zip1.4s v0, v1, v1 +; CHECK-NEXT: zip2.4s v1, v1, v1 ; CHECK-NEXT: ret entry: %s = shufflevector <8 x i32> %a, <8 x i32> poison, <8 x i32> @@ -92,14 +90,12 @@ define <16 x i32> @addp_v16i32(<16 x i32> %a) { ; CHECK-LABEL: addp_v16i32: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: rev64.4s v4, v3 -; CHECK-NEXT: rev64.4s v5, v2 -; CHECK-NEXT: rev64.4s v6, v1 -; CHECK-NEXT: rev64.4s v7, v0 -; CHECK-NEXT: add.4s v0, v7, v0 -; CHECK-NEXT: add.4s v1, v6, v1 -; CHECK-NEXT: add.4s v2, v5, v2 -; CHECK-NEXT: add.4s v3, v4, v3 +; CHECK-NEXT: addp.4s v1, v0, v1 +; CHECK-NEXT: zip1.4s v0, v1, v1 +; CHECK-NEXT: zip2.4s v1, v1, v1 +; CHECK-NEXT: addp.4s v3, v2, v3 +; CHECK-NEXT: zip1.4s v2, v3, v3 +; CHECK-NEXT: zip2.4s v3, v3, v3 ; CHECK-NEXT: ret entry: %s = shufflevector <16 x i32> %a, <16 x i32> poison, <16 x i32> @@ -122,10 +118,9 @@ define <16 x i16> @addp_v16i16(<16 x i16> %a) { ; CHECK-LABEL: addp_v16i16: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: rev32.8h v2, v1 -; CHECK-NEXT: rev32.8h v3, v0 -; CHECK-NEXT: add.8h v0, v3, v0 -; CHECK-NEXT: add.8h v1, v2, v1 +; CHECK-NEXT: addp.8h v1, v0, v1 +; CHECK-NEXT: zip1.8h v0, v1, v1 +; CHECK-NEXT: zip2.8h v1, v1, v1 ; CHECK-NEXT: ret entry: %s = shufflevector <16 x i16> %a, <16 x i16> poison, <16 x i32> @@ -148,10 +143,9 @@ define <32 x i8> @addp_v32i8(<32 x i8> %a) { ; CHECK-LABEL: addp_v32i8: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: rev16.16b v2, v1 -; CHECK-NEXT: rev16.16b v3, v0 -; CHECK-NEXT: add.16b v0, v3, v0 -; CHECK-NEXT: add.16b v1, v2, v1 +; CHECK-NEXT: addp.16b v1, v0, v1 +; CHECK-NEXT: zip1.16b v0, v1, v1 +; CHECK-NEXT: zip2.16b v1, v1, v1 ; CHECK-NEXT: ret entry: %s = shufflevector <32 x i8> %a, <32 x i8> poison, <32 x i32> Index: llvm/test/CodeGen/AArch64/insert-extend.ll =================================================================== --- llvm/test/CodeGen/AArch64/insert-extend.ll +++ llvm/test/CodeGen/AArch64/insert-extend.ll @@ -91,111 +91,100 @@ ; CHECK-NEXT: add v1.4s, v5.4s, v1.4s ; CHECK-NEXT: add v2.4s, v4.4s, v2.4s ; CHECK-NEXT: add v0.4s, v6.4s, v0.4s -; CHECK-NEXT: rev64 v6.4s, v1.4s -; CHECK-NEXT: rev64 v7.4s, v2.4s +; CHECK-NEXT: rev64 v6.4s, v2.4s +; CHECK-NEXT: rev64 v17.4s, v1.4s ; CHECK-NEXT: add v3.4s, v3.4s, v16.4s -; CHECK-NEXT: rev64 v4.4s, v0.4s -; CHECK-NEXT: rev64 v5.4s, v3.4s -; CHECK-NEXT: add v18.4s, v1.4s, v6.4s -; CHECK-NEXT: add v19.4s, v2.4s, v7.4s -; CHECK-NEXT: sub v1.4s, v1.4s, v6.4s -; CHECK-NEXT: sub v2.4s, v2.4s, v7.4s -; CHECK-NEXT: add v16.4s, v0.4s, v4.4s -; CHECK-NEXT: zip1 v7.4s, v2.4s, v1.4s -; CHECK-NEXT: add v17.4s, v3.4s, v5.4s -; CHECK-NEXT: sub v0.4s, v0.4s, v4.4s -; CHECK-NEXT: sub v3.4s, v3.4s, v5.4s -; CHECK-NEXT: uzp2 v6.4s, v17.4s, v16.4s -; CHECK-NEXT: zip2 v5.4s, v0.4s, v3.4s -; CHECK-NEXT: ext v20.16b, v17.16b, v17.16b, #12 -; CHECK-NEXT: mov v0.s[1], v3.s[0] -; CHECK-NEXT: ext v3.16b, v2.16b, v7.16b, #8 -; CHECK-NEXT: mov v2.s[3], v1.s[2] -; CHECK-NEXT: zip1 v4.4s, v19.4s, v18.4s -; CHECK-NEXT: trn2 v21.4s, v17.4s, v16.4s -; CHECK-NEXT: uzp2 v6.4s, v6.4s, v17.4s -; CHECK-NEXT: mov v17.s[0], v16.s[1] -; CHECK-NEXT: zip2 v7.4s, v19.4s, v18.4s -; CHECK-NEXT: mov v0.d[1], v3.d[1] -; CHECK-NEXT: ext v1.16b, v16.16b, v20.16b, #12 -; CHECK-NEXT: mov v5.d[1], v2.d[1] -; CHECK-NEXT: mov v17.d[1], v4.d[1] -; CHECK-NEXT: mov v6.d[1], v7.d[1] -; CHECK-NEXT: mov v1.d[1], v7.d[1] -; CHECK-NEXT: add v3.4s, v5.4s, v0.4s -; CHECK-NEXT: mov v21.d[1], v4.d[1] +; CHECK-NEXT: rev64 v5.4s, v0.4s ; CHECK-NEXT: rev64 v4.4s, v3.4s -; CHECK-NEXT: sub v1.4s, v17.4s, v1.4s +; CHECK-NEXT: addp v16.4s, v2.4s, v1.4s +; CHECK-NEXT: sub v1.4s, v1.4s, v17.4s +; CHECK-NEXT: sub v2.4s, v2.4s, v6.4s +; CHECK-NEXT: addp v7.4s, v3.4s, v0.4s ; CHECK-NEXT: sub v0.4s, v0.4s, v5.4s -; CHECK-NEXT: add v2.4s, v6.4s, v21.4s -; CHECK-NEXT: rev64 v6.4s, v1.4s -; CHECK-NEXT: add v7.4s, v3.4s, v4.4s +; CHECK-NEXT: zip1 v5.4s, v2.4s, v1.4s ; CHECK-NEXT: sub v3.4s, v3.4s, v4.4s -; CHECK-NEXT: rev64 v4.4s, v0.4s -; CHECK-NEXT: rev64 v5.4s, v2.4s -; CHECK-NEXT: add v17.4s, v1.4s, v6.4s -; CHECK-NEXT: sub v1.4s, v1.4s, v6.4s -; CHECK-NEXT: add v19.4s, v0.4s, v4.4s -; CHECK-NEXT: sub v0.4s, v0.4s, v4.4s -; CHECK-NEXT: ext v16.16b, v7.16b, v3.16b, #4 -; CHECK-NEXT: add v18.4s, v2.4s, v5.4s -; CHECK-NEXT: ext v6.16b, v17.16b, v1.16b, #4 -; CHECK-NEXT: sub v2.4s, v2.4s, v5.4s -; CHECK-NEXT: ext v5.16b, v19.16b, v0.16b, #4 -; CHECK-NEXT: rev64 v16.4s, v16.4s -; CHECK-NEXT: rev64 v6.4s, v6.4s -; CHECK-NEXT: ext v20.16b, v18.16b, v18.16b, #4 -; CHECK-NEXT: rev64 v5.4s, v5.4s -; CHECK-NEXT: mov v7.s[3], v3.s[3] -; CHECK-NEXT: ext v4.16b, v3.16b, v16.16b, #12 -; CHECK-NEXT: mov v19.s[3], v0.s[3] -; CHECK-NEXT: mov v17.s[3], v1.s[3] -; CHECK-NEXT: ext v6.16b, v1.16b, v6.16b, #12 +; CHECK-NEXT: ext v18.16b, v7.16b, v7.16b, #8 +; CHECK-NEXT: zip2 v4.4s, v0.4s, v3.4s +; CHECK-NEXT: mov v0.s[1], v3.s[0] +; CHECK-NEXT: ext v3.16b, v2.16b, v5.16b, #8 +; CHECK-NEXT: mov v2.s[3], v1.s[2] +; CHECK-NEXT: uzp2 v19.4s, v7.4s, v16.4s +; CHECK-NEXT: uzp1 v6.4s, v7.4s, v16.4s +; CHECK-NEXT: uzp1 v7.4s, v18.4s, v16.4s +; CHECK-NEXT: uzp2 v1.4s, v18.4s, v16.4s +; CHECK-NEXT: mov v0.d[1], v3.d[1] +; CHECK-NEXT: mov v4.d[1], v2.d[1] +; CHECK-NEXT: add v5.4s, v19.4s, v6.4s +; CHECK-NEXT: sub v1.4s, v7.4s, v1.4s +; CHECK-NEXT: rev64 v2.4s, v5.4s +; CHECK-NEXT: sub v6.4s, v0.4s, v4.4s +; CHECK-NEXT: add v0.4s, v4.4s, v0.4s +; CHECK-NEXT: rev64 v3.4s, v1.4s +; CHECK-NEXT: rev64 v4.4s, v6.4s +; CHECK-NEXT: rev64 v7.4s, v0.4s +; CHECK-NEXT: addp v16.4s, v1.4s, v6.4s +; CHECK-NEXT: addp v17.4s, v5.4s, v0.4s +; CHECK-NEXT: sub v4.4s, v6.4s, v4.4s +; CHECK-NEXT: sub v0.4s, v0.4s, v7.4s +; CHECK-NEXT: sub v1.4s, v1.4s, v3.4s +; CHECK-NEXT: sub v2.4s, v5.4s, v2.4s +; CHECK-NEXT: ext v3.16b, v1.16b, v16.16b, #8 +; CHECK-NEXT: ext v5.16b, v17.16b, v0.16b, #4 +; CHECK-NEXT: ext v6.16b, v16.16b, v4.16b, #4 +; CHECK-NEXT: zip1 v7.4s, v17.4s, v17.4s +; CHECK-NEXT: ext v18.16b, v3.16b, v1.16b, #4 +; CHECK-NEXT: zip2 v5.4s, v5.4s, v17.4s +; CHECK-NEXT: zip2 v6.4s, v6.4s, v16.4s +; CHECK-NEXT: trn2 v7.4s, v7.4s, v2.4s +; CHECK-NEXT: ext v2.16b, v2.16b, v17.16b, #4 +; CHECK-NEXT: mov v1.s[2], v16.s[1] ; CHECK-NEXT: ext v5.16b, v0.16b, v5.16b, #12 -; CHECK-NEXT: rev64 v18.4s, v18.4s -; CHECK-NEXT: trn2 v20.4s, v2.4s, v20.4s -; CHECK-NEXT: sub v16.4s, v7.4s, v4.4s -; CHECK-NEXT: sub v21.4s, v17.4s, v6.4s -; CHECK-NEXT: sub v22.4s, v19.4s, v5.4s -; CHECK-NEXT: trn2 v2.4s, v18.4s, v2.4s -; CHECK-NEXT: mov v17.s[0], v1.s[0] -; CHECK-NEXT: ext v1.16b, v20.16b, v20.16b, #4 -; CHECK-NEXT: mov v19.s[0], v0.s[0] -; CHECK-NEXT: mov v7.s[0], v3.s[0] -; CHECK-NEXT: add v0.4s, v17.4s, v6.4s -; CHECK-NEXT: add v3.4s, v2.4s, v1.4s -; CHECK-NEXT: add v5.4s, v19.4s, v5.4s -; CHECK-NEXT: add v4.4s, v7.4s, v4.4s -; CHECK-NEXT: sub v1.4s, v2.4s, v1.4s -; CHECK-NEXT: mov v4.d[1], v16.d[1] -; CHECK-NEXT: mov v5.d[1], v22.d[1] -; CHECK-NEXT: mov v0.d[1], v21.d[1] -; CHECK-NEXT: mov v3.d[1], v1.d[1] -; CHECK-NEXT: movi v1.8h, #1 +; CHECK-NEXT: ext v6.16b, v4.16b, v6.16b, #12 +; CHECK-NEXT: uzp2 v3.4s, v3.4s, v18.4s +; CHECK-NEXT: mov v4.s[2], v16.s[3] +; CHECK-NEXT: mov v0.s[2], v17.s[3] +; CHECK-NEXT: ext v2.16b, v2.16b, v2.16b, #4 +; CHECK-NEXT: sub v18.4s, v1.4s, v3.4s +; CHECK-NEXT: sub v19.4s, v4.4s, v6.4s +; CHECK-NEXT: sub v20.4s, v0.4s, v5.4s +; CHECK-NEXT: sub v21.4s, v7.4s, v2.4s +; CHECK-NEXT: mov v4.s[1], v16.s[2] +; CHECK-NEXT: mov v0.s[1], v17.s[2] +; CHECK-NEXT: mov v2.s[0], v17.s[1] +; CHECK-NEXT: mov v1.s[1], v16.s[0] +; CHECK-NEXT: add v4.4s, v4.4s, v6.4s +; CHECK-NEXT: add v0.4s, v0.4s, v5.4s +; CHECK-NEXT: add v2.4s, v7.4s, v2.4s +; CHECK-NEXT: add v1.4s, v1.4s, v3.4s +; CHECK-NEXT: mov v2.d[1], v21.d[1] +; CHECK-NEXT: mov v1.d[1], v18.d[1] +; CHECK-NEXT: mov v4.d[1], v19.d[1] +; CHECK-NEXT: mov v0.d[1], v20.d[1] +; CHECK-NEXT: movi v3.8h, #1 ; CHECK-NEXT: movi v17.2d, #0x00ffff0000ffff -; CHECK-NEXT: ushr v2.4s, v0.4s, #15 -; CHECK-NEXT: ushr v6.4s, v4.4s, #15 -; CHECK-NEXT: ushr v7.4s, v3.4s, #15 -; CHECK-NEXT: ushr v16.4s, v5.4s, #15 -; CHECK-NEXT: and v6.16b, v6.16b, v1.16b -; CHECK-NEXT: and v16.16b, v16.16b, v1.16b -; CHECK-NEXT: and v7.16b, v7.16b, v1.16b -; CHECK-NEXT: and v1.16b, v2.16b, v1.16b -; CHECK-NEXT: mul v2.4s, v6.4s, v17.4s +; CHECK-NEXT: ushr v5.4s, v4.4s, #15 +; CHECK-NEXT: ushr v6.4s, v2.4s, #15 +; CHECK-NEXT: ushr v7.4s, v0.4s, #15 +; CHECK-NEXT: ushr v16.4s, v1.4s, #15 +; CHECK-NEXT: and v6.16b, v6.16b, v3.16b +; CHECK-NEXT: and v16.16b, v16.16b, v3.16b +; CHECK-NEXT: and v7.16b, v7.16b, v3.16b +; CHECK-NEXT: and v3.16b, v5.16b, v3.16b +; CHECK-NEXT: mul v5.4s, v6.4s, v17.4s ; CHECK-NEXT: mul v6.4s, v16.4s, v17.4s -; CHECK-NEXT: mul v1.4s, v1.4s, v17.4s +; CHECK-NEXT: mul v3.4s, v3.4s, v17.4s ; CHECK-NEXT: mul v7.4s, v7.4s, v17.4s -; CHECK-NEXT: add v4.4s, v2.4s, v4.4s -; CHECK-NEXT: add v5.4s, v6.4s, v5.4s -; CHECK-NEXT: add v0.4s, v1.4s, v0.4s -; CHECK-NEXT: add v3.4s, v7.4s, v3.4s -; CHECK-NEXT: eor v0.16b, v0.16b, v1.16b -; CHECK-NEXT: eor v1.16b, v3.16b, v7.16b -; CHECK-NEXT: eor v3.16b, v5.16b, v6.16b -; CHECK-NEXT: eor v2.16b, v4.16b, v2.16b -; CHECK-NEXT: add v2.4s, v2.4s, v3.4s +; CHECK-NEXT: add v2.4s, v5.4s, v2.4s +; CHECK-NEXT: add v1.4s, v6.4s, v1.4s +; CHECK-NEXT: add v4.4s, v3.4s, v4.4s +; CHECK-NEXT: add v0.4s, v7.4s, v0.4s +; CHECK-NEXT: eor v3.16b, v4.16b, v3.16b +; CHECK-NEXT: eor v0.16b, v0.16b, v7.16b +; CHECK-NEXT: eor v1.16b, v1.16b, v6.16b +; CHECK-NEXT: eor v2.16b, v2.16b, v5.16b +; CHECK-NEXT: add v1.4s, v2.4s, v1.4s +; CHECK-NEXT: add v0.4s, v0.4s, v3.4s ; CHECK-NEXT: add v0.4s, v1.4s, v0.4s -; CHECK-NEXT: add v0.4s, v0.4s, v2.4s ; CHECK-NEXT: addv s0, v0.4s ; CHECK-NEXT: fmov w8, s0 ; CHECK-NEXT: lsr w9, w8, #16 Index: llvm/test/CodeGen/AArch64/reduce-shuffle.ll =================================================================== --- llvm/test/CodeGen/AArch64/reduce-shuffle.ll +++ llvm/test/CodeGen/AArch64/reduce-shuffle.ll @@ -35,122 +35,120 @@ ; CHECK-NEXT: ushll v4.8h, v4.8b, #0 ; CHECK-NEXT: ld1 { v6.s }[1], [x2] ; CHECK-NEXT: ushll v0.8h, v0.8b, #0 -; CHECK-NEXT: usubl v7.4s, v3.4h, v5.4h -; CHECK-NEXT: usubl2 v3.4s, v3.8h, v5.8h -; CHECK-NEXT: usubl2 v5.4s, v2.8h, v4.8h +; CHECK-NEXT: usubl v17.4s, v3.4h, v5.4h +; CHECK-NEXT: usubl2 v5.4s, v3.8h, v5.8h +; CHECK-NEXT: usubl2 v3.4s, v2.8h, v4.8h ; CHECK-NEXT: usubl v2.4s, v2.4h, v4.4h ; CHECK-NEXT: ushll v4.8h, v6.8b, #0 -; CHECK-NEXT: shl v5.4s, v5.4s, #16 +; CHECK-NEXT: shl v3.4s, v3.4s, #16 ; CHECK-NEXT: usubl2 v6.4s, v0.8h, v4.8h ; CHECK-NEXT: shl v2.4s, v2.4s, #16 ; CHECK-NEXT: usubl v0.4s, v0.4h, v4.4h -; CHECK-NEXT: add v1.4s, v5.4s, v1.4s -; CHECK-NEXT: shl v4.4s, v6.4s, #16 -; CHECK-NEXT: shl v0.4s, v0.4s, #16 -; CHECK-NEXT: add v2.4s, v2.4s, v16.4s -; CHECK-NEXT: add v3.4s, v4.4s, v3.4s -; CHECK-NEXT: add v0.4s, v0.4s, v7.4s -; CHECK-NEXT: uzp2 v6.4s, v2.4s, v1.4s -; CHECK-NEXT: ext v17.16b, v2.16b, v2.16b, #12 -; CHECK-NEXT: zip1 v4.4s, v0.4s, v3.4s -; CHECK-NEXT: mov v16.16b, v2.16b -; CHECK-NEXT: mov v19.16b, v1.16b -; CHECK-NEXT: zip2 v5.4s, v1.4s, v2.4s -; CHECK-NEXT: zip2 v18.4s, v2.4s, v1.4s -; CHECK-NEXT: mov v16.s[0], v1.s[1] -; CHECK-NEXT: uzp2 v6.4s, v6.4s, v2.4s -; CHECK-NEXT: zip2 v7.4s, v0.4s, v3.4s -; CHECK-NEXT: ext v1.16b, v1.16b, v17.16b, #12 -; CHECK-NEXT: ext v17.16b, v0.16b, v4.16b, #8 -; CHECK-NEXT: mov v19.s[1], v2.s[0] -; CHECK-NEXT: mov v0.s[3], v3.s[2] -; CHECK-NEXT: mov v6.d[1], v7.d[1] -; CHECK-NEXT: mov v16.d[1], v4.d[1] -; CHECK-NEXT: mov v19.d[1], v17.d[1] -; CHECK-NEXT: mov v18.d[1], v0.d[1] -; CHECK-NEXT: mov v1.d[1], v7.d[1] -; CHECK-NEXT: mov v5.d[1], v0.d[1] -; CHECK-NEXT: add v0.4s, v16.4s, v19.4s -; CHECK-NEXT: add v4.4s, v6.4s, v18.4s -; CHECK-NEXT: rev64 v3.4s, v0.4s -; CHECK-NEXT: sub v1.4s, v5.4s, v1.4s -; CHECK-NEXT: rev64 v5.4s, v4.4s -; CHECK-NEXT: sub v2.4s, v19.4s, v16.4s -; CHECK-NEXT: mov v3.d[1], v0.d[1] -; CHECK-NEXT: add v6.4s, v1.4s, v2.4s -; CHECK-NEXT: sub v1.4s, v2.4s, v1.4s -; CHECK-NEXT: mov v5.d[1], v4.d[1] -; CHECK-NEXT: rev64 v2.4s, v1.4s -; CHECK-NEXT: rev64 v7.4s, v6.4s +; CHECK-NEXT: add v19.4s, v3.4s, v1.4s +; CHECK-NEXT: shl v6.4s, v6.4s, #16 +; CHECK-NEXT: shl v4.4s, v0.4s, #16 +; CHECK-NEXT: add v1.4s, v2.4s, v16.4s +; CHECK-NEXT: add v7.4s, v6.4s, v5.4s +; CHECK-NEXT: add v18.4s, v4.4s, v17.4s +; CHECK-NEXT: ext v20.16b, v1.16b, v1.16b, #12 +; CHECK-NEXT: zip1 v17.4s, v18.4s, v7.4s +; CHECK-NEXT: uzp2 v16.4s, v1.4s, v19.4s +; CHECK-NEXT: mov v2.16b, v1.16b +; CHECK-NEXT: mov v6.16b, v19.16b +; CHECK-NEXT: mov v2.s[0], v19.s[1] +; CHECK-NEXT: ext v5.16b, v19.16b, v20.16b, #12 +; CHECK-NEXT: ext v20.16b, v18.16b, v17.16b, #8 +; CHECK-NEXT: mov v6.s[1], v1.s[0] +; CHECK-NEXT: zip2 v3.4s, v1.4s, v19.4s +; CHECK-NEXT: uzp2 v4.4s, v16.4s, v1.4s +; CHECK-NEXT: zip2 v16.4s, v18.4s, v7.4s +; CHECK-NEXT: mov v18.s[3], v7.s[2] +; CHECK-NEXT: mov v2.d[1], v17.d[1] +; CHECK-NEXT: mov v6.d[1], v20.d[1] +; CHECK-NEXT: zip2 v0.4s, v19.4s, v1.4s +; CHECK-NEXT: mov v4.d[1], v16.d[1] +; CHECK-NEXT: mov v3.d[1], v18.d[1] +; CHECK-NEXT: add v1.4s, v2.4s, v6.4s +; CHECK-NEXT: mov v5.d[1], v16.d[1] +; CHECK-NEXT: mov v0.d[1], v18.d[1] ; CHECK-NEXT: add v3.4s, v4.4s, v3.4s +; CHECK-NEXT: sub v2.4s, v6.4s, v2.4s +; CHECK-NEXT: rev64 v6.4s, v1.4s +; CHECK-NEXT: rev64 v4.4s, v3.4s ; CHECK-NEXT: sub v0.4s, v0.4s, v5.4s -; CHECK-NEXT: add v4.4s, v1.4s, v2.4s -; CHECK-NEXT: add v16.4s, v6.4s, v7.4s -; CHECK-NEXT: sub v1.4s, v1.4s, v2.4s -; CHECK-NEXT: sub v2.4s, v6.4s, v7.4s -; CHECK-NEXT: rev64 v6.4s, v3.4s -; CHECK-NEXT: rev64 v17.4s, v0.4s -; CHECK-NEXT: ext v7.16b, v4.16b, v1.16b, #4 -; CHECK-NEXT: ext v5.16b, v16.16b, v2.16b, #4 -; CHECK-NEXT: add v18.4s, v3.4s, v6.4s -; CHECK-NEXT: add v19.4s, v0.4s, v17.4s -; CHECK-NEXT: sub v0.4s, v0.4s, v17.4s -; CHECK-NEXT: sub v3.4s, v3.4s, v6.4s -; CHECK-NEXT: rev64 v6.4s, v7.4s -; CHECK-NEXT: rev64 v7.4s, v18.4s +; CHECK-NEXT: movi v19.8h, #1 +; CHECK-NEXT: mov v6.d[1], v1.d[1] +; CHECK-NEXT: add v5.4s, v0.4s, v2.4s +; CHECK-NEXT: sub v0.4s, v2.4s, v0.4s +; CHECK-NEXT: mov v4.d[1], v3.d[1] +; CHECK-NEXT: rev64 v2.4s, v5.4s +; CHECK-NEXT: rev64 v7.4s, v0.4s +; CHECK-NEXT: add v3.4s, v3.4s, v6.4s +; CHECK-NEXT: sub v1.4s, v1.4s, v4.4s +; CHECK-NEXT: addp v4.4s, v3.4s, v5.4s +; CHECK-NEXT: sub v2.4s, v5.4s, v2.4s +; CHECK-NEXT: sub v6.4s, v0.4s, v7.4s +; CHECK-NEXT: rev64 v7.4s, v3.4s +; CHECK-NEXT: rev64 v5.4s, v1.4s +; CHECK-NEXT: zip1 v16.4s, v4.4s, v4.4s +; CHECK-NEXT: addp v0.4s, v1.4s, v0.4s +; CHECK-NEXT: ext v17.16b, v4.16b, v2.16b, #4 +; CHECK-NEXT: sub v3.4s, v3.4s, v7.4s +; CHECK-NEXT: sub v1.4s, v1.4s, v5.4s +; CHECK-NEXT: ext v7.16b, v0.16b, v6.16b, #4 +; CHECK-NEXT: ext v18.16b, v3.16b, v4.16b, #4 +; CHECK-NEXT: trn2 v3.4s, v16.4s, v3.4s +; CHECK-NEXT: ext v16.16b, v1.16b, v0.16b, #8 +; CHECK-NEXT: zip2 v5.4s, v17.4s, v4.4s +; CHECK-NEXT: zip2 v7.4s, v7.4s, v0.4s ; CHECK-NEXT: ext v17.16b, v18.16b, v18.16b, #4 -; CHECK-NEXT: ext v18.16b, v19.16b, v0.16b, #4 -; CHECK-NEXT: rev64 v5.4s, v5.4s -; CHECK-NEXT: mov v16.s[3], v2.s[3] -; CHECK-NEXT: mov v4.s[3], v1.s[3] -; CHECK-NEXT: rev64 v18.4s, v18.4s -; CHECK-NEXT: mov v19.s[3], v0.s[3] +; CHECK-NEXT: ext v18.16b, v16.16b, v1.16b, #4 ; CHECK-NEXT: ext v5.16b, v2.16b, v5.16b, #12 -; CHECK-NEXT: ext v6.16b, v1.16b, v6.16b, #12 -; CHECK-NEXT: trn2 v7.4s, v7.4s, v3.4s -; CHECK-NEXT: trn2 v3.4s, v3.4s, v17.4s -; CHECK-NEXT: ext v18.16b, v0.16b, v18.16b, #12 -; CHECK-NEXT: sub v17.4s, v16.4s, v5.4s -; CHECK-NEXT: sub v20.4s, v4.4s, v6.4s -; CHECK-NEXT: ext v3.16b, v3.16b, v3.16b, #4 -; CHECK-NEXT: mov v16.s[0], v2.s[0] -; CHECK-NEXT: sub v2.4s, v19.4s, v18.4s -; CHECK-NEXT: mov v4.s[0], v1.s[0] -; CHECK-NEXT: mov v19.s[0], v0.s[0] -; CHECK-NEXT: add v1.4s, v7.4s, v3.4s -; CHECK-NEXT: sub v0.4s, v7.4s, v3.4s -; CHECK-NEXT: add v3.4s, v4.4s, v6.4s -; CHECK-NEXT: add v4.4s, v16.4s, v5.4s -; CHECK-NEXT: add v5.4s, v19.4s, v18.4s -; CHECK-NEXT: mov v4.d[1], v17.d[1] +; CHECK-NEXT: mov v2.s[2], v4.s[3] +; CHECK-NEXT: ext v7.16b, v6.16b, v7.16b, #12 +; CHECK-NEXT: mov v1.s[2], v0.s[1] +; CHECK-NEXT: mov v6.s[2], v0.s[3] +; CHECK-NEXT: uzp2 v16.4s, v16.4s, v18.4s +; CHECK-NEXT: sub v18.4s, v2.4s, v5.4s +; CHECK-NEXT: mov v2.s[1], v4.s[2] +; CHECK-NEXT: sub v20.4s, v3.4s, v17.4s +; CHECK-NEXT: mov v17.s[0], v4.s[1] +; CHECK-NEXT: sub v21.4s, v6.4s, v7.4s +; CHECK-NEXT: sub v4.4s, v1.4s, v16.4s +; CHECK-NEXT: mov v6.s[1], v0.s[2] +; CHECK-NEXT: mov v1.s[1], v0.s[0] +; CHECK-NEXT: add v0.4s, v2.4s, v5.4s +; CHECK-NEXT: add v3.4s, v3.4s, v17.4s +; CHECK-NEXT: mov v0.d[1], v18.d[1] +; CHECK-NEXT: add v2.4s, v6.4s, v7.4s +; CHECK-NEXT: add v1.4s, v1.4s, v16.4s ; CHECK-NEXT: mov v3.d[1], v20.d[1] -; CHECK-NEXT: mov v1.d[1], v0.d[1] -; CHECK-NEXT: mov v5.d[1], v2.d[1] -; CHECK-NEXT: movi v0.8h, #1 -; CHECK-NEXT: movi v17.2d, #0x00ffff0000ffff -; CHECK-NEXT: ushr v2.4s, v1.4s, #15 -; CHECK-NEXT: ushr v6.4s, v4.4s, #15 -; CHECK-NEXT: ushr v7.4s, v5.4s, #15 -; CHECK-NEXT: ushr v16.4s, v3.4s, #15 -; CHECK-NEXT: and v6.16b, v6.16b, v0.16b -; CHECK-NEXT: and v16.16b, v16.16b, v0.16b -; CHECK-NEXT: and v7.16b, v7.16b, v0.16b -; CHECK-NEXT: and v0.16b, v2.16b, v0.16b -; CHECK-NEXT: mul v2.4s, v6.4s, v17.4s -; CHECK-NEXT: mul v6.4s, v16.4s, v17.4s -; CHECK-NEXT: mul v0.4s, v0.4s, v17.4s -; CHECK-NEXT: mul v7.4s, v7.4s, v17.4s -; CHECK-NEXT: add v4.4s, v2.4s, v4.4s -; CHECK-NEXT: add v3.4s, v6.4s, v3.4s -; CHECK-NEXT: add v1.4s, v0.4s, v1.4s -; CHECK-NEXT: add v5.4s, v7.4s, v5.4s -; CHECK-NEXT: eor v0.16b, v1.16b, v0.16b -; CHECK-NEXT: eor v1.16b, v5.16b, v7.16b -; CHECK-NEXT: eor v3.16b, v3.16b, v6.16b -; CHECK-NEXT: eor v2.16b, v4.16b, v2.16b -; CHECK-NEXT: add v2.4s, v2.4s, v3.4s -; CHECK-NEXT: add v0.4s, v0.4s, v1.4s +; CHECK-NEXT: mov v2.d[1], v21.d[1] +; CHECK-NEXT: mov v1.d[1], v4.d[1] +; CHECK-NEXT: ushr v5.4s, v0.4s, #15 +; CHECK-NEXT: ushr v7.4s, v3.4s, #15 +; CHECK-NEXT: and v4.16b, v5.16b, v19.16b +; CHECK-NEXT: ushr v6.4s, v2.4s, #15 +; CHECK-NEXT: movi v5.2d, #0x00ffff0000ffff +; CHECK-NEXT: ushr v16.4s, v1.4s, #15 +; CHECK-NEXT: and v6.16b, v6.16b, v19.16b +; CHECK-NEXT: and v16.16b, v16.16b, v19.16b +; CHECK-NEXT: and v7.16b, v7.16b, v19.16b +; CHECK-NEXT: mul v6.4s, v6.4s, v5.4s +; CHECK-NEXT: mul v4.4s, v4.4s, v5.4s +; CHECK-NEXT: mul v7.4s, v7.4s, v5.4s +; CHECK-NEXT: mul v5.4s, v16.4s, v5.4s +; CHECK-NEXT: add v2.4s, v6.4s, v2.4s +; CHECK-NEXT: add v0.4s, v4.4s, v0.4s +; CHECK-NEXT: add v3.4s, v7.4s, v3.4s +; CHECK-NEXT: add v1.4s, v5.4s, v1.4s +; CHECK-NEXT: eor v3.16b, v3.16b, v7.16b +; CHECK-NEXT: eor v1.16b, v1.16b, v5.16b +; CHECK-NEXT: eor v0.16b, v0.16b, v4.16b +; CHECK-NEXT: eor v2.16b, v2.16b, v6.16b ; CHECK-NEXT: add v0.4s, v0.4s, v2.4s +; CHECK-NEXT: add v1.4s, v3.4s, v1.4s +; CHECK-NEXT: add v0.4s, v1.4s, v0.4s ; CHECK-NEXT: addv s0, v0.4s ; CHECK-NEXT: fmov w8, s0 ; CHECK-NEXT: lsr w9, w8, #16 @@ -541,76 +539,68 @@ ; CHECK-NEXT: add v3.4s, v3.4s, v16.4s ; CHECK-NEXT: add v1.4s, v5.4s, v1.4s ; CHECK-NEXT: add v2.4s, v4.4s, v2.4s -; CHECK-NEXT: rev64 v4.4s, v0.4s -; CHECK-NEXT: rev64 v5.4s, v3.4s -; CHECK-NEXT: rev64 v6.4s, v1.4s -; CHECK-NEXT: rev64 v7.4s, v2.4s -; CHECK-NEXT: add v16.4s, v0.4s, v4.4s -; CHECK-NEXT: add v17.4s, v3.4s, v5.4s -; CHECK-NEXT: add v18.4s, v1.4s, v6.4s -; CHECK-NEXT: add v19.4s, v2.4s, v7.4s -; CHECK-NEXT: sub v1.4s, v1.4s, v6.4s -; CHECK-NEXT: sub v2.4s, v2.4s, v7.4s -; CHECK-NEXT: sub v0.4s, v0.4s, v4.4s -; CHECK-NEXT: sub v3.4s, v3.4s, v5.4s -; CHECK-NEXT: ext v4.16b, v17.16b, v17.16b, #12 -; CHECK-NEXT: zip1 v5.4s, v18.4s, v19.4s -; CHECK-NEXT: ext v7.16b, v2.16b, v1.16b, #4 -; CHECK-NEXT: mov v19.s[2], v18.s[3] -; CHECK-NEXT: uzp2 v18.4s, v17.4s, v16.4s +; CHECK-NEXT: rev64 v4.4s, v3.4s +; CHECK-NEXT: rev64 v5.4s, v0.4s +; CHECK-NEXT: rev64 v6.4s, v2.4s +; CHECK-NEXT: rev64 v17.4s, v1.4s +; CHECK-NEXT: addp v7.4s, v3.4s, v0.4s +; CHECK-NEXT: addp v16.4s, v2.4s, v1.4s +; CHECK-NEXT: sub v0.4s, v0.4s, v5.4s +; CHECK-NEXT: sub v3.4s, v3.4s, v4.4s +; CHECK-NEXT: sub v1.4s, v1.4s, v17.4s +; CHECK-NEXT: sub v2.4s, v2.4s, v6.4s ; CHECK-NEXT: zip2 v6.4s, v3.4s, v0.4s -; CHECK-NEXT: ext v4.16b, v16.16b, v4.16b, #12 -; CHECK-NEXT: zip1 v0.4s, v3.4s, v0.4s -; CHECK-NEXT: trn2 v3.4s, v17.4s, v16.4s -; CHECK-NEXT: uzp2 v18.4s, v18.4s, v17.4s +; CHECK-NEXT: ext v17.16b, v2.16b, v1.16b, #4 ; CHECK-NEXT: mov v1.s[3], v2.s[2] -; CHECK-NEXT: ext v2.16b, v7.16b, v2.16b, #4 -; CHECK-NEXT: mov v17.s[0], v16.s[1] -; CHECK-NEXT: mov v4.d[1], v19.d[1] +; CHECK-NEXT: uzp2 v5.4s, v7.4s, v16.4s +; CHECK-NEXT: ext v4.16b, v16.16b, v16.16b, #8 +; CHECK-NEXT: uzp1 v16.4s, v7.4s, v16.4s +; CHECK-NEXT: zip1 v0.4s, v3.4s, v0.4s ; CHECK-NEXT: mov v6.d[1], v1.d[1] -; CHECK-NEXT: mov v0.d[1], v2.d[1] -; CHECK-NEXT: mov v17.d[1], v5.d[1] -; CHECK-NEXT: mov v3.d[1], v5.d[1] -; CHECK-NEXT: mov v18.d[1], v19.d[1] -; CHECK-NEXT: add v1.4s, v6.4s, v0.4s -; CHECK-NEXT: add v2.4s, v4.4s, v17.4s +; CHECK-NEXT: ext v1.16b, v17.16b, v2.16b, #4 +; CHECK-NEXT: rev64 v3.4s, v5.4s +; CHECK-NEXT: uzp1 v2.4s, v7.4s, v4.4s +; CHECK-NEXT: rev64 v5.4s, v16.4s +; CHECK-NEXT: uzp2 v4.4s, v7.4s, v4.4s +; CHECK-NEXT: mov v0.d[1], v1.d[1] +; CHECK-NEXT: add v1.4s, v3.4s, v5.4s +; CHECK-NEXT: sub v2.4s, v2.4s, v4.4s +; CHECK-NEXT: add v3.4s, v6.4s, v0.4s ; CHECK-NEXT: sub v0.4s, v0.4s, v6.4s -; CHECK-NEXT: sub v3.4s, v3.4s, v18.4s -; CHECK-NEXT: zip1 v4.4s, v2.4s, v1.4s -; CHECK-NEXT: zip1 v5.4s, v3.4s, v0.4s -; CHECK-NEXT: uzp2 v6.4s, v2.4s, v1.4s -; CHECK-NEXT: mov v7.16b, v2.16b -; CHECK-NEXT: trn2 v4.4s, v2.4s, v4.4s -; CHECK-NEXT: ext v16.16b, v3.16b, v5.16b, #8 -; CHECK-NEXT: uzp2 v6.4s, v6.4s, v2.4s -; CHECK-NEXT: mov v7.s[1], v1.s[1] -; CHECK-NEXT: zip2 v1.4s, v2.4s, v1.4s -; CHECK-NEXT: zip2 v2.4s, v3.4s, v0.4s -; CHECK-NEXT: mov v3.s[3], v0.s[2] +; CHECK-NEXT: zip1 v4.4s, v1.4s, v3.4s +; CHECK-NEXT: uzp2 v5.4s, v1.4s, v3.4s +; CHECK-NEXT: zip1 v6.4s, v2.4s, v0.4s +; CHECK-NEXT: zip2 v7.4s, v1.4s, v3.4s +; CHECK-NEXT: trn2 v4.4s, v1.4s, v4.4s +; CHECK-NEXT: uzp2 v5.4s, v5.4s, v1.4s +; CHECK-NEXT: ext v16.16b, v2.16b, v6.16b, #8 +; CHECK-NEXT: zip2 v17.4s, v2.4s, v0.4s +; CHECK-NEXT: mov v1.s[1], v3.s[1] +; CHECK-NEXT: mov v2.s[3], v0.s[2] ; CHECK-NEXT: mov v4.d[1], v16.d[1] -; CHECK-NEXT: mov v7.d[1], v5.d[1] -; CHECK-NEXT: mov v6.d[1], v2.d[1] -; CHECK-NEXT: mov v1.d[1], v3.d[1] -; CHECK-NEXT: add v0.4s, v4.4s, v7.4s -; CHECK-NEXT: sub v3.4s, v7.4s, v4.4s -; CHECK-NEXT: add v4.4s, v6.4s, v1.4s -; CHECK-NEXT: ext v2.16b, v0.16b, v0.16b, #4 -; CHECK-NEXT: ext v7.16b, v4.16b, v4.16b, #4 -; CHECK-NEXT: sub v1.4s, v1.4s, v6.4s -; CHECK-NEXT: ext v5.16b, v2.16b, v3.16b, #8 -; CHECK-NEXT: ext v6.16b, v7.16b, v1.16b, #8 -; CHECK-NEXT: zip1 v16.4s, v4.4s, v1.4s -; CHECK-NEXT: zip2 v17.4s, v4.4s, v1.4s -; CHECK-NEXT: ext v2.16b, v5.16b, v2.16b, #4 -; CHECK-NEXT: zip2 v5.4s, v3.4s, v0.4s -; CHECK-NEXT: zip2 v1.4s, v1.4s, v4.4s -; CHECK-NEXT: ext v4.16b, v6.16b, v7.16b, #4 -; CHECK-NEXT: zip2 v6.4s, v0.4s, v3.4s -; CHECK-NEXT: zip1 v0.4s, v0.4s, v3.4s -; CHECK-NEXT: add v1.4s, v5.4s, v1.4s -; CHECK-NEXT: add v2.4s, v2.4s, v4.4s -; CHECK-NEXT: sub v3.4s, v17.4s, v6.4s -; CHECK-NEXT: sub v0.4s, v16.4s, v0.4s +; CHECK-NEXT: mov v5.d[1], v17.d[1] +; CHECK-NEXT: mov v1.d[1], v6.d[1] +; CHECK-NEXT: mov v7.d[1], v2.d[1] +; CHECK-NEXT: add v0.4s, v4.4s, v1.4s +; CHECK-NEXT: add v2.4s, v5.4s, v7.4s +; CHECK-NEXT: ext v3.16b, v0.16b, v0.16b, #4 +; CHECK-NEXT: sub v1.4s, v1.4s, v4.4s +; CHECK-NEXT: ext v4.16b, v2.16b, v2.16b, #4 +; CHECK-NEXT: sub v5.4s, v7.4s, v5.4s +; CHECK-NEXT: ext v6.16b, v3.16b, v1.16b, #8 +; CHECK-NEXT: zip1 v7.4s, v2.4s, v5.4s +; CHECK-NEXT: ext v17.16b, v4.16b, v5.16b, #8 +; CHECK-NEXT: zip2 v16.4s, v2.4s, v5.4s +; CHECK-NEXT: ext v3.16b, v6.16b, v3.16b, #4 +; CHECK-NEXT: zip2 v6.4s, v1.4s, v0.4s +; CHECK-NEXT: zip2 v2.4s, v5.4s, v2.4s +; CHECK-NEXT: ext v4.16b, v17.16b, v4.16b, #4 +; CHECK-NEXT: zip2 v5.4s, v0.4s, v1.4s +; CHECK-NEXT: zip1 v0.4s, v0.4s, v1.4s +; CHECK-NEXT: add v1.4s, v6.4s, v2.4s +; CHECK-NEXT: add v2.4s, v3.4s, v4.4s +; CHECK-NEXT: sub v3.4s, v16.4s, v5.4s +; CHECK-NEXT: sub v0.4s, v7.4s, v0.4s ; CHECK-NEXT: movi v4.8h, #1 ; CHECK-NEXT: movi v17.2d, #0x00ffff0000ffff ; CHECK-NEXT: ushr v5.4s, v0.4s, #15