diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -15687,6 +15687,41 @@ return DAG.getNode(AArch64ISD::CSINC, DL, VT, LHS, LHS, CC, Cond); } +// Transform vector add(zext i8 to i32, zext i8 to i32) +// into sext(add(zext(i8 to i16), zext(i8 to i16)) to i32) +// This allows extra uses of saddl/uaddl at the lower vector widths, and less +// extends. +static SDValue performVectorAddSubExtCombine(SDNode *N, SelectionDAG &DAG) { + EVT VT = N->getValueType(0); + if (!VT.isFixedLengthVector() || VT.getSizeInBits() <= 128 || + (N->getOperand(0).getOpcode() != ISD::ZERO_EXTEND && + N->getOperand(0).getOpcode() != ISD::SIGN_EXTEND) || + (N->getOperand(1).getOpcode() != ISD::ZERO_EXTEND && + N->getOperand(1).getOpcode() != ISD::SIGN_EXTEND) || + N->getOperand(0).getOperand(0).getValueType() != + N->getOperand(1).getOperand(0).getValueType()) + return SDValue(); + + SDValue N0 = N->getOperand(0).getOperand(0); + SDValue N1 = N->getOperand(1).getOperand(0); + EVT InVT = N0.getValueType(); + + EVT S1 = InVT.getScalarType(); + EVT S2 = VT.getScalarType(); + if ((S2 == MVT::i32 && S1 == MVT::i8) || + (S2 == MVT::i64 && (S1 == MVT::i8 || S1 == MVT::i16))) { + SDLoc DL(N); + EVT HalfVT = EVT::getVectorVT(*DAG.getContext(), + S2.getHalfSizedIntegerVT(*DAG.getContext()), + VT.getVectorElementCount()); + SDValue NewN0 = DAG.getNode(N->getOperand(0).getOpcode(), DL, HalfVT, N0); + SDValue NewN1 = DAG.getNode(N->getOperand(1).getOpcode(), DL, HalfVT, N1); + SDValue NewOp = DAG.getNode(N->getOpcode(), DL, HalfVT, NewN0, NewN1); + return DAG.getNode(ISD::SIGN_EXTEND, DL, VT, NewOp); + } + return SDValue(); +} + static SDValue performAddSubCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG) { @@ -15699,6 +15734,8 @@ return Val; if (SDValue Val = performNegCSelCombine(N, DAG)) return Val; + if (SDValue Val = performVectorAddSubExtCombine(N, DAG)) + return Val; return performAddSubLongCombine(N, DCI, DAG); } diff --git a/llvm/test/CodeGen/AArch64/insert-extend.ll b/llvm/test/CodeGen/AArch64/insert-extend.ll --- a/llvm/test/CodeGen/AArch64/insert-extend.ll +++ b/llvm/test/CodeGen/AArch64/insert-extend.ll @@ -46,145 +46,133 @@ define i32 @large(i8* nocapture noundef readonly %p1, i32 noundef %st1, i8* nocapture noundef readonly %p2, i32 noundef %st2) { ; CHECK-LABEL: large: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: // kill: def $w1 killed $w1 def $x1 -; CHECK-NEXT: sxtw x8, w1 ; CHECK-NEXT: // kill: def $w3 killed $w3 def $x3 -; CHECK-NEXT: sxtw x11, w3 -; CHECK-NEXT: add x9, x0, x8 -; CHECK-NEXT: add x12, x2, x11 -; CHECK-NEXT: add x10, x9, x8 -; CHECK-NEXT: add x13, x12, x11 -; CHECK-NEXT: add x8, x10, x8 -; CHECK-NEXT: add x11, x13, x11 -; CHECK-NEXT: ldp s1, s5, [x9] -; CHECK-NEXT: ldp s0, s4, [x8] -; CHECK-NEXT: ld1 { v0.s }[1], [x10], #4 -; CHECK-NEXT: ld1 { v1.s }[1], [x0], #4 -; CHECK-NEXT: ldp s2, s6, [x11] -; CHECK-NEXT: ldp s3, s7, [x12] -; CHECK-NEXT: ushll v0.8h, v0.8b, #0 -; CHECK-NEXT: ushll v1.8h, v1.8b, #0 +; CHECK-NEXT: // kill: def $w1 killed $w1 def $x1 +; CHECK-NEXT: sxtw x8, w3 +; CHECK-NEXT: sxtw x9, w1 +; CHECK-NEXT: add x10, x2, x8 +; CHECK-NEXT: add x11, x0, x9 +; CHECK-NEXT: add x12, x10, x8 +; CHECK-NEXT: add x13, x11, x9 +; CHECK-NEXT: add x8, x12, x8 +; CHECK-NEXT: add x9, x13, x9 +; CHECK-NEXT: ldp s0, s6, [x11] +; CHECK-NEXT: ldp s3, s7, [x10] +; CHECK-NEXT: ldp s1, s5, [x8] +; CHECK-NEXT: ldp s2, s4, [x9] +; CHECK-NEXT: ld1 { v1.s }[1], [x12], #4 ; CHECK-NEXT: ld1 { v2.s }[1], [x13], #4 ; CHECK-NEXT: ld1 { v3.s }[1], [x2], #4 -; CHECK-NEXT: ld1 { v4.s }[1], [x10] -; CHECK-NEXT: ld1 { v5.s }[1], [x0] -; CHECK-NEXT: ld1 { v6.s }[1], [x13] +; CHECK-NEXT: ld1 { v0.s }[1], [x0], #4 +; CHECK-NEXT: ld1 { v5.s }[1], [x12] +; CHECK-NEXT: ld1 { v4.s }[1], [x13] ; CHECK-NEXT: ld1 { v7.s }[1], [x2] -; CHECK-NEXT: ushll v2.8h, v2.8b, #0 -; CHECK-NEXT: ushll v3.8h, v3.8b, #0 -; CHECK-NEXT: usubl v16.4s, v0.4h, v2.4h -; CHECK-NEXT: usubl2 v0.4s, v0.8h, v2.8h -; CHECK-NEXT: usubl v2.4s, v1.4h, v3.4h -; CHECK-NEXT: usubl2 v1.4s, v1.8h, v3.8h -; CHECK-NEXT: ushll v3.8h, v4.8b, #0 -; CHECK-NEXT: ushll v4.8h, v5.8b, #0 -; CHECK-NEXT: ushll v5.8h, v6.8b, #0 -; CHECK-NEXT: ushll v6.8h, v7.8b, #0 -; CHECK-NEXT: usubl2 v7.4s, v3.8h, v5.8h -; CHECK-NEXT: usubl v3.4s, v3.4h, v5.4h -; CHECK-NEXT: usubl2 v5.4s, v4.8h, v6.8h -; CHECK-NEXT: usubl v4.4s, v4.4h, v6.4h -; CHECK-NEXT: shl v6.4s, v7.4s, #16 -; CHECK-NEXT: shl v5.4s, v5.4s, #16 -; CHECK-NEXT: shl v3.4s, v3.4s, #16 -; CHECK-NEXT: shl v4.4s, v4.4s, #16 -; CHECK-NEXT: add v1.4s, v5.4s, v1.4s -; CHECK-NEXT: add v2.4s, v4.4s, v2.4s -; CHECK-NEXT: add v0.4s, v6.4s, v0.4s -; CHECK-NEXT: rev64 v6.4s, v2.4s -; CHECK-NEXT: rev64 v17.4s, v1.4s -; CHECK-NEXT: add v3.4s, v3.4s, v16.4s -; CHECK-NEXT: rev64 v5.4s, v0.4s -; CHECK-NEXT: rev64 v4.4s, v3.4s -; CHECK-NEXT: addp v16.4s, v2.4s, v1.4s -; CHECK-NEXT: sub v1.4s, v1.4s, v17.4s -; CHECK-NEXT: sub v2.4s, v2.4s, v6.4s -; CHECK-NEXT: addp v7.4s, v3.4s, v0.4s -; CHECK-NEXT: sub v0.4s, v0.4s, v5.4s -; CHECK-NEXT: zip1 v5.4s, v2.4s, v1.4s -; CHECK-NEXT: sub v3.4s, v3.4s, v4.4s +; CHECK-NEXT: ld1 { v6.s }[1], [x0] +; CHECK-NEXT: usubl v0.8h, v0.8b, v3.8b +; CHECK-NEXT: usubl v1.8h, v2.8b, v1.8b +; CHECK-NEXT: usubl v2.8h, v4.8b, v5.8b +; CHECK-NEXT: usubl v3.8h, v6.8b, v7.8b +; CHECK-NEXT: shll v4.4s, v2.4h, #16 +; CHECK-NEXT: shll v5.4s, v3.4h, #16 +; CHECK-NEXT: shll2 v3.4s, v3.8h, #16 +; CHECK-NEXT: shll2 v2.4s, v2.8h, #16 +; CHECK-NEXT: saddw2 v3.4s, v3.4s, v0.8h +; CHECK-NEXT: saddw v0.4s, v5.4s, v0.4h +; CHECK-NEXT: saddw2 v2.4s, v2.4s, v1.8h +; CHECK-NEXT: saddw v1.4s, v4.4s, v1.4h +; CHECK-NEXT: rev64 v6.4s, v0.4s +; CHECK-NEXT: rev64 v17.4s, v3.4s +; CHECK-NEXT: rev64 v5.4s, v2.4s +; CHECK-NEXT: addp v7.4s, v1.4s, v2.4s +; CHECK-NEXT: rev64 v4.4s, v1.4s +; CHECK-NEXT: addp v16.4s, v0.4s, v3.4s +; CHECK-NEXT: sub v3.4s, v3.4s, v17.4s +; CHECK-NEXT: sub v0.4s, v0.4s, v6.4s ; CHECK-NEXT: ext v18.16b, v7.16b, v7.16b, #8 -; CHECK-NEXT: zip2 v4.4s, v0.4s, v3.4s -; CHECK-NEXT: mov v0.s[1], v3.s[0] -; CHECK-NEXT: ext v3.16b, v2.16b, v5.16b, #8 -; CHECK-NEXT: mov v2.s[3], v1.s[2] +; CHECK-NEXT: sub v2.4s, v2.4s, v5.4s +; CHECK-NEXT: zip1 v5.4s, v0.4s, v3.4s ; CHECK-NEXT: uzp2 v19.4s, v7.4s, v16.4s -; CHECK-NEXT: uzp1 v6.4s, v7.4s, v16.4s -; CHECK-NEXT: uzp1 v7.4s, v18.4s, v16.4s -; CHECK-NEXT: uzp2 v1.4s, v18.4s, v16.4s -; CHECK-NEXT: mov v0.d[1], v3.d[1] -; CHECK-NEXT: mov v4.d[1], v2.d[1] -; CHECK-NEXT: add v5.4s, v19.4s, v6.4s -; CHECK-NEXT: sub v1.4s, v7.4s, v1.4s -; CHECK-NEXT: rev64 v2.4s, v5.4s -; CHECK-NEXT: sub v6.4s, v0.4s, v4.4s -; CHECK-NEXT: add v0.4s, v4.4s, v0.4s -; CHECK-NEXT: rev64 v3.4s, v1.4s -; CHECK-NEXT: rev64 v4.4s, v6.4s -; CHECK-NEXT: rev64 v7.4s, v0.4s -; CHECK-NEXT: addp v16.4s, v1.4s, v6.4s -; CHECK-NEXT: addp v17.4s, v5.4s, v0.4s -; CHECK-NEXT: sub v4.4s, v6.4s, v4.4s -; CHECK-NEXT: sub v0.4s, v0.4s, v7.4s -; CHECK-NEXT: sub v1.4s, v1.4s, v3.4s -; CHECK-NEXT: sub v2.4s, v5.4s, v2.4s -; CHECK-NEXT: ext v3.16b, v1.16b, v16.16b, #8 -; CHECK-NEXT: ext v5.16b, v17.16b, v0.16b, #4 -; CHECK-NEXT: ext v6.16b, v16.16b, v4.16b, #4 -; CHECK-NEXT: zip1 v7.4s, v17.4s, v17.4s -; CHECK-NEXT: ext v18.16b, v3.16b, v1.16b, #4 -; CHECK-NEXT: zip2 v5.4s, v5.4s, v17.4s -; CHECK-NEXT: zip2 v6.4s, v6.4s, v16.4s -; CHECK-NEXT: trn2 v7.4s, v7.4s, v2.4s -; CHECK-NEXT: ext v2.16b, v2.16b, v17.16b, #4 -; CHECK-NEXT: mov v1.s[2], v16.s[1] -; CHECK-NEXT: ext v5.16b, v0.16b, v5.16b, #12 -; CHECK-NEXT: ext v6.16b, v4.16b, v6.16b, #12 -; CHECK-NEXT: uzp2 v3.4s, v3.4s, v18.4s -; CHECK-NEXT: mov v4.s[2], v16.s[3] -; CHECK-NEXT: mov v0.s[2], v17.s[3] -; CHECK-NEXT: ext v2.16b, v2.16b, v2.16b, #4 -; CHECK-NEXT: sub v18.4s, v1.4s, v3.4s -; CHECK-NEXT: sub v19.4s, v4.4s, v6.4s -; CHECK-NEXT: sub v20.4s, v0.4s, v5.4s -; CHECK-NEXT: sub v21.4s, v7.4s, v2.4s -; CHECK-NEXT: mov v4.s[1], v16.s[2] -; CHECK-NEXT: mov v0.s[1], v17.s[2] -; CHECK-NEXT: mov v2.s[0], v17.s[1] -; CHECK-NEXT: mov v1.s[1], v16.s[0] -; CHECK-NEXT: add v4.4s, v4.4s, v6.4s -; CHECK-NEXT: add v0.4s, v0.4s, v5.4s -; CHECK-NEXT: add v2.4s, v7.4s, v2.4s -; CHECK-NEXT: add v1.4s, v1.4s, v3.4s -; CHECK-NEXT: mov v2.d[1], v21.d[1] -; CHECK-NEXT: mov v1.d[1], v18.d[1] -; CHECK-NEXT: mov v4.d[1], v19.d[1] -; CHECK-NEXT: mov v0.d[1], v20.d[1] -; CHECK-NEXT: movi v3.8h, #1 +; CHECK-NEXT: uzp1 v7.4s, v7.4s, v16.4s +; CHECK-NEXT: sub v1.4s, v1.4s, v4.4s +; CHECK-NEXT: uzp1 v6.4s, v18.4s, v16.4s +; CHECK-NEXT: zip2 v4.4s, v2.4s, v1.4s +; CHECK-NEXT: uzp2 v16.4s, v18.4s, v16.4s +; CHECK-NEXT: mov v2.s[1], v1.s[0] +; CHECK-NEXT: ext v1.16b, v0.16b, v5.16b, #8 +; CHECK-NEXT: mov v0.s[3], v3.s[2] +; CHECK-NEXT: add v7.4s, v19.4s, v7.4s +; CHECK-NEXT: sub v3.4s, v6.4s, v16.4s +; CHECK-NEXT: rev64 v5.4s, v7.4s +; CHECK-NEXT: mov v2.d[1], v1.d[1] +; CHECK-NEXT: mov v4.d[1], v0.d[1] +; CHECK-NEXT: rev64 v6.4s, v3.4s +; CHECK-NEXT: sub v0.4s, v7.4s, v5.4s +; CHECK-NEXT: add v5.4s, v4.4s, v2.4s +; CHECK-NEXT: sub v2.4s, v2.4s, v4.4s +; CHECK-NEXT: sub v1.4s, v3.4s, v6.4s +; CHECK-NEXT: rev64 v4.4s, v5.4s +; CHECK-NEXT: addp v6.4s, v7.4s, v5.4s +; CHECK-NEXT: rev64 v7.4s, v2.4s +; CHECK-NEXT: addp v3.4s, v3.4s, v2.4s +; CHECK-NEXT: sub v4.4s, v5.4s, v4.4s +; CHECK-NEXT: zip1 v16.4s, v6.4s, v6.4s +; CHECK-NEXT: sub v2.4s, v2.4s, v7.4s +; CHECK-NEXT: ext v17.16b, v1.16b, v3.16b, #8 +; CHECK-NEXT: ext v5.16b, v3.16b, v2.16b, #4 +; CHECK-NEXT: ext v7.16b, v6.16b, v4.16b, #4 +; CHECK-NEXT: ext v18.16b, v0.16b, v6.16b, #4 +; CHECK-NEXT: trn2 v0.4s, v16.4s, v0.4s +; CHECK-NEXT: ext v16.16b, v17.16b, v1.16b, #4 +; CHECK-NEXT: zip2 v5.4s, v5.4s, v3.4s +; CHECK-NEXT: zip2 v7.4s, v7.4s, v6.4s +; CHECK-NEXT: ext v18.16b, v18.16b, v18.16b, #4 +; CHECK-NEXT: mov v1.s[2], v3.s[1] +; CHECK-NEXT: ext v5.16b, v2.16b, v5.16b, #12 +; CHECK-NEXT: ext v7.16b, v4.16b, v7.16b, #12 +; CHECK-NEXT: mov v2.s[2], v3.s[3] +; CHECK-NEXT: mov v4.s[2], v6.s[3] +; CHECK-NEXT: uzp2 v16.4s, v17.4s, v16.4s +; CHECK-NEXT: sub v19.4s, v0.4s, v18.4s +; CHECK-NEXT: mov v18.s[0], v6.s[1] +; CHECK-NEXT: sub v17.4s, v2.4s, v5.4s +; CHECK-NEXT: sub v20.4s, v4.4s, v7.4s +; CHECK-NEXT: sub v21.4s, v1.4s, v16.4s +; CHECK-NEXT: mov v2.s[1], v3.s[2] +; CHECK-NEXT: mov v4.s[1], v6.s[2] +; CHECK-NEXT: mov v1.s[1], v3.s[0] +; CHECK-NEXT: add v0.4s, v0.4s, v18.4s +; CHECK-NEXT: add v2.4s, v2.4s, v5.4s +; CHECK-NEXT: add v3.4s, v4.4s, v7.4s +; CHECK-NEXT: add v1.4s, v1.4s, v16.4s +; CHECK-NEXT: mov v0.d[1], v19.d[1] +; CHECK-NEXT: mov v1.d[1], v21.d[1] +; CHECK-NEXT: mov v2.d[1], v17.d[1] +; CHECK-NEXT: mov v3.d[1], v20.d[1] +; CHECK-NEXT: movi v4.8h, #1 ; CHECK-NEXT: movi v17.2d, #0x00ffff0000ffff -; CHECK-NEXT: ushr v5.4s, v4.4s, #15 -; CHECK-NEXT: ushr v6.4s, v2.4s, #15 -; CHECK-NEXT: ushr v7.4s, v0.4s, #15 +; CHECK-NEXT: ushr v5.4s, v2.4s, #15 +; CHECK-NEXT: ushr v6.4s, v0.4s, #15 +; CHECK-NEXT: ushr v7.4s, v3.4s, #15 ; CHECK-NEXT: ushr v16.4s, v1.4s, #15 -; CHECK-NEXT: and v6.16b, v6.16b, v3.16b -; CHECK-NEXT: and v16.16b, v16.16b, v3.16b -; CHECK-NEXT: and v7.16b, v7.16b, v3.16b -; CHECK-NEXT: and v3.16b, v5.16b, v3.16b +; CHECK-NEXT: and v6.16b, v6.16b, v4.16b +; CHECK-NEXT: and v16.16b, v16.16b, v4.16b +; CHECK-NEXT: and v7.16b, v7.16b, v4.16b +; CHECK-NEXT: and v4.16b, v5.16b, v4.16b ; CHECK-NEXT: mul v5.4s, v6.4s, v17.4s ; CHECK-NEXT: mul v6.4s, v16.4s, v17.4s -; CHECK-NEXT: mul v3.4s, v3.4s, v17.4s +; CHECK-NEXT: mul v4.4s, v4.4s, v17.4s ; CHECK-NEXT: mul v7.4s, v7.4s, v17.4s -; CHECK-NEXT: add v2.4s, v5.4s, v2.4s +; CHECK-NEXT: add v0.4s, v5.4s, v0.4s ; CHECK-NEXT: add v1.4s, v6.4s, v1.4s -; CHECK-NEXT: add v4.4s, v3.4s, v4.4s -; CHECK-NEXT: add v0.4s, v7.4s, v0.4s -; CHECK-NEXT: eor v3.16b, v4.16b, v3.16b -; CHECK-NEXT: eor v0.16b, v0.16b, v7.16b +; CHECK-NEXT: add v2.4s, v4.4s, v2.4s +; CHECK-NEXT: add v3.4s, v7.4s, v3.4s +; CHECK-NEXT: eor v2.16b, v2.16b, v4.16b +; CHECK-NEXT: eor v3.16b, v3.16b, v7.16b ; CHECK-NEXT: eor v1.16b, v1.16b, v6.16b -; CHECK-NEXT: eor v2.16b, v2.16b, v5.16b -; CHECK-NEXT: add v1.4s, v2.4s, v1.4s -; CHECK-NEXT: add v0.4s, v0.4s, v3.4s -; CHECK-NEXT: add v0.4s, v1.4s, v0.4s +; CHECK-NEXT: eor v0.16b, v0.16b, v5.16b +; CHECK-NEXT: add v0.4s, v0.4s, v1.4s +; CHECK-NEXT: add v1.4s, v3.4s, v2.4s +; CHECK-NEXT: add v0.4s, v0.4s, v1.4s ; CHECK-NEXT: addv s0, v0.4s ; CHECK-NEXT: fmov w8, s0 ; CHECK-NEXT: lsr w9, w8, #16 diff --git a/llvm/test/CodeGen/AArch64/neon-extadd.ll b/llvm/test/CodeGen/AArch64/neon-extadd.ll --- a/llvm/test/CodeGen/AArch64/neon-extadd.ll +++ b/llvm/test/CodeGen/AArch64/neon-extadd.ll @@ -90,10 +90,9 @@ define <8 x i32> @extadds_v8i8_i32(<8 x i8> %s0, <8 x i8> %s1) { ; CHECK-LABEL: extadds_v8i8_i32: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: sshll v0.8h, v0.8b, #0 -; CHECK-NEXT: sshll v2.8h, v1.8b, #0 -; CHECK-NEXT: saddl2 v1.4s, v0.8h, v2.8h -; CHECK-NEXT: saddl v0.4s, v0.4h, v2.4h +; CHECK-NEXT: saddl v0.8h, v0.8b, v1.8b +; CHECK-NEXT: sshll2 v1.4s, v0.8h, #0 +; CHECK-NEXT: sshll v0.4s, v0.4h, #0 ; CHECK-NEXT: ret entry: %s0s = sext <8 x i8> %s0 to <8 x i32> @@ -105,10 +104,9 @@ define <8 x i32> @extaddu_v8i8_i32(<8 x i8> %s0, <8 x i8> %s1) { ; CHECK-LABEL: extaddu_v8i8_i32: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ushll v0.8h, v0.8b, #0 -; CHECK-NEXT: ushll v2.8h, v1.8b, #0 -; CHECK-NEXT: uaddl2 v1.4s, v0.8h, v2.8h -; CHECK-NEXT: uaddl v0.4s, v0.4h, v2.4h +; CHECK-NEXT: uaddl v0.8h, v0.8b, v1.8b +; CHECK-NEXT: ushll2 v1.4s, v0.8h, #0 +; CHECK-NEXT: ushll v0.4s, v0.4h, #0 ; CHECK-NEXT: ret entry: %s0s = zext <8 x i8> %s0 to <8 x i32> @@ -120,14 +118,12 @@ define <16 x i32> @extadds_v16i8_i32(<16 x i8> %s0, <16 x i8> %s1) { ; CHECK-LABEL: extadds_v16i8_i32: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: sshll v2.8h, v0.8b, #0 -; CHECK-NEXT: sshll2 v4.8h, v0.16b, #0 -; CHECK-NEXT: sshll2 v5.8h, v1.16b, #0 -; CHECK-NEXT: sshll v0.8h, v1.8b, #0 -; CHECK-NEXT: saddl2 v3.4s, v4.8h, v5.8h -; CHECK-NEXT: saddl2 v1.4s, v2.8h, v0.8h -; CHECK-NEXT: saddl v0.4s, v2.4h, v0.4h -; CHECK-NEXT: saddl v2.4s, v4.4h, v5.4h +; CHECK-NEXT: saddl2 v2.8h, v0.16b, v1.16b +; CHECK-NEXT: saddl v0.8h, v0.8b, v1.8b +; CHECK-NEXT: sshll2 v3.4s, v2.8h, #0 +; CHECK-NEXT: sshll2 v1.4s, v0.8h, #0 +; CHECK-NEXT: sshll v0.4s, v0.4h, #0 +; CHECK-NEXT: sshll v2.4s, v2.4h, #0 ; CHECK-NEXT: ret entry: %s0s = sext <16 x i8> %s0 to <16 x i32> @@ -139,14 +135,12 @@ define <16 x i32> @extaddu_v16i8_i32(<16 x i8> %s0, <16 x i8> %s1) { ; CHECK-LABEL: extaddu_v16i8_i32: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ushll v2.8h, v0.8b, #0 -; CHECK-NEXT: ushll2 v4.8h, v0.16b, #0 -; CHECK-NEXT: ushll2 v5.8h, v1.16b, #0 -; CHECK-NEXT: ushll v0.8h, v1.8b, #0 -; CHECK-NEXT: uaddl2 v3.4s, v4.8h, v5.8h -; CHECK-NEXT: uaddl2 v1.4s, v2.8h, v0.8h -; CHECK-NEXT: uaddl v0.4s, v2.4h, v0.4h -; CHECK-NEXT: uaddl v2.4s, v4.4h, v5.4h +; CHECK-NEXT: uaddl2 v2.8h, v0.16b, v1.16b +; CHECK-NEXT: uaddl v0.8h, v0.8b, v1.8b +; CHECK-NEXT: ushll2 v3.4s, v2.8h, #0 +; CHECK-NEXT: ushll2 v1.4s, v0.8h, #0 +; CHECK-NEXT: ushll v0.4s, v0.4h, #0 +; CHECK-NEXT: ushll v2.4s, v2.4h, #0 ; CHECK-NEXT: ret entry: %s0s = zext <16 x i8> %s0 to <16 x i32> @@ -158,16 +152,13 @@ define <8 x i64> @extadds_v8i8_i64(<8 x i8> %s0, <8 x i8> %s1) { ; CHECK-LABEL: extadds_v8i8_i64: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: sshll v0.8h, v0.8b, #0 -; CHECK-NEXT: sshll v1.8h, v1.8b, #0 -; CHECK-NEXT: sshll v2.4s, v0.4h, #0 -; CHECK-NEXT: sshll2 v4.4s, v0.8h, #0 -; CHECK-NEXT: sshll2 v5.4s, v1.8h, #0 -; CHECK-NEXT: sshll v0.4s, v1.4h, #0 -; CHECK-NEXT: saddl2 v3.2d, v4.4s, v5.4s -; CHECK-NEXT: saddl2 v1.2d, v2.4s, v0.4s -; CHECK-NEXT: saddl v0.2d, v2.2s, v0.2s -; CHECK-NEXT: saddl v2.2d, v4.2s, v5.2s +; CHECK-NEXT: saddl v0.8h, v0.8b, v1.8b +; CHECK-NEXT: sshll2 v2.4s, v0.8h, #0 +; CHECK-NEXT: sshll v0.4s, v0.4h, #0 +; CHECK-NEXT: sshll2 v3.2d, v2.4s, #0 +; CHECK-NEXT: sshll2 v1.2d, v0.4s, #0 +; CHECK-NEXT: sshll v0.2d, v0.2s, #0 +; CHECK-NEXT: sshll v2.2d, v2.2s, #0 ; CHECK-NEXT: ret entry: %s0s = sext <8 x i8> %s0 to <8 x i64> @@ -179,16 +170,13 @@ define <8 x i64> @extaddu_v8i8_i64(<8 x i8> %s0, <8 x i8> %s1) { ; CHECK-LABEL: extaddu_v8i8_i64: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ushll v0.8h, v0.8b, #0 -; CHECK-NEXT: ushll v1.8h, v1.8b, #0 -; CHECK-NEXT: ushll v2.4s, v0.4h, #0 -; CHECK-NEXT: ushll2 v4.4s, v0.8h, #0 -; CHECK-NEXT: ushll2 v5.4s, v1.8h, #0 -; CHECK-NEXT: ushll v0.4s, v1.4h, #0 -; CHECK-NEXT: uaddl2 v3.2d, v4.4s, v5.4s -; CHECK-NEXT: uaddl2 v1.2d, v2.4s, v0.4s -; CHECK-NEXT: uaddl v0.2d, v2.2s, v0.2s -; CHECK-NEXT: uaddl v2.2d, v4.2s, v5.2s +; CHECK-NEXT: uaddl v0.8h, v0.8b, v1.8b +; CHECK-NEXT: ushll2 v2.4s, v0.8h, #0 +; CHECK-NEXT: ushll v0.4s, v0.4h, #0 +; CHECK-NEXT: ushll2 v3.2d, v2.4s, #0 +; CHECK-NEXT: ushll2 v1.2d, v0.4s, #0 +; CHECK-NEXT: ushll v0.2d, v0.2s, #0 +; CHECK-NEXT: ushll v2.2d, v2.2s, #0 ; CHECK-NEXT: ret entry: %s0s = zext <8 x i8> %s0 to <8 x i64> @@ -286,10 +274,9 @@ define <4 x i64> @extadds_v4i16_i64(<4 x i16> %s0, <4 x i16> %s1) { ; CHECK-LABEL: extadds_v4i16_i64: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: sshll v0.4s, v0.4h, #0 -; CHECK-NEXT: sshll v2.4s, v1.4h, #0 -; CHECK-NEXT: saddl2 v1.2d, v0.4s, v2.4s -; CHECK-NEXT: saddl v0.2d, v0.2s, v2.2s +; CHECK-NEXT: saddl v0.4s, v0.4h, v1.4h +; CHECK-NEXT: sshll2 v1.2d, v0.4s, #0 +; CHECK-NEXT: sshll v0.2d, v0.2s, #0 ; CHECK-NEXT: ret entry: %s0s = sext <4 x i16> %s0 to <4 x i64> @@ -301,10 +288,9 @@ define <4 x i64> @extaddu_v4i16_i64(<4 x i16> %s0, <4 x i16> %s1) { ; CHECK-LABEL: extaddu_v4i16_i64: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ushll v0.4s, v0.4h, #0 -; CHECK-NEXT: ushll v2.4s, v1.4h, #0 -; CHECK-NEXT: uaddl2 v1.2d, v0.4s, v2.4s -; CHECK-NEXT: uaddl v0.2d, v0.2s, v2.2s +; CHECK-NEXT: uaddl v0.4s, v0.4h, v1.4h +; CHECK-NEXT: ushll2 v1.2d, v0.4s, #0 +; CHECK-NEXT: ushll v0.2d, v0.2s, #0 ; CHECK-NEXT: ret entry: %s0s = zext <4 x i16> %s0 to <4 x i64> @@ -316,14 +302,12 @@ define <8 x i64> @extadds_v8i16_i64(<8 x i16> %s0, <8 x i16> %s1) { ; CHECK-LABEL: extadds_v8i16_i64: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: sshll v2.4s, v0.4h, #0 -; CHECK-NEXT: sshll2 v4.4s, v0.8h, #0 -; CHECK-NEXT: sshll2 v5.4s, v1.8h, #0 -; CHECK-NEXT: sshll v0.4s, v1.4h, #0 -; CHECK-NEXT: saddl2 v3.2d, v4.4s, v5.4s -; CHECK-NEXT: saddl2 v1.2d, v2.4s, v0.4s -; CHECK-NEXT: saddl v0.2d, v2.2s, v0.2s -; CHECK-NEXT: saddl v2.2d, v4.2s, v5.2s +; CHECK-NEXT: saddl2 v2.4s, v0.8h, v1.8h +; CHECK-NEXT: saddl v0.4s, v0.4h, v1.4h +; CHECK-NEXT: sshll2 v3.2d, v2.4s, #0 +; CHECK-NEXT: sshll2 v1.2d, v0.4s, #0 +; CHECK-NEXT: sshll v0.2d, v0.2s, #0 +; CHECK-NEXT: sshll v2.2d, v2.2s, #0 ; CHECK-NEXT: ret entry: %s0s = sext <8 x i16> %s0 to <8 x i64> @@ -335,14 +319,12 @@ define <8 x i64> @extaddu_v8i16_i64(<8 x i16> %s0, <8 x i16> %s1) { ; CHECK-LABEL: extaddu_v8i16_i64: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ushll v2.4s, v0.4h, #0 -; CHECK-NEXT: ushll2 v4.4s, v0.8h, #0 -; CHECK-NEXT: ushll2 v5.4s, v1.8h, #0 -; CHECK-NEXT: ushll v0.4s, v1.4h, #0 -; CHECK-NEXT: uaddl2 v3.2d, v4.4s, v5.4s -; CHECK-NEXT: uaddl2 v1.2d, v2.4s, v0.4s -; CHECK-NEXT: uaddl v0.2d, v2.2s, v0.2s -; CHECK-NEXT: uaddl v2.2d, v4.2s, v5.2s +; CHECK-NEXT: uaddl2 v2.4s, v0.8h, v1.8h +; CHECK-NEXT: uaddl v0.4s, v0.4h, v1.4h +; CHECK-NEXT: ushll2 v3.2d, v2.4s, #0 +; CHECK-NEXT: ushll2 v1.2d, v0.4s, #0 +; CHECK-NEXT: ushll v0.2d, v0.2s, #0 +; CHECK-NEXT: ushll v2.2d, v2.2s, #0 ; CHECK-NEXT: ret entry: %s0s = zext <8 x i16> %s0 to <8 x i64> @@ -440,18 +422,14 @@ define <16 x i32> @add_zs(<16 x i8> %s0, <16 x i8> %s1) { ; CHECK-LABEL: add_zs: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ushll v2.8h, v0.8b, #0 -; CHECK-NEXT: ushll2 v0.8h, v0.16b, #0 -; CHECK-NEXT: ushll v4.4s, v2.4h, #0 -; CHECK-NEXT: ushll v5.4s, v0.4h, #0 -; CHECK-NEXT: ushll2 v2.4s, v2.8h, #0 -; CHECK-NEXT: ushll2 v0.4s, v0.8h, #0 -; CHECK-NEXT: sshll2 v6.8h, v1.16b, #0 -; CHECK-NEXT: sshll v7.8h, v1.8b, #0 -; CHECK-NEXT: saddw2 v3.4s, v0.4s, v6.8h -; CHECK-NEXT: saddw2 v1.4s, v2.4s, v7.8h -; CHECK-NEXT: saddw v0.4s, v4.4s, v7.4h -; CHECK-NEXT: saddw v2.4s, v5.4s, v6.4h +; CHECK-NEXT: ushll2 v2.8h, v0.16b, #0 +; CHECK-NEXT: ushll v0.8h, v0.8b, #0 +; CHECK-NEXT: saddw2 v2.8h, v2.8h, v1.16b +; CHECK-NEXT: saddw v0.8h, v0.8h, v1.8b +; CHECK-NEXT: sshll2 v3.4s, v2.8h, #0 +; CHECK-NEXT: sshll2 v1.4s, v0.8h, #0 +; CHECK-NEXT: sshll v0.4s, v0.4h, #0 +; CHECK-NEXT: sshll v2.4s, v2.4h, #0 ; CHECK-NEXT: ret entry: %s0s = zext <16 x i8> %s0 to <16 x i32> @@ -463,87 +441,86 @@ define <20 x i32> @v20(<20 x i8> %s0, <20 x i8> %s1) { ; CHECK-LABEL: v20: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ldr b0, [sp] -; CHECK-NEXT: add x9, sp, #8 -; CHECK-NEXT: ldr b1, [sp, #64] -; CHECK-NEXT: add x10, sp, #72 -; CHECK-NEXT: add x11, sp, #16 +; CHECK-NEXT: ldr b0, [sp, #96] +; CHECK-NEXT: add x9, sp, #104 ; CHECK-NEXT: ldr b2, [sp, #160] +; CHECK-NEXT: add x10, sp, #168 +; CHECK-NEXT: ldr b3, [sp] +; CHECK-NEXT: fmov s1, w0 ; CHECK-NEXT: ld1 { v0.b }[1], [x9] -; CHECK-NEXT: add x9, sp, #80 -; CHECK-NEXT: ld1 { v1.b }[1], [x10] -; CHECK-NEXT: add x10, sp, #24 -; CHECK-NEXT: fmov s3, w0 -; CHECK-NEXT: ldr b4, [sp, #96] -; CHECK-NEXT: ldr b5, [sp, #224] -; CHECK-NEXT: add x12, sp, #56 -; CHECK-NEXT: ld1 { v0.b }[2], [x11] -; CHECK-NEXT: add x11, sp, #168 -; CHECK-NEXT: ld1 { v1.b }[2], [x9] -; CHECK-NEXT: add x9, sp, #88 -; CHECK-NEXT: mov v3.b[1], w1 -; CHECK-NEXT: ld1 { v2.b }[1], [x11] -; CHECK-NEXT: add x11, sp, #176 -; CHECK-NEXT: ld1 { v0.b }[3], [x10] -; CHECK-NEXT: add x10, sp, #32 -; CHECK-NEXT: ld1 { v1.b }[3], [x9] -; CHECK-NEXT: add x9, sp, #104 -; CHECK-NEXT: mov v3.b[2], w2 -; CHECK-NEXT: ld1 { v2.b }[2], [x11] -; CHECK-NEXT: add x11, sp, #48 -; CHECK-NEXT: ld1 { v0.b }[4], [x10] -; CHECK-NEXT: add x10, sp, #40 -; CHECK-NEXT: ld1 { v4.b }[1], [x9] ; CHECK-NEXT: add x9, sp, #112 -; CHECK-NEXT: mov v3.b[3], w3 -; CHECK-NEXT: ushll v1.8h, v1.8b, #0 -; CHECK-NEXT: ld1 { v0.b }[5], [x10] -; CHECK-NEXT: add x10, sp, #184 -; CHECK-NEXT: ld1 { v4.b }[2], [x9] +; CHECK-NEXT: ld1 { v2.b }[1], [x10] +; CHECK-NEXT: add x10, sp, #8 +; CHECK-NEXT: add x11, sp, #128 +; CHECK-NEXT: add x12, sp, #184 +; CHECK-NEXT: mov v1.b[1], w1 +; CHECK-NEXT: add x13, sp, #192 +; CHECK-NEXT: ld1 { v0.b }[2], [x9] ; CHECK-NEXT: add x9, sp, #120 -; CHECK-NEXT: mov v3.b[4], w4 -; CHECK-NEXT: ld1 { v2.b }[3], [x10] -; CHECK-NEXT: add x10, sp, #192 -; CHECK-NEXT: ld1 { v0.b }[6], [x11] -; CHECK-NEXT: add x11, sp, #200 -; CHECK-NEXT: ld1 { v4.b }[3], [x9] -; CHECK-NEXT: add x9, sp, #128 -; CHECK-NEXT: mov v3.b[5], w5 -; CHECK-NEXT: ld1 { v2.b }[4], [x10] -; CHECK-NEXT: add x10, sp, #232 -; CHECK-NEXT: ld1 { v0.b }[7], [x12] -; CHECK-NEXT: ld1 { v4.b }[4], [x9] +; CHECK-NEXT: ld1 { v3.b }[1], [x10] +; CHECK-NEXT: add x10, sp, #16 +; CHECK-NEXT: ldr b4, [sp, #224] +; CHECK-NEXT: mov v1.b[2], w2 +; CHECK-NEXT: ldr b5, [sp, #64] +; CHECK-NEXT: ld1 { v0.b }[3], [x9] +; CHECK-NEXT: add x9, sp, #176 +; CHECK-NEXT: ld1 { v3.b }[2], [x10] +; CHECK-NEXT: add x10, sp, #24 +; CHECK-NEXT: ld1 { v2.b }[2], [x9] ; CHECK-NEXT: add x9, sp, #136 +; CHECK-NEXT: ld1 { v0.b }[4], [x11] +; CHECK-NEXT: add x11, sp, #144 +; CHECK-NEXT: ld1 { v3.b }[3], [x10] +; CHECK-NEXT: add x10, sp, #32 +; CHECK-NEXT: mov v1.b[3], w3 +; CHECK-NEXT: ld1 { v2.b }[3], [x12] +; CHECK-NEXT: add x12, sp, #200 +; CHECK-NEXT: ld1 { v0.b }[5], [x9] +; CHECK-NEXT: add x9, sp, #152 +; CHECK-NEXT: ld1 { v3.b }[4], [x10] +; CHECK-NEXT: add x10, sp, #72 +; CHECK-NEXT: mov v1.b[4], w4 +; CHECK-NEXT: ld1 { v2.b }[4], [x13] +; CHECK-NEXT: add x13, sp, #232 +; CHECK-NEXT: ld1 { v0.b }[6], [x11] +; CHECK-NEXT: add x11, sp, #40 ; CHECK-NEXT: ld1 { v5.b }[1], [x10] -; CHECK-NEXT: add x10, sp, #240 -; CHECK-NEXT: ld1 { v2.b }[5], [x11] -; CHECK-NEXT: add x11, sp, #208 -; CHECK-NEXT: mov v3.b[6], w6 -; CHECK-NEXT: ld1 { v4.b }[5], [x9] -; CHECK-NEXT: add x9, sp, #144 -; CHECK-NEXT: ld1 { v5.b }[2], [x10] -; CHECK-NEXT: add x10, sp, #248 -; CHECK-NEXT: ld1 { v2.b }[6], [x11] +; CHECK-NEXT: add x10, sp, #80 +; CHECK-NEXT: ld1 { v4.b }[1], [x13] +; CHECK-NEXT: ld1 { v2.b }[5], [x12] +; CHECK-NEXT: add x12, sp, #240 +; CHECK-NEXT: ld1 { v0.b }[7], [x9] +; CHECK-NEXT: add x9, sp, #208 +; CHECK-NEXT: ld1 { v3.b }[5], [x11] ; CHECK-NEXT: add x11, sp, #216 -; CHECK-NEXT: mov v3.b[7], w7 -; CHECK-NEXT: ld1 { v4.b }[6], [x9] -; CHECK-NEXT: add x9, sp, #152 -; CHECK-NEXT: ld1 { v5.b }[3], [x10] +; CHECK-NEXT: mov v1.b[5], w5 +; CHECK-NEXT: ld1 { v4.b }[2], [x12] +; CHECK-NEXT: ld1 { v2.b }[6], [x9] +; CHECK-NEXT: add x9, sp, #48 +; CHECK-NEXT: ld1 { v5.b }[2], [x10] +; CHECK-NEXT: add x12, sp, #248 +; CHECK-NEXT: add x10, sp, #56 +; CHECK-NEXT: ld1 { v3.b }[6], [x9] +; CHECK-NEXT: add x9, sp, #88 +; CHECK-NEXT: mov v1.b[6], w6 +; CHECK-NEXT: ld1 { v4.b }[3], [x12] ; CHECK-NEXT: ld1 { v2.b }[7], [x11] -; CHECK-NEXT: ushll v0.8h, v0.8b, #0 -; CHECK-NEXT: ld1 { v4.b }[7], [x9] -; CHECK-NEXT: ushll v5.8h, v5.8b, #0 -; CHECK-NEXT: ushll v2.8h, v2.8b, #0 -; CHECK-NEXT: uaddl v1.4s, v1.4h, v5.4h -; CHECK-NEXT: uaddl2 v5.4s, v0.8h, v2.8h -; CHECK-NEXT: ushll v3.8h, v3.8b, #0 +; CHECK-NEXT: ld1 { v5.b }[3], [x9] +; CHECK-NEXT: ld1 { v3.b }[7], [x10] +; CHECK-NEXT: mov v1.b[7], w7 ; CHECK-NEXT: ushll v4.8h, v4.8b, #0 -; CHECK-NEXT: uaddl v0.4s, v0.4h, v2.4h -; CHECK-NEXT: stp q5, q1, [x8, #48] -; CHECK-NEXT: uaddl2 v2.4s, v3.8h, v4.8h -; CHECK-NEXT: uaddl v1.4s, v3.4h, v4.4h -; CHECK-NEXT: stp q2, q0, [x8, #16] -; CHECK-NEXT: str q1, [x8] +; CHECK-NEXT: ushll v5.8h, v5.8b, #0 +; CHECK-NEXT: uaddl v2.8h, v3.8b, v2.8b +; CHECK-NEXT: add v3.4h, v5.4h, v4.4h +; CHECK-NEXT: uaddl v0.8h, v1.8b, v0.8b +; CHECK-NEXT: ushll v1.4s, v3.4h, #0 +; CHECK-NEXT: ushll2 v3.4s, v2.8h, #0 +; CHECK-NEXT: ushll v2.4s, v2.4h, #0 +; CHECK-NEXT: stp q3, q1, [x8, #48] +; CHECK-NEXT: ushll2 v1.4s, v0.8h, #0 +; CHECK-NEXT: ushll v0.4s, v0.4h, #0 +; CHECK-NEXT: stp q1, q2, [x8, #16] +; CHECK-NEXT: str q0, [x8] ; CHECK-NEXT: ret entry: %s0s = zext <20 x i8> %s0 to <20 x i32> @@ -643,14 +620,12 @@ define <16 x i32> @sub_zz(<16 x i8> %s0, <16 x i8> %s1) { ; CHECK-LABEL: sub_zz: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ushll v2.8h, v0.8b, #0 -; CHECK-NEXT: ushll2 v4.8h, v0.16b, #0 -; CHECK-NEXT: ushll2 v5.8h, v1.16b, #0 -; CHECK-NEXT: ushll v0.8h, v1.8b, #0 -; CHECK-NEXT: usubl2 v3.4s, v4.8h, v5.8h -; CHECK-NEXT: usubl2 v1.4s, v2.8h, v0.8h -; CHECK-NEXT: usubl v0.4s, v2.4h, v0.4h -; CHECK-NEXT: usubl v2.4s, v4.4h, v5.4h +; CHECK-NEXT: usubl2 v2.8h, v0.16b, v1.16b +; CHECK-NEXT: usubl v0.8h, v0.8b, v1.8b +; CHECK-NEXT: sshll2 v3.4s, v2.8h, #0 +; CHECK-NEXT: sshll2 v1.4s, v0.8h, #0 +; CHECK-NEXT: sshll v0.4s, v0.4h, #0 +; CHECK-NEXT: sshll v2.4s, v2.4h, #0 ; CHECK-NEXT: ret entry: %s0s = zext <16 x i8> %s0 to <16 x i32> @@ -662,14 +637,12 @@ define <16 x i32> @sub_ss(<16 x i8> %s0, <16 x i8> %s1) { ; CHECK-LABEL: sub_ss: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: sshll v2.8h, v0.8b, #0 -; CHECK-NEXT: sshll2 v4.8h, v0.16b, #0 -; CHECK-NEXT: sshll2 v5.8h, v1.16b, #0 -; CHECK-NEXT: sshll v0.8h, v1.8b, #0 -; CHECK-NEXT: ssubl2 v3.4s, v4.8h, v5.8h -; CHECK-NEXT: ssubl2 v1.4s, v2.8h, v0.8h -; CHECK-NEXT: ssubl v0.4s, v2.4h, v0.4h -; CHECK-NEXT: ssubl v2.4s, v4.4h, v5.4h +; CHECK-NEXT: ssubl2 v2.8h, v0.16b, v1.16b +; CHECK-NEXT: ssubl v0.8h, v0.8b, v1.8b +; CHECK-NEXT: sshll2 v3.4s, v2.8h, #0 +; CHECK-NEXT: sshll2 v1.4s, v0.8h, #0 +; CHECK-NEXT: sshll v0.4s, v0.4h, #0 +; CHECK-NEXT: sshll v2.4s, v2.4h, #0 ; CHECK-NEXT: ret entry: %s0s = sext <16 x i8> %s0 to <16 x i32> @@ -681,18 +654,14 @@ define <16 x i32> @sub_zs(<16 x i8> %s0, <16 x i8> %s1) { ; CHECK-LABEL: sub_zs: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ushll v2.8h, v0.8b, #0 -; CHECK-NEXT: ushll2 v0.8h, v0.16b, #0 -; CHECK-NEXT: ushll v4.4s, v2.4h, #0 -; CHECK-NEXT: ushll v5.4s, v0.4h, #0 -; CHECK-NEXT: ushll2 v2.4s, v2.8h, #0 -; CHECK-NEXT: ushll2 v0.4s, v0.8h, #0 -; CHECK-NEXT: sshll2 v6.8h, v1.16b, #0 -; CHECK-NEXT: sshll v7.8h, v1.8b, #0 -; CHECK-NEXT: ssubw2 v3.4s, v0.4s, v6.8h -; CHECK-NEXT: ssubw2 v1.4s, v2.4s, v7.8h -; CHECK-NEXT: ssubw v0.4s, v4.4s, v7.4h -; CHECK-NEXT: ssubw v2.4s, v5.4s, v6.4h +; CHECK-NEXT: ushll2 v2.8h, v0.16b, #0 +; CHECK-NEXT: ushll v0.8h, v0.8b, #0 +; CHECK-NEXT: ssubw2 v2.8h, v2.8h, v1.16b +; CHECK-NEXT: ssubw v0.8h, v0.8h, v1.8b +; CHECK-NEXT: sshll2 v3.4s, v2.8h, #0 +; CHECK-NEXT: sshll2 v1.4s, v0.8h, #0 +; CHECK-NEXT: sshll v0.4s, v0.4h, #0 +; CHECK-NEXT: sshll v2.4s, v2.4h, #0 ; CHECK-NEXT: ret entry: %s0s = zext <16 x i8> %s0 to <16 x i32> diff --git a/llvm/test/CodeGen/AArch64/reduce-shuffle.ll b/llvm/test/CodeGen/AArch64/reduce-shuffle.ll --- a/llvm/test/CodeGen/AArch64/reduce-shuffle.ll +++ b/llvm/test/CodeGen/AArch64/reduce-shuffle.ll @@ -4,151 +4,138 @@ define i32 @v1(ptr nocapture noundef readonly %p1, i32 noundef %i1, ptr nocapture noundef readonly %p2, i32 noundef %i2) { ; CHECK-LABEL: v1: ; CHECK: // %bb.0: // %entry +; CHECK-NEXT: // kill: def $w3 killed $w3 def $x3 ; CHECK-NEXT: // kill: def $w1 killed $w1 def $x1 ; CHECK-NEXT: sxtw x8, w1 -; CHECK-NEXT: // kill: def $w3 killed $w3 def $x3 +; CHECK-NEXT: sxtw x10, w3 ; CHECK-NEXT: add x9, x0, x8 -; CHECK-NEXT: add x10, x9, x8 -; CHECK-NEXT: add x8, x10, x8 -; CHECK-NEXT: ldp s3, s0, [x9] -; CHECK-NEXT: ldp s1, s2, [x8] -; CHECK-NEXT: sxtw x8, w3 -; CHECK-NEXT: add x11, x2, x8 -; CHECK-NEXT: add x12, x11, x8 -; CHECK-NEXT: ld1 { v1.s }[1], [x10], #4 -; CHECK-NEXT: add x8, x12, x8 -; CHECK-NEXT: ushll v1.8h, v1.8b, #0 -; CHECK-NEXT: ld1 { v2.s }[1], [x10] -; CHECK-NEXT: ld1 { v3.s }[1], [x0], #4 -; CHECK-NEXT: ldp s5, s4, [x8] -; CHECK-NEXT: ldp s7, s6, [x11] -; CHECK-NEXT: ushll v3.8h, v3.8b, #0 -; CHECK-NEXT: ld1 { v0.s }[1], [x0] -; CHECK-NEXT: ld1 { v5.s }[1], [x12], #4 -; CHECK-NEXT: ushll v5.8h, v5.8b, #0 -; CHECK-NEXT: ushll v2.8h, v2.8b, #0 -; CHECK-NEXT: usubl v16.4s, v1.4h, v5.4h -; CHECK-NEXT: ld1 { v4.s }[1], [x12] +; CHECK-NEXT: add x12, x2, x10 +; CHECK-NEXT: add x11, x9, x8 +; CHECK-NEXT: add x13, x12, x10 +; CHECK-NEXT: add x8, x11, x8 +; CHECK-NEXT: add x10, x13, x10 +; CHECK-NEXT: ldp s1, s0, [x9] +; CHECK-NEXT: ldp s7, s6, [x12] +; CHECK-NEXT: ldp s3, s2, [x8] +; CHECK-NEXT: ldp s5, s4, [x10] +; CHECK-NEXT: ld1 { v5.s }[1], [x13], #4 +; CHECK-NEXT: ld1 { v3.s }[1], [x11], #4 ; CHECK-NEXT: ld1 { v7.s }[1], [x2], #4 -; CHECK-NEXT: usubl2 v1.4s, v1.8h, v5.8h -; CHECK-NEXT: ushll v5.8h, v7.8b, #0 -; CHECK-NEXT: ushll v4.8h, v4.8b, #0 +; CHECK-NEXT: ld1 { v1.s }[1], [x0], #4 +; CHECK-NEXT: ld1 { v4.s }[1], [x13] +; CHECK-NEXT: ld1 { v2.s }[1], [x11] ; CHECK-NEXT: ld1 { v6.s }[1], [x2] -; CHECK-NEXT: ushll v0.8h, v0.8b, #0 -; CHECK-NEXT: usubl v17.4s, v3.4h, v5.4h -; CHECK-NEXT: usubl2 v5.4s, v3.8h, v5.8h -; CHECK-NEXT: usubl2 v3.4s, v2.8h, v4.8h -; CHECK-NEXT: usubl v2.4s, v2.4h, v4.4h -; CHECK-NEXT: ushll v4.8h, v6.8b, #0 -; CHECK-NEXT: shl v3.4s, v3.4s, #16 -; CHECK-NEXT: usubl2 v6.4s, v0.8h, v4.8h -; CHECK-NEXT: shl v2.4s, v2.4s, #16 -; CHECK-NEXT: usubl v0.4s, v0.4h, v4.4h -; CHECK-NEXT: add v19.4s, v3.4s, v1.4s -; CHECK-NEXT: shl v6.4s, v6.4s, #16 -; CHECK-NEXT: shl v4.4s, v0.4s, #16 -; CHECK-NEXT: add v1.4s, v2.4s, v16.4s -; CHECK-NEXT: add v7.4s, v6.4s, v5.4s -; CHECK-NEXT: add v18.4s, v4.4s, v17.4s -; CHECK-NEXT: ext v20.16b, v1.16b, v1.16b, #12 -; CHECK-NEXT: zip1 v17.4s, v18.4s, v7.4s -; CHECK-NEXT: uzp2 v16.4s, v1.4s, v19.4s -; CHECK-NEXT: mov v2.16b, v1.16b -; CHECK-NEXT: mov v6.16b, v19.16b -; CHECK-NEXT: mov v2.s[0], v19.s[1] -; CHECK-NEXT: ext v5.16b, v19.16b, v20.16b, #12 -; CHECK-NEXT: ext v20.16b, v18.16b, v17.16b, #8 -; CHECK-NEXT: mov v6.s[1], v1.s[0] -; CHECK-NEXT: zip2 v3.4s, v1.4s, v19.4s -; CHECK-NEXT: uzp2 v4.4s, v16.4s, v1.4s -; CHECK-NEXT: zip2 v16.4s, v18.4s, v7.4s -; CHECK-NEXT: mov v18.s[3], v7.s[2] -; CHECK-NEXT: mov v2.d[1], v17.d[1] -; CHECK-NEXT: mov v6.d[1], v20.d[1] -; CHECK-NEXT: zip2 v0.4s, v19.4s, v1.4s -; CHECK-NEXT: mov v4.d[1], v16.d[1] -; CHECK-NEXT: mov v3.d[1], v18.d[1] -; CHECK-NEXT: add v1.4s, v2.4s, v6.4s -; CHECK-NEXT: mov v5.d[1], v16.d[1] -; CHECK-NEXT: mov v0.d[1], v18.d[1] -; CHECK-NEXT: add v3.4s, v4.4s, v3.4s -; CHECK-NEXT: sub v2.4s, v6.4s, v2.4s +; CHECK-NEXT: ld1 { v0.s }[1], [x0] +; CHECK-NEXT: usubl v3.8h, v3.8b, v5.8b +; CHECK-NEXT: usubl v2.8h, v2.8b, v4.8b +; CHECK-NEXT: usubl v1.8h, v1.8b, v7.8b +; CHECK-NEXT: usubl v0.8h, v0.8b, v6.8b +; CHECK-NEXT: shll v4.4s, v2.4h, #16 +; CHECK-NEXT: shll2 v2.4s, v2.8h, #16 +; CHECK-NEXT: shll v5.4s, v0.4h, #16 +; CHECK-NEXT: shll2 v0.4s, v0.8h, #16 +; CHECK-NEXT: saddw2 v2.4s, v2.4s, v3.8h +; CHECK-NEXT: saddw v3.4s, v4.4s, v3.4h +; CHECK-NEXT: saddw2 v0.4s, v0.4s, v1.8h +; CHECK-NEXT: saddw v1.4s, v5.4s, v1.4h +; CHECK-NEXT: uzp2 v5.4s, v3.4s, v2.4s +; CHECK-NEXT: ext v16.16b, v3.16b, v3.16b, #12 +; CHECK-NEXT: zip1 v17.4s, v1.4s, v0.4s +; CHECK-NEXT: zip2 v6.4s, v1.4s, v0.4s +; CHECK-NEXT: zip2 v18.4s, v3.4s, v2.4s +; CHECK-NEXT: uzp2 v5.4s, v5.4s, v3.4s +; CHECK-NEXT: ext v19.16b, v1.16b, v17.16b, #8 +; CHECK-NEXT: mov v1.s[3], v0.s[2] +; CHECK-NEXT: zip2 v4.4s, v2.4s, v3.4s +; CHECK-NEXT: mov v7.16b, v3.16b +; CHECK-NEXT: ext v16.16b, v2.16b, v16.16b, #12 +; CHECK-NEXT: mov v7.s[0], v2.s[1] +; CHECK-NEXT: mov v2.s[1], v3.s[0] +; CHECK-NEXT: mov v5.d[1], v6.d[1] +; CHECK-NEXT: mov v18.d[1], v1.d[1] +; CHECK-NEXT: mov v16.d[1], v6.d[1] +; CHECK-NEXT: mov v4.d[1], v1.d[1] +; CHECK-NEXT: mov v7.d[1], v17.d[1] +; CHECK-NEXT: mov v2.d[1], v19.d[1] +; CHECK-NEXT: add v1.4s, v5.4s, v18.4s +; CHECK-NEXT: sub v3.4s, v4.4s, v16.4s +; CHECK-NEXT: rev64 v4.4s, v1.4s +; CHECK-NEXT: add v0.4s, v7.4s, v2.4s +; CHECK-NEXT: sub v2.4s, v2.4s, v7.4s +; CHECK-NEXT: rev64 v5.4s, v0.4s +; CHECK-NEXT: mov v4.d[1], v1.d[1] +; CHECK-NEXT: add v6.4s, v3.4s, v2.4s +; CHECK-NEXT: sub v2.4s, v2.4s, v3.4s +; CHECK-NEXT: mov v5.d[1], v0.d[1] +; CHECK-NEXT: sub v0.4s, v0.4s, v4.4s +; CHECK-NEXT: rev64 v7.4s, v2.4s +; CHECK-NEXT: rev64 v3.4s, v6.4s +; CHECK-NEXT: rev64 v4.4s, v0.4s +; CHECK-NEXT: add v1.4s, v1.4s, v5.4s +; CHECK-NEXT: sub v7.4s, v2.4s, v7.4s +; CHECK-NEXT: addp v5.4s, v1.4s, v6.4s +; CHECK-NEXT: addp v2.4s, v0.4s, v2.4s +; CHECK-NEXT: sub v3.4s, v6.4s, v3.4s ; CHECK-NEXT: rev64 v6.4s, v1.4s -; CHECK-NEXT: rev64 v4.4s, v3.4s -; CHECK-NEXT: sub v0.4s, v0.4s, v5.4s -; CHECK-NEXT: movi v19.8h, #1 -; CHECK-NEXT: mov v6.d[1], v1.d[1] -; CHECK-NEXT: add v5.4s, v0.4s, v2.4s -; CHECK-NEXT: sub v0.4s, v2.4s, v0.4s -; CHECK-NEXT: mov v4.d[1], v3.d[1] -; CHECK-NEXT: rev64 v2.4s, v5.4s -; CHECK-NEXT: rev64 v7.4s, v0.4s -; CHECK-NEXT: add v3.4s, v3.4s, v6.4s -; CHECK-NEXT: sub v1.4s, v1.4s, v4.4s -; CHECK-NEXT: addp v4.4s, v3.4s, v5.4s -; CHECK-NEXT: sub v2.4s, v5.4s, v2.4s -; CHECK-NEXT: sub v6.4s, v0.4s, v7.4s -; CHECK-NEXT: rev64 v7.4s, v3.4s -; CHECK-NEXT: rev64 v5.4s, v1.4s -; CHECK-NEXT: zip1 v16.4s, v4.4s, v4.4s -; CHECK-NEXT: addp v0.4s, v1.4s, v0.4s -; CHECK-NEXT: ext v17.16b, v4.16b, v2.16b, #4 -; CHECK-NEXT: sub v3.4s, v3.4s, v7.4s -; CHECK-NEXT: sub v1.4s, v1.4s, v5.4s -; CHECK-NEXT: ext v7.16b, v0.16b, v6.16b, #4 -; CHECK-NEXT: ext v18.16b, v3.16b, v4.16b, #4 -; CHECK-NEXT: trn2 v3.4s, v16.4s, v3.4s -; CHECK-NEXT: ext v16.16b, v1.16b, v0.16b, #8 -; CHECK-NEXT: zip2 v5.4s, v17.4s, v4.4s -; CHECK-NEXT: zip2 v7.4s, v7.4s, v0.4s -; CHECK-NEXT: ext v17.16b, v18.16b, v18.16b, #4 -; CHECK-NEXT: ext v18.16b, v16.16b, v1.16b, #4 -; CHECK-NEXT: ext v5.16b, v2.16b, v5.16b, #12 -; CHECK-NEXT: mov v2.s[2], v4.s[3] -; CHECK-NEXT: ext v7.16b, v6.16b, v7.16b, #12 -; CHECK-NEXT: mov v1.s[2], v0.s[1] -; CHECK-NEXT: mov v6.s[2], v0.s[3] -; CHECK-NEXT: uzp2 v16.4s, v16.4s, v18.4s -; CHECK-NEXT: sub v18.4s, v2.4s, v5.4s -; CHECK-NEXT: mov v2.s[1], v4.s[2] -; CHECK-NEXT: sub v20.4s, v3.4s, v17.4s -; CHECK-NEXT: mov v17.s[0], v4.s[1] -; CHECK-NEXT: sub v21.4s, v6.4s, v7.4s -; CHECK-NEXT: sub v4.4s, v1.4s, v16.4s -; CHECK-NEXT: mov v6.s[1], v0.s[2] -; CHECK-NEXT: mov v1.s[1], v0.s[0] -; CHECK-NEXT: add v0.4s, v2.4s, v5.4s +; CHECK-NEXT: sub v0.4s, v0.4s, v4.4s +; CHECK-NEXT: zip1 v16.4s, v5.4s, v5.4s +; CHECK-NEXT: ext v17.16b, v2.16b, v7.16b, #4 +; CHECK-NEXT: ext v18.16b, v5.16b, v3.16b, #4 +; CHECK-NEXT: ext v4.16b, v0.16b, v2.16b, #8 +; CHECK-NEXT: sub v1.4s, v1.4s, v6.4s +; CHECK-NEXT: ext v6.16b, v1.16b, v5.16b, #4 +; CHECK-NEXT: trn2 v1.4s, v16.4s, v1.4s +; CHECK-NEXT: zip2 v16.4s, v17.4s, v2.4s +; CHECK-NEXT: zip2 v17.4s, v18.4s, v5.4s +; CHECK-NEXT: ext v18.16b, v4.16b, v0.16b, #4 +; CHECK-NEXT: ext v6.16b, v6.16b, v6.16b, #4 +; CHECK-NEXT: ext v16.16b, v7.16b, v16.16b, #12 +; CHECK-NEXT: ext v17.16b, v3.16b, v17.16b, #12 +; CHECK-NEXT: mov v0.s[2], v2.s[1] +; CHECK-NEXT: uzp2 v4.4s, v4.4s, v18.4s +; CHECK-NEXT: mov v7.s[2], v2.s[3] +; CHECK-NEXT: mov v3.s[2], v5.s[3] +; CHECK-NEXT: sub v18.4s, v1.4s, v6.4s +; CHECK-NEXT: mov v6.s[0], v5.s[1] +; CHECK-NEXT: sub v19.4s, v0.4s, v4.4s +; CHECK-NEXT: sub v20.4s, v7.4s, v16.4s +; CHECK-NEXT: sub v21.4s, v3.4s, v17.4s +; CHECK-NEXT: mov v0.s[1], v2.s[0] +; CHECK-NEXT: mov v7.s[1], v2.s[2] +; CHECK-NEXT: mov v3.s[1], v5.s[2] +; CHECK-NEXT: add v1.4s, v1.4s, v6.4s +; CHECK-NEXT: add v0.4s, v0.4s, v4.4s +; CHECK-NEXT: add v2.4s, v7.4s, v16.4s ; CHECK-NEXT: add v3.4s, v3.4s, v17.4s -; CHECK-NEXT: mov v0.d[1], v18.d[1] -; CHECK-NEXT: add v2.4s, v6.4s, v7.4s -; CHECK-NEXT: add v1.4s, v1.4s, v16.4s -; CHECK-NEXT: mov v3.d[1], v20.d[1] -; CHECK-NEXT: mov v2.d[1], v21.d[1] -; CHECK-NEXT: mov v1.d[1], v4.d[1] -; CHECK-NEXT: ushr v5.4s, v0.4s, #15 -; CHECK-NEXT: ushr v7.4s, v3.4s, #15 -; CHECK-NEXT: and v4.16b, v5.16b, v19.16b +; CHECK-NEXT: mov v2.d[1], v20.d[1] +; CHECK-NEXT: mov v3.d[1], v21.d[1] +; CHECK-NEXT: mov v1.d[1], v18.d[1] +; CHECK-NEXT: mov v0.d[1], v19.d[1] +; CHECK-NEXT: movi v4.8h, #1 +; CHECK-NEXT: movi v17.2d, #0x00ffff0000ffff +; CHECK-NEXT: ushr v5.4s, v1.4s, #15 ; CHECK-NEXT: ushr v6.4s, v2.4s, #15 -; CHECK-NEXT: movi v5.2d, #0x00ffff0000ffff -; CHECK-NEXT: ushr v16.4s, v1.4s, #15 -; CHECK-NEXT: and v6.16b, v6.16b, v19.16b -; CHECK-NEXT: and v16.16b, v16.16b, v19.16b -; CHECK-NEXT: and v7.16b, v7.16b, v19.16b -; CHECK-NEXT: mul v6.4s, v6.4s, v5.4s -; CHECK-NEXT: mul v4.4s, v4.4s, v5.4s -; CHECK-NEXT: mul v7.4s, v7.4s, v5.4s -; CHECK-NEXT: mul v5.4s, v16.4s, v5.4s -; CHECK-NEXT: add v2.4s, v6.4s, v2.4s -; CHECK-NEXT: add v0.4s, v4.4s, v0.4s -; CHECK-NEXT: add v3.4s, v7.4s, v3.4s -; CHECK-NEXT: add v1.4s, v5.4s, v1.4s -; CHECK-NEXT: eor v3.16b, v3.16b, v7.16b -; CHECK-NEXT: eor v1.16b, v1.16b, v5.16b -; CHECK-NEXT: eor v0.16b, v0.16b, v4.16b -; CHECK-NEXT: eor v2.16b, v2.16b, v6.16b -; CHECK-NEXT: add v0.4s, v0.4s, v2.4s -; CHECK-NEXT: add v1.4s, v3.4s, v1.4s +; CHECK-NEXT: ushr v7.4s, v0.4s, #15 +; CHECK-NEXT: ushr v16.4s, v3.4s, #15 +; CHECK-NEXT: and v6.16b, v6.16b, v4.16b +; CHECK-NEXT: and v16.16b, v16.16b, v4.16b +; CHECK-NEXT: and v7.16b, v7.16b, v4.16b +; CHECK-NEXT: and v4.16b, v5.16b, v4.16b +; CHECK-NEXT: mul v5.4s, v6.4s, v17.4s +; CHECK-NEXT: mul v6.4s, v16.4s, v17.4s +; CHECK-NEXT: mul v4.4s, v4.4s, v17.4s +; CHECK-NEXT: mul v7.4s, v7.4s, v17.4s +; CHECK-NEXT: add v2.4s, v5.4s, v2.4s +; CHECK-NEXT: add v3.4s, v6.4s, v3.4s +; CHECK-NEXT: add v1.4s, v4.4s, v1.4s +; CHECK-NEXT: add v0.4s, v7.4s, v0.4s +; CHECK-NEXT: eor v1.16b, v1.16b, v4.16b +; CHECK-NEXT: eor v0.16b, v0.16b, v7.16b +; CHECK-NEXT: eor v3.16b, v3.16b, v6.16b +; CHECK-NEXT: eor v2.16b, v2.16b, v5.16b +; CHECK-NEXT: add v2.4s, v3.4s, v2.4s ; CHECK-NEXT: add v0.4s, v1.4s, v0.4s +; CHECK-NEXT: add v0.4s, v0.4s, v2.4s ; CHECK-NEXT: addv s0, v0.4s ; CHECK-NEXT: fmov w8, s0 ; CHECK-NEXT: lsr w9, w8, #16 @@ -249,144 +236,130 @@ define i32 @v2(ptr nocapture noundef readonly %p1, i32 noundef %i1, ptr nocapture noundef readonly %p2, i32 noundef %i2) { ; CHECK-LABEL: v2: ; CHECK: // %bb.0: // %entry +; CHECK-NEXT: // kill: def $w3 killed $w3 def $x3 ; CHECK-NEXT: // kill: def $w1 killed $w1 def $x1 ; CHECK-NEXT: sxtw x8, w1 -; CHECK-NEXT: // kill: def $w3 killed $w3 def $x3 +; CHECK-NEXT: sxtw x10, w3 ; CHECK-NEXT: add x9, x0, x8 -; CHECK-NEXT: add x10, x9, x8 -; CHECK-NEXT: add x8, x10, x8 -; CHECK-NEXT: ldp s3, s0, [x9] -; CHECK-NEXT: ldp s1, s2, [x8] -; CHECK-NEXT: sxtw x8, w3 -; CHECK-NEXT: add x11, x2, x8 -; CHECK-NEXT: add x12, x11, x8 -; CHECK-NEXT: ld1 { v1.s }[1], [x10], #4 -; CHECK-NEXT: add x8, x12, x8 -; CHECK-NEXT: ushll v1.8h, v1.8b, #0 -; CHECK-NEXT: ld1 { v2.s }[1], [x10] -; CHECK-NEXT: ld1 { v3.s }[1], [x0], #4 -; CHECK-NEXT: ldp s5, s4, [x8] -; CHECK-NEXT: ldp s7, s6, [x11] -; CHECK-NEXT: ushll v3.8h, v3.8b, #0 -; CHECK-NEXT: ld1 { v0.s }[1], [x0] -; CHECK-NEXT: ld1 { v5.s }[1], [x12], #4 -; CHECK-NEXT: ushll v5.8h, v5.8b, #0 -; CHECK-NEXT: ushll v2.8h, v2.8b, #0 -; CHECK-NEXT: usubl v16.4s, v1.4h, v5.4h -; CHECK-NEXT: ld1 { v4.s }[1], [x12] +; CHECK-NEXT: add x12, x2, x10 +; CHECK-NEXT: add x11, x9, x8 +; CHECK-NEXT: add x13, x12, x10 +; CHECK-NEXT: add x8, x11, x8 +; CHECK-NEXT: add x10, x13, x10 +; CHECK-NEXT: ldp s1, s0, [x9] +; CHECK-NEXT: ldp s7, s6, [x12] +; CHECK-NEXT: ldp s3, s2, [x8] +; CHECK-NEXT: ldp s5, s4, [x10] +; CHECK-NEXT: ld1 { v5.s }[1], [x13], #4 +; CHECK-NEXT: ld1 { v3.s }[1], [x11], #4 ; CHECK-NEXT: ld1 { v7.s }[1], [x2], #4 -; CHECK-NEXT: usubl2 v1.4s, v1.8h, v5.8h -; CHECK-NEXT: ushll v5.8h, v7.8b, #0 -; CHECK-NEXT: ushll v4.8h, v4.8b, #0 +; CHECK-NEXT: ld1 { v1.s }[1], [x0], #4 +; CHECK-NEXT: ld1 { v4.s }[1], [x13] +; CHECK-NEXT: ld1 { v2.s }[1], [x11] ; CHECK-NEXT: ld1 { v6.s }[1], [x2] -; CHECK-NEXT: ushll v0.8h, v0.8b, #0 -; CHECK-NEXT: usubl v7.4s, v3.4h, v5.4h -; CHECK-NEXT: usubl2 v3.4s, v3.8h, v5.8h -; CHECK-NEXT: usubl2 v5.4s, v2.8h, v4.8h -; CHECK-NEXT: usubl v2.4s, v2.4h, v4.4h -; CHECK-NEXT: ushll v4.8h, v6.8b, #0 -; CHECK-NEXT: shl v5.4s, v5.4s, #16 -; CHECK-NEXT: usubl2 v6.4s, v0.8h, v4.8h -; CHECK-NEXT: shl v2.4s, v2.4s, #16 -; CHECK-NEXT: usubl v0.4s, v0.4h, v4.4h -; CHECK-NEXT: add v1.4s, v5.4s, v1.4s -; CHECK-NEXT: shl v4.4s, v6.4s, #16 -; CHECK-NEXT: shl v0.4s, v0.4s, #16 -; CHECK-NEXT: add v2.4s, v2.4s, v16.4s -; CHECK-NEXT: add v3.4s, v4.4s, v3.4s -; CHECK-NEXT: add v0.4s, v0.4s, v7.4s -; CHECK-NEXT: uzp2 v6.4s, v2.4s, v1.4s -; CHECK-NEXT: ext v17.16b, v2.16b, v2.16b, #12 -; CHECK-NEXT: zip1 v4.4s, v0.4s, v3.4s -; CHECK-NEXT: mov v16.16b, v2.16b -; CHECK-NEXT: mov v19.16b, v0.16b -; CHECK-NEXT: zip2 v5.4s, v1.4s, v2.4s -; CHECK-NEXT: zip2 v18.4s, v2.4s, v1.4s -; CHECK-NEXT: mov v16.s[0], v1.s[1] -; CHECK-NEXT: uzp2 v6.4s, v6.4s, v2.4s -; CHECK-NEXT: ext v7.16b, v1.16b, v17.16b, #12 -; CHECK-NEXT: zip2 v17.4s, v0.4s, v3.4s -; CHECK-NEXT: ext v0.16b, v0.16b, v4.16b, #8 -; CHECK-NEXT: mov v1.s[1], v2.s[0] -; CHECK-NEXT: mov v19.s[3], v3.s[2] -; CHECK-NEXT: mov v6.d[1], v17.d[1] -; CHECK-NEXT: mov v16.d[1], v4.d[1] -; CHECK-NEXT: mov v1.d[1], v0.d[1] -; CHECK-NEXT: mov v18.d[1], v19.d[1] -; CHECK-NEXT: mov v7.d[1], v17.d[1] -; CHECK-NEXT: mov v5.d[1], v19.d[1] -; CHECK-NEXT: add v0.4s, v16.4s, v1.4s -; CHECK-NEXT: add v3.4s, v6.4s, v18.4s -; CHECK-NEXT: rev64 v2.4s, v0.4s -; CHECK-NEXT: sub v4.4s, v5.4s, v7.4s -; CHECK-NEXT: rev64 v5.4s, v3.4s -; CHECK-NEXT: sub v1.4s, v1.4s, v16.4s -; CHECK-NEXT: mov v2.d[1], v0.d[1] -; CHECK-NEXT: add v6.4s, v4.4s, v1.4s -; CHECK-NEXT: mov v5.d[1], v3.d[1] -; CHECK-NEXT: sub v1.4s, v1.4s, v4.4s -; CHECK-NEXT: add v2.4s, v3.4s, v2.4s -; CHECK-NEXT: sub v0.4s, v0.4s, v5.4s -; CHECK-NEXT: zip1 v4.4s, v2.4s, v6.4s -; CHECK-NEXT: uzp2 v5.4s, v2.4s, v6.4s -; CHECK-NEXT: zip1 v7.4s, v0.4s, v1.4s -; CHECK-NEXT: mov v16.16b, v2.16b -; CHECK-NEXT: zip2 v3.4s, v2.4s, v6.4s -; CHECK-NEXT: mov v16.s[1], v6.s[1] -; CHECK-NEXT: mov v6.16b, v0.16b -; CHECK-NEXT: trn2 v4.4s, v2.4s, v4.4s -; CHECK-NEXT: zip2 v17.4s, v0.4s, v1.4s -; CHECK-NEXT: ext v0.16b, v0.16b, v7.16b, #8 -; CHECK-NEXT: uzp2 v2.4s, v5.4s, v2.4s -; CHECK-NEXT: mov v6.s[3], v1.s[2] +; CHECK-NEXT: ld1 { v0.s }[1], [x0] +; CHECK-NEXT: usubl v3.8h, v3.8b, v5.8b +; CHECK-NEXT: usubl v2.8h, v2.8b, v4.8b +; CHECK-NEXT: usubl v1.8h, v1.8b, v7.8b +; CHECK-NEXT: usubl v0.8h, v0.8b, v6.8b +; CHECK-NEXT: shll v4.4s, v2.4h, #16 +; CHECK-NEXT: shll2 v2.4s, v2.8h, #16 +; CHECK-NEXT: shll v5.4s, v0.4h, #16 +; CHECK-NEXT: shll2 v0.4s, v0.8h, #16 +; CHECK-NEXT: saddw2 v2.4s, v2.4s, v3.8h +; CHECK-NEXT: saddw v3.4s, v4.4s, v3.4h +; CHECK-NEXT: saddw2 v0.4s, v0.4s, v1.8h +; CHECK-NEXT: saddw v1.4s, v5.4s, v1.4h +; CHECK-NEXT: uzp2 v5.4s, v3.4s, v2.4s +; CHECK-NEXT: ext v17.16b, v3.16b, v3.16b, #12 +; CHECK-NEXT: zip1 v7.4s, v1.4s, v0.4s +; CHECK-NEXT: mov v16.16b, v3.16b +; CHECK-NEXT: zip2 v4.4s, v2.4s, v3.4s +; CHECK-NEXT: zip2 v6.4s, v1.4s, v0.4s +; CHECK-NEXT: zip2 v18.4s, v3.4s, v2.4s +; CHECK-NEXT: mov v16.s[0], v2.s[1] +; CHECK-NEXT: ext v19.16b, v1.16b, v7.16b, #8 +; CHECK-NEXT: ext v17.16b, v2.16b, v17.16b, #12 +; CHECK-NEXT: uzp2 v5.4s, v5.4s, v3.4s +; CHECK-NEXT: mov v1.s[3], v0.s[2] +; CHECK-NEXT: mov v2.s[1], v3.s[0] ; CHECK-NEXT: mov v16.d[1], v7.d[1] -; CHECK-NEXT: mov v4.d[1], v0.d[1] -; CHECK-NEXT: mov v2.d[1], v17.d[1] -; CHECK-NEXT: mov v3.d[1], v6.d[1] -; CHECK-NEXT: movi v0.8h, #1 -; CHECK-NEXT: add v1.4s, v16.4s, v4.4s -; CHECK-NEXT: sub v4.4s, v4.4s, v16.4s -; CHECK-NEXT: add v6.4s, v3.4s, v2.4s -; CHECK-NEXT: sub v2.4s, v2.4s, v3.4s -; CHECK-NEXT: ext v5.16b, v1.16b, v1.16b, #4 -; CHECK-NEXT: zip2 v3.4s, v4.4s, v1.4s -; CHECK-NEXT: zip2 v7.4s, v2.4s, v6.4s -; CHECK-NEXT: ext v17.16b, v6.16b, v6.16b, #4 -; CHECK-NEXT: zip1 v16.4s, v1.4s, v4.4s -; CHECK-NEXT: zip2 v1.4s, v1.4s, v4.4s -; CHECK-NEXT: add v3.4s, v3.4s, v7.4s -; CHECK-NEXT: zip2 v7.4s, v6.4s, v2.4s -; CHECK-NEXT: zip1 v6.4s, v6.4s, v2.4s -; CHECK-NEXT: ext v4.16b, v5.16b, v4.16b, #8 -; CHECK-NEXT: ext v2.16b, v17.16b, v2.16b, #8 -; CHECK-NEXT: sub v1.4s, v7.4s, v1.4s -; CHECK-NEXT: sub v6.4s, v6.4s, v16.4s -; CHECK-NEXT: ext v4.16b, v4.16b, v5.16b, #4 -; CHECK-NEXT: ext v2.16b, v2.16b, v17.16b, #4 -; CHECK-NEXT: movi v7.2d, #0x00ffff0000ffff +; CHECK-NEXT: mov v5.d[1], v6.d[1] +; CHECK-NEXT: mov v18.d[1], v1.d[1] +; CHECK-NEXT: mov v2.d[1], v19.d[1] +; CHECK-NEXT: mov v4.d[1], v1.d[1] +; CHECK-NEXT: mov v17.d[1], v6.d[1] +; CHECK-NEXT: add v0.4s, v5.4s, v18.4s +; CHECK-NEXT: add v1.4s, v16.4s, v2.4s +; CHECK-NEXT: rev64 v3.4s, v0.4s +; CHECK-NEXT: rev64 v5.4s, v1.4s +; CHECK-NEXT: sub v2.4s, v2.4s, v16.4s +; CHECK-NEXT: sub v4.4s, v4.4s, v17.4s +; CHECK-NEXT: mov v3.d[1], v0.d[1] +; CHECK-NEXT: mov v5.d[1], v1.d[1] +; CHECK-NEXT: add v6.4s, v4.4s, v2.4s +; CHECK-NEXT: sub v2.4s, v2.4s, v4.4s +; CHECK-NEXT: sub v1.4s, v1.4s, v3.4s +; CHECK-NEXT: add v0.4s, v0.4s, v5.4s +; CHECK-NEXT: zip1 v3.4s, v1.4s, v2.4s +; CHECK-NEXT: zip1 v4.4s, v0.4s, v6.4s +; CHECK-NEXT: uzp2 v5.4s, v0.4s, v6.4s +; CHECK-NEXT: mov v17.16b, v1.16b +; CHECK-NEXT: zip2 v7.4s, v0.4s, v6.4s +; CHECK-NEXT: ext v16.16b, v1.16b, v3.16b, #8 +; CHECK-NEXT: trn2 v4.4s, v0.4s, v4.4s +; CHECK-NEXT: uzp2 v5.4s, v5.4s, v0.4s +; CHECK-NEXT: zip2 v1.4s, v1.4s, v2.4s +; CHECK-NEXT: mov v17.s[3], v2.s[2] +; CHECK-NEXT: mov v0.s[1], v6.s[1] +; CHECK-NEXT: mov v4.d[1], v16.d[1] +; CHECK-NEXT: mov v5.d[1], v1.d[1] +; CHECK-NEXT: mov v7.d[1], v17.d[1] +; CHECK-NEXT: mov v0.d[1], v3.d[1] +; CHECK-NEXT: movi v1.8h, #1 +; CHECK-NEXT: add v2.4s, v7.4s, v5.4s +; CHECK-NEXT: add v3.4s, v0.4s, v4.4s +; CHECK-NEXT: sub v5.4s, v5.4s, v7.4s +; CHECK-NEXT: sub v0.4s, v4.4s, v0.4s +; CHECK-NEXT: ext v4.16b, v3.16b, v3.16b, #4 +; CHECK-NEXT: zip2 v6.4s, v0.4s, v3.4s +; CHECK-NEXT: zip2 v7.4s, v5.4s, v2.4s +; CHECK-NEXT: zip1 v16.4s, v2.4s, v5.4s +; CHECK-NEXT: zip2 v17.4s, v2.4s, v5.4s +; CHECK-NEXT: ext v2.16b, v2.16b, v2.16b, #4 +; CHECK-NEXT: add v6.4s, v6.4s, v7.4s +; CHECK-NEXT: zip2 v7.4s, v3.4s, v0.4s +; CHECK-NEXT: zip1 v3.4s, v3.4s, v0.4s +; CHECK-NEXT: ext v0.16b, v4.16b, v0.16b, #8 +; CHECK-NEXT: ext v5.16b, v2.16b, v5.16b, #8 +; CHECK-NEXT: sub v7.4s, v17.4s, v7.4s +; CHECK-NEXT: sub v3.4s, v16.4s, v3.4s +; CHECK-NEXT: ext v0.16b, v0.16b, v4.16b, #4 +; CHECK-NEXT: ext v2.16b, v5.16b, v2.16b, #4 +; CHECK-NEXT: movi v17.2d, #0x00ffff0000ffff ; CHECK-NEXT: ushr v5.4s, v3.4s, #15 -; CHECK-NEXT: ushr v16.4s, v6.4s, #15 -; CHECK-NEXT: ushr v17.4s, v1.4s, #15 -; CHECK-NEXT: add v2.4s, v4.4s, v2.4s -; CHECK-NEXT: and v5.16b, v5.16b, v0.16b -; CHECK-NEXT: ushr v4.4s, v2.4s, #15 -; CHECK-NEXT: and v17.16b, v17.16b, v0.16b -; CHECK-NEXT: and v16.16b, v16.16b, v0.16b -; CHECK-NEXT: and v0.16b, v4.16b, v0.16b -; CHECK-NEXT: mul v5.4s, v5.4s, v7.4s -; CHECK-NEXT: mul v16.4s, v16.4s, v7.4s -; CHECK-NEXT: mul v17.4s, v17.4s, v7.4s -; CHECK-NEXT: mul v0.4s, v0.4s, v7.4s -; CHECK-NEXT: add v3.4s, v5.4s, v3.4s -; CHECK-NEXT: add v6.4s, v16.4s, v6.4s -; CHECK-NEXT: add v1.4s, v17.4s, v1.4s -; CHECK-NEXT: add v2.4s, v0.4s, v2.4s -; CHECK-NEXT: eor v4.16b, v6.16b, v16.16b -; CHECK-NEXT: eor v1.16b, v1.16b, v17.16b -; CHECK-NEXT: eor v3.16b, v3.16b, v5.16b -; CHECK-NEXT: eor v0.16b, v2.16b, v0.16b +; CHECK-NEXT: ushr v4.4s, v6.4s, #15 +; CHECK-NEXT: ushr v16.4s, v7.4s, #15 +; CHECK-NEXT: add v0.4s, v0.4s, v2.4s +; CHECK-NEXT: and v5.16b, v5.16b, v1.16b +; CHECK-NEXT: mul v2.4s, v5.4s, v17.4s +; CHECK-NEXT: ushr v5.4s, v0.4s, #15 +; CHECK-NEXT: and v4.16b, v4.16b, v1.16b +; CHECK-NEXT: and v16.16b, v16.16b, v1.16b +; CHECK-NEXT: and v1.16b, v5.16b, v1.16b +; CHECK-NEXT: mul v4.4s, v4.4s, v17.4s +; CHECK-NEXT: mul v16.4s, v16.4s, v17.4s +; CHECK-NEXT: mul v1.4s, v1.4s, v17.4s +; CHECK-NEXT: add v3.4s, v2.4s, v3.4s +; CHECK-NEXT: add v5.4s, v4.4s, v6.4s +; CHECK-NEXT: add v6.4s, v16.4s, v7.4s +; CHECK-NEXT: add v0.4s, v1.4s, v0.4s +; CHECK-NEXT: eor v2.16b, v3.16b, v2.16b +; CHECK-NEXT: eor v0.16b, v0.16b, v1.16b +; CHECK-NEXT: eor v1.16b, v6.16b, v16.16b +; CHECK-NEXT: eor v3.16b, v5.16b, v4.16b ; CHECK-NEXT: add v1.4s, v3.4s, v1.4s -; CHECK-NEXT: add v0.4s, v0.4s, v4.4s +; CHECK-NEXT: add v0.4s, v0.4s, v2.4s ; CHECK-NEXT: add v0.4s, v0.4s, v1.4s ; CHECK-NEXT: addv s0, v0.4s ; CHECK-NEXT: fmov w8, s0 @@ -493,138 +466,126 @@ define i32 @v3(ptr nocapture noundef readonly %p1, i32 noundef %i1, ptr nocapture noundef readonly %p2, i32 noundef %i2) { ; CHECK-LABEL: v3: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: // kill: def $w1 killed $w1 def $x1 -; CHECK-NEXT: sxtw x8, w1 ; CHECK-NEXT: // kill: def $w3 killed $w3 def $x3 -; CHECK-NEXT: sxtw x11, w3 -; CHECK-NEXT: add x9, x0, x8 -; CHECK-NEXT: add x12, x2, x11 -; CHECK-NEXT: add x10, x9, x8 -; CHECK-NEXT: add x13, x12, x11 -; CHECK-NEXT: add x8, x10, x8 -; CHECK-NEXT: add x11, x13, x11 -; CHECK-NEXT: ldp s1, s5, [x9] -; CHECK-NEXT: ldp s0, s4, [x8] -; CHECK-NEXT: ld1 { v0.s }[1], [x10], #4 -; CHECK-NEXT: ld1 { v1.s }[1], [x0], #4 -; CHECK-NEXT: ldp s2, s6, [x11] -; CHECK-NEXT: ldp s3, s7, [x12] -; CHECK-NEXT: ushll v0.8h, v0.8b, #0 -; CHECK-NEXT: ushll v1.8h, v1.8b, #0 +; CHECK-NEXT: // kill: def $w1 killed $w1 def $x1 +; CHECK-NEXT: sxtw x8, w3 +; CHECK-NEXT: sxtw x9, w1 +; CHECK-NEXT: add x10, x2, x8 +; CHECK-NEXT: add x11, x0, x9 +; CHECK-NEXT: add x12, x10, x8 +; CHECK-NEXT: add x13, x11, x9 +; CHECK-NEXT: add x8, x12, x8 +; CHECK-NEXT: add x9, x13, x9 +; CHECK-NEXT: ldp s0, s6, [x11] +; CHECK-NEXT: ldp s3, s7, [x10] +; CHECK-NEXT: ldp s1, s5, [x8] +; CHECK-NEXT: ldp s2, s4, [x9] +; CHECK-NEXT: ld1 { v1.s }[1], [x12], #4 ; CHECK-NEXT: ld1 { v2.s }[1], [x13], #4 ; CHECK-NEXT: ld1 { v3.s }[1], [x2], #4 -; CHECK-NEXT: ld1 { v4.s }[1], [x10] -; CHECK-NEXT: ld1 { v5.s }[1], [x0] -; CHECK-NEXT: ld1 { v6.s }[1], [x13] +; CHECK-NEXT: ld1 { v0.s }[1], [x0], #4 +; CHECK-NEXT: ld1 { v5.s }[1], [x12] +; CHECK-NEXT: ld1 { v4.s }[1], [x13] ; CHECK-NEXT: ld1 { v7.s }[1], [x2] -; CHECK-NEXT: ushll v2.8h, v2.8b, #0 -; CHECK-NEXT: ushll v3.8h, v3.8b, #0 -; CHECK-NEXT: usubl v16.4s, v0.4h, v2.4h -; CHECK-NEXT: usubl2 v0.4s, v0.8h, v2.8h -; CHECK-NEXT: usubl v2.4s, v1.4h, v3.4h -; CHECK-NEXT: usubl2 v1.4s, v1.8h, v3.8h -; CHECK-NEXT: ushll v3.8h, v4.8b, #0 -; CHECK-NEXT: ushll v4.8h, v5.8b, #0 -; CHECK-NEXT: ushll v5.8h, v6.8b, #0 -; CHECK-NEXT: ushll v6.8h, v7.8b, #0 -; CHECK-NEXT: usubl2 v7.4s, v3.8h, v5.8h -; CHECK-NEXT: usubl v3.4s, v3.4h, v5.4h -; CHECK-NEXT: usubl2 v5.4s, v4.8h, v6.8h -; CHECK-NEXT: usubl v4.4s, v4.4h, v6.4h -; CHECK-NEXT: shl v3.4s, v3.4s, #16 -; CHECK-NEXT: shl v6.4s, v7.4s, #16 -; CHECK-NEXT: shl v5.4s, v5.4s, #16 -; CHECK-NEXT: shl v4.4s, v4.4s, #16 -; CHECK-NEXT: add v0.4s, v6.4s, v0.4s -; CHECK-NEXT: add v3.4s, v3.4s, v16.4s -; CHECK-NEXT: add v1.4s, v5.4s, v1.4s -; CHECK-NEXT: add v2.4s, v4.4s, v2.4s -; CHECK-NEXT: rev64 v4.4s, v3.4s -; CHECK-NEXT: rev64 v5.4s, v0.4s -; CHECK-NEXT: rev64 v6.4s, v2.4s -; CHECK-NEXT: rev64 v17.4s, v1.4s -; CHECK-NEXT: addp v7.4s, v3.4s, v0.4s -; CHECK-NEXT: addp v16.4s, v2.4s, v1.4s -; CHECK-NEXT: sub v0.4s, v0.4s, v5.4s -; CHECK-NEXT: sub v3.4s, v3.4s, v4.4s -; CHECK-NEXT: sub v1.4s, v1.4s, v17.4s -; CHECK-NEXT: sub v2.4s, v2.4s, v6.4s -; CHECK-NEXT: zip2 v6.4s, v3.4s, v0.4s -; CHECK-NEXT: ext v17.16b, v2.16b, v1.16b, #4 -; CHECK-NEXT: mov v1.s[3], v2.s[2] +; CHECK-NEXT: ld1 { v6.s }[1], [x0] +; CHECK-NEXT: usubl v0.8h, v0.8b, v3.8b +; CHECK-NEXT: usubl v1.8h, v2.8b, v1.8b +; CHECK-NEXT: usubl v2.8h, v4.8b, v5.8b +; CHECK-NEXT: usubl v3.8h, v6.8b, v7.8b +; CHECK-NEXT: shll v4.4s, v2.4h, #16 +; CHECK-NEXT: shll v5.4s, v3.4h, #16 +; CHECK-NEXT: shll2 v3.4s, v3.8h, #16 +; CHECK-NEXT: shll2 v2.4s, v2.8h, #16 +; CHECK-NEXT: saddw2 v3.4s, v3.4s, v0.8h +; CHECK-NEXT: saddw v0.4s, v5.4s, v0.4h +; CHECK-NEXT: saddw2 v2.4s, v2.4s, v1.8h +; CHECK-NEXT: rev64 v17.4s, v3.4s +; CHECK-NEXT: rev64 v6.4s, v0.4s +; CHECK-NEXT: saddw v1.4s, v4.4s, v1.4h +; CHECK-NEXT: rev64 v5.4s, v2.4s +; CHECK-NEXT: addp v16.4s, v0.4s, v3.4s +; CHECK-NEXT: rev64 v4.4s, v1.4s +; CHECK-NEXT: sub v3.4s, v3.4s, v17.4s +; CHECK-NEXT: sub v0.4s, v0.4s, v6.4s +; CHECK-NEXT: addp v7.4s, v1.4s, v2.4s +; CHECK-NEXT: ext v17.16b, v0.16b, v3.16b, #4 +; CHECK-NEXT: sub v2.4s, v2.4s, v5.4s +; CHECK-NEXT: sub v1.4s, v1.4s, v4.4s ; CHECK-NEXT: uzp2 v5.4s, v7.4s, v16.4s ; CHECK-NEXT: ext v4.16b, v16.16b, v16.16b, #8 ; CHECK-NEXT: uzp1 v16.4s, v7.4s, v16.4s -; CHECK-NEXT: zip1 v0.4s, v3.4s, v0.4s -; CHECK-NEXT: mov v6.d[1], v1.d[1] -; CHECK-NEXT: ext v1.16b, v17.16b, v2.16b, #4 -; CHECK-NEXT: rev64 v3.4s, v5.4s -; CHECK-NEXT: uzp1 v2.4s, v7.4s, v4.4s -; CHECK-NEXT: rev64 v5.4s, v16.4s +; CHECK-NEXT: zip2 v6.4s, v1.4s, v2.4s +; CHECK-NEXT: mov v3.s[3], v0.s[2] +; CHECK-NEXT: zip1 v1.4s, v1.4s, v2.4s +; CHECK-NEXT: ext v0.16b, v17.16b, v0.16b, #4 +; CHECK-NEXT: rev64 v2.4s, v5.4s +; CHECK-NEXT: uzp1 v5.4s, v7.4s, v4.4s +; CHECK-NEXT: rev64 v16.4s, v16.4s ; CHECK-NEXT: uzp2 v4.4s, v7.4s, v4.4s -; CHECK-NEXT: mov v0.d[1], v1.d[1] -; CHECK-NEXT: add v1.4s, v3.4s, v5.4s -; CHECK-NEXT: sub v2.4s, v2.4s, v4.4s -; CHECK-NEXT: add v3.4s, v6.4s, v0.4s -; CHECK-NEXT: sub v0.4s, v0.4s, v6.4s -; CHECK-NEXT: zip1 v4.4s, v1.4s, v3.4s -; CHECK-NEXT: uzp2 v5.4s, v1.4s, v3.4s -; CHECK-NEXT: zip1 v6.4s, v2.4s, v0.4s -; CHECK-NEXT: zip2 v7.4s, v1.4s, v3.4s -; CHECK-NEXT: trn2 v4.4s, v1.4s, v4.4s -; CHECK-NEXT: uzp2 v5.4s, v5.4s, v1.4s -; CHECK-NEXT: ext v16.16b, v2.16b, v6.16b, #8 -; CHECK-NEXT: zip2 v17.4s, v2.4s, v0.4s -; CHECK-NEXT: mov v1.s[1], v3.s[1] -; CHECK-NEXT: mov v2.s[3], v0.s[2] -; CHECK-NEXT: mov v4.d[1], v16.d[1] -; CHECK-NEXT: mov v5.d[1], v17.d[1] -; CHECK-NEXT: mov v1.d[1], v6.d[1] -; CHECK-NEXT: mov v7.d[1], v2.d[1] -; CHECK-NEXT: add v0.4s, v4.4s, v1.4s -; CHECK-NEXT: add v2.4s, v5.4s, v7.4s -; CHECK-NEXT: ext v3.16b, v0.16b, v0.16b, #4 -; CHECK-NEXT: sub v1.4s, v1.4s, v4.4s -; CHECK-NEXT: ext v4.16b, v2.16b, v2.16b, #4 -; CHECK-NEXT: sub v5.4s, v7.4s, v5.4s -; CHECK-NEXT: ext v6.16b, v3.16b, v1.16b, #8 -; CHECK-NEXT: zip1 v7.4s, v2.4s, v5.4s -; CHECK-NEXT: ext v17.16b, v4.16b, v5.16b, #8 -; CHECK-NEXT: zip2 v16.4s, v2.4s, v5.4s -; CHECK-NEXT: ext v3.16b, v6.16b, v3.16b, #4 -; CHECK-NEXT: zip2 v6.4s, v1.4s, v0.4s -; CHECK-NEXT: zip2 v2.4s, v5.4s, v2.4s -; CHECK-NEXT: ext v4.16b, v17.16b, v4.16b, #4 -; CHECK-NEXT: zip2 v5.4s, v0.4s, v1.4s -; CHECK-NEXT: zip1 v0.4s, v0.4s, v1.4s -; CHECK-NEXT: add v1.4s, v6.4s, v2.4s -; CHECK-NEXT: add v2.4s, v3.4s, v4.4s -; CHECK-NEXT: sub v3.4s, v16.4s, v5.4s -; CHECK-NEXT: sub v0.4s, v7.4s, v0.4s -; CHECK-NEXT: movi v4.8h, #1 +; CHECK-NEXT: mov v6.d[1], v3.d[1] +; CHECK-NEXT: mov v1.d[1], v0.d[1] +; CHECK-NEXT: add v0.4s, v2.4s, v16.4s +; CHECK-NEXT: sub v2.4s, v5.4s, v4.4s +; CHECK-NEXT: sub v3.4s, v1.4s, v6.4s +; CHECK-NEXT: add v1.4s, v6.4s, v1.4s +; CHECK-NEXT: zip1 v4.4s, v2.4s, v3.4s +; CHECK-NEXT: zip1 v5.4s, v0.4s, v1.4s +; CHECK-NEXT: uzp2 v6.4s, v0.4s, v1.4s +; CHECK-NEXT: zip2 v7.4s, v2.4s, v3.4s +; CHECK-NEXT: zip2 v16.4s, v0.4s, v1.4s +; CHECK-NEXT: ext v17.16b, v2.16b, v4.16b, #8 +; CHECK-NEXT: uzp2 v6.4s, v6.4s, v0.4s +; CHECK-NEXT: mov v2.s[3], v3.s[2] +; CHECK-NEXT: trn2 v3.4s, v0.4s, v5.4s +; CHECK-NEXT: mov v0.s[1], v1.s[1] +; CHECK-NEXT: mov v6.d[1], v7.d[1] +; CHECK-NEXT: mov v16.d[1], v2.d[1] +; CHECK-NEXT: mov v3.d[1], v17.d[1] +; CHECK-NEXT: mov v0.d[1], v4.d[1] +; CHECK-NEXT: add v1.4s, v6.4s, v16.4s +; CHECK-NEXT: sub v2.4s, v16.4s, v6.4s +; CHECK-NEXT: add v7.4s, v3.4s, v0.4s +; CHECK-NEXT: ext v6.16b, v1.16b, v1.16b, #4 +; CHECK-NEXT: sub v0.4s, v0.4s, v3.4s +; CHECK-NEXT: ext v3.16b, v7.16b, v7.16b, #4 +; CHECK-NEXT: zip1 v4.4s, v1.4s, v2.4s +; CHECK-NEXT: zip2 v5.4s, v1.4s, v2.4s +; CHECK-NEXT: zip2 v1.4s, v2.4s, v1.4s +; CHECK-NEXT: zip2 v16.4s, v0.4s, v7.4s +; CHECK-NEXT: zip1 v17.4s, v7.4s, v0.4s +; CHECK-NEXT: zip2 v7.4s, v7.4s, v0.4s +; CHECK-NEXT: ext v2.16b, v6.16b, v2.16b, #8 +; CHECK-NEXT: ext v0.16b, v3.16b, v0.16b, #8 +; CHECK-NEXT: add v1.4s, v16.4s, v1.4s +; CHECK-NEXT: movi v16.8h, #1 +; CHECK-NEXT: ext v2.16b, v2.16b, v6.16b, #4 +; CHECK-NEXT: ext v0.16b, v0.16b, v3.16b, #4 +; CHECK-NEXT: sub v3.4s, v5.4s, v7.4s +; CHECK-NEXT: sub v4.4s, v4.4s, v17.4s +; CHECK-NEXT: ushr v5.4s, v1.4s, #15 +; CHECK-NEXT: add v0.4s, v0.4s, v2.4s +; CHECK-NEXT: ushr v6.4s, v3.4s, #15 ; CHECK-NEXT: movi v17.2d, #0x00ffff0000ffff -; CHECK-NEXT: ushr v5.4s, v0.4s, #15 -; CHECK-NEXT: ushr v6.4s, v1.4s, #15 -; CHECK-NEXT: ushr v7.4s, v2.4s, #15 -; CHECK-NEXT: ushr v16.4s, v3.4s, #15 -; CHECK-NEXT: and v6.16b, v6.16b, v4.16b -; CHECK-NEXT: and v16.16b, v16.16b, v4.16b -; CHECK-NEXT: and v7.16b, v7.16b, v4.16b -; CHECK-NEXT: and v4.16b, v5.16b, v4.16b -; CHECK-NEXT: mul v5.4s, v6.4s, v17.4s -; CHECK-NEXT: mul v6.4s, v16.4s, v17.4s -; CHECK-NEXT: mul v4.4s, v4.4s, v17.4s +; CHECK-NEXT: and v2.16b, v5.16b, v16.16b +; CHECK-NEXT: ushr v5.4s, v4.4s, #15 +; CHECK-NEXT: ushr v7.4s, v0.4s, #15 +; CHECK-NEXT: and v6.16b, v6.16b, v16.16b +; CHECK-NEXT: and v7.16b, v7.16b, v16.16b +; CHECK-NEXT: and v5.16b, v5.16b, v16.16b +; CHECK-NEXT: mul v2.4s, v2.4s, v17.4s +; CHECK-NEXT: mul v6.4s, v6.4s, v17.4s +; CHECK-NEXT: mul v5.4s, v5.4s, v17.4s ; CHECK-NEXT: mul v7.4s, v7.4s, v17.4s -; CHECK-NEXT: add v1.4s, v5.4s, v1.4s +; CHECK-NEXT: add v1.4s, v2.4s, v1.4s ; CHECK-NEXT: add v3.4s, v6.4s, v3.4s -; CHECK-NEXT: add v0.4s, v4.4s, v0.4s -; CHECK-NEXT: add v2.4s, v7.4s, v2.4s -; CHECK-NEXT: eor v0.16b, v0.16b, v4.16b -; CHECK-NEXT: eor v2.16b, v2.16b, v7.16b +; CHECK-NEXT: add v4.4s, v5.4s, v4.4s +; CHECK-NEXT: add v0.4s, v7.4s, v0.4s +; CHECK-NEXT: eor v4.16b, v4.16b, v5.16b +; CHECK-NEXT: eor v0.16b, v0.16b, v7.16b ; CHECK-NEXT: eor v3.16b, v3.16b, v6.16b -; CHECK-NEXT: eor v1.16b, v1.16b, v5.16b +; CHECK-NEXT: eor v1.16b, v1.16b, v2.16b ; CHECK-NEXT: add v1.4s, v1.4s, v3.4s -; CHECK-NEXT: add v0.4s, v2.4s, v0.4s +; CHECK-NEXT: add v0.4s, v0.4s, v4.4s ; CHECK-NEXT: add v0.4s, v0.4s, v1.4s ; CHECK-NEXT: addv s0, v0.4s ; CHECK-NEXT: fmov w8, s0