diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -14268,12 +14268,49 @@ Op1 ? Op1 : Mul->getOperand(1)); } +// Combine v4i32 Mul(And(Srl(X, 15), 0x10001), 0xffff) -> v8i16 CMLTz +// Same for other types with equivalent constants. +static SDValue performMulVectorCmpZeroCombine(SDNode *N, SelectionDAG &DAG) { + EVT VT = N->getValueType(0); + if (VT != MVT::v2i64 && VT != MVT::v1i64 && VT != MVT::v2i32 && + VT != MVT::v4i32 && VT != MVT::v4i16 && VT != MVT::v8i16) + return SDValue(); + if (N->getOperand(0).getOpcode() != ISD::AND || + N->getOperand(0).getOperand(0).getOpcode() != ISD::SRL) + return SDValue(); + + SDValue And = N->getOperand(0); + SDValue Srl = And.getOperand(0); + + APInt V1, V2, V3; + if (!ISD::isConstantSplatVector(N->getOperand(1).getNode(), V1) || + !ISD::isConstantSplatVector(And.getOperand(1).getNode(), V2) || + !ISD::isConstantSplatVector(Srl.getOperand(1).getNode(), V3)) + return SDValue(); + + unsigned HalfSize = VT.getScalarSizeInBits() / 2; + if (!V1.isMask(HalfSize) || V2 != (1ULL | 1ULL << HalfSize) || + V3 != (HalfSize - 1)) + return SDValue(); + + EVT HalfVT = EVT::getVectorVT(*DAG.getContext(), + EVT::getIntegerVT(*DAG.getContext(), HalfSize), + VT.getVectorElementCount() * 2); + + SDLoc DL(N); + SDValue In = DAG.getNode(AArch64ISD::NVCAST, DL, HalfVT, Srl.getOperand(0)); + SDValue CM = DAG.getNode(AArch64ISD::CMLTz, DL, HalfVT, In); + return DAG.getNode(AArch64ISD::NVCAST, DL, VT, CM); +} + static SDValue performMulCombine(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const AArch64Subtarget *Subtarget) { if (SDValue Ext = performMulVectorExtendCombine(N, DAG)) return Ext; + if (SDValue Ext = performMulVectorCmpZeroCombine(N, DAG)) + return Ext; if (DCI.isBeforeLegalizeOps()) return SDValue(); diff --git a/llvm/test/CodeGen/AArch64/insert-extend.ll b/llvm/test/CodeGen/AArch64/insert-extend.ll --- a/llvm/test/CodeGen/AArch64/insert-extend.ll +++ b/llvm/test/CodeGen/AArch64/insert-extend.ll @@ -118,58 +118,48 @@ ; CHECK-NEXT: zip1 v16.4s, v6.4s, v6.4s ; CHECK-NEXT: sub v2.4s, v2.4s, v7.4s ; CHECK-NEXT: ext v17.16b, v1.16b, v3.16b, #8 -; CHECK-NEXT: ext v5.16b, v3.16b, v2.16b, #4 -; CHECK-NEXT: ext v7.16b, v6.16b, v4.16b, #4 +; CHECK-NEXT: ext v5.16b, v6.16b, v4.16b, #4 +; CHECK-NEXT: ext v7.16b, v3.16b, v2.16b, #4 ; CHECK-NEXT: ext v18.16b, v0.16b, v6.16b, #4 ; CHECK-NEXT: trn2 v0.4s, v16.4s, v0.4s ; CHECK-NEXT: ext v16.16b, v17.16b, v1.16b, #4 -; CHECK-NEXT: zip2 v5.4s, v5.4s, v3.4s -; CHECK-NEXT: zip2 v7.4s, v7.4s, v6.4s +; CHECK-NEXT: zip2 v7.4s, v7.4s, v3.4s +; CHECK-NEXT: zip2 v5.4s, v5.4s, v6.4s ; CHECK-NEXT: ext v18.16b, v18.16b, v18.16b, #4 ; CHECK-NEXT: mov v1.s[2], v3.s[1] -; CHECK-NEXT: ext v5.16b, v2.16b, v5.16b, #12 -; CHECK-NEXT: ext v7.16b, v4.16b, v7.16b, #12 +; CHECK-NEXT: uzp2 v16.4s, v17.4s, v16.4s +; CHECK-NEXT: ext v7.16b, v2.16b, v7.16b, #12 +; CHECK-NEXT: ext v5.16b, v4.16b, v5.16b, #12 ; CHECK-NEXT: mov v2.s[2], v3.s[3] ; CHECK-NEXT: mov v4.s[2], v6.s[3] -; CHECK-NEXT: uzp2 v16.4s, v17.4s, v16.4s -; CHECK-NEXT: sub v19.4s, v0.4s, v18.4s +; CHECK-NEXT: sub v17.4s, v0.4s, v18.4s ; CHECK-NEXT: mov v18.s[0], v6.s[1] -; CHECK-NEXT: sub v17.4s, v2.4s, v5.4s -; CHECK-NEXT: sub v20.4s, v4.4s, v7.4s -; CHECK-NEXT: sub v21.4s, v1.4s, v16.4s +; CHECK-NEXT: sub v19.4s, v1.4s, v16.4s +; CHECK-NEXT: sub v20.4s, v2.4s, v7.4s +; CHECK-NEXT: sub v21.4s, v4.4s, v5.4s +; CHECK-NEXT: mov v1.s[1], v3.s[0] ; CHECK-NEXT: mov v2.s[1], v3.s[2] ; CHECK-NEXT: mov v4.s[1], v6.s[2] -; CHECK-NEXT: mov v1.s[1], v3.s[0] ; CHECK-NEXT: add v0.4s, v0.4s, v18.4s -; CHECK-NEXT: add v2.4s, v2.4s, v5.4s -; CHECK-NEXT: add v3.4s, v4.4s, v7.4s ; CHECK-NEXT: add v1.4s, v1.4s, v16.4s -; CHECK-NEXT: mov v0.d[1], v19.d[1] -; CHECK-NEXT: mov v1.d[1], v21.d[1] -; CHECK-NEXT: mov v2.d[1], v17.d[1] -; CHECK-NEXT: mov v3.d[1], v20.d[1] -; CHECK-NEXT: movi v4.8h, #1 -; CHECK-NEXT: movi v17.2d, #0x00ffff0000ffff -; CHECK-NEXT: ushr v5.4s, v2.4s, #15 -; CHECK-NEXT: ushr v6.4s, v0.4s, #15 -; CHECK-NEXT: ushr v7.4s, v3.4s, #15 -; CHECK-NEXT: ushr v16.4s, v1.4s, #15 -; CHECK-NEXT: and v6.16b, v6.16b, v4.16b -; CHECK-NEXT: and v16.16b, v16.16b, v4.16b -; CHECK-NEXT: and v7.16b, v7.16b, v4.16b -; CHECK-NEXT: and v4.16b, v5.16b, v4.16b -; CHECK-NEXT: mul v5.4s, v6.4s, v17.4s -; CHECK-NEXT: mul v6.4s, v16.4s, v17.4s -; CHECK-NEXT: mul v4.4s, v4.4s, v17.4s -; CHECK-NEXT: mul v7.4s, v7.4s, v17.4s -; CHECK-NEXT: add v0.4s, v5.4s, v0.4s -; CHECK-NEXT: add v1.4s, v6.4s, v1.4s +; CHECK-NEXT: add v2.4s, v2.4s, v7.4s +; CHECK-NEXT: add v3.4s, v4.4s, v5.4s +; CHECK-NEXT: mov v2.d[1], v20.d[1] +; CHECK-NEXT: mov v3.d[1], v21.d[1] +; CHECK-NEXT: mov v0.d[1], v17.d[1] +; CHECK-NEXT: mov v1.d[1], v19.d[1] +; CHECK-NEXT: cmlt v4.8h, v2.8h, #0 +; CHECK-NEXT: cmlt v5.8h, v3.8h, #0 +; CHECK-NEXT: cmlt v6.8h, v0.8h, #0 +; CHECK-NEXT: cmlt v7.8h, v1.8h, #0 +; CHECK-NEXT: add v0.4s, v6.4s, v0.4s +; CHECK-NEXT: add v1.4s, v7.4s, v1.4s ; CHECK-NEXT: add v2.4s, v4.4s, v2.4s -; CHECK-NEXT: add v3.4s, v7.4s, v3.4s +; CHECK-NEXT: add v3.4s, v5.4s, v3.4s ; CHECK-NEXT: eor v2.16b, v2.16b, v4.16b -; CHECK-NEXT: eor v3.16b, v3.16b, v7.16b -; CHECK-NEXT: eor v1.16b, v1.16b, v6.16b -; CHECK-NEXT: eor v0.16b, v0.16b, v5.16b +; CHECK-NEXT: eor v3.16b, v3.16b, v5.16b +; CHECK-NEXT: eor v1.16b, v1.16b, v7.16b +; CHECK-NEXT: eor v0.16b, v0.16b, v6.16b ; CHECK-NEXT: add v0.4s, v0.4s, v1.4s ; CHECK-NEXT: add v1.4s, v3.4s, v2.4s ; CHECK-NEXT: add v0.4s, v0.4s, v1.4s diff --git a/llvm/test/CodeGen/AArch64/mulcmle.ll b/llvm/test/CodeGen/AArch64/mulcmle.ll --- a/llvm/test/CodeGen/AArch64/mulcmle.ll +++ b/llvm/test/CodeGen/AArch64/mulcmle.ll @@ -4,13 +4,7 @@ define <1 x i64> @v1i64(<1 x i64> %a) { ; CHECK-LABEL: v1i64: ; CHECK: // %bb.0: -; CHECK-NEXT: movi v1.2s, #1 -; CHECK-NEXT: ushr d0, d0, #31 -; CHECK-NEXT: and v0.8b, v0.8b, v1.8b -; CHECK-NEXT: fmov x8, d0 -; CHECK-NEXT: lsl x9, x8, #32 -; CHECK-NEXT: sub x8, x9, x8 -; CHECK-NEXT: fmov d0, x8 +; CHECK-NEXT: cmlt v0.2s, v0.2s, #0 ; CHECK-NEXT: ret %b = lshr <1 x i64> %a, %c = and <1 x i64> %b, @@ -21,17 +15,7 @@ define <2 x i64> @v2i64(<2 x i64> %a) { ; CHECK-LABEL: v2i64: ; CHECK: // %bb.0: -; CHECK-NEXT: movi v1.4s, #1 -; CHECK-NEXT: ushr v0.2d, v0.2d, #31 -; CHECK-NEXT: and v0.16b, v0.16b, v1.16b -; CHECK-NEXT: fmov x9, d0 -; CHECK-NEXT: mov x8, v0.d[1] -; CHECK-NEXT: lsl x10, x9, #32 -; CHECK-NEXT: sub x9, x10, x9 -; CHECK-NEXT: lsl x10, x8, #32 -; CHECK-NEXT: sub x8, x10, x8 -; CHECK-NEXT: fmov d0, x9 -; CHECK-NEXT: mov v0.d[1], x8 +; CHECK-NEXT: cmlt v0.4s, v0.4s, #0 ; CHECK-NEXT: ret %b = lshr <2 x i64> %a, %c = and <2 x i64> %b, @@ -42,11 +26,7 @@ define <2 x i32> @v2i32(<2 x i32> %a) { ; CHECK-LABEL: v2i32: ; CHECK: // %bb.0: -; CHECK-NEXT: movi v1.4h, #1 -; CHECK-NEXT: movi d2, #0x00ffff0000ffff -; CHECK-NEXT: ushr v0.2s, v0.2s, #15 -; CHECK-NEXT: and v0.8b, v0.8b, v1.8b -; CHECK-NEXT: mul v0.2s, v0.2s, v2.2s +; CHECK-NEXT: cmlt v0.4h, v0.4h, #0 ; CHECK-NEXT: ret %b = lshr <2 x i32> %a, %c = and <2 x i32> %b, @@ -57,11 +37,7 @@ define <4 x i32> @v4i32(<4 x i32> %a) { ; CHECK-LABEL: v4i32: ; CHECK: // %bb.0: -; CHECK-NEXT: movi v1.8h, #1 -; CHECK-NEXT: ushr v0.4s, v0.4s, #15 -; CHECK-NEXT: movi v2.2d, #0x00ffff0000ffff -; CHECK-NEXT: and v0.16b, v0.16b, v1.16b -; CHECK-NEXT: mul v0.4s, v0.4s, v2.4s +; CHECK-NEXT: cmlt v0.8h, v0.8h, #0 ; CHECK-NEXT: ret %b = lshr <4 x i32> %a, %c = and <4 x i32> %b, @@ -72,14 +48,8 @@ define <8 x i32> @v8i32(<8 x i32> %a) { ; CHECK-LABEL: v8i32: ; CHECK: // %bb.0: -; CHECK-NEXT: movi v2.8h, #1 -; CHECK-NEXT: ushr v1.4s, v1.4s, #15 -; CHECK-NEXT: movi v3.2d, #0x00ffff0000ffff -; CHECK-NEXT: ushr v0.4s, v0.4s, #15 -; CHECK-NEXT: and v1.16b, v1.16b, v2.16b -; CHECK-NEXT: and v0.16b, v0.16b, v2.16b -; CHECK-NEXT: mul v0.4s, v0.4s, v3.4s -; CHECK-NEXT: mul v1.4s, v1.4s, v3.4s +; CHECK-NEXT: cmlt v0.8h, v0.8h, #0 +; CHECK-NEXT: cmlt v1.8h, v1.8h, #0 ; CHECK-NEXT: ret %b = lshr <8 x i32> %a, %c = and <8 x i32> %b, @@ -90,11 +60,7 @@ define <4 x i16> @v4i16(<4 x i16> %a) { ; CHECK-LABEL: v4i16: ; CHECK: // %bb.0: -; CHECK-NEXT: movi v1.8b, #1 -; CHECK-NEXT: movi d2, #0xff00ff00ff00ff -; CHECK-NEXT: ushr v0.4h, v0.4h, #7 -; CHECK-NEXT: and v0.8b, v0.8b, v1.8b -; CHECK-NEXT: mul v0.4h, v0.4h, v2.4h +; CHECK-NEXT: cmlt v0.8b, v0.8b, #0 ; CHECK-NEXT: ret %b = lshr <4 x i16> %a, %c = and <4 x i16> %b, @@ -105,11 +71,7 @@ define <8 x i16> @v8i16(<8 x i16> %a) { ; CHECK-LABEL: v8i16: ; CHECK: // %bb.0: -; CHECK-NEXT: movi v1.16b, #1 -; CHECK-NEXT: ushr v0.8h, v0.8h, #7 -; CHECK-NEXT: movi v2.2d, #0xff00ff00ff00ff -; CHECK-NEXT: and v0.16b, v0.16b, v1.16b -; CHECK-NEXT: mul v0.8h, v0.8h, v2.8h +; CHECK-NEXT: cmlt v0.16b, v0.16b, #0 ; CHECK-NEXT: ret %b = lshr <8 x i16> %a, %c = and <8 x i16> %b, diff --git a/llvm/test/CodeGen/AArch64/reduce-shuffle.ll b/llvm/test/CodeGen/AArch64/reduce-shuffle.ll --- a/llvm/test/CodeGen/AArch64/reduce-shuffle.ll +++ b/llvm/test/CodeGen/AArch64/reduce-shuffle.ll @@ -93,47 +93,37 @@ ; CHECK-NEXT: ext v17.16b, v3.16b, v17.16b, #12 ; CHECK-NEXT: mov v0.s[2], v2.s[1] ; CHECK-NEXT: uzp2 v4.4s, v4.4s, v18.4s -; CHECK-NEXT: mov v7.s[2], v2.s[3] ; CHECK-NEXT: mov v3.s[2], v5.s[3] +; CHECK-NEXT: mov v7.s[2], v2.s[3] ; CHECK-NEXT: sub v18.4s, v1.4s, v6.4s ; CHECK-NEXT: mov v6.s[0], v5.s[1] ; CHECK-NEXT: sub v19.4s, v0.4s, v4.4s -; CHECK-NEXT: sub v20.4s, v7.4s, v16.4s -; CHECK-NEXT: sub v21.4s, v3.4s, v17.4s +; CHECK-NEXT: sub v20.4s, v3.4s, v17.4s +; CHECK-NEXT: sub v21.4s, v7.4s, v16.4s ; CHECK-NEXT: mov v0.s[1], v2.s[0] -; CHECK-NEXT: mov v7.s[1], v2.s[2] ; CHECK-NEXT: mov v3.s[1], v5.s[2] +; CHECK-NEXT: mov v7.s[1], v2.s[2] ; CHECK-NEXT: add v1.4s, v1.4s, v6.4s ; CHECK-NEXT: add v0.4s, v0.4s, v4.4s -; CHECK-NEXT: add v2.4s, v7.4s, v16.4s -; CHECK-NEXT: add v3.4s, v3.4s, v17.4s -; CHECK-NEXT: mov v2.d[1], v20.d[1] -; CHECK-NEXT: mov v3.d[1], v21.d[1] +; CHECK-NEXT: add v2.4s, v3.4s, v17.4s +; CHECK-NEXT: add v3.4s, v7.4s, v16.4s ; CHECK-NEXT: mov v1.d[1], v18.d[1] ; CHECK-NEXT: mov v0.d[1], v19.d[1] -; CHECK-NEXT: movi v4.8h, #1 -; CHECK-NEXT: movi v17.2d, #0x00ffff0000ffff -; CHECK-NEXT: ushr v5.4s, v1.4s, #15 -; CHECK-NEXT: ushr v6.4s, v2.4s, #15 -; CHECK-NEXT: ushr v7.4s, v0.4s, #15 -; CHECK-NEXT: ushr v16.4s, v3.4s, #15 -; CHECK-NEXT: and v6.16b, v6.16b, v4.16b -; CHECK-NEXT: and v16.16b, v16.16b, v4.16b -; CHECK-NEXT: and v7.16b, v7.16b, v4.16b -; CHECK-NEXT: and v4.16b, v5.16b, v4.16b -; CHECK-NEXT: mul v5.4s, v6.4s, v17.4s -; CHECK-NEXT: mul v6.4s, v16.4s, v17.4s -; CHECK-NEXT: mul v4.4s, v4.4s, v17.4s -; CHECK-NEXT: mul v7.4s, v7.4s, v17.4s -; CHECK-NEXT: add v2.4s, v5.4s, v2.4s +; CHECK-NEXT: mov v3.d[1], v21.d[1] +; CHECK-NEXT: mov v2.d[1], v20.d[1] +; CHECK-NEXT: cmlt v4.8h, v1.8h, #0 +; CHECK-NEXT: cmlt v5.8h, v0.8h, #0 +; CHECK-NEXT: cmlt v6.8h, v3.8h, #0 +; CHECK-NEXT: cmlt v7.8h, v2.8h, #0 ; CHECK-NEXT: add v3.4s, v6.4s, v3.4s +; CHECK-NEXT: add v2.4s, v7.4s, v2.4s ; CHECK-NEXT: add v1.4s, v4.4s, v1.4s -; CHECK-NEXT: add v0.4s, v7.4s, v0.4s +; CHECK-NEXT: add v0.4s, v5.4s, v0.4s ; CHECK-NEXT: eor v1.16b, v1.16b, v4.16b -; CHECK-NEXT: eor v0.16b, v0.16b, v7.16b +; CHECK-NEXT: eor v0.16b, v0.16b, v5.16b +; CHECK-NEXT: eor v2.16b, v2.16b, v7.16b ; CHECK-NEXT: eor v3.16b, v3.16b, v6.16b -; CHECK-NEXT: eor v2.16b, v2.16b, v5.16b -; CHECK-NEXT: add v2.4s, v3.4s, v2.4s +; CHECK-NEXT: add v2.4s, v2.4s, v3.4s ; CHECK-NEXT: add v0.4s, v1.4s, v0.4s ; CHECK-NEXT: add v0.4s, v0.4s, v2.4s ; CHECK-NEXT: addv s0, v0.4s @@ -316,50 +306,40 @@ ; CHECK-NEXT: mov v5.d[1], v1.d[1] ; CHECK-NEXT: mov v7.d[1], v17.d[1] ; CHECK-NEXT: mov v0.d[1], v3.d[1] -; CHECK-NEXT: movi v1.8h, #1 -; CHECK-NEXT: add v2.4s, v7.4s, v5.4s -; CHECK-NEXT: add v3.4s, v0.4s, v4.4s -; CHECK-NEXT: sub v5.4s, v5.4s, v7.4s +; CHECK-NEXT: add v1.4s, v7.4s, v5.4s +; CHECK-NEXT: add v2.4s, v0.4s, v4.4s ; CHECK-NEXT: sub v0.4s, v4.4s, v0.4s -; CHECK-NEXT: ext v4.16b, v3.16b, v3.16b, #4 -; CHECK-NEXT: zip2 v6.4s, v0.4s, v3.4s -; CHECK-NEXT: zip2 v7.4s, v5.4s, v2.4s -; CHECK-NEXT: zip1 v16.4s, v2.4s, v5.4s -; CHECK-NEXT: zip2 v17.4s, v2.4s, v5.4s -; CHECK-NEXT: ext v2.16b, v2.16b, v2.16b, #4 -; CHECK-NEXT: add v6.4s, v6.4s, v7.4s -; CHECK-NEXT: zip2 v7.4s, v3.4s, v0.4s -; CHECK-NEXT: zip1 v3.4s, v3.4s, v0.4s +; CHECK-NEXT: ext v4.16b, v2.16b, v2.16b, #4 +; CHECK-NEXT: ext v16.16b, v1.16b, v1.16b, #4 +; CHECK-NEXT: sub v3.4s, v5.4s, v7.4s +; CHECK-NEXT: zip2 v5.4s, v0.4s, v2.4s +; CHECK-NEXT: zip1 v6.4s, v1.4s, v3.4s +; CHECK-NEXT: zip2 v7.4s, v1.4s, v3.4s +; CHECK-NEXT: zip2 v1.4s, v3.4s, v1.4s +; CHECK-NEXT: zip1 v17.4s, v2.4s, v0.4s +; CHECK-NEXT: zip2 v2.4s, v2.4s, v0.4s ; CHECK-NEXT: ext v0.16b, v4.16b, v0.16b, #8 -; CHECK-NEXT: ext v5.16b, v2.16b, v5.16b, #8 -; CHECK-NEXT: sub v7.4s, v17.4s, v7.4s -; CHECK-NEXT: sub v3.4s, v16.4s, v3.4s +; CHECK-NEXT: ext v3.16b, v16.16b, v3.16b, #8 +; CHECK-NEXT: add v1.4s, v5.4s, v1.4s +; CHECK-NEXT: sub v5.4s, v6.4s, v17.4s ; CHECK-NEXT: ext v0.16b, v0.16b, v4.16b, #4 -; CHECK-NEXT: ext v2.16b, v5.16b, v2.16b, #4 -; CHECK-NEXT: movi v17.2d, #0x00ffff0000ffff -; CHECK-NEXT: ushr v5.4s, v3.4s, #15 -; CHECK-NEXT: ushr v4.4s, v6.4s, #15 -; CHECK-NEXT: ushr v16.4s, v7.4s, #15 -; CHECK-NEXT: add v0.4s, v0.4s, v2.4s -; CHECK-NEXT: and v5.16b, v5.16b, v1.16b -; CHECK-NEXT: mul v2.4s, v5.4s, v17.4s -; CHECK-NEXT: ushr v5.4s, v0.4s, #15 -; CHECK-NEXT: and v4.16b, v4.16b, v1.16b -; CHECK-NEXT: and v16.16b, v16.16b, v1.16b -; CHECK-NEXT: and v1.16b, v5.16b, v1.16b -; CHECK-NEXT: mul v4.4s, v4.4s, v17.4s -; CHECK-NEXT: mul v16.4s, v16.4s, v17.4s -; CHECK-NEXT: mul v1.4s, v1.4s, v17.4s -; CHECK-NEXT: add v3.4s, v2.4s, v3.4s -; CHECK-NEXT: add v5.4s, v4.4s, v6.4s -; CHECK-NEXT: add v6.4s, v16.4s, v7.4s -; CHECK-NEXT: add v0.4s, v1.4s, v0.4s -; CHECK-NEXT: eor v2.16b, v3.16b, v2.16b -; CHECK-NEXT: eor v0.16b, v0.16b, v1.16b -; CHECK-NEXT: eor v1.16b, v6.16b, v16.16b -; CHECK-NEXT: eor v3.16b, v5.16b, v4.16b -; CHECK-NEXT: add v1.4s, v3.4s, v1.4s -; CHECK-NEXT: add v0.4s, v0.4s, v2.4s +; CHECK-NEXT: ext v3.16b, v3.16b, v16.16b, #4 +; CHECK-NEXT: cmlt v6.8h, v5.8h, #0 +; CHECK-NEXT: sub v2.4s, v7.4s, v2.4s +; CHECK-NEXT: add v4.4s, v6.4s, v5.4s +; CHECK-NEXT: add v0.4s, v0.4s, v3.4s +; CHECK-NEXT: cmlt v7.8h, v2.8h, #0 +; CHECK-NEXT: cmlt v17.8h, v1.8h, #0 +; CHECK-NEXT: eor v3.16b, v4.16b, v6.16b +; CHECK-NEXT: cmlt v4.8h, v0.8h, #0 +; CHECK-NEXT: add v1.4s, v17.4s, v1.4s +; CHECK-NEXT: add v2.4s, v7.4s, v2.4s +; CHECK-NEXT: add v0.4s, v4.4s, v0.4s +; CHECK-NEXT: eor v2.16b, v2.16b, v7.16b +; CHECK-NEXT: eor v1.16b, v1.16b, v17.16b +; CHECK-NEXT: eor v0.16b, v0.16b, v4.16b +; CHECK-NEXT: add v1.4s, v1.4s, v2.4s +; CHECK-NEXT: add v0.4s, v0.4s, v3.4s ; CHECK-NEXT: add v0.4s, v0.4s, v1.4s ; CHECK-NEXT: addv s0, v0.4s ; CHECK-NEXT: fmov w8, s0 @@ -557,25 +537,15 @@ ; CHECK-NEXT: ext v2.16b, v6.16b, v2.16b, #8 ; CHECK-NEXT: ext v0.16b, v3.16b, v0.16b, #8 ; CHECK-NEXT: add v1.4s, v16.4s, v1.4s -; CHECK-NEXT: movi v16.8h, #1 +; CHECK-NEXT: sub v4.4s, v4.4s, v17.4s ; CHECK-NEXT: ext v2.16b, v2.16b, v6.16b, #4 ; CHECK-NEXT: ext v0.16b, v0.16b, v3.16b, #4 ; CHECK-NEXT: sub v3.4s, v5.4s, v7.4s -; CHECK-NEXT: sub v4.4s, v4.4s, v17.4s -; CHECK-NEXT: ushr v5.4s, v1.4s, #15 +; CHECK-NEXT: cmlt v5.8h, v4.8h, #0 +; CHECK-NEXT: cmlt v6.8h, v3.8h, #0 ; CHECK-NEXT: add v0.4s, v0.4s, v2.4s -; CHECK-NEXT: ushr v6.4s, v3.4s, #15 -; CHECK-NEXT: movi v17.2d, #0x00ffff0000ffff -; CHECK-NEXT: and v2.16b, v5.16b, v16.16b -; CHECK-NEXT: ushr v5.4s, v4.4s, #15 -; CHECK-NEXT: ushr v7.4s, v0.4s, #15 -; CHECK-NEXT: and v6.16b, v6.16b, v16.16b -; CHECK-NEXT: and v7.16b, v7.16b, v16.16b -; CHECK-NEXT: and v5.16b, v5.16b, v16.16b -; CHECK-NEXT: mul v2.4s, v2.4s, v17.4s -; CHECK-NEXT: mul v6.4s, v6.4s, v17.4s -; CHECK-NEXT: mul v5.4s, v5.4s, v17.4s -; CHECK-NEXT: mul v7.4s, v7.4s, v17.4s +; CHECK-NEXT: cmlt v2.8h, v1.8h, #0 +; CHECK-NEXT: cmlt v7.8h, v0.8h, #0 ; CHECK-NEXT: add v1.4s, v2.4s, v1.4s ; CHECK-NEXT: add v3.4s, v6.4s, v3.4s ; CHECK-NEXT: add v4.4s, v5.4s, v4.4s