Index: llvm/lib/Target/AArch64/AArch64ISelLowering.cpp =================================================================== --- llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -18533,6 +18533,250 @@ DAG.getConstant(0, DL, MVT::i64)); } +static bool isLoadOrMultipleLoads(SDValue B, SmallVector &Loads) { + SDValue BV = peekThroughOneUseBitcasts(B); + if (!BV->hasOneUse()) + return false; + if (auto *Ld = dyn_cast(BV)) { + if (!Ld || !Ld->isSimple()) + return false; + Loads.push_back(Ld); + return true; + } else if (BV.getOpcode() == ISD::BUILD_VECTOR || + BV.getOpcode() == ISD::CONCAT_VECTORS) { + for (unsigned Op = 0; Op < BV.getNumOperands(); Op++) { + auto *Ld = dyn_cast(BV.getOperand(Op)); + if (!Ld || !Ld->isSimple() || !BV.getOperand(Op).hasOneUse()) + return false; + Loads.push_back(Ld); + } + return true; + } else if (B.getOpcode() == ISD::VECTOR_SHUFFLE) { + // Try to find a tree of shuffles and concats from how IR shuffles of loads + // are lowered. Note that this only comes up because we do not always visit + // operands before uses. After that is fixed this can be removed and in the + // meantime this is fairly specific to the lowering we expect from IR. + // t46: v16i8 = vector_shuffle<0,1,2,3,4,5,6,7,8,9,10,11,16,17,18,19> t44, t45 + // t44: v16i8 = vector_shuffle<0,1,2,3,4,5,6,7,16,17,18,19,u,u,u,u> t42, t43 + // t42: v16i8 = concat_vectors t40, t36, undef:v4i8, undef:v4i8 + // t40: v4i8,ch = load<(load (s32) from %ir.17)> t0, t22, undef:i64 + // t36: v4i8,ch = load<(load (s32) from %ir.13)> t0, t18, undef:i64 + // t43: v16i8 = concat_vectors t32, undef:v4i8, undef:v4i8, undef:v4i8 + // t32: v4i8,ch = load<(load (s32) from %ir.9)> t0, t14, undef:i64 + // t45: v16i8 = concat_vectors t28, undef:v4i8, undef:v4i8, undef:v4i8 + // t28: v4i8,ch = load<(load (s32) from %ir.0)> t0, t2, undef:i64 + if (B.getOperand(0).getOpcode() != ISD::VECTOR_SHUFFLE || + B.getOperand(0).getOperand(0).getOpcode() != ISD::CONCAT_VECTORS || + B.getOperand(0).getOperand(1).getOpcode() != ISD::CONCAT_VECTORS || + B.getOperand(1).getOpcode() != ISD::CONCAT_VECTORS || + B.getOperand(1).getNumOperands() != 4) + return false; + auto SV1 = cast(B); + auto SV2 = cast(B.getOperand(0)); + int NumElts = B.getValueType().getVectorNumElements(); + int NumSubElts = NumElts / 4; + for (int I = 0; I < NumSubElts; I++) { + // <0,1,2,3,4,5,6,7,8,9,10,11,16,17,18,19> + if (SV1->getMaskElt(I) != I || + SV1->getMaskElt(I + NumSubElts) != I + NumSubElts || + SV1->getMaskElt(I + NumSubElts * 2) != I + NumSubElts * 2 || + SV1->getMaskElt(I + NumSubElts * 3) != I + NumElts) + return false; + // <0,1,2,3,4,5,6,7,16,17,18,19,u,u,u,u> + if (SV2->getMaskElt(I) != I || + SV2->getMaskElt(I + NumSubElts) != I + NumSubElts || + SV2->getMaskElt(I + NumSubElts * 2) != I + NumElts) + return false; + } + auto *Ld0 = dyn_cast(SV2->getOperand(0).getOperand(0)); + auto *Ld1 = dyn_cast(SV2->getOperand(0).getOperand(1)); + auto *Ld2 = dyn_cast(SV2->getOperand(1).getOperand(0)); + auto *Ld3 = dyn_cast(B.getOperand(1).getOperand(0)); + if (!Ld0 || !Ld1 || !Ld2 || !Ld3 || !Ld0->isSimple() || !Ld1->isSimple() || + !Ld2->isSimple() || !Ld3->isSimple()) + return false; + Loads.push_back(Ld0); + Loads.push_back(Ld1); + Loads.push_back(Ld2); + Loads.push_back(Ld3); + return true; + } + return false; +} + +static bool areLoadedOffsetButOtherwiseSame(SDValue Op0, SDValue Op1, + SelectionDAG &DAG, + unsigned &NumSubLoads) { + if (!Op0.hasOneUse() || !Op1.hasOneUse()) + return false; + + SmallVector Loads0, Loads1; + if (isLoadOrMultipleLoads(Op0, Loads0) && + isLoadOrMultipleLoads(Op1, Loads1)) { + if (NumSubLoads && Loads0.size() != NumSubLoads) + return false; + NumSubLoads = Loads0.size(); + return Loads0.size() == Loads1.size() && + all_of(zip(Loads0, Loads1), [&DAG](auto L) { + unsigned Size = get<0>(L)->getValueType(0).getSizeInBits(); + return Size == get<1>(L)->getValueType(0).getSizeInBits() && + DAG.areNonVolatileConsecutiveLoads(get<1>(L), get<0>(L), + Size / 8, 1); + }); + } + + if (Op0.getOpcode() != Op1.getOpcode()) + return false; + + switch (Op0.getOpcode()) { + case ISD::ADD: + case ISD::SUB: + return areLoadedOffsetButOtherwiseSame(Op0.getOperand(0), Op1.getOperand(0), + DAG, NumSubLoads) && + areLoadedOffsetButOtherwiseSame(Op0.getOperand(1), Op1.getOperand(1), + DAG, NumSubLoads); + case ISD::SIGN_EXTEND: + case ISD::ANY_EXTEND: + case ISD::ZERO_EXTEND: + EVT XVT = Op0.getOperand(0).getValueType(); + if (XVT.getScalarSizeInBits() != 8 && XVT.getScalarSizeInBits() != 16 && + XVT.getScalarSizeInBits() != 32) + return false; + return areLoadedOffsetButOtherwiseSame(Op0.getOperand(0), Op1.getOperand(0), + DAG, NumSubLoads); + } + return false; +} + +// This method attempts to fold trees of add(ext(load p), shl(ext(load p+4)) +// into a single load of twice the size, that we extract the bottom part and top +// part so that the shl can use a shll2 instruction. The two loads in that +// example can also be larger trees of instructions, which are identical except +// for the leaves which are all loads offset from the LHS, including +// buildvectors of multiple loads. For example the RHS tree could be +// sub(zext(buildvec(load p+4, load q+4)), zext(buildvec(load r+4, load s+4))) +// Whilst it can be common for the larger loads to replace LDP instructions +// (which doesn't gain anything on it's own), the larger loads can help create +// more efficient code, and in buildvectors prevent the need for ld1 lane +// inserts which can be slower than normal loads. +static SDValue performExtBinopLoadFold(SDNode *N, SelectionDAG &DAG) { + EVT VT = N->getValueType(0); + if (!VT.isFixedLengthVector() || + (VT.getScalarSizeInBits() != 16 && VT.getScalarSizeInBits() != 32 && + VT.getScalarSizeInBits() != 64)) + return SDValue(); + + SDValue Other = N->getOperand(0); + SDValue Shift = N->getOperand(1); + if (Shift.getOpcode() != ISD::SHL && N->getOpcode() != ISD::SUB) + std::swap(Shift, Other); + APInt ShiftAmt; + if (Shift.getOpcode() != ISD::SHL || !Shift.hasOneUse() || + !ISD::isConstantSplatVector(Shift.getOperand(1).getNode(), ShiftAmt)) + return SDValue(); + + if (!ISD::isExtOpcode(Shift.getOperand(0).getOpcode()) || + !ISD::isExtOpcode(Other.getOpcode()) || + Shift.getOperand(0).getOperand(0).getValueType() != + Other.getOperand(0).getValueType() || + !Other.hasOneUse() || !Shift.getOperand(0).hasOneUse()) + return SDValue(); + + SDValue Op0 = Other.getOperand(0); + SDValue Op1 = Shift.getOperand(0).getOperand(0); + + unsigned NumSubLoads = 0; + if (!areLoadedOffsetButOtherwiseSame(Op0, Op1, DAG, NumSubLoads)) + return SDValue(); + + // Attempt to rule out some unprofitable cases using heuristics (some working + // around suboptimal code generation), notably if the extend not be able to + // use ushll2 instructions as the types are not large enough. Otherwise zip's + // will need to be created which can increase the instruction count. + unsigned NumElts = Op0.getValueType().getVectorNumElements(); + unsigned NumSubElts = NumElts / NumSubLoads; + if (NumSubElts * VT.getScalarSizeInBits() < 128 || + (Other.getOpcode() != Shift.getOperand(0).getOpcode() && + Op0.getValueType().getSizeInBits() < 128 && + !DAG.getTargetLoweringInfo().isTypeLegal(Op0.getValueType()))) + return SDValue(); + + // Recreate the tree with the new combined loads. + std::function GenCombinedTree = + [&GenCombinedTree](SDValue Op0, SDValue Op1, SelectionDAG &DAG) { + EVT DVT = + Op0.getValueType().getDoubleNumVectorElementsVT(*DAG.getContext()); + + SmallVector Loads0, Loads1; + if (isLoadOrMultipleLoads(Op0, Loads0) && + isLoadOrMultipleLoads(Op1, Loads1)) { + EVT LoadVT = EVT::getVectorVT( + *DAG.getContext(), Op0.getValueType().getScalarType(), + Op0.getValueType().getVectorNumElements() / Loads0.size()); + EVT DLoadVT = LoadVT.getDoubleNumVectorElementsVT(*DAG.getContext()); + + SmallVector NewLoads; + for (const auto &[L0, L1] : zip(Loads0, Loads1)) { + SDValue Load = DAG.getLoad(DLoadVT, SDLoc(L0), L0->getChain(), + L0->getBasePtr(), L0->getPointerInfo(), + L0->getOriginalAlign()); + DAG.makeEquivalentMemoryOrdering(L0, Load.getValue(1)); + DAG.makeEquivalentMemoryOrdering(L1, Load.getValue(1)); + NewLoads.push_back(Load); + } + return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op0), DVT, NewLoads); + } + + SmallVector Ops; + for (const auto &[O0, O1] : zip(Op0->op_values(), Op1->op_values())) + Ops.push_back(GenCombinedTree(O0, O1, DAG)); + return DAG.getNode(Op0.getOpcode(), SDLoc(Op0), DVT, Ops); + }; + SDValue NewOp = GenCombinedTree(Op0, Op1, DAG); + + SmallVector LowMask(NumElts, 0), HighMask(NumElts, 0); + int Hi = NumSubElts, Lo = 0; + for (unsigned i = 0; i < NumSubLoads; i++) { + for (unsigned j = 0; j < NumSubElts; j++) { + LowMask[i * NumSubElts + j] = Lo++; + HighMask[i * NumSubElts + j] = Hi++; + } + Lo += NumSubElts; + Hi += NumSubElts; + } + SDLoc DL(N); + SDValue Ext0, Ext1; + // Extract the top and bottom lanes, then extend the result. Possibly extend + // the result then extract the lanes if the two operands match as it produces + // slightly smaller code. + if (Other.getOpcode() != Shift.getOperand(0).getOpcode()) { + SDValue SubL = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, Op0.getValueType(), + NewOp, DAG.getConstant(0, DL, MVT::i64)); + SDValue SubH = + DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, Op0.getValueType(), NewOp, + DAG.getConstant(NumSubElts * NumSubLoads, DL, MVT::i64)); + SDValue Extr0 = + DAG.getVectorShuffle(Op0.getValueType(), DL, SubL, SubH, LowMask); + SDValue Extr1 = + DAG.getVectorShuffle(Op0.getValueType(), DL, SubL, SubH, HighMask); + Ext0 = DAG.getNode(Other.getOpcode(), DL, VT, Extr0); + Ext1 = DAG.getNode(Shift.getOperand(0).getOpcode(), DL, VT, Extr1); + } else { + EVT DVT = VT.getDoubleNumVectorElementsVT(*DAG.getContext()); + SDValue Ext = DAG.getNode(Other.getOpcode(), DL, DVT, NewOp); + SDValue SubL = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Ext, + DAG.getConstant(0, DL, MVT::i64)); + SDValue SubH = + DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Ext, + DAG.getConstant(NumSubElts * NumSubLoads, DL, MVT::i64)); + Ext0 = DAG.getVectorShuffle(VT, DL, SubL, SubH, LowMask); + Ext1 = DAG.getVectorShuffle(VT, DL, SubL, SubH, HighMask); + } + SDValue NShift = + DAG.getNode(Shift.getOpcode(), DL, VT, Ext1, Shift.getOperand(1)); + return DAG.getNode(N->getOpcode(), DL, VT, Ext0, NShift); +} + static SDValue performAddSubCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) { // Try to change sum of two reductions. @@ -18555,6 +18799,9 @@ if (SDValue Val = performAddSubIntoVectorOp(N, DCI.DAG)) return Val; + if (SDValue Val = performExtBinopLoadFold(N, DCI.DAG)) + return Val; + return performAddSubLongCombine(N, DCI); } Index: llvm/test/CodeGen/AArch64/extbinopload.ll =================================================================== --- llvm/test/CodeGen/AArch64/extbinopload.ll +++ llvm/test/CodeGen/AArch64/extbinopload.ll @@ -55,9 +55,9 @@ define <4 x i32> @load_v4i16_v4i32(ptr %p) { ; CHECK-LABEL: load_v4i16_v4i32: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp d1, d0, [x0] -; CHECK-NEXT: ushll v0.4s, v0.4h, #3 -; CHECK-NEXT: uaddw v0.4s, v0.4s, v1.4h +; CHECK-NEXT: ldr q0, [x0] +; CHECK-NEXT: ushll2 v1.4s, v0.8h, #3 +; CHECK-NEXT: uaddw v0.4s, v1.4s, v0.4h ; CHECK-NEXT: ret %l1 = load <4 x i16>, ptr %p %q = getelementptr i8, ptr %p, i32 8 @@ -91,11 +91,10 @@ define <4 x i32> @load_v4i8_v4i32(ptr %p) { ; CHECK-LABEL: load_v4i8_v4i32: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp s1, s0, [x0] +; CHECK-NEXT: ldr d0, [x0] ; CHECK-NEXT: ushll v0.8h, v0.8b, #0 -; CHECK-NEXT: ushll v1.8h, v1.8b, #0 -; CHECK-NEXT: ushll v0.4s, v0.4h, #3 -; CHECK-NEXT: uaddw v0.4s, v0.4s, v1.4h +; CHECK-NEXT: ushll2 v1.4s, v0.8h, #3 +; CHECK-NEXT: uaddw v0.4s, v1.4s, v0.4h ; CHECK-NEXT: ret %l1 = load <4 x i8>, ptr %p %q = getelementptr i8, ptr %p, i32 4 @@ -110,30 +109,28 @@ define <4 x i32> @load_v4i12_v4i32(ptr %p) { ; CHECK-LABEL: load_v4i12_v4i32: ; CHECK: // %bb.0: -; CHECK-NEXT: ldur w8, [x0, #6] -; CHECK-NEXT: ldr w9, [x0] -; CHECK-NEXT: ldrh w12, [x0, #10] -; CHECK-NEXT: and w10, w8, #0xfff -; CHECK-NEXT: ldrh w13, [x0, #4] -; CHECK-NEXT: and w11, w9, #0xfff +; CHECK-NEXT: ldr x8, [x0] +; CHECK-NEXT: ldr w9, [x0, #8] +; CHECK-NEXT: ubfx x10, x8, #48, #12 +; CHECK-NEXT: lsr x11, x8, #60 +; CHECK-NEXT: orr w11, w11, w9, lsl #4 +; CHECK-NEXT: and w12, w8, #0xfff +; CHECK-NEXT: and w11, w11, #0xfff ; CHECK-NEXT: fmov s0, w10 ; CHECK-NEXT: ubfx w10, w8, #12, #12 -; CHECK-NEXT: fmov s1, w11 -; CHECK-NEXT: ubfx w11, w9, #12, #12 -; CHECK-NEXT: orr x8, x8, x12, lsl #32 -; CHECK-NEXT: orr x9, x9, x13, lsl #32 -; CHECK-NEXT: mov v0.s[1], w10 -; CHECK-NEXT: ubfx x8, x8, #24, #12 -; CHECK-NEXT: mov v1.s[1], w11 -; CHECK-NEXT: ubfx x9, x9, #24, #12 -; CHECK-NEXT: mov v0.s[2], w8 -; CHECK-NEXT: ubfx w8, w12, #4, #12 -; CHECK-NEXT: mov v1.s[2], w9 -; CHECK-NEXT: ubfx w9, w13, #4, #12 -; CHECK-NEXT: mov v0.s[3], w8 -; CHECK-NEXT: mov v1.s[3], w9 -; CHECK-NEXT: shl v0.4s, v0.4s, #3 -; CHECK-NEXT: add v0.4s, v1.4s, v0.4s +; CHECK-NEXT: fmov s1, w12 +; CHECK-NEXT: mov v0.h[1], w11 +; CHECK-NEXT: ubfx w11, w9, #8, #12 +; CHECK-NEXT: mov v1.h[1], w10 +; CHECK-NEXT: ubfx x10, x8, #24, #12 +; CHECK-NEXT: lsr x9, x9, #20 +; CHECK-NEXT: ubfx x8, x8, #36, #12 +; CHECK-NEXT: mov v0.h[2], w11 +; CHECK-NEXT: mov v1.h[2], w10 +; CHECK-NEXT: mov v0.h[3], w9 +; CHECK-NEXT: mov v1.h[3], w8 +; CHECK-NEXT: ushll v0.4s, v0.4h, #3 +; CHECK-NEXT: uaddw v0.4s, v0.4s, v1.4h ; CHECK-NEXT: ret %l1 = load <4 x i12>, ptr %p %q = getelementptr i8, ptr %p, i32 6 @@ -148,9 +145,9 @@ define <8 x i16> @load_v8i8(ptr %p) { ; CHECK-LABEL: load_v8i8: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp d1, d0, [x0] -; CHECK-NEXT: ushll v0.8h, v0.8b, #3 -; CHECK-NEXT: uaddw v0.8h, v0.8h, v1.8b +; CHECK-NEXT: ldr q0, [x0] +; CHECK-NEXT: ushll2 v1.8h, v0.16b, #3 +; CHECK-NEXT: uaddw v0.8h, v1.8h, v0.8b ; CHECK-NEXT: ret %l1 = load <8 x i8>, ptr %p %q = getelementptr i8, ptr %p, i32 8 @@ -165,11 +162,10 @@ define <8 x i16> @loadadd_v8i8(ptr %p1, ptr %p2) { ; CHECK-LABEL: loadadd_v8i8: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp d0, d1, [x0] -; CHECK-NEXT: ldp d3, d2, [x1] -; CHECK-NEXT: add v0.8b, v0.8b, v3.8b -; CHECK-NEXT: add v1.8b, v1.8b, v2.8b -; CHECK-NEXT: ushll v1.8h, v1.8b, #3 +; CHECK-NEXT: ldr q0, [x0] +; CHECK-NEXT: ldr q1, [x1] +; CHECK-NEXT: add v0.16b, v0.16b, v1.16b +; CHECK-NEXT: ushll2 v1.8h, v0.16b, #3 ; CHECK-NEXT: uaddw v0.8h, v1.8h, v0.8b ; CHECK-NEXT: ret %l11 = load <8 x i8>, ptr %p1 @@ -190,14 +186,14 @@ define <8 x i32> @loadaddext_v8i8(ptr %p1, ptr %p2) { ; CHECK-LABEL: loadaddext_v8i8: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp d2, d0, [x0] -; CHECK-NEXT: ldp d3, d1, [x1] -; CHECK-NEXT: uaddl v2.8h, v2.8b, v3.8b +; CHECK-NEXT: ldr q0, [x0] +; CHECK-NEXT: ldr q1, [x1] +; CHECK-NEXT: uaddl2 v2.8h, v0.16b, v1.16b ; CHECK-NEXT: uaddl v0.8h, v0.8b, v1.8b -; CHECK-NEXT: ushll2 v1.4s, v0.8h, #3 -; CHECK-NEXT: ushll v0.4s, v0.4h, #3 -; CHECK-NEXT: uaddw2 v1.4s, v1.4s, v2.8h -; CHECK-NEXT: uaddw v0.4s, v0.4s, v2.4h +; CHECK-NEXT: ushll2 v1.4s, v2.8h, #3 +; CHECK-NEXT: ushll v2.4s, v2.4h, #3 +; CHECK-NEXT: uaddw2 v1.4s, v1.4s, v0.8h +; CHECK-NEXT: uaddw v0.4s, v2.4s, v0.4h ; CHECK-NEXT: ret %l11 = load <8 x i8>, ptr %p1 %q1 = getelementptr i8, ptr %p1, i32 8 @@ -221,15 +217,10 @@ define <4 x i32> @loadaddext_v4i8(ptr %p1, ptr %p2) { ; CHECK-LABEL: loadaddext_v4i8: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp s0, s1, [x0] -; CHECK-NEXT: ldp s2, s3, [x1] -; CHECK-NEXT: ushll v1.8h, v1.8b, #0 -; CHECK-NEXT: ushll v3.8h, v3.8b, #0 -; CHECK-NEXT: ushll v0.8h, v0.8b, #0 -; CHECK-NEXT: ushll v2.8h, v2.8b, #0 -; CHECK-NEXT: add v1.4h, v1.4h, v3.4h -; CHECK-NEXT: add v0.4h, v0.4h, v2.4h -; CHECK-NEXT: ushll v1.4s, v1.4h, #3 +; CHECK-NEXT: ldr d0, [x0] +; CHECK-NEXT: ldr d1, [x1] +; CHECK-NEXT: uaddl v0.8h, v0.8b, v1.8b +; CHECK-NEXT: ushll2 v1.4s, v0.8h, #3 ; CHECK-NEXT: uaddw v0.4s, v1.4s, v0.4h ; CHECK-NEXT: ret %l11 = load <4 x i8>, ptr %p1 @@ -321,15 +312,14 @@ define <8 x i32> @load_bv_v4i8_i32(ptr %p, ptr %q) { ; CHECK-LABEL: load_bv_v4i8_i32: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp s0, s1, [x0] -; CHECK-NEXT: ld1 { v0.s }[1], [x1], #4 +; CHECK-NEXT: ldr d0, [x0] +; CHECK-NEXT: ldr d1, [x1] ; CHECK-NEXT: ushll v0.8h, v0.8b, #0 -; CHECK-NEXT: ld1 { v1.s }[1], [x1] ; CHECK-NEXT: ushll v1.8h, v1.8b, #0 -; CHECK-NEXT: ushll2 v2.4s, v1.8h, #3 -; CHECK-NEXT: ushll v3.4s, v1.4h, #3 -; CHECK-NEXT: uaddw2 v1.4s, v2.4s, v0.8h -; CHECK-NEXT: uaddw v0.4s, v3.4s, v0.4h +; CHECK-NEXT: ushll2 v2.4s, v0.8h, #3 +; CHECK-NEXT: ushll2 v3.4s, v1.8h, #3 +; CHECK-NEXT: uaddw v0.4s, v2.4s, v0.4h +; CHECK-NEXT: uaddw v1.4s, v3.4s, v1.4h ; CHECK-NEXT: ret %j1 = load <4 x i8>, ptr %p %p1 = getelementptr i8, ptr %p, i32 4 @@ -349,12 +339,12 @@ define <8 x i32> @load_bv_v4i16_i32(ptr %p, ptr %q) { ; CHECK-LABEL: load_bv_v4i16_i32: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp d0, d1, [x0] -; CHECK-NEXT: ldp d3, d2, [x1] -; CHECK-NEXT: ushll v1.4s, v1.4h, #3 -; CHECK-NEXT: ushll v2.4s, v2.4h, #3 -; CHECK-NEXT: uaddw v0.4s, v1.4s, v0.4h -; CHECK-NEXT: uaddw v1.4s, v2.4s, v3.4h +; CHECK-NEXT: ldr q0, [x0] +; CHECK-NEXT: ldr q1, [x1] +; CHECK-NEXT: ushll2 v2.4s, v0.8h, #3 +; CHECK-NEXT: ushll2 v3.4s, v1.8h, #3 +; CHECK-NEXT: uaddw v0.4s, v2.4s, v0.4h +; CHECK-NEXT: uaddw v1.4s, v3.4s, v1.4h ; CHECK-NEXT: ret %j1 = load <4 x i16>, ptr %p %p1 = getelementptr i8, ptr %p, i32 8 @@ -575,30 +565,26 @@ define <16 x i32> @double2_bv_4xv4i8_i32(ptr %p, ptr %q, ptr %r, ptr %s, ptr %t, ptr %u, ptr %v, ptr %w) { ; CHECK-LABEL: double2_bv_4xv4i8_i32: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp s0, s1, [x2] -; CHECK-NEXT: ldp s2, s3, [x0] -; CHECK-NEXT: ldp s4, s5, [x6] -; CHECK-NEXT: ldp s6, s7, [x4] -; CHECK-NEXT: ld1 { v0.s }[1], [x3], #4 -; CHECK-NEXT: ld1 { v2.s }[1], [x1], #4 -; CHECK-NEXT: ld1 { v4.s }[1], [x7], #4 -; CHECK-NEXT: ld1 { v6.s }[1], [x5], #4 -; CHECK-NEXT: ld1 { v1.s }[1], [x3] -; CHECK-NEXT: ld1 { v3.s }[1], [x1] -; CHECK-NEXT: ld1 { v5.s }[1], [x7] -; CHECK-NEXT: ld1 { v7.s }[1], [x5] -; CHECK-NEXT: usubl v2.8h, v2.8b, v6.8b -; CHECK-NEXT: usubl v4.8h, v0.8b, v4.8b -; CHECK-NEXT: usubl v1.8h, v1.8b, v5.8b -; CHECK-NEXT: usubl v3.8h, v3.8b, v7.8b -; CHECK-NEXT: shll v5.4s, v1.4h, #16 -; CHECK-NEXT: shll v0.4s, v3.4h, #16 -; CHECK-NEXT: shll2 v3.4s, v3.8h, #16 -; CHECK-NEXT: shll2 v6.4s, v1.8h, #16 -; CHECK-NEXT: saddw2 v1.4s, v3.4s, v2.8h -; CHECK-NEXT: saddw2 v3.4s, v6.4s, v4.8h -; CHECK-NEXT: saddw v0.4s, v0.4s, v2.4h -; CHECK-NEXT: saddw v2.4s, v5.4s, v4.4h +; CHECK-NEXT: ldr d0, [x4] +; CHECK-NEXT: ldr d2, [x0] +; CHECK-NEXT: ldr d3, [x1] +; CHECK-NEXT: ldr d6, [x5] +; CHECK-NEXT: ldr d1, [x2] +; CHECK-NEXT: ldr d4, [x3] +; CHECK-NEXT: ldr d5, [x7] +; CHECK-NEXT: ldr d7, [x6] +; CHECK-NEXT: usubl v0.8h, v2.8b, v0.8b +; CHECK-NEXT: usubl v2.8h, v3.8b, v6.8b +; CHECK-NEXT: usubl v4.8h, v4.8b, v5.8b +; CHECK-NEXT: usubl v3.8h, v1.8b, v7.8b +; CHECK-NEXT: shll2 v1.4s, v0.8h, #16 +; CHECK-NEXT: shll2 v5.4s, v2.8h, #16 +; CHECK-NEXT: saddw v0.4s, v1.4s, v0.4h +; CHECK-NEXT: saddw v1.4s, v5.4s, v2.4h +; CHECK-NEXT: shll2 v2.4s, v3.8h, #16 +; CHECK-NEXT: shll2 v5.4s, v4.8h, #16 +; CHECK-NEXT: saddw v2.4s, v2.4s, v3.4h +; CHECK-NEXT: saddw v3.4s, v5.4s, v4.4h ; CHECK-NEXT: ret %j1 = load <4 x i8>, ptr %p %p1 = getelementptr i8, ptr %p, i32 4 @@ -1270,12 +1256,11 @@ define <8 x i32> @commuted_loads(ptr %p1, ptr %p2) { ; CHECK-LABEL: commuted_loads: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp d0, d1, [x0] -; CHECK-NEXT: ldp d3, d2, [x1] -; CHECK-NEXT: add v0.8b, v3.8b, v0.8b -; CHECK-NEXT: add v1.8b, v2.8b, v1.8b +; CHECK-NEXT: ldr q0, [x0] +; CHECK-NEXT: ldr q1, [x1] +; CHECK-NEXT: add v0.16b, v1.16b, v0.16b +; CHECK-NEXT: ushll2 v1.8h, v0.16b, #0 ; CHECK-NEXT: ushll v0.8h, v0.8b, #0 -; CHECK-NEXT: ushll v1.8h, v1.8b, #0 ; CHECK-NEXT: ushll2 v2.4s, v1.8h, #3 ; CHECK-NEXT: ushll v3.4s, v1.4h, #3 ; CHECK-NEXT: uaddw2 v1.4s, v2.4s, v0.8h @@ -1353,3 +1338,74 @@ %a = sub <8 x i32> %se2, %e1 ret <8 x i32> %a } + +define <4 x i32> @bitcast(ptr %p) { +; CHECK-LABEL: bitcast: +; CHECK: // %bb.0: +; CHECK-NEXT: ldr d0, [x0] +; CHECK-NEXT: ushll v0.8h, v0.8b, #0 +; CHECK-NEXT: ushll2 v1.4s, v0.8h, #3 +; CHECK-NEXT: uaddw v0.4s, v1.4s, v0.4h +; CHECK-NEXT: ret + %l1b = load float, ptr %p + %l1 = bitcast float %l1b to <4 x i8> + %q = getelementptr i8, ptr %p, i32 4 + %l2b = load float, ptr %q + %l2 = bitcast float %l2b to <4 x i8> + %e1 = zext <4 x i8> %l1 to <4 x i32> + %e2 = zext <4 x i8> %l2 to <4 x i32> + %e3 = shl <4 x i32> %e2, + %a = add <4 x i32> %e1, %e3 + ret <4 x i32> %a +} + +define <4 x i32> @atomic(ptr %p) { +; CHECK-LABEL: atomic: +; CHECK: // %bb.0: +; CHECK-NEXT: ldar w8, [x0] +; CHECK-NEXT: ldr s0, [x0, #4] +; CHECK-NEXT: movi v2.2d, #0x0000ff000000ff +; CHECK-NEXT: fmov s1, w8 +; CHECK-NEXT: zip1 v1.8b, v1.8b, v0.8b +; CHECK-NEXT: ushll v0.8h, v0.8b, #0 +; CHECK-NEXT: ushll v0.4s, v0.4h, #3 +; CHECK-NEXT: ushll v1.4s, v1.4h, #0 +; CHECK-NEXT: and v1.16b, v1.16b, v2.16b +; CHECK-NEXT: add v0.4s, v1.4s, v0.4s +; CHECK-NEXT: ret + %l1b = load atomic float, ptr %p acquire, align 4 + %l1 = bitcast float %l1b to <4 x i8> + %q = getelementptr i8, ptr %p, i32 4 + %l2b = load float, ptr %q + %l2 = bitcast float %l2b to <4 x i8> + %e1 = zext <4 x i8> %l1 to <4 x i32> + %e2 = zext <4 x i8> %l2 to <4 x i32> + %e3 = shl <4 x i32> %e2, + %a = add <4 x i32> %e1, %e3 + ret <4 x i32> %a +} + +define <4 x i32> @volatile(ptr %p) { +; CHECK-LABEL: volatile: +; CHECK: // %bb.0: +; CHECK-NEXT: sub sp, sp, #16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: ldr s0, [x0] +; CHECK-NEXT: ldr s1, [x0, #4] +; CHECK-NEXT: ushll v0.8h, v0.8b, #0 +; CHECK-NEXT: ushll v1.8h, v1.8b, #0 +; CHECK-NEXT: ushll v1.4s, v1.4h, #3 +; CHECK-NEXT: uaddw v0.4s, v1.4s, v0.4h +; CHECK-NEXT: add sp, sp, #16 +; CHECK-NEXT: ret + %l1b = load volatile float, ptr %p + %l1 = bitcast float %l1b to <4 x i8> + %q = getelementptr i8, ptr %p, i32 4 + %l2b = load float, ptr %q + %l2 = bitcast float %l2b to <4 x i8> + %e1 = zext <4 x i8> %l1 to <4 x i32> + %e2 = zext <4 x i8> %l2 to <4 x i32> + %e3 = shl <4 x i32> %e2, + %a = add <4 x i32> %e1, %e3 + ret <4 x i32> %a +} Index: llvm/test/CodeGen/AArch64/extbinopload2.ll =================================================================== --- llvm/test/CodeGen/AArch64/extbinopload2.ll +++ llvm/test/CodeGen/AArch64/extbinopload2.ll @@ -47,9 +47,9 @@ define <8 x i16> @std_v8i8_v8i16(ptr %p) { ; CHECK-LABEL: std_v8i8_v8i16: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp d0, d1, [x0] +; CHECK-NEXT: ldr q0, [x0] +; CHECK-NEXT: ushll2 v1.8h, v0.16b, #3 ; CHECK-NEXT: ushll v0.8h, v0.8b, #0 -; CHECK-NEXT: ushll v1.8h, v1.8b, #3 ; CHECK-NEXT: sub v0.8h, v0.8h, v1.8h ; CHECK-NEXT: ret %l1 = load <8 x i8>, ptr %p @@ -65,13 +65,13 @@ define <16 x i16> @std_v16i8_v16i16(ptr %p) { ; CHECK-LABEL: std_v16i8_v16i16: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q1, [x0] -; CHECK-NEXT: ushll v2.8h, v0.8b, #0 -; CHECK-NEXT: ushll2 v0.8h, v0.16b, #0 -; CHECK-NEXT: ushll2 v3.8h, v1.16b, #3 -; CHECK-NEXT: ushll v4.8h, v1.8b, #3 -; CHECK-NEXT: sub v1.8h, v0.8h, v3.8h -; CHECK-NEXT: sub v0.8h, v2.8h, v4.8h +; CHECK-NEXT: ldp q1, q0, [x0] +; CHECK-NEXT: ushll2 v3.8h, v1.16b, #0 +; CHECK-NEXT: ushll v4.8h, v1.8b, #0 +; CHECK-NEXT: ushll2 v2.8h, v0.16b, #3 +; CHECK-NEXT: ushll v0.8h, v0.8b, #3 +; CHECK-NEXT: sub v1.8h, v3.8h, v2.8h +; CHECK-NEXT: sub v0.8h, v4.8h, v0.8h ; CHECK-NEXT: ret %l1 = load <16 x i8>, ptr %p %q = getelementptr i8, ptr %p, i32 16 @@ -110,11 +110,10 @@ define <4 x i32> @std_v4i8_v4i32(ptr %p) { ; CHECK-LABEL: std_v4i8_v4i32: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp s0, s1, [x0] +; CHECK-NEXT: ldr d0, [x0] ; CHECK-NEXT: ushll v0.8h, v0.8b, #0 -; CHECK-NEXT: ushll v1.8h, v1.8b, #0 +; CHECK-NEXT: ushll2 v1.4s, v0.8h, #3 ; CHECK-NEXT: ushll v0.4s, v0.4h, #0 -; CHECK-NEXT: ushll v1.4s, v1.4h, #3 ; CHECK-NEXT: sub v0.4s, v0.4s, v1.4s ; CHECK-NEXT: ret %l1 = load <4 x i8>, ptr %p @@ -130,15 +129,15 @@ define <8 x i32> @std_v8i8_v8i32(ptr %p) { ; CHECK-LABEL: std_v8i8_v8i32: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp d0, d1, [x0] +; CHECK-NEXT: ldr q0, [x0] +; CHECK-NEXT: ushll2 v1.8h, v0.16b, #0 ; CHECK-NEXT: ushll v0.8h, v0.8b, #0 -; CHECK-NEXT: ushll v1.8h, v1.8b, #0 -; CHECK-NEXT: ushll v2.4s, v0.4h, #0 -; CHECK-NEXT: ushll2 v0.4s, v0.8h, #0 -; CHECK-NEXT: ushll2 v3.4s, v1.8h, #3 +; CHECK-NEXT: ushll2 v2.4s, v1.8h, #3 +; CHECK-NEXT: ushll2 v3.4s, v0.8h, #0 ; CHECK-NEXT: ushll v4.4s, v1.4h, #3 -; CHECK-NEXT: sub v1.4s, v0.4s, v3.4s -; CHECK-NEXT: sub v0.4s, v2.4s, v4.4s +; CHECK-NEXT: ushll v0.4s, v0.4h, #0 +; CHECK-NEXT: sub v1.4s, v3.4s, v2.4s +; CHECK-NEXT: sub v0.4s, v0.4s, v4.4s ; CHECK-NEXT: ret %l1 = load <8 x i8>, ptr %p %q = getelementptr i8, ptr %p, i32 8 @@ -153,23 +152,23 @@ define <16 x i32> @std_v16i8_v16i32(ptr %p) { ; CHECK-LABEL: std_v16i8_v16i32: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q1, [x0] -; CHECK-NEXT: ushll v2.8h, v0.8b, #0 -; CHECK-NEXT: ushll2 v0.8h, v0.16b, #0 -; CHECK-NEXT: ushll2 v3.8h, v1.16b, #0 -; CHECK-NEXT: ushll v1.8h, v1.8b, #0 -; CHECK-NEXT: ushll v4.4s, v2.4h, #0 -; CHECK-NEXT: ushll v5.4s, v0.4h, #0 -; CHECK-NEXT: ushll2 v2.4s, v2.8h, #0 -; CHECK-NEXT: ushll2 v0.4s, v0.8h, #0 -; CHECK-NEXT: ushll v6.4s, v1.4h, #3 -; CHECK-NEXT: ushll v7.4s, v3.4h, #3 -; CHECK-NEXT: ushll2 v3.4s, v3.8h, #3 -; CHECK-NEXT: ushll2 v1.4s, v1.8h, #3 -; CHECK-NEXT: sub v3.4s, v0.4s, v3.4s -; CHECK-NEXT: sub v1.4s, v2.4s, v1.4s -; CHECK-NEXT: sub v0.4s, v4.4s, v6.4s -; CHECK-NEXT: sub v2.4s, v5.4s, v7.4s +; CHECK-NEXT: ldp q1, q0, [x0] +; CHECK-NEXT: ushll2 v4.8h, v1.16b, #0 +; CHECK-NEXT: ushll v5.8h, v1.8b, #0 +; CHECK-NEXT: ushll2 v2.8h, v0.16b, #0 +; CHECK-NEXT: ushll v0.8h, v0.8b, #0 +; CHECK-NEXT: ushll2 v1.4s, v2.8h, #3 +; CHECK-NEXT: ushll2 v3.4s, v4.8h, #0 +; CHECK-NEXT: ushll2 v6.4s, v0.8h, #3 +; CHECK-NEXT: sub v3.4s, v3.4s, v1.4s +; CHECK-NEXT: ushll2 v1.4s, v5.8h, #0 +; CHECK-NEXT: ushll v2.4s, v2.4h, #3 +; CHECK-NEXT: ushll v0.4s, v0.4h, #3 +; CHECK-NEXT: ushll v5.4s, v5.4h, #0 +; CHECK-NEXT: ushll v4.4s, v4.4h, #0 +; CHECK-NEXT: sub v1.4s, v1.4s, v6.4s +; CHECK-NEXT: sub v0.4s, v5.4s, v0.4s +; CHECK-NEXT: sub v2.4s, v4.4s, v2.4s ; CHECK-NEXT: ret %l1 = load <16 x i8>, ptr %p %q = getelementptr i8, ptr %p, i32 16 @@ -184,16 +183,16 @@ define <2 x i64> @std_v2i8_v2i64(ptr %p) { ; CHECK-LABEL: std_v2i8_v2i64: ; CHECK: // %bb.0: -; CHECK-NEXT: ldrb w8, [x0, #2] -; CHECK-NEXT: ldrb w9, [x0] -; CHECK-NEXT: fmov d0, x8 -; CHECK-NEXT: ldrb w8, [x0, #3] -; CHECK-NEXT: fmov d1, x9 -; CHECK-NEXT: ldrb w9, [x0, #1] -; CHECK-NEXT: mov v0.d[1], x8 -; CHECK-NEXT: mov v1.d[1], x9 -; CHECK-NEXT: shl v0.2d, v0.2d, #3 -; CHECK-NEXT: sub v0.2d, v1.2d, v0.2d +; CHECK-NEXT: ldr s1, [x0] +; CHECK-NEXT: movi v0.2d, #0x000000000000ff +; CHECK-NEXT: ushll v1.8h, v1.8b, #0 +; CHECK-NEXT: ushll v1.4s, v1.4h, #0 +; CHECK-NEXT: ushll2 v2.2d, v1.4s, #0 +; CHECK-NEXT: ushll v1.2d, v1.2s, #0 +; CHECK-NEXT: and v2.16b, v2.16b, v0.16b +; CHECK-NEXT: shl v2.2d, v2.2d, #3 +; CHECK-NEXT: and v0.16b, v1.16b, v0.16b +; CHECK-NEXT: sub v0.2d, v0.2d, v2.2d ; CHECK-NEXT: ret %l1 = load <2 x i8>, ptr %p %q = getelementptr i8, ptr %p, i32 2 @@ -208,24 +207,16 @@ define <4 x i64> @std_v4i8_v4i64(ptr %p) { ; CHECK-LABEL: std_v4i8_v4i64: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp s1, s2, [x0] -; CHECK-NEXT: movi v0.2d, #0x000000000000ff -; CHECK-NEXT: ushll v1.8h, v1.8b, #0 -; CHECK-NEXT: ushll v2.8h, v2.8b, #0 -; CHECK-NEXT: ushll v1.4s, v1.4h, #0 -; CHECK-NEXT: ushll v2.4s, v2.4h, #0 -; CHECK-NEXT: ushll v3.2d, v1.2s, #0 -; CHECK-NEXT: ushll2 v1.2d, v1.4s, #0 -; CHECK-NEXT: ushll2 v4.2d, v2.4s, #0 -; CHECK-NEXT: ushll v2.2d, v2.2s, #0 -; CHECK-NEXT: and v3.16b, v3.16b, v0.16b -; CHECK-NEXT: and v1.16b, v1.16b, v0.16b -; CHECK-NEXT: and v4.16b, v4.16b, v0.16b -; CHECK-NEXT: and v0.16b, v2.16b, v0.16b -; CHECK-NEXT: shl v2.2d, v4.2d, #3 -; CHECK-NEXT: shl v0.2d, v0.2d, #3 -; CHECK-NEXT: sub v1.2d, v1.2d, v2.2d -; CHECK-NEXT: sub v0.2d, v3.2d, v0.2d +; CHECK-NEXT: ldr d0, [x0] +; CHECK-NEXT: ushll v0.8h, v0.8b, #0 +; CHECK-NEXT: ushll2 v1.4s, v0.8h, #0 +; CHECK-NEXT: ushll v0.4s, v0.4h, #0 +; CHECK-NEXT: ushll2 v2.2d, v1.4s, #3 +; CHECK-NEXT: ushll2 v3.2d, v0.4s, #0 +; CHECK-NEXT: ushll v4.2d, v1.2s, #3 +; CHECK-NEXT: ushll v0.2d, v0.2s, #0 +; CHECK-NEXT: sub v1.2d, v3.2d, v2.2d +; CHECK-NEXT: sub v0.2d, v0.2d, v4.2d ; CHECK-NEXT: ret %l1 = load <4 x i8>, ptr %p %q = getelementptr i8, ptr %p, i32 4 @@ -240,25 +231,25 @@ define <8 x i64> @std_v8i8_v8i64(ptr %p) { ; CHECK-LABEL: std_v8i8_v8i64: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp d0, d1, [x0] +; CHECK-NEXT: ldr q0, [x0] +; CHECK-NEXT: ushll2 v1.8h, v0.16b, #0 ; CHECK-NEXT: ushll v0.8h, v0.8b, #0 -; CHECK-NEXT: ushll v1.8h, v1.8b, #0 -; CHECK-NEXT: ushll v2.4s, v0.4h, #0 -; CHECK-NEXT: ushll2 v0.4s, v0.8h, #0 -; CHECK-NEXT: ushll2 v3.4s, v1.8h, #0 -; CHECK-NEXT: ushll v1.4s, v1.4h, #0 -; CHECK-NEXT: ushll v4.2d, v2.2s, #0 -; CHECK-NEXT: ushll v5.2d, v0.2s, #0 -; CHECK-NEXT: ushll2 v2.2d, v2.4s, #0 -; CHECK-NEXT: ushll2 v0.2d, v0.4s, #0 -; CHECK-NEXT: ushll v6.2d, v1.2s, #3 -; CHECK-NEXT: ushll v7.2d, v3.2s, #3 -; CHECK-NEXT: ushll2 v3.2d, v3.4s, #3 -; CHECK-NEXT: ushll2 v1.2d, v1.4s, #3 -; CHECK-NEXT: sub v3.2d, v0.2d, v3.2d -; CHECK-NEXT: sub v1.2d, v2.2d, v1.2d -; CHECK-NEXT: sub v0.2d, v4.2d, v6.2d -; CHECK-NEXT: sub v2.2d, v5.2d, v7.2d +; CHECK-NEXT: ushll2 v2.4s, v1.8h, #0 +; CHECK-NEXT: ushll2 v4.4s, v0.8h, #0 +; CHECK-NEXT: ushll v5.4s, v1.4h, #0 +; CHECK-NEXT: ushll v0.4s, v0.4h, #0 +; CHECK-NEXT: ushll2 v1.2d, v2.4s, #3 +; CHECK-NEXT: ushll2 v3.2d, v4.4s, #0 +; CHECK-NEXT: ushll2 v6.2d, v5.4s, #3 +; CHECK-NEXT: sub v3.2d, v3.2d, v1.2d +; CHECK-NEXT: ushll2 v1.2d, v0.4s, #0 +; CHECK-NEXT: ushll v2.2d, v2.2s, #3 +; CHECK-NEXT: ushll v5.2d, v5.2s, #3 +; CHECK-NEXT: ushll v0.2d, v0.2s, #0 +; CHECK-NEXT: ushll v4.2d, v4.2s, #0 +; CHECK-NEXT: sub v1.2d, v1.2d, v6.2d +; CHECK-NEXT: sub v0.2d, v0.2d, v5.2d +; CHECK-NEXT: sub v2.2d, v4.2d, v2.2d ; CHECK-NEXT: ret %l1 = load <8 x i8>, ptr %p %q = getelementptr i8, ptr %p, i32 8 @@ -273,43 +264,43 @@ define <16 x i64> @std_v16i8_v16i64(ptr %p) { ; CHECK-LABEL: std_v16i8_v16i64: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q1, [x0] -; CHECK-NEXT: ushll v2.8h, v0.8b, #0 -; CHECK-NEXT: ushll2 v0.8h, v0.16b, #0 -; CHECK-NEXT: ushll v3.4s, v2.4h, #0 -; CHECK-NEXT: ushll v4.4s, v0.4h, #0 -; CHECK-NEXT: ushll v16.2d, v3.2s, #0 -; CHECK-NEXT: ushll2 v19.2d, v3.4s, #0 -; CHECK-NEXT: ushll2 v3.8h, v1.16b, #0 +; CHECK-NEXT: ldp q1, q0, [x0] +; CHECK-NEXT: ushll2 v4.8h, v1.16b, #0 ; CHECK-NEXT: ushll v1.8h, v1.8b, #0 -; CHECK-NEXT: ushll2 v2.4s, v2.8h, #0 -; CHECK-NEXT: ushll2 v0.4s, v0.8h, #0 -; CHECK-NEXT: ushll2 v5.4s, v3.8h, #0 -; CHECK-NEXT: ushll2 v7.4s, v1.8h, #0 -; CHECK-NEXT: ushll v1.4s, v1.4h, #0 -; CHECK-NEXT: ushll v3.4s, v3.4h, #0 -; CHECK-NEXT: ushll v17.2d, v4.2s, #0 -; CHECK-NEXT: ushll v18.2d, v2.2s, #0 -; CHECK-NEXT: ushll v6.2d, v0.2s, #0 -; CHECK-NEXT: ushll2 v4.2d, v4.4s, #0 -; CHECK-NEXT: ushll2 v2.2d, v2.4s, #0 -; CHECK-NEXT: ushll2 v0.2d, v0.4s, #0 -; CHECK-NEXT: ushll v20.2d, v1.2s, #3 -; CHECK-NEXT: ushll v21.2d, v3.2s, #3 -; CHECK-NEXT: ushll v22.2d, v7.2s, #3 -; CHECK-NEXT: ushll2 v1.2d, v1.4s, #3 -; CHECK-NEXT: ushll v23.2d, v5.2s, #3 -; CHECK-NEXT: ushll2 v24.2d, v3.4s, #3 +; CHECK-NEXT: ushll2 v2.8h, v0.16b, #0 +; CHECK-NEXT: ushll v0.8h, v0.8b, #0 +; CHECK-NEXT: ushll2 v5.4s, v2.8h, #0 +; CHECK-NEXT: ushll2 v6.4s, v4.8h, #0 +; CHECK-NEXT: ushll2 v16.4s, v0.8h, #0 +; CHECK-NEXT: ushll2 v17.4s, v1.8h, #0 ; CHECK-NEXT: ushll2 v3.2d, v5.4s, #3 -; CHECK-NEXT: ushll2 v5.2d, v7.4s, #3 -; CHECK-NEXT: sub v7.2d, v0.2d, v3.2d -; CHECK-NEXT: sub v3.2d, v2.2d, v5.2d -; CHECK-NEXT: sub v5.2d, v4.2d, v24.2d -; CHECK-NEXT: sub v6.2d, v6.2d, v23.2d -; CHECK-NEXT: sub v1.2d, v19.2d, v1.2d -; CHECK-NEXT: sub v2.2d, v18.2d, v22.2d -; CHECK-NEXT: sub v0.2d, v16.2d, v20.2d -; CHECK-NEXT: sub v4.2d, v17.2d, v21.2d +; CHECK-NEXT: ushll2 v7.2d, v6.4s, #0 +; CHECK-NEXT: ushll2 v18.2d, v16.4s, #3 +; CHECK-NEXT: ushll2 v19.2d, v17.4s, #0 +; CHECK-NEXT: sub v7.2d, v7.2d, v3.2d +; CHECK-NEXT: sub v3.2d, v19.2d, v18.2d +; CHECK-NEXT: ushll v18.4s, v2.4h, #0 +; CHECK-NEXT: ushll v4.4s, v4.4h, #0 +; CHECK-NEXT: ushll2 v2.2d, v18.4s, #3 +; CHECK-NEXT: ushll2 v19.2d, v4.4s, #0 +; CHECK-NEXT: ushll v20.2d, v5.2s, #3 +; CHECK-NEXT: sub v5.2d, v19.2d, v2.2d +; CHECK-NEXT: ushll v0.4s, v0.4h, #0 +; CHECK-NEXT: ushll v19.4s, v1.4h, #0 +; CHECK-NEXT: ushll2 v1.2d, v0.4s, #3 +; CHECK-NEXT: ushll2 v2.2d, v19.4s, #0 +; CHECK-NEXT: ushll v16.2d, v16.2s, #3 +; CHECK-NEXT: ushll v17.2d, v17.2s, #0 +; CHECK-NEXT: ushll v6.2d, v6.2s, #0 +; CHECK-NEXT: sub v1.2d, v2.2d, v1.2d +; CHECK-NEXT: sub v2.2d, v17.2d, v16.2d +; CHECK-NEXT: ushll v16.2d, v18.2s, #3 +; CHECK-NEXT: ushll v0.2d, v0.2s, #3 +; CHECK-NEXT: ushll v17.2d, v19.2s, #0 +; CHECK-NEXT: ushll v4.2d, v4.2s, #0 +; CHECK-NEXT: sub v6.2d, v6.2d, v20.2d +; CHECK-NEXT: sub v0.2d, v17.2d, v0.2d +; CHECK-NEXT: sub v4.2d, v4.2d, v16.2d ; CHECK-NEXT: ret %l1 = load <16 x i8>, ptr %p %q = getelementptr i8, ptr %p, i32 16 @@ -348,9 +339,9 @@ define <4 x i32> @std_v4i16_v4i32(ptr %p) { ; CHECK-LABEL: std_v4i16_v4i32: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp d0, d1, [x0] +; CHECK-NEXT: ldr q0, [x0] +; CHECK-NEXT: ushll2 v1.4s, v0.8h, #3 ; CHECK-NEXT: ushll v0.4s, v0.4h, #0 -; CHECK-NEXT: ushll v1.4s, v1.4h, #3 ; CHECK-NEXT: sub v0.4s, v0.4s, v1.4s ; CHECK-NEXT: ret %l1 = load <4 x i16>, ptr %p @@ -366,13 +357,13 @@ define <8 x i32> @std_v8i16_v8i32(ptr %p) { ; CHECK-LABEL: std_v8i16_v8i32: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q1, [x0] -; CHECK-NEXT: ushll v2.4s, v0.4h, #0 -; CHECK-NEXT: ushll2 v0.4s, v0.8h, #0 -; CHECK-NEXT: ushll2 v3.4s, v1.8h, #3 -; CHECK-NEXT: ushll v4.4s, v1.4h, #3 -; CHECK-NEXT: sub v1.4s, v0.4s, v3.4s -; CHECK-NEXT: sub v0.4s, v2.4s, v4.4s +; CHECK-NEXT: ldp q1, q0, [x0] +; CHECK-NEXT: ushll2 v3.4s, v1.8h, #0 +; CHECK-NEXT: ushll v4.4s, v1.4h, #0 +; CHECK-NEXT: ushll2 v2.4s, v0.8h, #3 +; CHECK-NEXT: ushll v0.4s, v0.4h, #3 +; CHECK-NEXT: sub v1.4s, v3.4s, v2.4s +; CHECK-NEXT: sub v0.4s, v4.4s, v0.4s ; CHECK-NEXT: ret %l1 = load <8 x i16>, ptr %p %q = getelementptr i8, ptr %p, i32 16 @@ -387,20 +378,21 @@ define <16 x i32> @std_v16i16_v16i32(ptr %p) { ; CHECK-LABEL: std_v16i16_v16i32: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q1, [x0] -; CHECK-NEXT: ushll v5.4s, v0.4h, #0 -; CHECK-NEXT: ushll2 v0.4s, v0.8h, #0 -; CHECK-NEXT: ldp q3, q2, [x0, #32] -; CHECK-NEXT: ushll v4.4s, v1.4h, #0 -; CHECK-NEXT: ushll2 v6.4s, v1.8h, #0 -; CHECK-NEXT: ushll v16.4s, v3.4h, #3 -; CHECK-NEXT: ushll2 v1.4s, v3.8h, #3 -; CHECK-NEXT: ushll v7.4s, v2.4h, #3 -; CHECK-NEXT: ushll2 v2.4s, v2.8h, #3 -; CHECK-NEXT: sub v1.4s, v0.4s, v1.4s -; CHECK-NEXT: sub v3.4s, v6.4s, v2.4s -; CHECK-NEXT: sub v0.4s, v5.4s, v16.4s -; CHECK-NEXT: sub v2.4s, v4.4s, v7.4s +; CHECK-NEXT: ldp q2, q0, [x0, #16] +; CHECK-NEXT: ldr q3, [x0] +; CHECK-NEXT: ldr q4, [x0, #48] +; CHECK-NEXT: ushll2 v1.4s, v0.8h, #3 +; CHECK-NEXT: ushll2 v5.4s, v3.8h, #0 +; CHECK-NEXT: ushll2 v6.4s, v4.8h, #3 +; CHECK-NEXT: sub v1.4s, v5.4s, v1.4s +; CHECK-NEXT: ushll2 v5.4s, v2.8h, #0 +; CHECK-NEXT: ushll v0.4s, v0.4h, #3 +; CHECK-NEXT: ushll v7.4s, v3.4h, #0 +; CHECK-NEXT: ushll v4.4s, v4.4h, #3 +; CHECK-NEXT: ushll v2.4s, v2.4h, #0 +; CHECK-NEXT: sub v3.4s, v5.4s, v6.4s +; CHECK-NEXT: sub v0.4s, v7.4s, v0.4s +; CHECK-NEXT: sub v2.4s, v2.4s, v4.4s ; CHECK-NEXT: ret %l1 = load <16 x i16>, ptr %p %q = getelementptr i8, ptr %p, i32 32 @@ -415,16 +407,11 @@ define <2 x i64> @std_v2i16_v2i64(ptr %p) { ; CHECK-LABEL: std_v2i16_v2i64: ; CHECK: // %bb.0: -; CHECK-NEXT: ldrh w8, [x0, #4] -; CHECK-NEXT: ldrh w9, [x0] -; CHECK-NEXT: fmov d0, x8 -; CHECK-NEXT: ldrh w8, [x0, #6] -; CHECK-NEXT: fmov d1, x9 -; CHECK-NEXT: ldrh w9, [x0, #2] -; CHECK-NEXT: mov v0.d[1], x8 -; CHECK-NEXT: mov v1.d[1], x9 -; CHECK-NEXT: shl v0.2d, v0.2d, #3 -; CHECK-NEXT: sub v0.2d, v1.2d, v0.2d +; CHECK-NEXT: ldr d0, [x0] +; CHECK-NEXT: ushll v0.4s, v0.4h, #0 +; CHECK-NEXT: ushll2 v1.2d, v0.4s, #3 +; CHECK-NEXT: ushll v0.2d, v0.2s, #0 +; CHECK-NEXT: sub v0.2d, v0.2d, v1.2d ; CHECK-NEXT: ret %l1 = load <2 x i16>, ptr %p %q = getelementptr i8, ptr %p, i32 4 @@ -439,15 +426,15 @@ define <4 x i64> @std_v4i16_v4i64(ptr %p) { ; CHECK-LABEL: std_v4i16_v4i64: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp d0, d1, [x0] +; CHECK-NEXT: ldr q0, [x0] +; CHECK-NEXT: ushll2 v1.4s, v0.8h, #0 ; CHECK-NEXT: ushll v0.4s, v0.4h, #0 -; CHECK-NEXT: ushll v1.4s, v1.4h, #0 -; CHECK-NEXT: ushll v2.2d, v0.2s, #0 -; CHECK-NEXT: ushll2 v0.2d, v0.4s, #0 -; CHECK-NEXT: ushll2 v3.2d, v1.4s, #3 +; CHECK-NEXT: ushll2 v2.2d, v1.4s, #3 +; CHECK-NEXT: ushll2 v3.2d, v0.4s, #0 ; CHECK-NEXT: ushll v4.2d, v1.2s, #3 -; CHECK-NEXT: sub v1.2d, v0.2d, v3.2d -; CHECK-NEXT: sub v0.2d, v2.2d, v4.2d +; CHECK-NEXT: ushll v0.2d, v0.2s, #0 +; CHECK-NEXT: sub v1.2d, v3.2d, v2.2d +; CHECK-NEXT: sub v0.2d, v0.2d, v4.2d ; CHECK-NEXT: ret %l1 = load <4 x i16>, ptr %p %q = getelementptr i8, ptr %p, i32 8 @@ -462,23 +449,23 @@ define <8 x i64> @std_v8i16_v8i64(ptr %p) { ; CHECK-LABEL: std_v8i16_v8i64: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q1, [x0] -; CHECK-NEXT: ushll v2.4s, v0.4h, #0 -; CHECK-NEXT: ushll2 v0.4s, v0.8h, #0 -; CHECK-NEXT: ushll2 v3.4s, v1.8h, #0 -; CHECK-NEXT: ushll v1.4s, v1.4h, #0 -; CHECK-NEXT: ushll v4.2d, v2.2s, #0 -; CHECK-NEXT: ushll v5.2d, v0.2s, #0 -; CHECK-NEXT: ushll2 v2.2d, v2.4s, #0 -; CHECK-NEXT: ushll2 v0.2d, v0.4s, #0 -; CHECK-NEXT: ushll v6.2d, v1.2s, #3 -; CHECK-NEXT: ushll v7.2d, v3.2s, #3 -; CHECK-NEXT: ushll2 v3.2d, v3.4s, #3 -; CHECK-NEXT: ushll2 v1.2d, v1.4s, #3 -; CHECK-NEXT: sub v3.2d, v0.2d, v3.2d -; CHECK-NEXT: sub v1.2d, v2.2d, v1.2d -; CHECK-NEXT: sub v0.2d, v4.2d, v6.2d -; CHECK-NEXT: sub v2.2d, v5.2d, v7.2d +; CHECK-NEXT: ldp q1, q0, [x0] +; CHECK-NEXT: ushll2 v4.4s, v1.8h, #0 +; CHECK-NEXT: ushll v5.4s, v1.4h, #0 +; CHECK-NEXT: ushll2 v2.4s, v0.8h, #0 +; CHECK-NEXT: ushll v0.4s, v0.4h, #0 +; CHECK-NEXT: ushll2 v1.2d, v2.4s, #3 +; CHECK-NEXT: ushll2 v3.2d, v4.4s, #0 +; CHECK-NEXT: ushll2 v6.2d, v0.4s, #3 +; CHECK-NEXT: sub v3.2d, v3.2d, v1.2d +; CHECK-NEXT: ushll2 v1.2d, v5.4s, #0 +; CHECK-NEXT: ushll v2.2d, v2.2s, #3 +; CHECK-NEXT: ushll v0.2d, v0.2s, #3 +; CHECK-NEXT: ushll v5.2d, v5.2s, #0 +; CHECK-NEXT: ushll v4.2d, v4.2s, #0 +; CHECK-NEXT: sub v1.2d, v1.2d, v6.2d +; CHECK-NEXT: sub v0.2d, v5.2d, v0.2d +; CHECK-NEXT: sub v2.2d, v4.2d, v2.2d ; CHECK-NEXT: ret %l1 = load <8 x i16>, ptr %p %q = getelementptr i8, ptr %p, i32 16 @@ -493,40 +480,41 @@ define <16 x i64> @std_v16i16_v16i64(ptr %p) { ; CHECK-LABEL: std_v16i16_v16i64: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q1, [x0] -; CHECK-NEXT: ushll v5.4s, v0.4h, #0 -; CHECK-NEXT: ushll2 v0.4s, v0.8h, #0 -; CHECK-NEXT: ldp q3, q2, [x0, #32] -; CHECK-NEXT: ushll v4.4s, v1.4h, #0 -; CHECK-NEXT: ushll2 v1.4s, v1.8h, #0 -; CHECK-NEXT: ushll v16.2d, v4.2s, #0 -; CHECK-NEXT: ushll2 v7.4s, v3.8h, #0 -; CHECK-NEXT: ushll v3.4s, v3.4h, #0 -; CHECK-NEXT: ushll2 v19.4s, v2.8h, #0 -; CHECK-NEXT: ushll v2.4s, v2.4h, #0 -; CHECK-NEXT: ushll v17.2d, v5.2s, #0 -; CHECK-NEXT: ushll v6.2d, v1.2s, #0 -; CHECK-NEXT: ushll2 v4.2d, v4.4s, #0 -; CHECK-NEXT: ushll v18.2d, v0.2s, #0 -; CHECK-NEXT: ushll2 v5.2d, v5.4s, #0 -; CHECK-NEXT: ushll2 v1.2d, v1.4s, #0 -; CHECK-NEXT: ushll2 v0.2d, v0.4s, #0 -; CHECK-NEXT: ushll v20.2d, v2.2s, #3 -; CHECK-NEXT: ushll v21.2d, v3.2s, #3 -; CHECK-NEXT: ushll v22.2d, v19.2s, #3 -; CHECK-NEXT: ushll2 v23.2d, v2.4s, #3 -; CHECK-NEXT: ushll v2.2d, v7.2s, #3 -; CHECK-NEXT: ushll2 v24.2d, v3.4s, #3 -; CHECK-NEXT: ushll2 v3.2d, v7.4s, #3 -; CHECK-NEXT: ushll2 v7.2d, v19.4s, #3 -; CHECK-NEXT: sub v3.2d, v0.2d, v3.2d -; CHECK-NEXT: sub v7.2d, v1.2d, v7.2d -; CHECK-NEXT: sub v1.2d, v5.2d, v24.2d -; CHECK-NEXT: sub v2.2d, v18.2d, v2.2d -; CHECK-NEXT: sub v5.2d, v4.2d, v23.2d -; CHECK-NEXT: sub v6.2d, v6.2d, v22.2d -; CHECK-NEXT: sub v0.2d, v17.2d, v21.2d -; CHECK-NEXT: sub v4.2d, v16.2d, v20.2d +; CHECK-NEXT: ldp q4, q0, [x0, #16] +; CHECK-NEXT: ushll2 v17.4s, v4.8h, #0 +; CHECK-NEXT: ushll v4.4s, v4.4h, #0 +; CHECK-NEXT: ldr q1, [x0] +; CHECK-NEXT: ldr q5, [x0, #48] +; CHECK-NEXT: ushll2 v2.4s, v0.8h, #0 +; CHECK-NEXT: ushll2 v6.4s, v1.8h, #0 +; CHECK-NEXT: ushll2 v16.4s, v5.8h, #0 +; CHECK-NEXT: ushll2 v3.2d, v2.4s, #3 +; CHECK-NEXT: ushll2 v7.2d, v6.4s, #0 +; CHECK-NEXT: ushll2 v18.2d, v16.4s, #3 +; CHECK-NEXT: ushll2 v19.2d, v17.4s, #0 +; CHECK-NEXT: ushll v0.4s, v0.4h, #0 +; CHECK-NEXT: sub v3.2d, v7.2d, v3.2d +; CHECK-NEXT: sub v7.2d, v19.2d, v18.2d +; CHECK-NEXT: ushll v18.4s, v1.4h, #0 +; CHECK-NEXT: ushll2 v1.2d, v0.4s, #3 +; CHECK-NEXT: ushll2 v19.2d, v18.4s, #0 +; CHECK-NEXT: ushll v2.2d, v2.2s, #3 +; CHECK-NEXT: ushll v6.2d, v6.2s, #0 +; CHECK-NEXT: sub v1.2d, v19.2d, v1.2d +; CHECK-NEXT: ushll v19.4s, v5.4h, #0 +; CHECK-NEXT: sub v2.2d, v6.2d, v2.2d +; CHECK-NEXT: ushll2 v5.2d, v19.4s, #3 +; CHECK-NEXT: ushll2 v6.2d, v4.4s, #0 +; CHECK-NEXT: ushll v16.2d, v16.2s, #3 +; CHECK-NEXT: ushll v17.2d, v17.2s, #0 +; CHECK-NEXT: sub v5.2d, v6.2d, v5.2d +; CHECK-NEXT: sub v6.2d, v17.2d, v16.2d +; CHECK-NEXT: ushll v0.2d, v0.2s, #3 +; CHECK-NEXT: ushll v16.2d, v18.2s, #0 +; CHECK-NEXT: ushll v17.2d, v19.2s, #3 +; CHECK-NEXT: ushll v4.2d, v4.2s, #0 +; CHECK-NEXT: sub v0.2d, v16.2d, v0.2d +; CHECK-NEXT: sub v4.2d, v4.2d, v17.2d ; CHECK-NEXT: ret %l1 = load <16 x i16>, ptr %p %q = getelementptr i8, ptr %p, i32 32 @@ -541,9 +529,9 @@ define <2 x i64> @std_v2i32_v2i64(ptr %p) { ; CHECK-LABEL: std_v2i32_v2i64: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp d0, d1, [x0] +; CHECK-NEXT: ldr q0, [x0] +; CHECK-NEXT: ushll2 v1.2d, v0.4s, #3 ; CHECK-NEXT: ushll v0.2d, v0.2s, #0 -; CHECK-NEXT: ushll v1.2d, v1.2s, #3 ; CHECK-NEXT: sub v0.2d, v0.2d, v1.2d ; CHECK-NEXT: ret %l1 = load <2 x i32>, ptr %p @@ -559,13 +547,13 @@ define <4 x i64> @std_v4i32_v4i64(ptr %p) { ; CHECK-LABEL: std_v4i32_v4i64: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q1, [x0] -; CHECK-NEXT: ushll v2.2d, v0.2s, #0 -; CHECK-NEXT: ushll2 v0.2d, v0.4s, #0 -; CHECK-NEXT: ushll2 v3.2d, v1.4s, #3 -; CHECK-NEXT: ushll v4.2d, v1.2s, #3 -; CHECK-NEXT: sub v1.2d, v0.2d, v3.2d -; CHECK-NEXT: sub v0.2d, v2.2d, v4.2d +; CHECK-NEXT: ldp q1, q0, [x0] +; CHECK-NEXT: ushll2 v3.2d, v1.4s, #0 +; CHECK-NEXT: ushll v4.2d, v1.2s, #0 +; CHECK-NEXT: ushll2 v2.2d, v0.4s, #3 +; CHECK-NEXT: ushll v0.2d, v0.2s, #3 +; CHECK-NEXT: sub v1.2d, v3.2d, v2.2d +; CHECK-NEXT: sub v0.2d, v4.2d, v0.2d ; CHECK-NEXT: ret %l1 = load <4 x i32>, ptr %p %q = getelementptr i8, ptr %p, i32 16 @@ -580,20 +568,21 @@ define <8 x i64> @std_v8i32_v8i64(ptr %p) { ; CHECK-LABEL: std_v8i32_v8i64: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q1, [x0] -; CHECK-NEXT: ushll v5.2d, v0.2s, #0 -; CHECK-NEXT: ushll2 v0.2d, v0.4s, #0 -; CHECK-NEXT: ldp q3, q2, [x0, #32] -; CHECK-NEXT: ushll v4.2d, v1.2s, #0 -; CHECK-NEXT: ushll2 v6.2d, v1.4s, #0 -; CHECK-NEXT: ushll v16.2d, v3.2s, #3 -; CHECK-NEXT: ushll2 v1.2d, v3.4s, #3 -; CHECK-NEXT: ushll v7.2d, v2.2s, #3 -; CHECK-NEXT: ushll2 v2.2d, v2.4s, #3 -; CHECK-NEXT: sub v1.2d, v0.2d, v1.2d -; CHECK-NEXT: sub v3.2d, v6.2d, v2.2d -; CHECK-NEXT: sub v0.2d, v5.2d, v16.2d -; CHECK-NEXT: sub v2.2d, v4.2d, v7.2d +; CHECK-NEXT: ldp q2, q0, [x0, #16] +; CHECK-NEXT: ldr q3, [x0] +; CHECK-NEXT: ldr q4, [x0, #48] +; CHECK-NEXT: ushll2 v1.2d, v0.4s, #3 +; CHECK-NEXT: ushll2 v5.2d, v3.4s, #0 +; CHECK-NEXT: ushll2 v6.2d, v4.4s, #3 +; CHECK-NEXT: sub v1.2d, v5.2d, v1.2d +; CHECK-NEXT: ushll2 v5.2d, v2.4s, #0 +; CHECK-NEXT: ushll v0.2d, v0.2s, #3 +; CHECK-NEXT: ushll v7.2d, v3.2s, #0 +; CHECK-NEXT: ushll v4.2d, v4.2s, #3 +; CHECK-NEXT: ushll v2.2d, v2.2s, #0 +; CHECK-NEXT: sub v3.2d, v5.2d, v6.2d +; CHECK-NEXT: sub v0.2d, v7.2d, v0.2d +; CHECK-NEXT: sub v2.2d, v2.2d, v4.2d ; CHECK-NEXT: ret %l1 = load <8 x i32>, ptr %p %q = getelementptr i8, ptr %p, i32 32 @@ -608,34 +597,34 @@ define <16 x i64> @std_v16i32_v16i64(ptr %p) { ; CHECK-LABEL: std_v16i32_v16i64: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q2, q3, [x0, #32] -; CHECK-NEXT: ushll v16.2d, v2.2s, #0 -; CHECK-NEXT: ushll2 v2.2d, v2.4s, #0 -; CHECK-NEXT: ldp q0, q1, [x0] -; CHECK-NEXT: ushll v6.2d, v3.2s, #0 -; CHECK-NEXT: ushll2 v7.2d, v3.4s, #0 -; CHECK-NEXT: ushll v18.2d, v0.2s, #0 -; CHECK-NEXT: ushll2 v0.2d, v0.4s, #0 -; CHECK-NEXT: ldp q5, q4, [x0, #96] -; CHECK-NEXT: ushll v17.2d, v1.2s, #0 -; CHECK-NEXT: ushll2 v20.2d, v1.4s, #0 -; CHECK-NEXT: ushll v22.2d, v5.2s, #3 -; CHECK-NEXT: ushll2 v5.2d, v5.4s, #3 -; CHECK-NEXT: ldp q19, q3, [x0, #64] -; CHECK-NEXT: ushll v21.2d, v4.2s, #3 -; CHECK-NEXT: ushll2 v4.2d, v4.4s, #3 -; CHECK-NEXT: sub v5.2d, v2.2d, v5.2d -; CHECK-NEXT: ushll v24.2d, v19.2s, #3 -; CHECK-NEXT: ushll2 v1.2d, v19.4s, #3 -; CHECK-NEXT: ushll v23.2d, v3.2s, #3 -; CHECK-NEXT: ushll2 v3.2d, v3.4s, #3 -; CHECK-NEXT: sub v1.2d, v0.2d, v1.2d -; CHECK-NEXT: sub v3.2d, v20.2d, v3.2d -; CHECK-NEXT: sub v7.2d, v7.2d, v4.2d -; CHECK-NEXT: sub v0.2d, v18.2d, v24.2d -; CHECK-NEXT: sub v2.2d, v17.2d, v23.2d -; CHECK-NEXT: sub v4.2d, v16.2d, v22.2d -; CHECK-NEXT: sub v6.2d, v6.2d, v21.2d +; CHECK-NEXT: ldp q2, q0, [x0] +; CHECK-NEXT: ushll2 v3.2d, v2.4s, #0 +; CHECK-NEXT: ushll v2.2d, v2.2s, #0 +; CHECK-NEXT: ldp q6, q4, [x0, #64] +; CHECK-NEXT: ushll2 v7.2d, v0.4s, #0 +; CHECK-NEXT: ushll2 v1.2d, v6.4s, #3 +; CHECK-NEXT: ushll v6.2d, v6.2s, #3 +; CHECK-NEXT: ldp q17, q16, [x0, #96] +; CHECK-NEXT: ushll2 v5.2d, v4.4s, #3 +; CHECK-NEXT: sub v1.2d, v3.2d, v1.2d +; CHECK-NEXT: sub v3.2d, v7.2d, v5.2d +; CHECK-NEXT: ushll2 v20.2d, v17.4s, #3 +; CHECK-NEXT: ushll v4.2d, v4.2s, #3 +; CHECK-NEXT: ldp q19, q18, [x0, #32] +; CHECK-NEXT: ushll2 v22.2d, v16.4s, #3 +; CHECK-NEXT: ushll v16.2d, v16.2s, #3 +; CHECK-NEXT: ushll2 v21.2d, v19.4s, #0 +; CHECK-NEXT: sub v5.2d, v21.2d, v20.2d +; CHECK-NEXT: ushll v20.2d, v0.2s, #0 +; CHECK-NEXT: ushll2 v7.2d, v18.4s, #0 +; CHECK-NEXT: sub v0.2d, v2.2d, v6.2d +; CHECK-NEXT: sub v2.2d, v20.2d, v4.2d +; CHECK-NEXT: ushll v4.2d, v17.2s, #3 +; CHECK-NEXT: ushll v6.2d, v19.2s, #0 +; CHECK-NEXT: ushll v17.2d, v18.2s, #0 +; CHECK-NEXT: sub v7.2d, v7.2d, v22.2d +; CHECK-NEXT: sub v4.2d, v6.2d, v4.2d +; CHECK-NEXT: sub v6.2d, v17.2d, v16.2d ; CHECK-NEXT: ret %l1 = load <16 x i32>, ptr %p %q = getelementptr i8, ptr %p, i32 64 @@ -721,12 +710,11 @@ define <8 x i16> @dbl_v8i8_v8i16(ptr %p1, ptr %p2) { ; CHECK-LABEL: dbl_v8i8_v8i16: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp d0, d1, [x0] -; CHECK-NEXT: ldp d2, d3, [x1] -; CHECK-NEXT: add v0.8b, v0.8b, v2.8b -; CHECK-NEXT: add v1.8b, v1.8b, v3.8b +; CHECK-NEXT: ldr q0, [x0] +; CHECK-NEXT: ldr q1, [x1] +; CHECK-NEXT: add v0.16b, v0.16b, v1.16b +; CHECK-NEXT: ushll2 v1.8h, v0.16b, #3 ; CHECK-NEXT: ushll v0.8h, v0.8b, #0 -; CHECK-NEXT: ushll v1.8h, v1.8b, #3 ; CHECK-NEXT: sub v0.8h, v0.8h, v1.8h ; CHECK-NEXT: ret %l11 = load <8 x i8>, ptr %p1 @@ -747,16 +735,16 @@ define <16 x i16> @dbl_v16i8_v16i16(ptr %p1, ptr %p2) { ; CHECK-LABEL: dbl_v16i8_v16i16: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q1, [x0] -; CHECK-NEXT: ldp q2, q3, [x1] -; CHECK-NEXT: add v0.16b, v0.16b, v2.16b +; CHECK-NEXT: ldp q1, q0, [x0] +; CHECK-NEXT: ldp q3, q2, [x1] ; CHECK-NEXT: add v1.16b, v1.16b, v3.16b -; CHECK-NEXT: ushll v2.8h, v0.8b, #0 -; CHECK-NEXT: ushll2 v0.8h, v0.16b, #0 -; CHECK-NEXT: ushll2 v3.8h, v1.16b, #3 -; CHECK-NEXT: ushll v4.8h, v1.8b, #3 -; CHECK-NEXT: sub v1.8h, v0.8h, v3.8h -; CHECK-NEXT: sub v0.8h, v2.8h, v4.8h +; CHECK-NEXT: add v0.16b, v0.16b, v2.16b +; CHECK-NEXT: ushll2 v3.8h, v1.16b, #0 +; CHECK-NEXT: ushll2 v2.8h, v0.16b, #3 +; CHECK-NEXT: ushll v0.8h, v0.8b, #3 +; CHECK-NEXT: ushll v4.8h, v1.8b, #0 +; CHECK-NEXT: sub v1.8h, v3.8h, v2.8h +; CHECK-NEXT: sub v0.8h, v4.8h, v0.8h ; CHECK-NEXT: ret %l11 = load <16 x i8>, ptr %p1 %q1 = getelementptr i8, ptr %p1, i32 16 @@ -816,21 +804,13 @@ define <4 x i32> @dbl_v4i8_v4i32(ptr %p1, ptr %p2) { ; CHECK-LABEL: dbl_v4i8_v4i32: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp s1, s2, [x0] -; CHECK-NEXT: ldp s3, s4, [x1] -; CHECK-NEXT: movi v0.2d, #0x0000ff000000ff -; CHECK-NEXT: ushll v2.8h, v2.8b, #0 -; CHECK-NEXT: ushll v4.8h, v4.8b, #0 -; CHECK-NEXT: ushll v1.8h, v1.8b, #0 -; CHECK-NEXT: ushll v3.8h, v3.8b, #0 -; CHECK-NEXT: add v2.4h, v2.4h, v4.4h -; CHECK-NEXT: add v1.4h, v1.4h, v3.4h -; CHECK-NEXT: ushll v2.4s, v2.4h, #0 -; CHECK-NEXT: ushll v1.4s, v1.4h, #0 -; CHECK-NEXT: and v2.16b, v2.16b, v0.16b -; CHECK-NEXT: shl v2.4s, v2.4s, #3 -; CHECK-NEXT: and v0.16b, v1.16b, v0.16b -; CHECK-NEXT: sub v0.4s, v0.4s, v2.4s +; CHECK-NEXT: ldr d0, [x0] +; CHECK-NEXT: ldr d1, [x1] +; CHECK-NEXT: add v0.8b, v0.8b, v1.8b +; CHECK-NEXT: ushll v0.8h, v0.8b, #0 +; CHECK-NEXT: ushll2 v1.4s, v0.8h, #3 +; CHECK-NEXT: ushll v0.4s, v0.4h, #0 +; CHECK-NEXT: sub v0.4s, v0.4s, v1.4s ; CHECK-NEXT: ret %l11 = load <4 x i8>, ptr %p1 %q1 = getelementptr i8, ptr %p1, i32 4 @@ -850,18 +830,17 @@ define <8 x i32> @dbl_v8i8_v8i32(ptr %p1, ptr %p2) { ; CHECK-LABEL: dbl_v8i8_v8i32: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp d0, d1, [x0] -; CHECK-NEXT: ldp d2, d3, [x1] -; CHECK-NEXT: add v0.8b, v0.8b, v2.8b -; CHECK-NEXT: add v1.8b, v1.8b, v3.8b +; CHECK-NEXT: ldr q0, [x0] +; CHECK-NEXT: ldr q1, [x1] +; CHECK-NEXT: add v0.16b, v0.16b, v1.16b +; CHECK-NEXT: ushll2 v1.8h, v0.16b, #0 ; CHECK-NEXT: ushll v0.8h, v0.8b, #0 -; CHECK-NEXT: ushll v1.8h, v1.8b, #0 -; CHECK-NEXT: ushll v2.4s, v0.4h, #0 -; CHECK-NEXT: ushll2 v0.4s, v0.8h, #0 -; CHECK-NEXT: ushll2 v3.4s, v1.8h, #3 +; CHECK-NEXT: ushll2 v2.4s, v1.8h, #3 +; CHECK-NEXT: ushll2 v3.4s, v0.8h, #0 ; CHECK-NEXT: ushll v4.4s, v1.4h, #3 -; CHECK-NEXT: sub v1.4s, v0.4s, v3.4s -; CHECK-NEXT: sub v0.4s, v2.4s, v4.4s +; CHECK-NEXT: ushll v0.4s, v0.4h, #0 +; CHECK-NEXT: sub v1.4s, v3.4s, v2.4s +; CHECK-NEXT: sub v0.4s, v0.4s, v4.4s ; CHECK-NEXT: ret %l11 = load <8 x i8>, ptr %p1 %q1 = getelementptr i8, ptr %p1, i32 8 @@ -881,26 +860,26 @@ define <16 x i32> @dbl_v16i8_v16i32(ptr %p1, ptr %p2) { ; CHECK-LABEL: dbl_v16i8_v16i32: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q1, [x0] -; CHECK-NEXT: ldp q2, q3, [x1] -; CHECK-NEXT: add v0.16b, v0.16b, v2.16b +; CHECK-NEXT: ldp q1, q0, [x0] +; CHECK-NEXT: ldp q3, q2, [x1] ; CHECK-NEXT: add v1.16b, v1.16b, v3.16b -; CHECK-NEXT: ushll v2.8h, v0.8b, #0 -; CHECK-NEXT: ushll2 v0.8h, v0.16b, #0 -; CHECK-NEXT: ushll2 v3.8h, v1.16b, #0 -; CHECK-NEXT: ushll v1.8h, v1.8b, #0 -; CHECK-NEXT: ushll v4.4s, v2.4h, #0 -; CHECK-NEXT: ushll v5.4s, v0.4h, #0 -; CHECK-NEXT: ushll2 v2.4s, v2.8h, #0 -; CHECK-NEXT: ushll2 v0.4s, v0.8h, #0 -; CHECK-NEXT: ushll v6.4s, v1.4h, #3 -; CHECK-NEXT: ushll v7.4s, v3.4h, #3 -; CHECK-NEXT: ushll2 v3.4s, v3.8h, #3 -; CHECK-NEXT: ushll2 v1.4s, v1.8h, #3 -; CHECK-NEXT: sub v3.4s, v0.4s, v3.4s -; CHECK-NEXT: sub v1.4s, v2.4s, v1.4s -; CHECK-NEXT: sub v0.4s, v4.4s, v6.4s -; CHECK-NEXT: sub v2.4s, v5.4s, v7.4s +; CHECK-NEXT: add v0.16b, v0.16b, v2.16b +; CHECK-NEXT: ushll2 v4.8h, v1.16b, #0 +; CHECK-NEXT: ushll2 v2.8h, v0.16b, #0 +; CHECK-NEXT: ushll v0.8h, v0.8b, #0 +; CHECK-NEXT: ushll v5.8h, v1.8b, #0 +; CHECK-NEXT: ushll2 v1.4s, v2.8h, #3 +; CHECK-NEXT: ushll2 v3.4s, v4.8h, #0 +; CHECK-NEXT: ushll2 v6.4s, v0.8h, #3 +; CHECK-NEXT: sub v3.4s, v3.4s, v1.4s +; CHECK-NEXT: ushll2 v1.4s, v5.8h, #0 +; CHECK-NEXT: ushll v2.4s, v2.4h, #3 +; CHECK-NEXT: ushll v0.4s, v0.4h, #3 +; CHECK-NEXT: ushll v5.4s, v5.4h, #0 +; CHECK-NEXT: ushll v4.4s, v4.4h, #0 +; CHECK-NEXT: sub v1.4s, v1.4s, v6.4s +; CHECK-NEXT: sub v0.4s, v5.4s, v0.4s +; CHECK-NEXT: sub v2.4s, v4.4s, v2.4s ; CHECK-NEXT: ret %l11 = load <16 x i8>, ptr %p1 %q1 = getelementptr i8, ptr %p1, i32 16 @@ -920,29 +899,19 @@ define <2 x i64> @dbl_v2i8_v2i64(ptr %p1, ptr %p2) { ; CHECK-LABEL: dbl_v2i8_v2i64: ; CHECK: // %bb.0: -; CHECK-NEXT: add x8, x0, #2 -; CHECK-NEXT: add x9, x1, #2 -; CHECK-NEXT: ld1 { v2.b }[0], [x0] -; CHECK-NEXT: add x10, x0, #1 -; CHECK-NEXT: ld1 { v3.b }[0], [x1] -; CHECK-NEXT: ld1 { v0.b }[0], [x8] -; CHECK-NEXT: add x8, x0, #3 -; CHECK-NEXT: ld1 { v1.b }[0], [x9] -; CHECK-NEXT: add x9, x1, #3 -; CHECK-NEXT: ld1 { v2.b }[4], [x10] -; CHECK-NEXT: ld1 { v0.b }[4], [x8] -; CHECK-NEXT: add x8, x1, #1 -; CHECK-NEXT: ld1 { v1.b }[4], [x9] -; CHECK-NEXT: ld1 { v3.b }[4], [x8] -; CHECK-NEXT: add v0.2s, v0.2s, v1.2s -; CHECK-NEXT: movi v1.2d, #0x000000000000ff -; CHECK-NEXT: add v2.2s, v2.2s, v3.2s -; CHECK-NEXT: ushll v0.2d, v0.2s, #0 -; CHECK-NEXT: ushll v2.2d, v2.2s, #0 -; CHECK-NEXT: and v0.16b, v0.16b, v1.16b -; CHECK-NEXT: shl v0.2d, v0.2d, #3 -; CHECK-NEXT: and v1.16b, v2.16b, v1.16b -; CHECK-NEXT: sub v0.2d, v1.2d, v0.2d +; CHECK-NEXT: ldr s1, [x0] +; CHECK-NEXT: ldr s2, [x1] +; CHECK-NEXT: movi v0.2d, #0x000000000000ff +; CHECK-NEXT: ushll v1.8h, v1.8b, #0 +; CHECK-NEXT: ushll v2.8h, v2.8b, #0 +; CHECK-NEXT: add v1.4h, v1.4h, v2.4h +; CHECK-NEXT: ushll v1.4s, v1.4h, #0 +; CHECK-NEXT: ushll2 v2.2d, v1.4s, #0 +; CHECK-NEXT: ushll v1.2d, v1.2s, #0 +; CHECK-NEXT: and v2.16b, v2.16b, v0.16b +; CHECK-NEXT: shl v2.2d, v2.2d, #3 +; CHECK-NEXT: and v0.16b, v1.16b, v0.16b +; CHECK-NEXT: sub v0.2d, v0.2d, v2.2d ; CHECK-NEXT: ret %l11 = load <2 x i8>, ptr %p1 %q1 = getelementptr i8, ptr %p1, i32 2 @@ -962,29 +931,18 @@ define <4 x i64> @dbl_v4i8_v4i64(ptr %p1, ptr %p2) { ; CHECK-LABEL: dbl_v4i8_v4i64: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp s1, s2, [x0] -; CHECK-NEXT: ldp s3, s4, [x1] -; CHECK-NEXT: movi v0.2d, #0x000000000000ff -; CHECK-NEXT: ushll v1.8h, v1.8b, #0 -; CHECK-NEXT: ushll v2.8h, v2.8b, #0 -; CHECK-NEXT: ushll v3.8h, v3.8b, #0 -; CHECK-NEXT: ushll v4.8h, v4.8b, #0 -; CHECK-NEXT: add v1.4h, v1.4h, v3.4h -; CHECK-NEXT: add v2.4h, v2.4h, v4.4h -; CHECK-NEXT: ushll v1.4s, v1.4h, #0 -; CHECK-NEXT: ushll v2.4s, v2.4h, #0 -; CHECK-NEXT: ushll v3.2d, v1.2s, #0 -; CHECK-NEXT: ushll2 v1.2d, v1.4s, #0 -; CHECK-NEXT: ushll2 v4.2d, v2.4s, #0 -; CHECK-NEXT: ushll v2.2d, v2.2s, #0 -; CHECK-NEXT: and v3.16b, v3.16b, v0.16b -; CHECK-NEXT: and v1.16b, v1.16b, v0.16b -; CHECK-NEXT: and v4.16b, v4.16b, v0.16b -; CHECK-NEXT: and v0.16b, v2.16b, v0.16b -; CHECK-NEXT: shl v4.2d, v4.2d, #3 -; CHECK-NEXT: shl v0.2d, v0.2d, #3 -; CHECK-NEXT: sub v1.2d, v1.2d, v4.2d -; CHECK-NEXT: sub v0.2d, v3.2d, v0.2d +; CHECK-NEXT: ldr d0, [x0] +; CHECK-NEXT: ldr d1, [x1] +; CHECK-NEXT: add v0.8b, v0.8b, v1.8b +; CHECK-NEXT: ushll v0.8h, v0.8b, #0 +; CHECK-NEXT: ushll2 v1.4s, v0.8h, #0 +; CHECK-NEXT: ushll v0.4s, v0.4h, #0 +; CHECK-NEXT: ushll2 v2.2d, v1.4s, #3 +; CHECK-NEXT: ushll2 v3.2d, v0.4s, #0 +; CHECK-NEXT: ushll v4.2d, v1.2s, #3 +; CHECK-NEXT: ushll v0.2d, v0.2s, #0 +; CHECK-NEXT: sub v1.2d, v3.2d, v2.2d +; CHECK-NEXT: sub v0.2d, v0.2d, v4.2d ; CHECK-NEXT: ret %l11 = load <4 x i8>, ptr %p1 %q1 = getelementptr i8, ptr %p1, i32 4 @@ -1004,28 +962,27 @@ define <8 x i64> @dbl_v8i8_v8i64(ptr %p1, ptr %p2) { ; CHECK-LABEL: dbl_v8i8_v8i64: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp d0, d1, [x0] -; CHECK-NEXT: ldp d2, d3, [x1] -; CHECK-NEXT: add v0.8b, v0.8b, v2.8b -; CHECK-NEXT: add v1.8b, v1.8b, v3.8b +; CHECK-NEXT: ldr q0, [x0] +; CHECK-NEXT: ldr q1, [x1] +; CHECK-NEXT: add v0.16b, v0.16b, v1.16b +; CHECK-NEXT: ushll2 v1.8h, v0.16b, #0 ; CHECK-NEXT: ushll v0.8h, v0.8b, #0 -; CHECK-NEXT: ushll v1.8h, v1.8b, #0 -; CHECK-NEXT: ushll v2.4s, v0.4h, #0 -; CHECK-NEXT: ushll2 v0.4s, v0.8h, #0 -; CHECK-NEXT: ushll2 v3.4s, v1.8h, #0 +; CHECK-NEXT: ushll2 v2.4s, v1.8h, #0 +; CHECK-NEXT: ushll2 v4.4s, v0.8h, #0 ; CHECK-NEXT: ushll v1.4s, v1.4h, #0 -; CHECK-NEXT: ushll v4.2d, v2.2s, #0 -; CHECK-NEXT: ushll v5.2d, v0.2s, #0 -; CHECK-NEXT: ushll2 v2.2d, v2.4s, #0 -; CHECK-NEXT: ushll2 v0.2d, v0.4s, #0 -; CHECK-NEXT: ushll v6.2d, v1.2s, #3 -; CHECK-NEXT: ushll v7.2d, v3.2s, #3 -; CHECK-NEXT: ushll2 v3.2d, v3.4s, #3 -; CHECK-NEXT: ushll2 v1.2d, v1.4s, #3 -; CHECK-NEXT: sub v3.2d, v0.2d, v3.2d -; CHECK-NEXT: sub v1.2d, v2.2d, v1.2d -; CHECK-NEXT: sub v0.2d, v4.2d, v6.2d -; CHECK-NEXT: sub v2.2d, v5.2d, v7.2d +; CHECK-NEXT: ushll v0.4s, v0.4h, #0 +; CHECK-NEXT: ushll2 v3.2d, v2.4s, #3 +; CHECK-NEXT: ushll2 v5.2d, v4.4s, #0 +; CHECK-NEXT: ushll2 v6.2d, v1.4s, #3 +; CHECK-NEXT: sub v3.2d, v5.2d, v3.2d +; CHECK-NEXT: ushll2 v5.2d, v0.4s, #0 +; CHECK-NEXT: ushll v2.2d, v2.2s, #3 +; CHECK-NEXT: ushll v7.2d, v1.2s, #3 +; CHECK-NEXT: ushll v0.2d, v0.2s, #0 +; CHECK-NEXT: ushll v4.2d, v4.2s, #0 +; CHECK-NEXT: sub v1.2d, v5.2d, v6.2d +; CHECK-NEXT: sub v0.2d, v0.2d, v7.2d +; CHECK-NEXT: sub v2.2d, v4.2d, v2.2d ; CHECK-NEXT: ret %l11 = load <8 x i8>, ptr %p1 %q1 = getelementptr i8, ptr %p1, i32 8 @@ -1045,46 +1002,46 @@ define <16 x i64> @dbl_v16i8_v16i64(ptr %p1, ptr %p2) { ; CHECK-LABEL: dbl_v16i8_v16i64: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q1, [x0] -; CHECK-NEXT: ldp q2, q3, [x1] -; CHECK-NEXT: add v0.16b, v0.16b, v2.16b -; CHECK-NEXT: ushll v2.8h, v0.8b, #0 +; CHECK-NEXT: ldp q1, q0, [x0] +; CHECK-NEXT: ldp q3, q2, [x1] ; CHECK-NEXT: add v1.16b, v1.16b, v3.16b -; CHECK-NEXT: ushll v3.4s, v2.4h, #0 -; CHECK-NEXT: ushll2 v0.8h, v0.16b, #0 -; CHECK-NEXT: ushll v16.2d, v3.2s, #0 -; CHECK-NEXT: ushll2 v19.2d, v3.4s, #0 -; CHECK-NEXT: ushll2 v3.8h, v1.16b, #0 +; CHECK-NEXT: add v0.16b, v0.16b, v2.16b +; CHECK-NEXT: ushll2 v4.8h, v1.16b, #0 +; CHECK-NEXT: ushll2 v2.8h, v0.16b, #0 +; CHECK-NEXT: ushll v0.8h, v0.8b, #0 ; CHECK-NEXT: ushll v1.8h, v1.8b, #0 -; CHECK-NEXT: ushll v4.4s, v0.4h, #0 -; CHECK-NEXT: ushll2 v2.4s, v2.8h, #0 -; CHECK-NEXT: ushll2 v0.4s, v0.8h, #0 -; CHECK-NEXT: ushll2 v5.4s, v3.8h, #0 -; CHECK-NEXT: ushll2 v7.4s, v1.8h, #0 -; CHECK-NEXT: ushll v1.4s, v1.4h, #0 -; CHECK-NEXT: ushll v3.4s, v3.4h, #0 -; CHECK-NEXT: ushll v17.2d, v4.2s, #0 -; CHECK-NEXT: ushll v18.2d, v2.2s, #0 -; CHECK-NEXT: ushll v6.2d, v0.2s, #0 -; CHECK-NEXT: ushll2 v4.2d, v4.4s, #0 -; CHECK-NEXT: ushll2 v2.2d, v2.4s, #0 -; CHECK-NEXT: ushll2 v0.2d, v0.4s, #0 -; CHECK-NEXT: ushll v20.2d, v1.2s, #3 -; CHECK-NEXT: ushll v21.2d, v3.2s, #3 -; CHECK-NEXT: ushll v22.2d, v7.2s, #3 -; CHECK-NEXT: ushll2 v1.2d, v1.4s, #3 -; CHECK-NEXT: ushll v23.2d, v5.2s, #3 -; CHECK-NEXT: ushll2 v24.2d, v3.4s, #3 +; CHECK-NEXT: ushll2 v5.4s, v2.8h, #0 +; CHECK-NEXT: ushll2 v6.4s, v4.8h, #0 +; CHECK-NEXT: ushll2 v16.4s, v0.8h, #0 +; CHECK-NEXT: ushll2 v17.4s, v1.8h, #0 ; CHECK-NEXT: ushll2 v3.2d, v5.4s, #3 -; CHECK-NEXT: ushll2 v5.2d, v7.4s, #3 -; CHECK-NEXT: sub v7.2d, v0.2d, v3.2d -; CHECK-NEXT: sub v3.2d, v2.2d, v5.2d -; CHECK-NEXT: sub v5.2d, v4.2d, v24.2d -; CHECK-NEXT: sub v6.2d, v6.2d, v23.2d -; CHECK-NEXT: sub v1.2d, v19.2d, v1.2d -; CHECK-NEXT: sub v2.2d, v18.2d, v22.2d -; CHECK-NEXT: sub v0.2d, v16.2d, v20.2d -; CHECK-NEXT: sub v4.2d, v17.2d, v21.2d +; CHECK-NEXT: ushll2 v7.2d, v6.4s, #0 +; CHECK-NEXT: ushll2 v18.2d, v16.4s, #3 +; CHECK-NEXT: ushll2 v19.2d, v17.4s, #0 +; CHECK-NEXT: sub v7.2d, v7.2d, v3.2d +; CHECK-NEXT: sub v3.2d, v19.2d, v18.2d +; CHECK-NEXT: ushll v18.4s, v2.4h, #0 +; CHECK-NEXT: ushll v4.4s, v4.4h, #0 +; CHECK-NEXT: ushll2 v2.2d, v18.4s, #3 +; CHECK-NEXT: ushll2 v19.2d, v4.4s, #0 +; CHECK-NEXT: ushll v20.2d, v5.2s, #3 +; CHECK-NEXT: sub v5.2d, v19.2d, v2.2d +; CHECK-NEXT: ushll v0.4s, v0.4h, #0 +; CHECK-NEXT: ushll v19.4s, v1.4h, #0 +; CHECK-NEXT: ushll2 v1.2d, v0.4s, #3 +; CHECK-NEXT: ushll2 v2.2d, v19.4s, #0 +; CHECK-NEXT: ushll v16.2d, v16.2s, #3 +; CHECK-NEXT: ushll v17.2d, v17.2s, #0 +; CHECK-NEXT: ushll v6.2d, v6.2s, #0 +; CHECK-NEXT: sub v1.2d, v2.2d, v1.2d +; CHECK-NEXT: sub v2.2d, v17.2d, v16.2d +; CHECK-NEXT: ushll v16.2d, v18.2s, #3 +; CHECK-NEXT: ushll v0.2d, v0.2s, #3 +; CHECK-NEXT: ushll v17.2d, v19.2s, #0 +; CHECK-NEXT: ushll v4.2d, v4.2s, #0 +; CHECK-NEXT: sub v6.2d, v6.2d, v20.2d +; CHECK-NEXT: sub v0.2d, v17.2d, v0.2d +; CHECK-NEXT: sub v4.2d, v4.2d, v16.2d ; CHECK-NEXT: ret %l11 = load <16 x i8>, ptr %p1 %q1 = getelementptr i8, ptr %p1, i32 16 @@ -1144,12 +1101,11 @@ define <4 x i32> @dbl_v4i16_v4i32(ptr %p1, ptr %p2) { ; CHECK-LABEL: dbl_v4i16_v4i32: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp d0, d1, [x0] -; CHECK-NEXT: ldp d2, d3, [x1] -; CHECK-NEXT: add v0.4h, v0.4h, v2.4h -; CHECK-NEXT: add v1.4h, v1.4h, v3.4h +; CHECK-NEXT: ldr q0, [x0] +; CHECK-NEXT: ldr q1, [x1] +; CHECK-NEXT: add v0.8h, v0.8h, v1.8h +; CHECK-NEXT: ushll2 v1.4s, v0.8h, #3 ; CHECK-NEXT: ushll v0.4s, v0.4h, #0 -; CHECK-NEXT: ushll v1.4s, v1.4h, #3 ; CHECK-NEXT: sub v0.4s, v0.4s, v1.4s ; CHECK-NEXT: ret %l11 = load <4 x i16>, ptr %p1 @@ -1170,16 +1126,16 @@ define <8 x i32> @dbl_v8i16_v8i32(ptr %p1, ptr %p2) { ; CHECK-LABEL: dbl_v8i16_v8i32: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q1, [x0] -; CHECK-NEXT: ldp q2, q3, [x1] -; CHECK-NEXT: add v0.8h, v0.8h, v2.8h +; CHECK-NEXT: ldp q1, q0, [x0] +; CHECK-NEXT: ldp q3, q2, [x1] ; CHECK-NEXT: add v1.8h, v1.8h, v3.8h -; CHECK-NEXT: ushll v2.4s, v0.4h, #0 -; CHECK-NEXT: ushll2 v0.4s, v0.8h, #0 -; CHECK-NEXT: ushll2 v3.4s, v1.8h, #3 -; CHECK-NEXT: ushll v4.4s, v1.4h, #3 -; CHECK-NEXT: sub v1.4s, v0.4s, v3.4s -; CHECK-NEXT: sub v0.4s, v2.4s, v4.4s +; CHECK-NEXT: add v0.8h, v0.8h, v2.8h +; CHECK-NEXT: ushll2 v3.4s, v1.8h, #0 +; CHECK-NEXT: ushll2 v2.4s, v0.8h, #3 +; CHECK-NEXT: ushll v0.4s, v0.4h, #3 +; CHECK-NEXT: ushll v4.4s, v1.4h, #0 +; CHECK-NEXT: sub v1.4s, v3.4s, v2.4s +; CHECK-NEXT: sub v0.4s, v4.4s, v0.4s ; CHECK-NEXT: ret %l11 = load <8 x i16>, ptr %p1 %q1 = getelementptr i8, ptr %p1, i32 16 @@ -1199,26 +1155,27 @@ define <16 x i32> @dbl_v16i16_v16i32(ptr %p1, ptr %p2) { ; CHECK-LABEL: dbl_v16i16_v16i32: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q1, q0, [x0] -; CHECK-NEXT: ldp q2, q3, [x0, #32] -; CHECK-NEXT: ldp q5, q4, [x1] -; CHECK-NEXT: add v1.8h, v1.8h, v5.8h -; CHECK-NEXT: ldp q6, q7, [x1, #32] -; CHECK-NEXT: add v0.8h, v0.8h, v4.8h -; CHECK-NEXT: ushll v5.4s, v1.4h, #0 -; CHECK-NEXT: ushll v4.4s, v0.4h, #0 -; CHECK-NEXT: add v2.8h, v2.8h, v6.8h -; CHECK-NEXT: ushll2 v0.4s, v0.8h, #0 -; CHECK-NEXT: add v3.8h, v3.8h, v7.8h -; CHECK-NEXT: ushll2 v1.4s, v1.8h, #0 -; CHECK-NEXT: ushll v6.4s, v3.4h, #3 -; CHECK-NEXT: ushll v7.4s, v2.4h, #3 -; CHECK-NEXT: ushll2 v2.4s, v2.8h, #3 -; CHECK-NEXT: ushll2 v3.4s, v3.8h, #3 -; CHECK-NEXT: sub v1.4s, v1.4s, v2.4s -; CHECK-NEXT: sub v3.4s, v0.4s, v3.4s -; CHECK-NEXT: sub v0.4s, v5.4s, v7.4s -; CHECK-NEXT: sub v2.4s, v4.4s, v6.4s +; CHECK-NEXT: ldp q0, q1, [x0, #32] +; CHECK-NEXT: ldp q2, q3, [x0] +; CHECK-NEXT: ldp q4, q5, [x1, #16] +; CHECK-NEXT: add v4.8h, v3.8h, v4.8h +; CHECK-NEXT: ldr q6, [x1, #48] +; CHECK-NEXT: ldr q7, [x1] +; CHECK-NEXT: add v0.8h, v0.8h, v5.8h +; CHECK-NEXT: add v6.8h, v1.8h, v6.8h +; CHECK-NEXT: add v2.8h, v2.8h, v7.8h +; CHECK-NEXT: ushll2 v1.4s, v0.8h, #3 +; CHECK-NEXT: ushll2 v3.4s, v2.8h, #0 +; CHECK-NEXT: ushll2 v5.4s, v6.8h, #3 +; CHECK-NEXT: ushll2 v7.4s, v4.8h, #0 +; CHECK-NEXT: ushll v0.4s, v0.4h, #3 +; CHECK-NEXT: sub v1.4s, v3.4s, v1.4s +; CHECK-NEXT: sub v3.4s, v7.4s, v5.4s +; CHECK-NEXT: ushll v2.4s, v2.4h, #0 +; CHECK-NEXT: ushll v5.4s, v6.4h, #3 +; CHECK-NEXT: ushll v4.4s, v4.4h, #0 +; CHECK-NEXT: sub v0.4s, v2.4s, v0.4s +; CHECK-NEXT: sub v2.4s, v4.4s, v5.4s ; CHECK-NEXT: ret %l11 = load <16 x i16>, ptr %p1 %q1 = getelementptr i8, ptr %p1, i32 32 @@ -1238,29 +1195,13 @@ define <2 x i64> @dbl_v2i16_v2i64(ptr %p1, ptr %p2) { ; CHECK-LABEL: dbl_v2i16_v2i64: ; CHECK: // %bb.0: -; CHECK-NEXT: add x8, x0, #4 -; CHECK-NEXT: add x9, x1, #4 -; CHECK-NEXT: ld1 { v2.h }[0], [x0] -; CHECK-NEXT: add x10, x0, #2 -; CHECK-NEXT: ld1 { v3.h }[0], [x1] -; CHECK-NEXT: ld1 { v0.h }[0], [x8] -; CHECK-NEXT: add x8, x0, #6 -; CHECK-NEXT: ld1 { v1.h }[0], [x9] -; CHECK-NEXT: add x9, x1, #6 -; CHECK-NEXT: ld1 { v2.h }[2], [x10] -; CHECK-NEXT: ld1 { v0.h }[2], [x8] -; CHECK-NEXT: add x8, x1, #2 -; CHECK-NEXT: ld1 { v1.h }[2], [x9] -; CHECK-NEXT: ld1 { v3.h }[2], [x8] -; CHECK-NEXT: add v0.2s, v0.2s, v1.2s -; CHECK-NEXT: movi v1.2d, #0x0000000000ffff -; CHECK-NEXT: add v2.2s, v2.2s, v3.2s +; CHECK-NEXT: ldr d0, [x0] +; CHECK-NEXT: ldr d1, [x1] +; CHECK-NEXT: add v0.4h, v0.4h, v1.4h +; CHECK-NEXT: ushll v0.4s, v0.4h, #0 +; CHECK-NEXT: ushll2 v1.2d, v0.4s, #3 ; CHECK-NEXT: ushll v0.2d, v0.2s, #0 -; CHECK-NEXT: ushll v2.2d, v2.2s, #0 -; CHECK-NEXT: and v0.16b, v0.16b, v1.16b -; CHECK-NEXT: shl v0.2d, v0.2d, #3 -; CHECK-NEXT: and v1.16b, v2.16b, v1.16b -; CHECK-NEXT: sub v0.2d, v1.2d, v0.2d +; CHECK-NEXT: sub v0.2d, v0.2d, v1.2d ; CHECK-NEXT: ret %l11 = load <2 x i16>, ptr %p1 %q1 = getelementptr i8, ptr %p1, i32 4 @@ -1280,18 +1221,17 @@ define <4 x i64> @dbl_v4i16_v4i64(ptr %p1, ptr %p2) { ; CHECK-LABEL: dbl_v4i16_v4i64: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp d0, d1, [x0] -; CHECK-NEXT: ldp d2, d3, [x1] -; CHECK-NEXT: add v0.4h, v0.4h, v2.4h -; CHECK-NEXT: add v1.4h, v1.4h, v3.4h +; CHECK-NEXT: ldr q0, [x0] +; CHECK-NEXT: ldr q1, [x1] +; CHECK-NEXT: add v0.8h, v0.8h, v1.8h +; CHECK-NEXT: ushll2 v1.4s, v0.8h, #0 ; CHECK-NEXT: ushll v0.4s, v0.4h, #0 -; CHECK-NEXT: ushll v1.4s, v1.4h, #0 -; CHECK-NEXT: ushll v2.2d, v0.2s, #0 -; CHECK-NEXT: ushll2 v0.2d, v0.4s, #0 -; CHECK-NEXT: ushll2 v3.2d, v1.4s, #3 +; CHECK-NEXT: ushll2 v2.2d, v1.4s, #3 +; CHECK-NEXT: ushll2 v3.2d, v0.4s, #0 ; CHECK-NEXT: ushll v4.2d, v1.2s, #3 -; CHECK-NEXT: sub v1.2d, v0.2d, v3.2d -; CHECK-NEXT: sub v0.2d, v2.2d, v4.2d +; CHECK-NEXT: ushll v0.2d, v0.2s, #0 +; CHECK-NEXT: sub v1.2d, v3.2d, v2.2d +; CHECK-NEXT: sub v0.2d, v0.2d, v4.2d ; CHECK-NEXT: ret %l11 = load <4 x i16>, ptr %p1 %q1 = getelementptr i8, ptr %p1, i32 8 @@ -1311,26 +1251,26 @@ define <8 x i64> @dbl_v8i16_v8i64(ptr %p1, ptr %p2) { ; CHECK-LABEL: dbl_v8i16_v8i64: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q1, [x0] -; CHECK-NEXT: ldp q2, q3, [x1] -; CHECK-NEXT: add v0.8h, v0.8h, v2.8h +; CHECK-NEXT: ldp q1, q0, [x0] +; CHECK-NEXT: ldp q3, q2, [x1] ; CHECK-NEXT: add v1.8h, v1.8h, v3.8h -; CHECK-NEXT: ushll v2.4s, v0.4h, #0 -; CHECK-NEXT: ushll2 v0.4s, v0.8h, #0 -; CHECK-NEXT: ushll2 v3.4s, v1.8h, #0 -; CHECK-NEXT: ushll v1.4s, v1.4h, #0 -; CHECK-NEXT: ushll v4.2d, v2.2s, #0 -; CHECK-NEXT: ushll v5.2d, v0.2s, #0 -; CHECK-NEXT: ushll2 v2.2d, v2.4s, #0 -; CHECK-NEXT: ushll2 v0.2d, v0.4s, #0 -; CHECK-NEXT: ushll v6.2d, v1.2s, #3 -; CHECK-NEXT: ushll v7.2d, v3.2s, #3 -; CHECK-NEXT: ushll2 v3.2d, v3.4s, #3 -; CHECK-NEXT: ushll2 v1.2d, v1.4s, #3 -; CHECK-NEXT: sub v3.2d, v0.2d, v3.2d -; CHECK-NEXT: sub v1.2d, v2.2d, v1.2d -; CHECK-NEXT: sub v0.2d, v4.2d, v6.2d -; CHECK-NEXT: sub v2.2d, v5.2d, v7.2d +; CHECK-NEXT: add v0.8h, v0.8h, v2.8h +; CHECK-NEXT: ushll2 v4.4s, v1.8h, #0 +; CHECK-NEXT: ushll2 v2.4s, v0.8h, #0 +; CHECK-NEXT: ushll v0.4s, v0.4h, #0 +; CHECK-NEXT: ushll v5.4s, v1.4h, #0 +; CHECK-NEXT: ushll2 v1.2d, v2.4s, #3 +; CHECK-NEXT: ushll2 v3.2d, v4.4s, #0 +; CHECK-NEXT: ushll2 v6.2d, v0.4s, #3 +; CHECK-NEXT: sub v3.2d, v3.2d, v1.2d +; CHECK-NEXT: ushll2 v1.2d, v5.4s, #0 +; CHECK-NEXT: ushll v2.2d, v2.2s, #3 +; CHECK-NEXT: ushll v0.2d, v0.2s, #3 +; CHECK-NEXT: ushll v5.2d, v5.2s, #0 +; CHECK-NEXT: ushll v4.2d, v4.2s, #0 +; CHECK-NEXT: sub v1.2d, v1.2d, v6.2d +; CHECK-NEXT: sub v0.2d, v5.2d, v0.2d +; CHECK-NEXT: sub v2.2d, v4.2d, v2.2d ; CHECK-NEXT: ret %l11 = load <8 x i16>, ptr %p1 %q1 = getelementptr i8, ptr %p1, i32 16 @@ -1350,46 +1290,47 @@ define <16 x i64> @dbl_v16i16_v16i64(ptr %p1, ptr %p2) { ; CHECK-LABEL: dbl_v16i16_v16i64: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q1, q0, [x0] -; CHECK-NEXT: ldp q2, q3, [x0, #32] -; CHECK-NEXT: ldp q5, q4, [x1] -; CHECK-NEXT: add v1.8h, v1.8h, v5.8h -; CHECK-NEXT: ldp q6, q7, [x1, #32] -; CHECK-NEXT: add v0.8h, v0.8h, v4.8h -; CHECK-NEXT: ushll v5.4s, v1.4h, #0 -; CHECK-NEXT: ushll v4.4s, v0.4h, #0 -; CHECK-NEXT: add v2.8h, v2.8h, v6.8h -; CHECK-NEXT: ushll2 v0.4s, v0.8h, #0 -; CHECK-NEXT: add v3.8h, v3.8h, v7.8h -; CHECK-NEXT: ushll2 v1.4s, v1.8h, #0 -; CHECK-NEXT: ushll2 v7.4s, v2.8h, #0 -; CHECK-NEXT: ushll2 v19.4s, v3.8h, #0 -; CHECK-NEXT: ushll v3.4s, v3.4h, #0 -; CHECK-NEXT: ushll v2.4s, v2.4h, #0 -; CHECK-NEXT: ushll v16.2d, v4.2s, #0 -; CHECK-NEXT: ushll v17.2d, v5.2s, #0 -; CHECK-NEXT: ushll v6.2d, v0.2s, #0 -; CHECK-NEXT: ushll2 v4.2d, v4.4s, #0 -; CHECK-NEXT: ushll v18.2d, v1.2s, #0 -; CHECK-NEXT: ushll2 v5.2d, v5.4s, #0 -; CHECK-NEXT: ushll2 v0.2d, v0.4s, #0 -; CHECK-NEXT: ushll2 v1.2d, v1.4s, #0 -; CHECK-NEXT: ushll v20.2d, v3.2s, #3 -; CHECK-NEXT: ushll v21.2d, v2.2s, #3 -; CHECK-NEXT: ushll v22.2d, v19.2s, #3 -; CHECK-NEXT: ushll2 v23.2d, v3.4s, #3 -; CHECK-NEXT: ushll v24.2d, v7.2s, #3 -; CHECK-NEXT: ushll2 v2.2d, v2.4s, #3 -; CHECK-NEXT: ushll2 v3.2d, v7.4s, #3 -; CHECK-NEXT: ushll2 v7.2d, v19.4s, #3 -; CHECK-NEXT: sub v3.2d, v1.2d, v3.2d -; CHECK-NEXT: sub v7.2d, v0.2d, v7.2d -; CHECK-NEXT: sub v1.2d, v5.2d, v2.2d -; CHECK-NEXT: sub v2.2d, v18.2d, v24.2d -; CHECK-NEXT: sub v5.2d, v4.2d, v23.2d -; CHECK-NEXT: sub v6.2d, v6.2d, v22.2d -; CHECK-NEXT: sub v0.2d, v17.2d, v21.2d -; CHECK-NEXT: sub v4.2d, v16.2d, v20.2d +; CHECK-NEXT: ldp q0, q1, [x0, #32] +; CHECK-NEXT: ldp q2, q3, [x0] +; CHECK-NEXT: ldp q4, q5, [x1, #16] +; CHECK-NEXT: add v4.8h, v3.8h, v4.8h +; CHECK-NEXT: ldr q6, [x1, #48] +; CHECK-NEXT: ldr q7, [x1] +; CHECK-NEXT: add v0.8h, v0.8h, v5.8h +; CHECK-NEXT: add v6.8h, v1.8h, v6.8h +; CHECK-NEXT: add v1.8h, v2.8h, v7.8h +; CHECK-NEXT: ushll2 v2.4s, v0.8h, #0 +; CHECK-NEXT: ushll2 v5.4s, v1.8h, #0 +; CHECK-NEXT: ushll2 v16.4s, v6.8h, #0 +; CHECK-NEXT: ushll2 v17.4s, v4.8h, #0 +; CHECK-NEXT: ushll2 v3.2d, v2.4s, #3 +; CHECK-NEXT: ushll2 v7.2d, v5.4s, #0 +; CHECK-NEXT: ushll2 v18.2d, v16.4s, #3 +; CHECK-NEXT: ushll2 v19.2d, v17.4s, #0 +; CHECK-NEXT: sub v3.2d, v7.2d, v3.2d +; CHECK-NEXT: sub v7.2d, v19.2d, v18.2d +; CHECK-NEXT: ushll v0.4s, v0.4h, #0 +; CHECK-NEXT: ushll v18.4s, v1.4h, #0 +; CHECK-NEXT: ushll2 v1.2d, v0.4s, #3 +; CHECK-NEXT: ushll2 v19.2d, v18.4s, #0 +; CHECK-NEXT: ushll v2.2d, v2.2s, #3 +; CHECK-NEXT: ushll v5.2d, v5.2s, #0 +; CHECK-NEXT: sub v1.2d, v19.2d, v1.2d +; CHECK-NEXT: ushll v19.4s, v6.4h, #0 +; CHECK-NEXT: ushll v4.4s, v4.4h, #0 +; CHECK-NEXT: sub v2.2d, v5.2d, v2.2d +; CHECK-NEXT: ushll2 v5.2d, v19.4s, #3 +; CHECK-NEXT: ushll2 v6.2d, v4.4s, #0 +; CHECK-NEXT: ushll v16.2d, v16.2s, #3 +; CHECK-NEXT: ushll v17.2d, v17.2s, #0 +; CHECK-NEXT: sub v5.2d, v6.2d, v5.2d +; CHECK-NEXT: sub v6.2d, v17.2d, v16.2d +; CHECK-NEXT: ushll v0.2d, v0.2s, #3 +; CHECK-NEXT: ushll v16.2d, v18.2s, #0 +; CHECK-NEXT: ushll v17.2d, v19.2s, #3 +; CHECK-NEXT: ushll v4.2d, v4.2s, #0 +; CHECK-NEXT: sub v0.2d, v16.2d, v0.2d +; CHECK-NEXT: sub v4.2d, v4.2d, v17.2d ; CHECK-NEXT: ret %l11 = load <16 x i16>, ptr %p1 %q1 = getelementptr i8, ptr %p1, i32 32 @@ -1409,12 +1350,11 @@ define <2 x i64> @dbl_v2i32_v2i64(ptr %p1, ptr %p2) { ; CHECK-LABEL: dbl_v2i32_v2i64: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp d0, d1, [x0] -; CHECK-NEXT: ldp d2, d3, [x1] -; CHECK-NEXT: add v0.2s, v0.2s, v2.2s -; CHECK-NEXT: add v1.2s, v1.2s, v3.2s +; CHECK-NEXT: ldr q0, [x0] +; CHECK-NEXT: ldr q1, [x1] +; CHECK-NEXT: add v0.4s, v0.4s, v1.4s +; CHECK-NEXT: ushll2 v1.2d, v0.4s, #3 ; CHECK-NEXT: ushll v0.2d, v0.2s, #0 -; CHECK-NEXT: ushll v1.2d, v1.2s, #3 ; CHECK-NEXT: sub v0.2d, v0.2d, v1.2d ; CHECK-NEXT: ret %l11 = load <2 x i32>, ptr %p1 @@ -1435,16 +1375,16 @@ define <4 x i64> @dbl_v4i32_v4i64(ptr %p1, ptr %p2) { ; CHECK-LABEL: dbl_v4i32_v4i64: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q1, [x0] -; CHECK-NEXT: ldp q2, q3, [x1] -; CHECK-NEXT: add v0.4s, v0.4s, v2.4s +; CHECK-NEXT: ldp q1, q0, [x0] +; CHECK-NEXT: ldp q3, q2, [x1] ; CHECK-NEXT: add v1.4s, v1.4s, v3.4s -; CHECK-NEXT: ushll v2.2d, v0.2s, #0 -; CHECK-NEXT: ushll2 v0.2d, v0.4s, #0 -; CHECK-NEXT: ushll2 v3.2d, v1.4s, #3 -; CHECK-NEXT: ushll v4.2d, v1.2s, #3 -; CHECK-NEXT: sub v1.2d, v0.2d, v3.2d -; CHECK-NEXT: sub v0.2d, v2.2d, v4.2d +; CHECK-NEXT: add v0.4s, v0.4s, v2.4s +; CHECK-NEXT: ushll2 v3.2d, v1.4s, #0 +; CHECK-NEXT: ushll2 v2.2d, v0.4s, #3 +; CHECK-NEXT: ushll v0.2d, v0.2s, #3 +; CHECK-NEXT: ushll v4.2d, v1.2s, #0 +; CHECK-NEXT: sub v1.2d, v3.2d, v2.2d +; CHECK-NEXT: sub v0.2d, v4.2d, v0.2d ; CHECK-NEXT: ret %l11 = load <4 x i32>, ptr %p1 %q1 = getelementptr i8, ptr %p1, i32 16 @@ -1464,26 +1404,27 @@ define <8 x i64> @dbl_v8i32_v8i64(ptr %p1, ptr %p2) { ; CHECK-LABEL: dbl_v8i32_v8i64: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q1, q0, [x0] -; CHECK-NEXT: ldp q2, q3, [x0, #32] -; CHECK-NEXT: ldp q5, q4, [x1] -; CHECK-NEXT: add v1.4s, v1.4s, v5.4s -; CHECK-NEXT: ldp q6, q7, [x1, #32] -; CHECK-NEXT: add v0.4s, v0.4s, v4.4s -; CHECK-NEXT: ushll v5.2d, v1.2s, #0 -; CHECK-NEXT: ushll v4.2d, v0.2s, #0 -; CHECK-NEXT: add v2.4s, v2.4s, v6.4s -; CHECK-NEXT: ushll2 v0.2d, v0.4s, #0 -; CHECK-NEXT: add v3.4s, v3.4s, v7.4s -; CHECK-NEXT: ushll2 v1.2d, v1.4s, #0 -; CHECK-NEXT: ushll v6.2d, v3.2s, #3 -; CHECK-NEXT: ushll v7.2d, v2.2s, #3 -; CHECK-NEXT: ushll2 v2.2d, v2.4s, #3 -; CHECK-NEXT: ushll2 v3.2d, v3.4s, #3 -; CHECK-NEXT: sub v1.2d, v1.2d, v2.2d -; CHECK-NEXT: sub v3.2d, v0.2d, v3.2d -; CHECK-NEXT: sub v0.2d, v5.2d, v7.2d -; CHECK-NEXT: sub v2.2d, v4.2d, v6.2d +; CHECK-NEXT: ldp q0, q1, [x0, #32] +; CHECK-NEXT: ldp q2, q3, [x0] +; CHECK-NEXT: ldp q4, q5, [x1, #16] +; CHECK-NEXT: add v4.4s, v3.4s, v4.4s +; CHECK-NEXT: ldr q6, [x1, #48] +; CHECK-NEXT: ldr q7, [x1] +; CHECK-NEXT: add v0.4s, v0.4s, v5.4s +; CHECK-NEXT: add v6.4s, v1.4s, v6.4s +; CHECK-NEXT: add v2.4s, v2.4s, v7.4s +; CHECK-NEXT: ushll2 v1.2d, v0.4s, #3 +; CHECK-NEXT: ushll2 v3.2d, v2.4s, #0 +; CHECK-NEXT: ushll2 v5.2d, v6.4s, #3 +; CHECK-NEXT: ushll2 v7.2d, v4.4s, #0 +; CHECK-NEXT: ushll v0.2d, v0.2s, #3 +; CHECK-NEXT: sub v1.2d, v3.2d, v1.2d +; CHECK-NEXT: sub v3.2d, v7.2d, v5.2d +; CHECK-NEXT: ushll v2.2d, v2.2s, #0 +; CHECK-NEXT: ushll v5.2d, v6.2s, #3 +; CHECK-NEXT: ushll v4.2d, v4.2s, #0 +; CHECK-NEXT: sub v0.2d, v2.2d, v0.2d +; CHECK-NEXT: sub v2.2d, v4.2d, v5.2d ; CHECK-NEXT: ret %l11 = load <8 x i32>, ptr %p1 %q1 = getelementptr i8, ptr %p1, i32 32 @@ -1503,46 +1444,46 @@ define <16 x i64> @dbl_v16i32_v16i64(ptr %p1, ptr %p2) { ; CHECK-LABEL: dbl_v16i32_v16i64: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q1, q0, [x0, #32] -; CHECK-NEXT: ldp q3, q2, [x0] -; CHECK-NEXT: ldp q17, q16, [x1] -; CHECK-NEXT: add v3.4s, v3.4s, v17.4s -; CHECK-NEXT: ldp q19, q18, [x1, #32] -; CHECK-NEXT: add v2.4s, v2.4s, v16.4s -; CHECK-NEXT: add v1.4s, v1.4s, v19.4s -; CHECK-NEXT: ldp q4, q5, [x0, #64] -; CHECK-NEXT: add v0.4s, v0.4s, v18.4s -; CHECK-NEXT: ushll2 v20.2d, v1.4s, #0 -; CHECK-NEXT: ldp q6, q7, [x0, #96] -; CHECK-NEXT: ldp q17, q16, [x1, #96] -; CHECK-NEXT: add v6.4s, v6.4s, v17.4s -; CHECK-NEXT: ushll v17.2d, v1.2s, #0 -; CHECK-NEXT: ldp q19, q18, [x1, #64] -; CHECK-NEXT: add v7.4s, v7.4s, v16.4s -; CHECK-NEXT: ushll v16.2d, v0.2s, #0 -; CHECK-NEXT: ushll2 v0.2d, v0.4s, #0 -; CHECK-NEXT: add v4.4s, v4.4s, v19.4s -; CHECK-NEXT: ushll v19.2d, v3.2s, #0 -; CHECK-NEXT: add v5.4s, v5.4s, v18.4s -; CHECK-NEXT: ushll v18.2d, v2.2s, #0 -; CHECK-NEXT: ushll2 v2.2d, v2.4s, #0 -; CHECK-NEXT: ushll2 v1.2d, v3.4s, #0 -; CHECK-NEXT: ushll v21.2d, v7.2s, #3 -; CHECK-NEXT: ushll v22.2d, v6.2s, #3 -; CHECK-NEXT: ushll v23.2d, v5.2s, #3 -; CHECK-NEXT: ushll v24.2d, v4.2s, #3 -; CHECK-NEXT: ushll2 v7.2d, v7.4s, #3 -; CHECK-NEXT: ushll2 v6.2d, v6.4s, #3 -; CHECK-NEXT: ushll2 v3.2d, v4.4s, #3 -; CHECK-NEXT: ushll2 v4.2d, v5.4s, #3 -; CHECK-NEXT: sub v1.2d, v1.2d, v3.2d -; CHECK-NEXT: sub v3.2d, v2.2d, v4.2d -; CHECK-NEXT: sub v5.2d, v20.2d, v6.2d -; CHECK-NEXT: sub v7.2d, v0.2d, v7.2d -; CHECK-NEXT: sub v0.2d, v19.2d, v24.2d -; CHECK-NEXT: sub v2.2d, v18.2d, v23.2d -; CHECK-NEXT: sub v4.2d, v17.2d, v22.2d -; CHECK-NEXT: sub v6.2d, v16.2d, v21.2d +; CHECK-NEXT: ldp q4, q5, [x0, #96] +; CHECK-NEXT: ldp q6, q7, [x0, #32] +; CHECK-NEXT: ldp q16, q17, [x1, #32] +; CHECK-NEXT: add v6.4s, v6.4s, v16.4s +; CHECK-NEXT: ldp q18, q19, [x1, #96] +; CHECK-NEXT: add v17.4s, v7.4s, v17.4s +; CHECK-NEXT: ushll2 v21.2d, v17.4s, #0 +; CHECK-NEXT: add v4.4s, v4.4s, v18.4s +; CHECK-NEXT: ushll v17.2d, v17.2s, #0 +; CHECK-NEXT: ldp q0, q1, [x0, #64] +; CHECK-NEXT: add v19.4s, v5.4s, v19.4s +; CHECK-NEXT: ldp q2, q3, [x0] +; CHECK-NEXT: ldp q20, q7, [x1] +; CHECK-NEXT: add v2.4s, v2.4s, v20.4s +; CHECK-NEXT: ushll2 v20.2d, v19.4s, #3 +; CHECK-NEXT: ldp q5, q16, [x1, #64] +; CHECK-NEXT: add v18.4s, v3.4s, v7.4s +; CHECK-NEXT: ushll2 v3.2d, v2.4s, #0 +; CHECK-NEXT: ushll2 v7.2d, v18.4s, #0 +; CHECK-NEXT: add v0.4s, v0.4s, v5.4s +; CHECK-NEXT: ushll v2.2d, v2.2s, #0 +; CHECK-NEXT: add v16.4s, v1.4s, v16.4s +; CHECK-NEXT: ushll2 v1.2d, v0.4s, #3 +; CHECK-NEXT: ushll2 v5.2d, v16.4s, #3 +; CHECK-NEXT: ushll v0.2d, v0.2s, #3 +; CHECK-NEXT: ushll v16.2d, v16.2s, #3 +; CHECK-NEXT: ushll v18.2d, v18.2s, #0 +; CHECK-NEXT: sub v1.2d, v3.2d, v1.2d +; CHECK-NEXT: sub v3.2d, v7.2d, v5.2d +; CHECK-NEXT: ushll2 v5.2d, v4.4s, #3 +; CHECK-NEXT: ushll2 v7.2d, v6.4s, #0 +; CHECK-NEXT: sub v0.2d, v2.2d, v0.2d +; CHECK-NEXT: sub v2.2d, v18.2d, v16.2d +; CHECK-NEXT: ushll v4.2d, v4.2s, #3 +; CHECK-NEXT: ushll v6.2d, v6.2s, #0 +; CHECK-NEXT: ushll v16.2d, v19.2s, #3 +; CHECK-NEXT: sub v5.2d, v7.2d, v5.2d +; CHECK-NEXT: sub v7.2d, v21.2d, v20.2d +; CHECK-NEXT: sub v4.2d, v6.2d, v4.2d +; CHECK-NEXT: sub v6.2d, v17.2d, v16.2d ; CHECK-NEXT: ret %l11 = load <16 x i32>, ptr %p1 %q1 = getelementptr i8, ptr %p1, i32 64 @@ -1605,16 +1546,11 @@ define <4 x i32> @dblext_v4i8_v4i16_v4i32(ptr %p1, ptr %p2) { ; CHECK-LABEL: dblext_v4i8_v4i16_v4i32: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp s0, s1, [x0] -; CHECK-NEXT: ldp s2, s3, [x1] -; CHECK-NEXT: ushll v0.8h, v0.8b, #0 -; CHECK-NEXT: ushll v1.8h, v1.8b, #0 -; CHECK-NEXT: ushll v2.8h, v2.8b, #0 -; CHECK-NEXT: ushll v3.8h, v3.8b, #0 -; CHECK-NEXT: add v0.4h, v0.4h, v2.4h -; CHECK-NEXT: add v1.4h, v1.4h, v3.4h +; CHECK-NEXT: ldr d0, [x0] +; CHECK-NEXT: ldr d1, [x1] +; CHECK-NEXT: uaddl v0.8h, v0.8b, v1.8b +; CHECK-NEXT: ushll2 v1.4s, v0.8h, #3 ; CHECK-NEXT: ushll v0.4s, v0.4h, #0 -; CHECK-NEXT: ushll v1.4s, v1.4h, #3 ; CHECK-NEXT: sub v0.4s, v0.4s, v1.4s ; CHECK-NEXT: ret %l11 = load <4 x i8>, ptr %p1 @@ -1639,16 +1575,16 @@ define <8 x i32> @dblext_v8i8_v8i16_v8i32(ptr %p1, ptr %p2) { ; CHECK-LABEL: dblext_v8i8_v8i16_v8i32: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp d0, d3, [x1] -; CHECK-NEXT: ldp d1, d2, [x0] -; CHECK-NEXT: uaddl v0.8h, v1.8b, v0.8b -; CHECK-NEXT: uaddl v1.8h, v2.8b, v3.8b -; CHECK-NEXT: ushll v2.4s, v0.4h, #0 -; CHECK-NEXT: ushll2 v0.4s, v0.8h, #0 -; CHECK-NEXT: ushll2 v3.4s, v1.8h, #3 -; CHECK-NEXT: ushll v4.4s, v1.4h, #3 -; CHECK-NEXT: sub v1.4s, v0.4s, v3.4s -; CHECK-NEXT: sub v0.4s, v2.4s, v4.4s +; CHECK-NEXT: ldr q0, [x0] +; CHECK-NEXT: ldr q1, [x1] +; CHECK-NEXT: uaddl2 v2.8h, v0.16b, v1.16b +; CHECK-NEXT: uaddl v0.8h, v0.8b, v1.8b +; CHECK-NEXT: ushll2 v1.4s, v2.8h, #3 +; CHECK-NEXT: ushll2 v3.4s, v0.8h, #0 +; CHECK-NEXT: ushll v2.4s, v2.4h, #3 +; CHECK-NEXT: ushll v0.4s, v0.4h, #0 +; CHECK-NEXT: sub v1.4s, v3.4s, v1.4s +; CHECK-NEXT: sub v0.4s, v0.4s, v2.4s ; CHECK-NEXT: ret %l11 = load <8 x i8>, ptr %p1 %q1 = getelementptr i8, ptr %p1, i32 8 @@ -1672,24 +1608,24 @@ define <16 x i32> @dblext_v16i8_v16i16_v16i32(ptr %p1, ptr %p2) { ; CHECK-LABEL: dblext_v16i8_v16i16_v16i32: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q3, [x1] -; CHECK-NEXT: ldp q1, q2, [x0] -; CHECK-NEXT: uaddl2 v4.8h, v1.16b, v0.16b -; CHECK-NEXT: uaddl v0.8h, v1.8b, v0.8b -; CHECK-NEXT: uaddl v1.8h, v2.8b, v3.8b -; CHECK-NEXT: uaddl2 v2.8h, v2.16b, v3.16b -; CHECK-NEXT: ushll v5.4s, v0.4h, #0 -; CHECK-NEXT: ushll v6.4s, v4.4h, #0 -; CHECK-NEXT: ushll2 v0.4s, v0.8h, #0 -; CHECK-NEXT: ushll2 v3.4s, v4.8h, #0 -; CHECK-NEXT: ushll v4.4s, v1.4h, #3 -; CHECK-NEXT: ushll v7.4s, v2.4h, #3 -; CHECK-NEXT: ushll2 v2.4s, v2.8h, #3 -; CHECK-NEXT: ushll2 v1.4s, v1.8h, #3 +; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ldp q2, q3, [x1] +; CHECK-NEXT: uaddl v4.8h, v0.8b, v2.8b +; CHECK-NEXT: uaddl2 v0.8h, v0.16b, v2.16b +; CHECK-NEXT: uaddl2 v5.8h, v1.16b, v3.16b +; CHECK-NEXT: uaddl v1.8h, v1.8b, v3.8b +; CHECK-NEXT: ushll2 v3.4s, v0.8h, #0 +; CHECK-NEXT: ushll2 v2.4s, v5.8h, #3 +; CHECK-NEXT: ushll2 v6.4s, v1.8h, #3 ; CHECK-NEXT: sub v3.4s, v3.4s, v2.4s -; CHECK-NEXT: sub v1.4s, v0.4s, v1.4s -; CHECK-NEXT: sub v0.4s, v5.4s, v4.4s -; CHECK-NEXT: sub v2.4s, v6.4s, v7.4s +; CHECK-NEXT: ushll2 v2.4s, v4.8h, #0 +; CHECK-NEXT: ushll v5.4s, v5.4h, #3 +; CHECK-NEXT: ushll v7.4s, v1.4h, #3 +; CHECK-NEXT: sub v1.4s, v2.4s, v6.4s +; CHECK-NEXT: ushll v2.4s, v4.4h, #0 +; CHECK-NEXT: ushll v4.4s, v0.4h, #0 +; CHECK-NEXT: sub v0.4s, v2.4s, v7.4s +; CHECK-NEXT: sub v2.4s, v4.4s, v5.4s ; CHECK-NEXT: ret %l11 = load <16 x i8>, ptr %p1 %q1 = getelementptr i8, ptr %p1, i32 16 @@ -1713,26 +1649,14 @@ define <2 x i64> @dblext_v2i8_v2i16_v2i64(ptr %p1, ptr %p2) { ; CHECK-LABEL: dblext_v2i8_v2i16_v2i64: ; CHECK: // %bb.0: -; CHECK-NEXT: ldrb w8, [x0] -; CHECK-NEXT: ldrb w9, [x0, #2] -; CHECK-NEXT: ldrb w10, [x1] -; CHECK-NEXT: fmov s0, w8 -; CHECK-NEXT: ldrb w8, [x1, #2] -; CHECK-NEXT: fmov s1, w9 -; CHECK-NEXT: ldrb w9, [x0, #1] -; CHECK-NEXT: fmov s2, w10 -; CHECK-NEXT: ldrb w10, [x0, #3] -; CHECK-NEXT: fmov s3, w8 -; CHECK-NEXT: ldrb w8, [x1, #1] -; CHECK-NEXT: mov v0.s[1], w9 -; CHECK-NEXT: ldrb w9, [x1, #3] -; CHECK-NEXT: mov v1.s[1], w10 -; CHECK-NEXT: mov v2.s[1], w8 -; CHECK-NEXT: mov v3.s[1], w9 -; CHECK-NEXT: add v0.2s, v0.2s, v2.2s -; CHECK-NEXT: add v1.2s, v1.2s, v3.2s +; CHECK-NEXT: ldr s0, [x0] +; CHECK-NEXT: ldr s1, [x1] +; CHECK-NEXT: ushll v0.8h, v0.8b, #0 +; CHECK-NEXT: ushll v1.8h, v1.8b, #0 +; CHECK-NEXT: add v0.4h, v0.4h, v1.4h +; CHECK-NEXT: ushll v0.4s, v0.4h, #0 +; CHECK-NEXT: ushll2 v1.2d, v0.4s, #3 ; CHECK-NEXT: ushll v0.2d, v0.2s, #0 -; CHECK-NEXT: ushll v1.2d, v1.2s, #3 ; CHECK-NEXT: sub v0.2d, v0.2d, v1.2d ; CHECK-NEXT: ret %l11 = load <2 x i8>, ptr %p1 @@ -1757,22 +1681,17 @@ define <4 x i64> @dblext_v4i8_v4i16_v4i64(ptr %p1, ptr %p2) { ; CHECK-LABEL: dblext_v4i8_v4i16_v4i64: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp s0, s1, [x0] -; CHECK-NEXT: ldp s2, s3, [x1] -; CHECK-NEXT: ushll v0.8h, v0.8b, #0 -; CHECK-NEXT: ushll v1.8h, v1.8b, #0 -; CHECK-NEXT: ushll v2.8h, v2.8b, #0 -; CHECK-NEXT: ushll v3.8h, v3.8b, #0 -; CHECK-NEXT: add v0.4h, v0.4h, v2.4h -; CHECK-NEXT: add v1.4h, v1.4h, v3.4h +; CHECK-NEXT: ldr d0, [x0] +; CHECK-NEXT: ldr d1, [x1] +; CHECK-NEXT: uaddl v0.8h, v0.8b, v1.8b +; CHECK-NEXT: ushll2 v1.4s, v0.8h, #0 ; CHECK-NEXT: ushll v0.4s, v0.4h, #0 -; CHECK-NEXT: ushll v1.4s, v1.4h, #0 -; CHECK-NEXT: ushll v2.2d, v0.2s, #0 -; CHECK-NEXT: ushll2 v0.2d, v0.4s, #0 -; CHECK-NEXT: ushll2 v3.2d, v1.4s, #3 +; CHECK-NEXT: ushll2 v2.2d, v1.4s, #3 +; CHECK-NEXT: ushll2 v3.2d, v0.4s, #0 ; CHECK-NEXT: ushll v4.2d, v1.2s, #3 -; CHECK-NEXT: sub v1.2d, v0.2d, v3.2d -; CHECK-NEXT: sub v0.2d, v2.2d, v4.2d +; CHECK-NEXT: ushll v0.2d, v0.2s, #0 +; CHECK-NEXT: sub v1.2d, v3.2d, v2.2d +; CHECK-NEXT: sub v0.2d, v0.2d, v4.2d ; CHECK-NEXT: ret %l11 = load <4 x i8>, ptr %p1 %q1 = getelementptr i8, ptr %p1, i32 4 @@ -1796,26 +1715,26 @@ define <8 x i64> @dblext_v8i8_v8i16_v8i64(ptr %p1, ptr %p2) { ; CHECK-LABEL: dblext_v8i8_v8i16_v8i64: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp d0, d3, [x1] -; CHECK-NEXT: ldp d1, d2, [x0] -; CHECK-NEXT: uaddl v0.8h, v1.8b, v0.8b -; CHECK-NEXT: uaddl v1.8h, v2.8b, v3.8b -; CHECK-NEXT: ushll v2.4s, v0.4h, #0 -; CHECK-NEXT: ushll2 v0.4s, v0.8h, #0 -; CHECK-NEXT: ushll2 v3.4s, v1.8h, #0 -; CHECK-NEXT: ushll v1.4s, v1.4h, #0 -; CHECK-NEXT: ushll v4.2d, v2.2s, #0 -; CHECK-NEXT: ushll v5.2d, v0.2s, #0 -; CHECK-NEXT: ushll2 v2.2d, v2.4s, #0 -; CHECK-NEXT: ushll2 v0.2d, v0.4s, #0 -; CHECK-NEXT: ushll v6.2d, v1.2s, #3 -; CHECK-NEXT: ushll v7.2d, v3.2s, #3 -; CHECK-NEXT: ushll2 v3.2d, v3.4s, #3 -; CHECK-NEXT: ushll2 v1.2d, v1.4s, #3 -; CHECK-NEXT: sub v3.2d, v0.2d, v3.2d -; CHECK-NEXT: sub v1.2d, v2.2d, v1.2d -; CHECK-NEXT: sub v0.2d, v4.2d, v6.2d -; CHECK-NEXT: sub v2.2d, v5.2d, v7.2d +; CHECK-NEXT: ldr q0, [x0] +; CHECK-NEXT: ldr q1, [x1] +; CHECK-NEXT: uaddl2 v2.8h, v0.16b, v1.16b +; CHECK-NEXT: uaddl v0.8h, v0.8b, v1.8b +; CHECK-NEXT: ushll2 v1.4s, v2.8h, #0 +; CHECK-NEXT: ushll2 v4.4s, v0.8h, #0 +; CHECK-NEXT: ushll v2.4s, v2.4h, #0 +; CHECK-NEXT: ushll v0.4s, v0.4h, #0 +; CHECK-NEXT: ushll2 v3.2d, v1.4s, #3 +; CHECK-NEXT: ushll2 v5.2d, v4.4s, #0 +; CHECK-NEXT: ushll2 v6.2d, v2.4s, #3 +; CHECK-NEXT: sub v3.2d, v5.2d, v3.2d +; CHECK-NEXT: ushll2 v5.2d, v0.4s, #0 +; CHECK-NEXT: ushll v7.2d, v1.2s, #3 +; CHECK-NEXT: ushll v2.2d, v2.2s, #3 +; CHECK-NEXT: ushll v0.2d, v0.2s, #0 +; CHECK-NEXT: ushll v4.2d, v4.2s, #0 +; CHECK-NEXT: sub v1.2d, v5.2d, v6.2d +; CHECK-NEXT: sub v0.2d, v0.2d, v2.2d +; CHECK-NEXT: sub v2.2d, v4.2d, v7.2d ; CHECK-NEXT: ret %l11 = load <8 x i8>, ptr %p1 %q1 = getelementptr i8, ptr %p1, i32 8 @@ -1839,44 +1758,44 @@ define <16 x i64> @dblext_v16i8_v16i16_v16i64(ptr %p1, ptr %p2) { ; CHECK-LABEL: dblext_v16i8_v16i16_v16i64: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q3, [x1] -; CHECK-NEXT: ldp q1, q2, [x0] -; CHECK-NEXT: uaddl2 v4.8h, v1.16b, v0.16b -; CHECK-NEXT: uaddl v0.8h, v1.8b, v0.8b -; CHECK-NEXT: uaddl v1.8h, v2.8b, v3.8b -; CHECK-NEXT: uaddl2 v2.8h, v2.16b, v3.16b -; CHECK-NEXT: ushll v3.4s, v0.4h, #0 -; CHECK-NEXT: ushll v5.4s, v4.4h, #0 -; CHECK-NEXT: ushll2 v4.4s, v4.8h, #0 -; CHECK-NEXT: ushll2 v0.4s, v0.8h, #0 -; CHECK-NEXT: ushll v16.2d, v3.2s, #0 -; CHECK-NEXT: ushll2 v19.2d, v3.4s, #0 -; CHECK-NEXT: ushll v6.2d, v4.2s, #0 -; CHECK-NEXT: ushll2 v3.2d, v4.4s, #0 -; CHECK-NEXT: ushll2 v4.4s, v2.8h, #0 -; CHECK-NEXT: ushll2 v7.4s, v1.8h, #0 -; CHECK-NEXT: ushll v1.4s, v1.4h, #0 -; CHECK-NEXT: ushll v2.4s, v2.4h, #0 -; CHECK-NEXT: ushll v17.2d, v5.2s, #0 +; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ldp q2, q3, [x1] +; CHECK-NEXT: uaddl v4.8h, v0.8b, v2.8b +; CHECK-NEXT: uaddl2 v0.8h, v0.16b, v2.16b +; CHECK-NEXT: uaddl2 v5.8h, v1.16b, v3.16b +; CHECK-NEXT: uaddl v1.8h, v1.8b, v3.8b +; CHECK-NEXT: ushll2 v6.4s, v0.8h, #0 +; CHECK-NEXT: ushll2 v2.4s, v5.8h, #0 +; CHECK-NEXT: ushll2 v16.4s, v1.8h, #0 +; CHECK-NEXT: ushll2 v17.4s, v4.8h, #0 +; CHECK-NEXT: ushll2 v3.2d, v2.4s, #3 +; CHECK-NEXT: ushll2 v7.2d, v6.4s, #0 +; CHECK-NEXT: ushll2 v18.2d, v16.4s, #3 +; CHECK-NEXT: ushll2 v19.2d, v17.4s, #0 +; CHECK-NEXT: sub v7.2d, v7.2d, v3.2d +; CHECK-NEXT: sub v3.2d, v19.2d, v18.2d +; CHECK-NEXT: ushll v18.4s, v5.4h, #0 +; CHECK-NEXT: ushll v0.4s, v0.4h, #0 +; CHECK-NEXT: ushll2 v5.2d, v18.4s, #3 +; CHECK-NEXT: ushll2 v19.2d, v0.4s, #0 +; CHECK-NEXT: ushll v2.2d, v2.2s, #3 +; CHECK-NEXT: ushll v6.2d, v6.2s, #0 +; CHECK-NEXT: sub v5.2d, v19.2d, v5.2d +; CHECK-NEXT: ushll v19.4s, v1.4h, #0 +; CHECK-NEXT: ushll v4.4s, v4.4h, #0 +; CHECK-NEXT: sub v6.2d, v6.2d, v2.2d +; CHECK-NEXT: ushll2 v1.2d, v19.4s, #3 +; CHECK-NEXT: ushll2 v2.2d, v4.4s, #0 +; CHECK-NEXT: ushll v16.2d, v16.2s, #3 +; CHECK-NEXT: ushll v17.2d, v17.2s, #0 +; CHECK-NEXT: sub v1.2d, v2.2d, v1.2d +; CHECK-NEXT: sub v2.2d, v17.2d, v16.2d +; CHECK-NEXT: ushll v16.2d, v18.2s, #3 +; CHECK-NEXT: ushll v17.2d, v19.2s, #3 +; CHECK-NEXT: ushll v4.2d, v4.2s, #0 ; CHECK-NEXT: ushll v18.2d, v0.2s, #0 -; CHECK-NEXT: ushll2 v5.2d, v5.4s, #0 -; CHECK-NEXT: ushll2 v0.2d, v0.4s, #0 -; CHECK-NEXT: ushll v20.2d, v1.2s, #3 -; CHECK-NEXT: ushll v21.2d, v2.2s, #3 -; CHECK-NEXT: ushll v22.2d, v7.2s, #3 -; CHECK-NEXT: ushll2 v1.2d, v1.4s, #3 -; CHECK-NEXT: ushll v23.2d, v4.2s, #3 -; CHECK-NEXT: ushll2 v2.2d, v2.4s, #3 -; CHECK-NEXT: ushll2 v4.2d, v4.4s, #3 -; CHECK-NEXT: ushll2 v24.2d, v7.4s, #3 -; CHECK-NEXT: sub v7.2d, v3.2d, v4.2d -; CHECK-NEXT: sub v3.2d, v0.2d, v24.2d -; CHECK-NEXT: sub v5.2d, v5.2d, v2.2d -; CHECK-NEXT: sub v6.2d, v6.2d, v23.2d -; CHECK-NEXT: sub v1.2d, v19.2d, v1.2d -; CHECK-NEXT: sub v2.2d, v18.2d, v22.2d -; CHECK-NEXT: sub v0.2d, v16.2d, v20.2d -; CHECK-NEXT: sub v4.2d, v17.2d, v21.2d +; CHECK-NEXT: sub v0.2d, v4.2d, v17.2d +; CHECK-NEXT: sub v4.2d, v18.2d, v16.2d ; CHECK-NEXT: ret %l11 = load <16 x i8>, ptr %p1 %q1 = getelementptr i8, ptr %p1, i32 16 @@ -1900,26 +1819,13 @@ define <2 x i64> @dblext_v2i8_v2i32_v2i64(ptr %p1, ptr %p2) { ; CHECK-LABEL: dblext_v2i8_v2i32_v2i64: ; CHECK: // %bb.0: -; CHECK-NEXT: ldrb w8, [x0] -; CHECK-NEXT: ldrb w9, [x0, #2] -; CHECK-NEXT: ldrb w10, [x1] -; CHECK-NEXT: fmov s0, w8 -; CHECK-NEXT: ldrb w8, [x1, #2] -; CHECK-NEXT: fmov s1, w9 -; CHECK-NEXT: ldrb w9, [x0, #1] -; CHECK-NEXT: fmov s2, w10 -; CHECK-NEXT: ldrb w10, [x0, #3] -; CHECK-NEXT: fmov s3, w8 -; CHECK-NEXT: ldrb w8, [x1, #1] -; CHECK-NEXT: mov v0.s[1], w9 -; CHECK-NEXT: ldrb w9, [x1, #3] -; CHECK-NEXT: mov v1.s[1], w10 -; CHECK-NEXT: mov v2.s[1], w8 -; CHECK-NEXT: mov v3.s[1], w9 -; CHECK-NEXT: add v0.2s, v0.2s, v2.2s -; CHECK-NEXT: add v1.2s, v1.2s, v3.2s +; CHECK-NEXT: ldr s0, [x0] +; CHECK-NEXT: ldr s1, [x1] +; CHECK-NEXT: ushll v0.8h, v0.8b, #0 +; CHECK-NEXT: ushll v1.8h, v1.8b, #0 +; CHECK-NEXT: uaddl v0.4s, v0.4h, v1.4h +; CHECK-NEXT: ushll2 v1.2d, v0.4s, #3 ; CHECK-NEXT: ushll v0.2d, v0.2s, #0 -; CHECK-NEXT: ushll v1.2d, v1.2s, #3 ; CHECK-NEXT: sub v0.2d, v0.2d, v1.2d ; CHECK-NEXT: ret %l11 = load <2 x i8>, ptr %p1 @@ -1944,20 +1850,17 @@ define <4 x i64> @dblext_v4i8_v4i32_v4i64(ptr %p1, ptr %p2) { ; CHECK-LABEL: dblext_v4i8_v4i32_v4i64: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp s0, s1, [x0] -; CHECK-NEXT: ldp s2, s3, [x1] -; CHECK-NEXT: ushll v0.8h, v0.8b, #0 -; CHECK-NEXT: ushll v1.8h, v1.8b, #0 -; CHECK-NEXT: ushll v2.8h, v2.8b, #0 -; CHECK-NEXT: ushll v3.8h, v3.8b, #0 -; CHECK-NEXT: uaddl v0.4s, v0.4h, v2.4h -; CHECK-NEXT: uaddl v1.4s, v1.4h, v3.4h -; CHECK-NEXT: ushll v2.2d, v0.2s, #0 -; CHECK-NEXT: ushll2 v0.2d, v0.4s, #0 -; CHECK-NEXT: ushll2 v3.2d, v1.4s, #3 +; CHECK-NEXT: ldr d0, [x0] +; CHECK-NEXT: ldr d1, [x1] +; CHECK-NEXT: uaddl v0.8h, v0.8b, v1.8b +; CHECK-NEXT: ushll2 v1.4s, v0.8h, #0 +; CHECK-NEXT: ushll v0.4s, v0.4h, #0 +; CHECK-NEXT: ushll2 v2.2d, v1.4s, #3 +; CHECK-NEXT: ushll2 v3.2d, v0.4s, #0 ; CHECK-NEXT: ushll v4.2d, v1.2s, #3 -; CHECK-NEXT: sub v1.2d, v0.2d, v3.2d -; CHECK-NEXT: sub v0.2d, v2.2d, v4.2d +; CHECK-NEXT: ushll v0.2d, v0.2s, #0 +; CHECK-NEXT: sub v1.2d, v3.2d, v2.2d +; CHECK-NEXT: sub v0.2d, v0.2d, v4.2d ; CHECK-NEXT: ret %l11 = load <4 x i8>, ptr %p1 %q1 = getelementptr i8, ptr %p1, i32 4 @@ -1981,26 +1884,26 @@ define <8 x i64> @dblext_v8i8_v8i32_v8i64(ptr %p1, ptr %p2) { ; CHECK-LABEL: dblext_v8i8_v8i32_v8i64: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp d0, d3, [x1] -; CHECK-NEXT: ldp d1, d2, [x0] -; CHECK-NEXT: uaddl v0.8h, v1.8b, v0.8b -; CHECK-NEXT: uaddl v1.8h, v2.8b, v3.8b -; CHECK-NEXT: ushll v2.4s, v0.4h, #0 -; CHECK-NEXT: ushll2 v0.4s, v0.8h, #0 -; CHECK-NEXT: ushll2 v3.4s, v1.8h, #0 -; CHECK-NEXT: ushll v1.4s, v1.4h, #0 -; CHECK-NEXT: ushll v4.2d, v2.2s, #0 -; CHECK-NEXT: ushll v5.2d, v0.2s, #0 -; CHECK-NEXT: ushll2 v2.2d, v2.4s, #0 -; CHECK-NEXT: ushll2 v0.2d, v0.4s, #0 -; CHECK-NEXT: ushll v6.2d, v1.2s, #3 -; CHECK-NEXT: ushll v7.2d, v3.2s, #3 -; CHECK-NEXT: ushll2 v3.2d, v3.4s, #3 -; CHECK-NEXT: ushll2 v1.2d, v1.4s, #3 -; CHECK-NEXT: sub v3.2d, v0.2d, v3.2d -; CHECK-NEXT: sub v1.2d, v2.2d, v1.2d -; CHECK-NEXT: sub v0.2d, v4.2d, v6.2d -; CHECK-NEXT: sub v2.2d, v5.2d, v7.2d +; CHECK-NEXT: ldr q0, [x0] +; CHECK-NEXT: ldr q1, [x1] +; CHECK-NEXT: uaddl2 v2.8h, v0.16b, v1.16b +; CHECK-NEXT: uaddl v0.8h, v0.8b, v1.8b +; CHECK-NEXT: ushll2 v1.4s, v2.8h, #0 +; CHECK-NEXT: ushll2 v4.4s, v0.8h, #0 +; CHECK-NEXT: ushll v2.4s, v2.4h, #0 +; CHECK-NEXT: ushll v0.4s, v0.4h, #0 +; CHECK-NEXT: ushll2 v3.2d, v1.4s, #3 +; CHECK-NEXT: ushll2 v5.2d, v4.4s, #0 +; CHECK-NEXT: ushll2 v6.2d, v2.4s, #3 +; CHECK-NEXT: sub v3.2d, v5.2d, v3.2d +; CHECK-NEXT: ushll2 v5.2d, v0.4s, #0 +; CHECK-NEXT: ushll v7.2d, v1.2s, #3 +; CHECK-NEXT: ushll v2.2d, v2.2s, #3 +; CHECK-NEXT: ushll v0.2d, v0.2s, #0 +; CHECK-NEXT: ushll v4.2d, v4.2s, #0 +; CHECK-NEXT: sub v1.2d, v5.2d, v6.2d +; CHECK-NEXT: sub v0.2d, v0.2d, v2.2d +; CHECK-NEXT: sub v2.2d, v4.2d, v7.2d ; CHECK-NEXT: ret %l11 = load <8 x i8>, ptr %p1 %q1 = getelementptr i8, ptr %p1, i32 8 @@ -2024,44 +1927,44 @@ define <16 x i64> @dblext_v16i8_v16i32_v16i64(ptr %p1, ptr %p2) { ; CHECK-LABEL: dblext_v16i8_v16i32_v16i64: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q3, [x1] -; CHECK-NEXT: ldp q1, q2, [x0] -; CHECK-NEXT: uaddl2 v4.8h, v1.16b, v0.16b -; CHECK-NEXT: uaddl v0.8h, v1.8b, v0.8b -; CHECK-NEXT: uaddl v1.8h, v2.8b, v3.8b -; CHECK-NEXT: uaddl2 v2.8h, v2.16b, v3.16b -; CHECK-NEXT: ushll v3.4s, v0.4h, #0 -; CHECK-NEXT: ushll v5.4s, v4.4h, #0 -; CHECK-NEXT: ushll2 v4.4s, v4.8h, #0 -; CHECK-NEXT: ushll2 v0.4s, v0.8h, #0 -; CHECK-NEXT: ushll v16.2d, v3.2s, #0 -; CHECK-NEXT: ushll2 v19.2d, v3.4s, #0 -; CHECK-NEXT: ushll v6.2d, v4.2s, #0 -; CHECK-NEXT: ushll2 v3.2d, v4.4s, #0 -; CHECK-NEXT: ushll2 v4.4s, v2.8h, #0 -; CHECK-NEXT: ushll2 v7.4s, v1.8h, #0 -; CHECK-NEXT: ushll v1.4s, v1.4h, #0 -; CHECK-NEXT: ushll v2.4s, v2.4h, #0 -; CHECK-NEXT: ushll v17.2d, v5.2s, #0 -; CHECK-NEXT: ushll v18.2d, v0.2s, #0 -; CHECK-NEXT: ushll2 v5.2d, v5.4s, #0 -; CHECK-NEXT: ushll2 v0.2d, v0.4s, #0 -; CHECK-NEXT: ushll v20.2d, v1.2s, #3 -; CHECK-NEXT: ushll v21.2d, v2.2s, #3 -; CHECK-NEXT: ushll v22.2d, v7.2s, #3 -; CHECK-NEXT: ushll2 v1.2d, v1.4s, #3 -; CHECK-NEXT: ushll v23.2d, v4.2s, #3 -; CHECK-NEXT: ushll2 v2.2d, v2.4s, #3 -; CHECK-NEXT: ushll2 v4.2d, v4.4s, #3 -; CHECK-NEXT: ushll2 v24.2d, v7.4s, #3 -; CHECK-NEXT: sub v7.2d, v3.2d, v4.2d -; CHECK-NEXT: sub v3.2d, v0.2d, v24.2d -; CHECK-NEXT: sub v5.2d, v5.2d, v2.2d -; CHECK-NEXT: sub v6.2d, v6.2d, v23.2d -; CHECK-NEXT: sub v1.2d, v19.2d, v1.2d -; CHECK-NEXT: sub v2.2d, v18.2d, v22.2d -; CHECK-NEXT: sub v0.2d, v16.2d, v20.2d -; CHECK-NEXT: sub v4.2d, v17.2d, v21.2d +; CHECK-NEXT: ldp q1, q0, [x0] +; CHECK-NEXT: ldp q2, q3, [x1] +; CHECK-NEXT: uaddl v4.8h, v1.8b, v2.8b +; CHECK-NEXT: uaddl2 v1.8h, v1.16b, v2.16b +; CHECK-NEXT: uaddl2 v5.8h, v0.16b, v3.16b +; CHECK-NEXT: uaddl v0.8h, v0.8b, v3.8b +; CHECK-NEXT: ushll2 v6.4s, v1.8h, #0 +; CHECK-NEXT: ushll2 v2.4s, v5.8h, #0 +; CHECK-NEXT: ushll2 v16.4s, v0.8h, #0 +; CHECK-NEXT: ushll2 v17.4s, v4.8h, #0 +; CHECK-NEXT: ushll2 v3.2d, v2.4s, #3 +; CHECK-NEXT: ushll2 v7.2d, v6.4s, #0 +; CHECK-NEXT: ushll2 v18.2d, v16.4s, #3 +; CHECK-NEXT: ushll2 v19.2d, v17.4s, #0 +; CHECK-NEXT: sub v7.2d, v7.2d, v3.2d +; CHECK-NEXT: sub v3.2d, v19.2d, v18.2d +; CHECK-NEXT: ushll v18.4s, v5.4h, #0 +; CHECK-NEXT: ushll v19.4s, v1.4h, #0 +; CHECK-NEXT: ushll2 v1.2d, v18.4s, #3 +; CHECK-NEXT: ushll2 v5.2d, v19.4s, #0 +; CHECK-NEXT: ushll v2.2d, v2.2s, #3 +; CHECK-NEXT: ushll v6.2d, v6.2s, #0 +; CHECK-NEXT: ushll v0.4s, v0.4h, #0 +; CHECK-NEXT: ushll v4.4s, v4.4h, #0 +; CHECK-NEXT: sub v5.2d, v5.2d, v1.2d +; CHECK-NEXT: sub v6.2d, v6.2d, v2.2d +; CHECK-NEXT: ushll2 v1.2d, v0.4s, #3 +; CHECK-NEXT: ushll2 v2.2d, v4.4s, #0 +; CHECK-NEXT: ushll v16.2d, v16.2s, #3 +; CHECK-NEXT: ushll v17.2d, v17.2s, #0 +; CHECK-NEXT: sub v1.2d, v2.2d, v1.2d +; CHECK-NEXT: sub v2.2d, v17.2d, v16.2d +; CHECK-NEXT: ushll v16.2d, v18.2s, #3 +; CHECK-NEXT: ushll v0.2d, v0.2s, #3 +; CHECK-NEXT: ushll v4.2d, v4.2s, #0 +; CHECK-NEXT: ushll v17.2d, v19.2s, #0 +; CHECK-NEXT: sub v0.2d, v4.2d, v0.2d +; CHECK-NEXT: sub v4.2d, v17.2d, v16.2d ; CHECK-NEXT: ret %l11 = load <16 x i8>, ptr %p1 %q1 = getelementptr i8, ptr %p1, i32 16 @@ -2085,26 +1988,11 @@ define <2 x i64> @dblext_v2i16_v2i32_v2i64(ptr %p1, ptr %p2) { ; CHECK-LABEL: dblext_v2i16_v2i32_v2i64: ; CHECK: // %bb.0: -; CHECK-NEXT: ldrh w8, [x0] -; CHECK-NEXT: ldrh w9, [x0, #4] -; CHECK-NEXT: ldrh w10, [x1] -; CHECK-NEXT: fmov s0, w8 -; CHECK-NEXT: ldrh w8, [x1, #4] -; CHECK-NEXT: fmov s1, w9 -; CHECK-NEXT: ldrh w9, [x0, #2] -; CHECK-NEXT: fmov s2, w10 -; CHECK-NEXT: ldrh w10, [x0, #6] -; CHECK-NEXT: fmov s3, w8 -; CHECK-NEXT: ldrh w8, [x1, #2] -; CHECK-NEXT: mov v0.s[1], w9 -; CHECK-NEXT: ldrh w9, [x1, #6] -; CHECK-NEXT: mov v1.s[1], w10 -; CHECK-NEXT: mov v2.s[1], w8 -; CHECK-NEXT: mov v3.s[1], w9 -; CHECK-NEXT: add v0.2s, v0.2s, v2.2s -; CHECK-NEXT: add v1.2s, v1.2s, v3.2s +; CHECK-NEXT: ldr d0, [x0] +; CHECK-NEXT: ldr d1, [x1] +; CHECK-NEXT: uaddl v0.4s, v0.4h, v1.4h +; CHECK-NEXT: ushll2 v1.2d, v0.4s, #3 ; CHECK-NEXT: ushll v0.2d, v0.2s, #0 -; CHECK-NEXT: ushll v1.2d, v1.2s, #3 ; CHECK-NEXT: sub v0.2d, v0.2d, v1.2d ; CHECK-NEXT: ret %l11 = load <2 x i16>, ptr %p1 @@ -2129,16 +2017,16 @@ define <4 x i64> @dblext_v4i16_v4i32_v4i64(ptr %p1, ptr %p2) { ; CHECK-LABEL: dblext_v4i16_v4i32_v4i64: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp d0, d3, [x1] -; CHECK-NEXT: ldp d1, d2, [x0] -; CHECK-NEXT: uaddl v0.4s, v1.4h, v0.4h -; CHECK-NEXT: uaddl v1.4s, v2.4h, v3.4h -; CHECK-NEXT: ushll v2.2d, v0.2s, #0 -; CHECK-NEXT: ushll2 v0.2d, v0.4s, #0 -; CHECK-NEXT: ushll2 v3.2d, v1.4s, #3 -; CHECK-NEXT: ushll v4.2d, v1.2s, #3 -; CHECK-NEXT: sub v1.2d, v0.2d, v3.2d -; CHECK-NEXT: sub v0.2d, v2.2d, v4.2d +; CHECK-NEXT: ldr q0, [x0] +; CHECK-NEXT: ldr q1, [x1] +; CHECK-NEXT: uaddl2 v2.4s, v0.8h, v1.8h +; CHECK-NEXT: uaddl v0.4s, v0.4h, v1.4h +; CHECK-NEXT: ushll2 v1.2d, v2.4s, #3 +; CHECK-NEXT: ushll2 v3.2d, v0.4s, #0 +; CHECK-NEXT: ushll v2.2d, v2.2s, #3 +; CHECK-NEXT: ushll v0.2d, v0.2s, #0 +; CHECK-NEXT: sub v1.2d, v3.2d, v1.2d +; CHECK-NEXT: sub v0.2d, v0.2d, v2.2d ; CHECK-NEXT: ret %l11 = load <4 x i16>, ptr %p1 %q1 = getelementptr i8, ptr %p1, i32 8 @@ -2162,24 +2050,24 @@ define <8 x i64> @dblext_v8i16_v8i32_v8i64(ptr %p1, ptr %p2) { ; CHECK-LABEL: dblext_v8i16_v8i32_v8i64: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q3, [x1] -; CHECK-NEXT: ldp q1, q2, [x0] -; CHECK-NEXT: uaddl2 v4.4s, v1.8h, v0.8h -; CHECK-NEXT: uaddl v0.4s, v1.4h, v0.4h -; CHECK-NEXT: uaddl v1.4s, v2.4h, v3.4h -; CHECK-NEXT: uaddl2 v2.4s, v2.8h, v3.8h -; CHECK-NEXT: ushll v5.2d, v0.2s, #0 -; CHECK-NEXT: ushll v6.2d, v4.2s, #0 -; CHECK-NEXT: ushll2 v0.2d, v0.4s, #0 -; CHECK-NEXT: ushll2 v3.2d, v4.4s, #0 -; CHECK-NEXT: ushll v4.2d, v1.2s, #3 -; CHECK-NEXT: ushll v7.2d, v2.2s, #3 -; CHECK-NEXT: ushll2 v2.2d, v2.4s, #3 -; CHECK-NEXT: ushll2 v1.2d, v1.4s, #3 +; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ldp q2, q3, [x1] +; CHECK-NEXT: uaddl v4.4s, v0.4h, v2.4h +; CHECK-NEXT: uaddl2 v0.4s, v0.8h, v2.8h +; CHECK-NEXT: uaddl2 v5.4s, v1.8h, v3.8h +; CHECK-NEXT: uaddl v1.4s, v1.4h, v3.4h +; CHECK-NEXT: ushll2 v3.2d, v0.4s, #0 +; CHECK-NEXT: ushll2 v2.2d, v5.4s, #3 +; CHECK-NEXT: ushll2 v6.2d, v1.4s, #3 ; CHECK-NEXT: sub v3.2d, v3.2d, v2.2d -; CHECK-NEXT: sub v1.2d, v0.2d, v1.2d -; CHECK-NEXT: sub v0.2d, v5.2d, v4.2d -; CHECK-NEXT: sub v2.2d, v6.2d, v7.2d +; CHECK-NEXT: ushll2 v2.2d, v4.4s, #0 +; CHECK-NEXT: ushll v5.2d, v5.2s, #3 +; CHECK-NEXT: ushll v7.2d, v1.2s, #3 +; CHECK-NEXT: sub v1.2d, v2.2d, v6.2d +; CHECK-NEXT: ushll v2.2d, v4.2s, #0 +; CHECK-NEXT: ushll v4.2d, v0.2s, #0 +; CHECK-NEXT: sub v0.2d, v2.2d, v7.2d +; CHECK-NEXT: sub v2.2d, v4.2d, v5.2d ; CHECK-NEXT: ret %l11 = load <8 x i16>, ptr %p1 %q1 = getelementptr i8, ptr %p1, i32 16 @@ -2203,42 +2091,42 @@ define <16 x i64> @dblext_v16i16_v16i32_v16i64(ptr %p1, ptr %p2) { ; CHECK-LABEL: dblext_v16i16_v16i32_v16i64: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q1, [x0] -; CHECK-NEXT: ldp q2, q3, [x1] -; CHECK-NEXT: uaddl2 v6.4s, v0.8h, v2.8h -; CHECK-NEXT: uaddl v0.4s, v0.4h, v2.4h -; CHECK-NEXT: ldp q5, q4, [x0, #32] -; CHECK-NEXT: uaddl2 v7.4s, v1.8h, v3.8h -; CHECK-NEXT: uaddl v1.4s, v1.4h, v3.4h -; CHECK-NEXT: ushll v18.2d, v0.2s, #0 -; CHECK-NEXT: ushll v19.2d, v7.2s, #0 -; CHECK-NEXT: ushll v20.2d, v6.2s, #0 -; CHECK-NEXT: ldp q2, q16, [x1, #32] -; CHECK-NEXT: ushll2 v0.2d, v0.4s, #0 -; CHECK-NEXT: ushll2 v6.2d, v6.4s, #0 -; CHECK-NEXT: uaddl v17.4s, v5.4h, v2.4h -; CHECK-NEXT: uaddl2 v2.4s, v5.8h, v2.8h -; CHECK-NEXT: uaddl v3.4s, v4.4h, v16.4h -; CHECK-NEXT: uaddl2 v4.4s, v4.8h, v16.8h -; CHECK-NEXT: ushll v16.2d, v1.2s, #0 -; CHECK-NEXT: ushll2 v5.2d, v1.4s, #0 -; CHECK-NEXT: ushll2 v1.2d, v7.4s, #0 -; CHECK-NEXT: ushll v21.2d, v3.2s, #3 -; CHECK-NEXT: ushll v22.2d, v17.2s, #3 -; CHECK-NEXT: ushll v23.2d, v4.2s, #3 -; CHECK-NEXT: ushll2 v24.2d, v3.4s, #3 -; CHECK-NEXT: ushll v25.2d, v2.2s, #3 -; CHECK-NEXT: ushll2 v17.2d, v17.4s, #3 -; CHECK-NEXT: ushll2 v2.2d, v2.4s, #3 -; CHECK-NEXT: ushll2 v4.2d, v4.4s, #3 -; CHECK-NEXT: sub v3.2d, v6.2d, v2.2d -; CHECK-NEXT: sub v7.2d, v1.2d, v4.2d -; CHECK-NEXT: sub v1.2d, v0.2d, v17.2d -; CHECK-NEXT: sub v2.2d, v20.2d, v25.2d -; CHECK-NEXT: sub v5.2d, v5.2d, v24.2d -; CHECK-NEXT: sub v6.2d, v19.2d, v23.2d -; CHECK-NEXT: sub v0.2d, v18.2d, v22.2d -; CHECK-NEXT: sub v4.2d, v16.2d, v21.2d +; CHECK-NEXT: ldp q1, q0, [x0] +; CHECK-NEXT: ldp q3, q2, [x0, #32] +; CHECK-NEXT: ldp q6, q4, [x1] +; CHECK-NEXT: uaddl v18.4s, v1.4h, v6.4h +; CHECK-NEXT: uaddl2 v1.4s, v1.8h, v6.8h +; CHECK-NEXT: ldp q7, q5, [x1, #32] +; CHECK-NEXT: uaddl v16.4s, v0.4h, v4.4h +; CHECK-NEXT: uaddl2 v0.4s, v0.8h, v4.8h +; CHECK-NEXT: ushll v20.2d, v1.2s, #0 +; CHECK-NEXT: uaddl2 v4.4s, v3.8h, v7.8h +; CHECK-NEXT: uaddl v19.4s, v3.4h, v7.4h +; CHECK-NEXT: uaddl v17.4s, v2.4h, v5.4h +; CHECK-NEXT: uaddl2 v5.4s, v2.8h, v5.8h +; CHECK-NEXT: ushll2 v2.2d, v4.4s, #3 +; CHECK-NEXT: ushll2 v3.2d, v1.4s, #0 +; CHECK-NEXT: ushll2 v6.2d, v5.4s, #3 +; CHECK-NEXT: ushll2 v7.2d, v0.4s, #0 +; CHECK-NEXT: sub v3.2d, v3.2d, v2.2d +; CHECK-NEXT: sub v7.2d, v7.2d, v6.2d +; CHECK-NEXT: ushll2 v2.2d, v19.4s, #3 +; CHECK-NEXT: ushll2 v6.2d, v18.4s, #0 +; CHECK-NEXT: ushll v4.2d, v4.2s, #3 +; CHECK-NEXT: sub v1.2d, v6.2d, v2.2d +; CHECK-NEXT: sub v2.2d, v20.2d, v4.2d +; CHECK-NEXT: ushll2 v4.2d, v17.4s, #3 +; CHECK-NEXT: ushll2 v6.2d, v16.4s, #0 +; CHECK-NEXT: ushll v20.2d, v5.2s, #3 +; CHECK-NEXT: ushll v0.2d, v0.2s, #0 +; CHECK-NEXT: sub v5.2d, v6.2d, v4.2d +; CHECK-NEXT: sub v6.2d, v0.2d, v20.2d +; CHECK-NEXT: ushll v0.2d, v19.2s, #3 +; CHECK-NEXT: ushll v4.2d, v18.2s, #0 +; CHECK-NEXT: ushll v17.2d, v17.2s, #3 +; CHECK-NEXT: ushll v16.2d, v16.2s, #0 +; CHECK-NEXT: sub v0.2d, v4.2d, v0.2d +; CHECK-NEXT: sub v4.2d, v16.2d, v17.2d ; CHECK-NEXT: ret %l11 = load <16 x i16>, ptr %p1 %q1 = getelementptr i8, ptr %p1, i32 32 @@ -2326,14 +2214,14 @@ define <16 x i16> @std_bv2_v8i8_v16i16(ptr %p1, ptr %p2) { ; CHECK-LABEL: std_bv2_v8i8_v16i16: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp d0, d3, [x1] -; CHECK-NEXT: ldp d1, d2, [x0] -; CHECK-NEXT: ushll v4.8h, v0.8b, #0 -; CHECK-NEXT: ushll v0.8h, v1.8b, #0 -; CHECK-NEXT: ushll v1.8h, v2.8b, #3 -; CHECK-NEXT: ushll v2.8h, v3.8b, #3 -; CHECK-NEXT: sub v0.8h, v0.8h, v1.8h -; CHECK-NEXT: sub v1.8h, v4.8h, v2.8h +; CHECK-NEXT: ldr q0, [x0] +; CHECK-NEXT: ldr q1, [x1] +; CHECK-NEXT: ushll2 v2.8h, v0.16b, #3 +; CHECK-NEXT: ushll v0.8h, v0.8b, #0 +; CHECK-NEXT: ushll2 v3.8h, v1.16b, #3 +; CHECK-NEXT: ushll v1.8h, v1.8b, #0 +; CHECK-NEXT: sub v0.8h, v0.8h, v2.8h +; CHECK-NEXT: sub v1.8h, v1.8h, v3.8h ; CHECK-NEXT: ret %lp1 = load <8 x i8>, ptr %p1 %q1 = getelementptr i8, ptr %p1, i32 8 @@ -2395,17 +2283,16 @@ define <8 x i32> @std_bv2_v4i8_v8i32(ptr %p1, ptr %p2) { ; CHECK-LABEL: std_bv2_v4i8_v8i32: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp s0, s1, [x0] -; CHECK-NEXT: ld1 { v0.s }[1], [x1], #4 +; CHECK-NEXT: ldr d0, [x0] +; CHECK-NEXT: ldr d1, [x1] ; CHECK-NEXT: ushll v0.8h, v0.8b, #0 -; CHECK-NEXT: ushll v2.4s, v0.4h, #0 -; CHECK-NEXT: ld1 { v1.s }[1], [x1] -; CHECK-NEXT: ushll2 v0.4s, v0.8h, #0 ; CHECK-NEXT: ushll v1.8h, v1.8b, #0 +; CHECK-NEXT: ushll2 v2.4s, v0.8h, #3 +; CHECK-NEXT: ushll v0.4s, v0.4h, #0 ; CHECK-NEXT: ushll2 v3.4s, v1.8h, #3 -; CHECK-NEXT: ushll v4.4s, v1.4h, #3 -; CHECK-NEXT: sub v1.4s, v0.4s, v3.4s -; CHECK-NEXT: sub v0.4s, v2.4s, v4.4s +; CHECK-NEXT: ushll v1.4s, v1.4h, #0 +; CHECK-NEXT: sub v0.4s, v0.4s, v2.4s +; CHECK-NEXT: sub v1.4s, v1.4s, v3.4s ; CHECK-NEXT: ret %lp1 = load <4 x i8>, ptr %p1 %q1 = getelementptr i8, ptr %p1, i32 4 @@ -2425,24 +2312,24 @@ define <16 x i32> @std_bv2_v8i8_v16i32(ptr %p1, ptr %p2) { ; CHECK-LABEL: std_bv2_v8i8_v16i32: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp d0, d3, [x1] -; CHECK-NEXT: ldp d1, d2, [x0] +; CHECK-NEXT: ldr q0, [x0] +; CHECK-NEXT: ldr q1, [x1] +; CHECK-NEXT: ushll2 v2.8h, v0.16b, #0 ; CHECK-NEXT: ushll v0.8h, v0.8b, #0 -; CHECK-NEXT: ushll v3.8h, v3.8b, #0 -; CHECK-NEXT: ushll v1.8h, v1.8b, #0 -; CHECK-NEXT: ushll v2.8h, v2.8b, #0 -; CHECK-NEXT: ushll v4.4s, v0.4h, #0 -; CHECK-NEXT: ushll v5.4s, v1.4h, #0 -; CHECK-NEXT: ushll2 v0.4s, v0.8h, #0 -; CHECK-NEXT: ushll2 v1.4s, v1.8h, #0 -; CHECK-NEXT: ushll v6.4s, v3.4h, #3 -; CHECK-NEXT: ushll v7.4s, v2.4h, #3 -; CHECK-NEXT: ushll2 v2.4s, v2.8h, #3 -; CHECK-NEXT: ushll2 v3.4s, v3.8h, #3 -; CHECK-NEXT: sub v1.4s, v1.4s, v2.4s -; CHECK-NEXT: sub v3.4s, v0.4s, v3.4s -; CHECK-NEXT: sub v0.4s, v5.4s, v7.4s -; CHECK-NEXT: sub v2.4s, v4.4s, v6.4s +; CHECK-NEXT: ushll2 v4.8h, v1.16b, #0 +; CHECK-NEXT: ushll v5.8h, v1.8b, #0 +; CHECK-NEXT: ushll2 v1.4s, v2.8h, #3 +; CHECK-NEXT: ushll2 v3.4s, v0.8h, #0 +; CHECK-NEXT: ushll2 v6.4s, v4.8h, #3 +; CHECK-NEXT: sub v1.4s, v3.4s, v1.4s +; CHECK-NEXT: ushll2 v3.4s, v5.8h, #0 +; CHECK-NEXT: ushll v2.4s, v2.4h, #3 +; CHECK-NEXT: ushll v0.4s, v0.4h, #0 +; CHECK-NEXT: ushll v4.4s, v4.4h, #3 +; CHECK-NEXT: ushll v5.4s, v5.4h, #0 +; CHECK-NEXT: sub v3.4s, v3.4s, v6.4s +; CHECK-NEXT: sub v0.4s, v0.4s, v2.4s +; CHECK-NEXT: sub v2.4s, v5.4s, v4.4s ; CHECK-NEXT: ret %lp1 = load <8 x i8>, ptr %p1 %q1 = getelementptr i8, ptr %p1, i32 8 @@ -2462,26 +2349,17 @@ define <4 x i64> @std_bv2_v2i8_v4i64(ptr %p1, ptr %p2) { ; CHECK-LABEL: std_bv2_v2i8_v4i64: ; CHECK: // %bb.0: -; CHECK-NEXT: ldrb w8, [x1] -; CHECK-NEXT: ldrb w9, [x0, #2] -; CHECK-NEXT: ldrb w10, [x1, #2] -; CHECK-NEXT: fmov d1, x8 -; CHECK-NEXT: ldrb w8, [x0] -; CHECK-NEXT: fmov d0, x9 -; CHECK-NEXT: ldrb w9, [x0, #3] -; CHECK-NEXT: fmov d2, x10 -; CHECK-NEXT: ldrb w10, [x1, #3] -; CHECK-NEXT: fmov d3, x8 -; CHECK-NEXT: ldrb w8, [x0, #1] -; CHECK-NEXT: mov v0.d[1], x9 -; CHECK-NEXT: ldrb w9, [x1, #1] -; CHECK-NEXT: mov v2.d[1], x10 -; CHECK-NEXT: mov v3.d[1], x8 -; CHECK-NEXT: mov v1.d[1], x9 -; CHECK-NEXT: shl v0.2d, v0.2d, #3 -; CHECK-NEXT: shl v2.2d, v2.2d, #3 -; CHECK-NEXT: sub v0.2d, v3.2d, v0.2d +; CHECK-NEXT: ldr s0, [x0] +; CHECK-NEXT: ld1 { v0.s }[1], [x1] +; CHECK-NEXT: ushll v0.8h, v0.8b, #0 +; CHECK-NEXT: ushll2 v1.4s, v0.8h, #0 +; CHECK-NEXT: ushll v0.4s, v0.4h, #0 +; CHECK-NEXT: ushll2 v2.2d, v1.4s, #3 +; CHECK-NEXT: ushll v1.2d, v1.2s, #0 +; CHECK-NEXT: ushll2 v3.2d, v0.4s, #3 +; CHECK-NEXT: ushll v0.2d, v0.2s, #0 ; CHECK-NEXT: sub v1.2d, v1.2d, v2.2d +; CHECK-NEXT: sub v0.2d, v0.2d, v3.2d ; CHECK-NEXT: ret %lp1 = load <2 x i8>, ptr %p1 %q1 = getelementptr i8, ptr %p1, i32 2 @@ -2501,27 +2379,26 @@ define <8 x i64> @std_bv2_v4i8_v8i64(ptr %p1, ptr %p2) { ; CHECK-LABEL: std_bv2_v4i8_v8i64: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp s0, s1, [x0] -; CHECK-NEXT: ld1 { v0.s }[1], [x1], #4 +; CHECK-NEXT: ldr d0, [x0] +; CHECK-NEXT: ldr d1, [x1] ; CHECK-NEXT: ushll v0.8h, v0.8b, #0 -; CHECK-NEXT: ushll v2.4s, v0.4h, #0 -; CHECK-NEXT: ld1 { v1.s }[1], [x1] -; CHECK-NEXT: ushll2 v0.4s, v0.8h, #0 -; CHECK-NEXT: ushll v4.2d, v2.2s, #0 -; CHECK-NEXT: ushll v5.2d, v0.2s, #0 ; CHECK-NEXT: ushll v1.8h, v1.8b, #0 -; CHECK-NEXT: ushll2 v2.2d, v2.4s, #0 -; CHECK-NEXT: ushll v3.4s, v1.4h, #0 -; CHECK-NEXT: ushll2 v1.4s, v1.8h, #0 -; CHECK-NEXT: ushll2 v0.2d, v0.4s, #0 -; CHECK-NEXT: ushll v6.2d, v3.2s, #3 -; CHECK-NEXT: ushll v7.2d, v1.2s, #3 -; CHECK-NEXT: ushll2 v1.2d, v1.4s, #3 -; CHECK-NEXT: ushll2 v16.2d, v3.4s, #3 -; CHECK-NEXT: sub v3.2d, v0.2d, v1.2d -; CHECK-NEXT: sub v1.2d, v2.2d, v16.2d -; CHECK-NEXT: sub v0.2d, v4.2d, v6.2d -; CHECK-NEXT: sub v2.2d, v5.2d, v7.2d +; CHECK-NEXT: ushll2 v2.4s, v0.8h, #0 +; CHECK-NEXT: ushll v0.4s, v0.4h, #0 +; CHECK-NEXT: ushll2 v4.4s, v1.8h, #0 +; CHECK-NEXT: ushll v5.4s, v1.4h, #0 +; CHECK-NEXT: ushll2 v1.2d, v2.4s, #3 +; CHECK-NEXT: ushll2 v3.2d, v0.4s, #0 +; CHECK-NEXT: ushll2 v6.2d, v4.4s, #3 +; CHECK-NEXT: sub v1.2d, v3.2d, v1.2d +; CHECK-NEXT: ushll2 v3.2d, v5.4s, #0 +; CHECK-NEXT: ushll v2.2d, v2.2s, #3 +; CHECK-NEXT: ushll v0.2d, v0.2s, #0 +; CHECK-NEXT: ushll v4.2d, v4.2s, #3 +; CHECK-NEXT: ushll v5.2d, v5.2s, #0 +; CHECK-NEXT: sub v3.2d, v3.2d, v6.2d +; CHECK-NEXT: sub v0.2d, v0.2d, v2.2d +; CHECK-NEXT: sub v2.2d, v5.2d, v4.2d ; CHECK-NEXT: ret %lp1 = load <4 x i8>, ptr %p1 %q1 = getelementptr i8, ptr %p1, i32 4 @@ -2541,44 +2418,44 @@ define <16 x i64> @std_bv2_v8i8_v16i64(ptr %p1, ptr %p2) { ; CHECK-LABEL: std_bv2_v8i8_v16i64: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp d0, d3, [x1] -; CHECK-NEXT: ldp d1, d2, [x0] +; CHECK-NEXT: ldr q0, [x0] +; CHECK-NEXT: ldr q1, [x1] +; CHECK-NEXT: ushll2 v2.8h, v0.16b, #0 ; CHECK-NEXT: ushll v0.8h, v0.8b, #0 -; CHECK-NEXT: ushll v3.8h, v3.8b, #0 -; CHECK-NEXT: ushll v1.8h, v1.8b, #0 -; CHECK-NEXT: ushll v2.8h, v2.8b, #0 -; CHECK-NEXT: ushll v4.4s, v0.4h, #0 -; CHECK-NEXT: ushll v5.4s, v1.4h, #0 -; CHECK-NEXT: ushll2 v0.4s, v0.8h, #0 -; CHECK-NEXT: ushll2 v1.4s, v1.8h, #0 -; CHECK-NEXT: ushll2 v7.4s, v2.8h, #0 -; CHECK-NEXT: ushll2 v19.4s, v3.8h, #0 -; CHECK-NEXT: ushll v3.4s, v3.4h, #0 -; CHECK-NEXT: ushll v2.4s, v2.4h, #0 -; CHECK-NEXT: ushll v16.2d, v4.2s, #0 -; CHECK-NEXT: ushll v17.2d, v5.2s, #0 -; CHECK-NEXT: ushll v6.2d, v0.2s, #0 -; CHECK-NEXT: ushll2 v4.2d, v4.4s, #0 -; CHECK-NEXT: ushll v18.2d, v1.2s, #0 -; CHECK-NEXT: ushll2 v5.2d, v5.4s, #0 -; CHECK-NEXT: ushll2 v0.2d, v0.4s, #0 -; CHECK-NEXT: ushll2 v1.2d, v1.4s, #0 -; CHECK-NEXT: ushll v20.2d, v3.2s, #3 -; CHECK-NEXT: ushll v21.2d, v2.2s, #3 -; CHECK-NEXT: ushll v22.2d, v19.2s, #3 -; CHECK-NEXT: ushll2 v23.2d, v3.4s, #3 -; CHECK-NEXT: ushll v24.2d, v7.2s, #3 -; CHECK-NEXT: ushll2 v2.2d, v2.4s, #3 -; CHECK-NEXT: ushll2 v3.2d, v7.4s, #3 -; CHECK-NEXT: ushll2 v7.2d, v19.4s, #3 -; CHECK-NEXT: sub v3.2d, v1.2d, v3.2d -; CHECK-NEXT: sub v7.2d, v0.2d, v7.2d -; CHECK-NEXT: sub v1.2d, v5.2d, v2.2d -; CHECK-NEXT: sub v2.2d, v18.2d, v24.2d -; CHECK-NEXT: sub v5.2d, v4.2d, v23.2d -; CHECK-NEXT: sub v6.2d, v6.2d, v22.2d -; CHECK-NEXT: sub v0.2d, v17.2d, v21.2d -; CHECK-NEXT: sub v4.2d, v16.2d, v20.2d +; CHECK-NEXT: ushll2 v4.8h, v1.16b, #0 +; CHECK-NEXT: ushll v5.8h, v1.8b, #0 +; CHECK-NEXT: ushll2 v1.4s, v2.8h, #0 +; CHECK-NEXT: ushll2 v6.4s, v0.8h, #0 +; CHECK-NEXT: ushll2 v16.4s, v4.8h, #0 +; CHECK-NEXT: ushll2 v17.4s, v5.8h, #0 +; CHECK-NEXT: ushll2 v3.2d, v1.4s, #3 +; CHECK-NEXT: ushll2 v7.2d, v6.4s, #0 +; CHECK-NEXT: ushll2 v18.2d, v16.4s, #3 +; CHECK-NEXT: ushll2 v19.2d, v17.4s, #0 +; CHECK-NEXT: sub v3.2d, v7.2d, v3.2d +; CHECK-NEXT: sub v7.2d, v19.2d, v18.2d +; CHECK-NEXT: ushll v18.4s, v2.4h, #0 +; CHECK-NEXT: ushll v0.4s, v0.4h, #0 +; CHECK-NEXT: ushll2 v2.2d, v18.4s, #3 +; CHECK-NEXT: ushll2 v19.2d, v0.4s, #0 +; CHECK-NEXT: ushll v20.2d, v1.2s, #3 +; CHECK-NEXT: ushll v6.2d, v6.2s, #0 +; CHECK-NEXT: sub v1.2d, v19.2d, v2.2d +; CHECK-NEXT: ushll v4.4s, v4.4h, #0 +; CHECK-NEXT: ushll v19.4s, v5.4h, #0 +; CHECK-NEXT: sub v2.2d, v6.2d, v20.2d +; CHECK-NEXT: ushll2 v5.2d, v4.4s, #3 +; CHECK-NEXT: ushll2 v6.2d, v19.4s, #0 +; CHECK-NEXT: ushll v16.2d, v16.2s, #3 +; CHECK-NEXT: ushll v17.2d, v17.2s, #0 +; CHECK-NEXT: sub v5.2d, v6.2d, v5.2d +; CHECK-NEXT: sub v6.2d, v17.2d, v16.2d +; CHECK-NEXT: ushll v16.2d, v18.2s, #3 +; CHECK-NEXT: ushll v0.2d, v0.2s, #0 +; CHECK-NEXT: ushll v4.2d, v4.2s, #3 +; CHECK-NEXT: ushll v17.2d, v19.2s, #0 +; CHECK-NEXT: sub v0.2d, v0.2d, v16.2d +; CHECK-NEXT: sub v4.2d, v17.2d, v4.2d ; CHECK-NEXT: ret %lp1 = load <8 x i8>, ptr %p1 %q1 = getelementptr i8, ptr %p1, i32 8 @@ -2636,14 +2513,14 @@ define <8 x i32> @std_bv2_v4i16_v8i32(ptr %p1, ptr %p2) { ; CHECK-LABEL: std_bv2_v4i16_v8i32: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp d0, d3, [x1] -; CHECK-NEXT: ldp d1, d2, [x0] -; CHECK-NEXT: ushll v4.4s, v0.4h, #0 -; CHECK-NEXT: ushll v0.4s, v1.4h, #0 -; CHECK-NEXT: ushll v1.4s, v2.4h, #3 -; CHECK-NEXT: ushll v2.4s, v3.4h, #3 -; CHECK-NEXT: sub v0.4s, v0.4s, v1.4s -; CHECK-NEXT: sub v1.4s, v4.4s, v2.4s +; CHECK-NEXT: ldr q0, [x0] +; CHECK-NEXT: ldr q1, [x1] +; CHECK-NEXT: ushll2 v2.4s, v0.8h, #3 +; CHECK-NEXT: ushll v0.4s, v0.4h, #0 +; CHECK-NEXT: ushll2 v3.4s, v1.8h, #3 +; CHECK-NEXT: ushll v1.4s, v1.4h, #0 +; CHECK-NEXT: sub v0.4s, v0.4s, v2.4s +; CHECK-NEXT: sub v1.4s, v1.4s, v3.4s ; CHECK-NEXT: ret %lp1 = load <4 x i16>, ptr %p1 %q1 = getelementptr i8, ptr %p1, i32 8 @@ -2663,20 +2540,20 @@ define <16 x i32> @std_bv2_v8i16_v16i32(ptr %p1, ptr %p2) { ; CHECK-LABEL: std_bv2_v8i16_v16i32: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q3, [x1] -; CHECK-NEXT: ushll v4.4s, v0.4h, #0 -; CHECK-NEXT: ushll2 v0.4s, v0.8h, #0 -; CHECK-NEXT: ldp q1, q2, [x0] -; CHECK-NEXT: ushll v6.4s, v3.4h, #3 -; CHECK-NEXT: ushll2 v3.4s, v3.8h, #3 -; CHECK-NEXT: ushll v5.4s, v1.4h, #0 -; CHECK-NEXT: ushll2 v1.4s, v1.8h, #0 -; CHECK-NEXT: ushll v7.4s, v2.4h, #3 -; CHECK-NEXT: ushll2 v2.4s, v2.8h, #3 -; CHECK-NEXT: sub v3.4s, v0.4s, v3.4s -; CHECK-NEXT: sub v1.4s, v1.4s, v2.4s -; CHECK-NEXT: sub v0.4s, v5.4s, v7.4s -; CHECK-NEXT: sub v2.4s, v4.4s, v6.4s +; CHECK-NEXT: ldp q0, q2, [x0] +; CHECK-NEXT: ushll2 v5.4s, v0.8h, #0 +; CHECK-NEXT: ushll v0.4s, v0.4h, #0 +; CHECK-NEXT: ldp q4, q3, [x1] +; CHECK-NEXT: ushll2 v1.4s, v2.8h, #3 +; CHECK-NEXT: ushll v2.4s, v2.4h, #3 +; CHECK-NEXT: sub v1.4s, v5.4s, v1.4s +; CHECK-NEXT: ushll2 v5.4s, v4.8h, #0 +; CHECK-NEXT: ushll v4.4s, v4.4h, #0 +; CHECK-NEXT: ushll2 v6.4s, v3.8h, #3 +; CHECK-NEXT: ushll v7.4s, v3.4h, #3 +; CHECK-NEXT: sub v3.4s, v5.4s, v6.4s +; CHECK-NEXT: sub v0.4s, v0.4s, v2.4s +; CHECK-NEXT: sub v2.4s, v4.4s, v7.4s ; CHECK-NEXT: ret %lp1 = load <8 x i16>, ptr %p1 %q1 = getelementptr i8, ptr %p1, i32 16 @@ -2696,30 +2573,16 @@ define <4 x i64> @std_bv2_v2i16_v4i64(ptr %p1, ptr %p2) { ; CHECK-LABEL: std_bv2_v2i16_v4i64: ; CHECK: // %bb.0: -; CHECK-NEXT: add x8, x0, #4 -; CHECK-NEXT: add x9, x1, #4 -; CHECK-NEXT: ld1 { v0.h }[0], [x0] -; CHECK-NEXT: add x10, x0, #2 -; CHECK-NEXT: ld1 { v1.h }[0], [x1] -; CHECK-NEXT: ld1 { v2.h }[0], [x8] -; CHECK-NEXT: add x8, x1, #2 -; CHECK-NEXT: ld1 { v3.h }[0], [x9] -; CHECK-NEXT: add x9, x0, #6 -; CHECK-NEXT: ld1 { v0.h }[2], [x10] -; CHECK-NEXT: add x10, x1, #6 -; CHECK-NEXT: ld1 { v1.h }[2], [x8] -; CHECK-NEXT: ld1 { v2.h }[2], [x9] -; CHECK-NEXT: ld1 { v3.h }[2], [x10] -; CHECK-NEXT: uzp1 v0.4h, v0.4h, v1.4h -; CHECK-NEXT: uzp1 v1.4h, v2.4h, v3.4h +; CHECK-NEXT: ldr d0, [x0] +; CHECK-NEXT: ldr d1, [x1] ; CHECK-NEXT: ushll v0.4s, v0.4h, #0 ; CHECK-NEXT: ushll v1.4s, v1.4h, #0 -; CHECK-NEXT: ushll v2.2d, v0.2s, #0 -; CHECK-NEXT: ushll2 v0.2d, v0.4s, #0 +; CHECK-NEXT: ushll2 v2.2d, v0.4s, #3 +; CHECK-NEXT: ushll v0.2d, v0.2s, #0 ; CHECK-NEXT: ushll2 v3.2d, v1.4s, #3 -; CHECK-NEXT: ushll v4.2d, v1.2s, #3 -; CHECK-NEXT: sub v1.2d, v0.2d, v3.2d -; CHECK-NEXT: sub v0.2d, v2.2d, v4.2d +; CHECK-NEXT: ushll v1.2d, v1.2s, #0 +; CHECK-NEXT: sub v0.2d, v0.2d, v2.2d +; CHECK-NEXT: sub v1.2d, v1.2d, v3.2d ; CHECK-NEXT: ret %lp1 = load <2 x i16>, ptr %p1 %q1 = getelementptr i8, ptr %p1, i32 4 @@ -2739,24 +2602,24 @@ define <8 x i64> @std_bv2_v4i16_v8i64(ptr %p1, ptr %p2) { ; CHECK-LABEL: std_bv2_v4i16_v8i64: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp d0, d3, [x1] -; CHECK-NEXT: ldp d1, d2, [x0] +; CHECK-NEXT: ldr q0, [x0] +; CHECK-NEXT: ldr q1, [x1] +; CHECK-NEXT: ushll2 v2.4s, v0.8h, #0 ; CHECK-NEXT: ushll v0.4s, v0.4h, #0 -; CHECK-NEXT: ushll v3.4s, v3.4h, #0 -; CHECK-NEXT: ushll v1.4s, v1.4h, #0 -; CHECK-NEXT: ushll v2.4s, v2.4h, #0 -; CHECK-NEXT: ushll v4.2d, v0.2s, #0 -; CHECK-NEXT: ushll v5.2d, v1.2s, #0 -; CHECK-NEXT: ushll2 v0.2d, v0.4s, #0 -; CHECK-NEXT: ushll2 v1.2d, v1.4s, #0 -; CHECK-NEXT: ushll v6.2d, v3.2s, #3 -; CHECK-NEXT: ushll v7.2d, v2.2s, #3 -; CHECK-NEXT: ushll2 v2.2d, v2.4s, #3 -; CHECK-NEXT: ushll2 v3.2d, v3.4s, #3 -; CHECK-NEXT: sub v1.2d, v1.2d, v2.2d -; CHECK-NEXT: sub v3.2d, v0.2d, v3.2d -; CHECK-NEXT: sub v0.2d, v5.2d, v7.2d -; CHECK-NEXT: sub v2.2d, v4.2d, v6.2d +; CHECK-NEXT: ushll2 v4.4s, v1.8h, #0 +; CHECK-NEXT: ushll v5.4s, v1.4h, #0 +; CHECK-NEXT: ushll2 v1.2d, v2.4s, #3 +; CHECK-NEXT: ushll2 v3.2d, v0.4s, #0 +; CHECK-NEXT: ushll2 v6.2d, v4.4s, #3 +; CHECK-NEXT: sub v1.2d, v3.2d, v1.2d +; CHECK-NEXT: ushll2 v3.2d, v5.4s, #0 +; CHECK-NEXT: ushll v2.2d, v2.2s, #3 +; CHECK-NEXT: ushll v0.2d, v0.2s, #0 +; CHECK-NEXT: ushll v4.2d, v4.2s, #3 +; CHECK-NEXT: ushll v5.2d, v5.2s, #0 +; CHECK-NEXT: sub v3.2d, v3.2d, v6.2d +; CHECK-NEXT: sub v0.2d, v0.2d, v2.2d +; CHECK-NEXT: sub v2.2d, v5.2d, v4.2d ; CHECK-NEXT: ret %lp1 = load <4 x i16>, ptr %p1 %q1 = getelementptr i8, ptr %p1, i32 8 @@ -2776,40 +2639,40 @@ define <16 x i64> @std_bv2_v8i16_v16i64(ptr %p1, ptr %p2) { ; CHECK-LABEL: std_bv2_v8i16_v16i64: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q3, [x1] -; CHECK-NEXT: ushll v4.4s, v0.4h, #0 -; CHECK-NEXT: ushll2 v0.4s, v0.8h, #0 -; CHECK-NEXT: ldp q1, q2, [x0] -; CHECK-NEXT: ushll2 v19.4s, v3.8h, #0 -; CHECK-NEXT: ushll v3.4s, v3.4h, #0 -; CHECK-NEXT: ushll v16.2d, v4.2s, #0 -; CHECK-NEXT: ushll v5.4s, v1.4h, #0 -; CHECK-NEXT: ushll2 v1.4s, v1.8h, #0 -; CHECK-NEXT: ushll2 v7.4s, v2.8h, #0 -; CHECK-NEXT: ushll v2.4s, v2.4h, #0 -; CHECK-NEXT: ushll v17.2d, v5.2s, #0 -; CHECK-NEXT: ushll v6.2d, v0.2s, #0 -; CHECK-NEXT: ushll2 v4.2d, v4.4s, #0 -; CHECK-NEXT: ushll v18.2d, v1.2s, #0 -; CHECK-NEXT: ushll2 v5.2d, v5.4s, #0 -; CHECK-NEXT: ushll2 v0.2d, v0.4s, #0 -; CHECK-NEXT: ushll2 v1.2d, v1.4s, #0 -; CHECK-NEXT: ushll v20.2d, v3.2s, #3 -; CHECK-NEXT: ushll v21.2d, v2.2s, #3 -; CHECK-NEXT: ushll v22.2d, v19.2s, #3 -; CHECK-NEXT: ushll2 v23.2d, v3.4s, #3 -; CHECK-NEXT: ushll v24.2d, v7.2s, #3 -; CHECK-NEXT: ushll2 v2.2d, v2.4s, #3 -; CHECK-NEXT: ushll2 v3.2d, v7.4s, #3 -; CHECK-NEXT: ushll2 v7.2d, v19.4s, #3 -; CHECK-NEXT: sub v3.2d, v1.2d, v3.2d -; CHECK-NEXT: sub v7.2d, v0.2d, v7.2d -; CHECK-NEXT: sub v1.2d, v5.2d, v2.2d -; CHECK-NEXT: sub v2.2d, v18.2d, v24.2d -; CHECK-NEXT: sub v5.2d, v4.2d, v23.2d -; CHECK-NEXT: sub v6.2d, v6.2d, v22.2d -; CHECK-NEXT: sub v0.2d, v17.2d, v21.2d -; CHECK-NEXT: sub v4.2d, v16.2d, v20.2d +; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ushll2 v6.4s, v0.8h, #0 +; CHECK-NEXT: ushll v0.4s, v0.4h, #0 +; CHECK-NEXT: ldp q4, q5, [x1] +; CHECK-NEXT: ushll2 v2.4s, v1.8h, #0 +; CHECK-NEXT: ushll v18.4s, v1.4h, #0 +; CHECK-NEXT: ushll2 v1.2d, v2.4s, #3 +; CHECK-NEXT: ushll2 v17.4s, v4.8h, #0 +; CHECK-NEXT: ushll2 v3.2d, v6.4s, #0 +; CHECK-NEXT: ushll2 v16.4s, v5.8h, #0 +; CHECK-NEXT: ushll2 v19.2d, v17.4s, #0 +; CHECK-NEXT: ushll2 v7.2d, v16.4s, #3 +; CHECK-NEXT: sub v3.2d, v3.2d, v1.2d +; CHECK-NEXT: sub v7.2d, v19.2d, v7.2d +; CHECK-NEXT: ushll2 v1.2d, v18.4s, #3 +; CHECK-NEXT: ushll2 v19.2d, v0.4s, #0 +; CHECK-NEXT: ushll v2.2d, v2.2s, #3 +; CHECK-NEXT: ushll v6.2d, v6.2s, #0 +; CHECK-NEXT: sub v1.2d, v19.2d, v1.2d +; CHECK-NEXT: ushll v19.4s, v5.4h, #0 +; CHECK-NEXT: ushll v4.4s, v4.4h, #0 +; CHECK-NEXT: sub v2.2d, v6.2d, v2.2d +; CHECK-NEXT: ushll2 v5.2d, v19.4s, #3 +; CHECK-NEXT: ushll2 v6.2d, v4.4s, #0 +; CHECK-NEXT: ushll v16.2d, v16.2s, #3 +; CHECK-NEXT: ushll v17.2d, v17.2s, #0 +; CHECK-NEXT: sub v5.2d, v6.2d, v5.2d +; CHECK-NEXT: sub v6.2d, v17.2d, v16.2d +; CHECK-NEXT: ushll v16.2d, v18.2s, #3 +; CHECK-NEXT: ushll v0.2d, v0.2s, #0 +; CHECK-NEXT: ushll v17.2d, v19.2s, #3 +; CHECK-NEXT: ushll v4.2d, v4.2s, #0 +; CHECK-NEXT: sub v0.2d, v0.2d, v16.2d +; CHECK-NEXT: sub v4.2d, v4.2d, v17.2d ; CHECK-NEXT: ret %lp1 = load <8 x i16>, ptr %p1 %q1 = getelementptr i8, ptr %p1, i32 16 @@ -2829,14 +2692,14 @@ define <4 x i64> @std_bv2_v2i32_v4i64(ptr %p1, ptr %p2) { ; CHECK-LABEL: std_bv2_v2i32_v4i64: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp d0, d3, [x1] -; CHECK-NEXT: ldp d1, d2, [x0] -; CHECK-NEXT: ushll v4.2d, v0.2s, #0 -; CHECK-NEXT: ushll v0.2d, v1.2s, #0 -; CHECK-NEXT: ushll v1.2d, v2.2s, #3 -; CHECK-NEXT: ushll v2.2d, v3.2s, #3 -; CHECK-NEXT: sub v0.2d, v0.2d, v1.2d -; CHECK-NEXT: sub v1.2d, v4.2d, v2.2d +; CHECK-NEXT: ldr q0, [x0] +; CHECK-NEXT: ldr q1, [x1] +; CHECK-NEXT: ushll2 v2.2d, v0.4s, #3 +; CHECK-NEXT: ushll v0.2d, v0.2s, #0 +; CHECK-NEXT: ushll2 v3.2d, v1.4s, #3 +; CHECK-NEXT: ushll v1.2d, v1.2s, #0 +; CHECK-NEXT: sub v0.2d, v0.2d, v2.2d +; CHECK-NEXT: sub v1.2d, v1.2d, v3.2d ; CHECK-NEXT: ret %lp1 = load <2 x i32>, ptr %p1 %q1 = getelementptr i8, ptr %p1, i32 8 @@ -2856,20 +2719,20 @@ define <8 x i64> @std_bv2_v4i32_v8i64(ptr %p1, ptr %p2) { ; CHECK-LABEL: std_bv2_v4i32_v8i64: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q3, [x1] -; CHECK-NEXT: ushll v4.2d, v0.2s, #0 -; CHECK-NEXT: ushll2 v0.2d, v0.4s, #0 -; CHECK-NEXT: ldp q1, q2, [x0] -; CHECK-NEXT: ushll v6.2d, v3.2s, #3 -; CHECK-NEXT: ushll2 v3.2d, v3.4s, #3 -; CHECK-NEXT: ushll v5.2d, v1.2s, #0 -; CHECK-NEXT: ushll2 v1.2d, v1.4s, #0 -; CHECK-NEXT: ushll v7.2d, v2.2s, #3 -; CHECK-NEXT: ushll2 v2.2d, v2.4s, #3 -; CHECK-NEXT: sub v3.2d, v0.2d, v3.2d -; CHECK-NEXT: sub v1.2d, v1.2d, v2.2d -; CHECK-NEXT: sub v0.2d, v5.2d, v7.2d -; CHECK-NEXT: sub v2.2d, v4.2d, v6.2d +; CHECK-NEXT: ldp q0, q2, [x0] +; CHECK-NEXT: ushll2 v5.2d, v0.4s, #0 +; CHECK-NEXT: ushll v0.2d, v0.2s, #0 +; CHECK-NEXT: ldp q4, q3, [x1] +; CHECK-NEXT: ushll2 v1.2d, v2.4s, #3 +; CHECK-NEXT: ushll v2.2d, v2.2s, #3 +; CHECK-NEXT: sub v1.2d, v5.2d, v1.2d +; CHECK-NEXT: ushll2 v5.2d, v4.4s, #0 +; CHECK-NEXT: ushll v4.2d, v4.2s, #0 +; CHECK-NEXT: ushll2 v6.2d, v3.4s, #3 +; CHECK-NEXT: ushll v7.2d, v3.2s, #3 +; CHECK-NEXT: sub v3.2d, v5.2d, v6.2d +; CHECK-NEXT: sub v0.2d, v0.2d, v2.2d +; CHECK-NEXT: sub v2.2d, v4.2d, v7.2d ; CHECK-NEXT: ret %lp1 = load <4 x i32>, ptr %p1 %q1 = getelementptr i8, ptr %p1, i32 16 @@ -2889,34 +2752,34 @@ define <16 x i64> @std_bv2_v8i32_v16i64(ptr %p1, ptr %p2) { ; CHECK-LABEL: std_bv2_v8i32_v16i64: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q2, q3, [x1] -; CHECK-NEXT: ushll v16.2d, v2.2s, #0 -; CHECK-NEXT: ushll2 v2.2d, v2.4s, #0 -; CHECK-NEXT: ldp q0, q1, [x0] -; CHECK-NEXT: ushll v6.2d, v3.2s, #0 -; CHECK-NEXT: ushll2 v19.2d, v3.4s, #0 -; CHECK-NEXT: ushll v18.2d, v0.2s, #0 -; CHECK-NEXT: ushll2 v0.2d, v0.4s, #0 -; CHECK-NEXT: ldp q5, q4, [x0, #32] -; CHECK-NEXT: ushll v17.2d, v1.2s, #0 -; CHECK-NEXT: ushll2 v20.2d, v1.4s, #0 -; CHECK-NEXT: ushll v24.2d, v5.2s, #3 -; CHECK-NEXT: ushll2 v1.2d, v5.4s, #3 -; CHECK-NEXT: ldp q3, q7, [x1, #32] -; CHECK-NEXT: ushll v23.2d, v4.2s, #3 -; CHECK-NEXT: sub v1.2d, v0.2d, v1.2d -; CHECK-NEXT: sub v0.2d, v18.2d, v24.2d -; CHECK-NEXT: ushll v22.2d, v3.2s, #3 -; CHECK-NEXT: ushll2 v25.2d, v3.4s, #3 -; CHECK-NEXT: ushll v21.2d, v7.2s, #3 -; CHECK-NEXT: ushll2 v7.2d, v7.4s, #3 -; CHECK-NEXT: ushll2 v3.2d, v4.4s, #3 -; CHECK-NEXT: sub v5.2d, v2.2d, v25.2d -; CHECK-NEXT: sub v3.2d, v20.2d, v3.2d -; CHECK-NEXT: sub v7.2d, v19.2d, v7.2d -; CHECK-NEXT: sub v2.2d, v17.2d, v23.2d -; CHECK-NEXT: sub v4.2d, v16.2d, v22.2d -; CHECK-NEXT: sub v6.2d, v6.2d, v21.2d +; CHECK-NEXT: ldp q2, q0, [x0] +; CHECK-NEXT: ushll2 v3.2d, v2.4s, #0 +; CHECK-NEXT: ushll v2.2d, v2.2s, #0 +; CHECK-NEXT: ldp q6, q4, [x0, #32] +; CHECK-NEXT: ushll2 v7.2d, v0.4s, #0 +; CHECK-NEXT: ushll2 v1.2d, v6.4s, #3 +; CHECK-NEXT: ushll v6.2d, v6.2s, #3 +; CHECK-NEXT: ldp q17, q16, [x1, #32] +; CHECK-NEXT: ushll2 v5.2d, v4.4s, #3 +; CHECK-NEXT: sub v1.2d, v3.2d, v1.2d +; CHECK-NEXT: sub v3.2d, v7.2d, v5.2d +; CHECK-NEXT: ushll2 v20.2d, v17.4s, #3 +; CHECK-NEXT: ushll v4.2d, v4.2s, #3 +; CHECK-NEXT: ldp q19, q18, [x1] +; CHECK-NEXT: ushll2 v7.2d, v16.4s, #3 +; CHECK-NEXT: ushll v16.2d, v16.2s, #3 +; CHECK-NEXT: ushll2 v21.2d, v19.4s, #0 +; CHECK-NEXT: sub v5.2d, v21.2d, v20.2d +; CHECK-NEXT: ushll v20.2d, v0.2s, #0 +; CHECK-NEXT: ushll2 v22.2d, v18.4s, #0 +; CHECK-NEXT: sub v0.2d, v2.2d, v6.2d +; CHECK-NEXT: sub v2.2d, v20.2d, v4.2d +; CHECK-NEXT: ushll v4.2d, v17.2s, #3 +; CHECK-NEXT: ushll v6.2d, v19.2s, #0 +; CHECK-NEXT: ushll v17.2d, v18.2s, #0 +; CHECK-NEXT: sub v7.2d, v22.2d, v7.2d +; CHECK-NEXT: sub v4.2d, v6.2d, v4.2d +; CHECK-NEXT: sub v6.2d, v17.2d, v16.2d ; CHECK-NEXT: ret %lp1 = load <8 x i32>, ptr %p1 %q1 = getelementptr i8, ptr %p1, i32 32 @@ -3043,20 +2906,18 @@ define <16 x i16> @dbl_bv2_v8i8_v16i16(ptr %p1, ptr %p2, ptr %p3, ptr %p4) { ; CHECK-LABEL: dbl_bv2_v8i8_v16i16: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp d0, d5, [x2] -; CHECK-NEXT: ldp d1, d2, [x0] -; CHECK-NEXT: ldp d3, d4, [x1] -; CHECK-NEXT: ldp d6, d7, [x3] -; CHECK-NEXT: add v0.8b, v1.8b, v0.8b -; CHECK-NEXT: add v1.8b, v2.8b, v5.8b +; CHECK-NEXT: ldr q0, [x2] +; CHECK-NEXT: ldr q1, [x0] +; CHECK-NEXT: ldr q2, [x1] +; CHECK-NEXT: ldr q3, [x3] +; CHECK-NEXT: add v0.16b, v1.16b, v0.16b +; CHECK-NEXT: add v1.16b, v2.16b, v3.16b +; CHECK-NEXT: ushll2 v2.8h, v0.16b, #3 ; CHECK-NEXT: ushll v0.8h, v0.8b, #0 -; CHECK-NEXT: add v3.8b, v3.8b, v6.8b -; CHECK-NEXT: add v2.8b, v4.8b, v7.8b -; CHECK-NEXT: ushll v3.8h, v3.8b, #0 -; CHECK-NEXT: ushll v1.8h, v1.8b, #3 -; CHECK-NEXT: ushll v2.8h, v2.8b, #3 -; CHECK-NEXT: sub v0.8h, v0.8h, v1.8h -; CHECK-NEXT: sub v1.8h, v3.8h, v2.8h +; CHECK-NEXT: ushll2 v3.8h, v1.16b, #3 +; CHECK-NEXT: ushll v1.8h, v1.8b, #0 +; CHECK-NEXT: sub v0.8h, v0.8h, v2.8h +; CHECK-NEXT: sub v1.8h, v1.8h, v3.8h ; CHECK-NEXT: ret %lp1 = load <8 x i8>, ptr %p1 %q1 = getelementptr i8, ptr %p1, i32 8 @@ -3156,22 +3017,20 @@ define <8 x i32> @dbl_bv2_v4i8_v8i32(ptr %p1, ptr %p2, ptr %p3, ptr %p4) { ; CHECK-LABEL: dbl_bv2_v4i8_v8i32: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp s0, s1, [x0] -; CHECK-NEXT: ld1 { v0.s }[1], [x1], #4 -; CHECK-NEXT: ldp s2, s3, [x2] -; CHECK-NEXT: ld1 { v1.s }[1], [x1] -; CHECK-NEXT: ld1 { v2.s }[1], [x3], #4 -; CHECK-NEXT: add v0.8b, v0.8b, v2.8b +; CHECK-NEXT: ldr d0, [x2] +; CHECK-NEXT: ldr d1, [x0] +; CHECK-NEXT: ldr d2, [x1] +; CHECK-NEXT: ldr d3, [x3] +; CHECK-NEXT: add v0.8b, v1.8b, v0.8b +; CHECK-NEXT: add v1.8b, v2.8b, v3.8b ; CHECK-NEXT: ushll v0.8h, v0.8b, #0 -; CHECK-NEXT: ld1 { v3.s }[1], [x3] -; CHECK-NEXT: ushll v2.4s, v0.4h, #0 -; CHECK-NEXT: ushll2 v0.4s, v0.8h, #0 -; CHECK-NEXT: add v1.8b, v1.8b, v3.8b ; CHECK-NEXT: ushll v1.8h, v1.8b, #0 +; CHECK-NEXT: ushll2 v2.4s, v0.8h, #3 +; CHECK-NEXT: ushll v0.4s, v0.4h, #0 ; CHECK-NEXT: ushll2 v3.4s, v1.8h, #3 -; CHECK-NEXT: ushll v4.4s, v1.4h, #3 -; CHECK-NEXT: sub v1.4s, v0.4s, v3.4s -; CHECK-NEXT: sub v0.4s, v2.4s, v4.4s +; CHECK-NEXT: ushll v1.4s, v1.4h, #0 +; CHECK-NEXT: sub v0.4s, v0.4s, v2.4s +; CHECK-NEXT: sub v1.4s, v1.4s, v3.4s ; CHECK-NEXT: ret %lp1 = load <4 x i8>, ptr %p1 %q1 = getelementptr i8, ptr %p1, i32 4 @@ -3201,30 +3060,28 @@ define <16 x i32> @dbl_bv2_v8i8_v16i32(ptr %p1, ptr %p2, ptr %p3, ptr %p4) { ; CHECK-LABEL: dbl_bv2_v8i8_v16i32: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp d0, d5, [x2] -; CHECK-NEXT: ldp d1, d2, [x0] -; CHECK-NEXT: ldp d3, d4, [x1] -; CHECK-NEXT: ldp d6, d7, [x3] -; CHECK-NEXT: add v0.8b, v1.8b, v0.8b -; CHECK-NEXT: add v1.8b, v2.8b, v5.8b +; CHECK-NEXT: ldr q0, [x2] +; CHECK-NEXT: ldr q1, [x0] +; CHECK-NEXT: ldr q2, [x1] +; CHECK-NEXT: ldr q3, [x3] +; CHECK-NEXT: add v0.16b, v1.16b, v0.16b +; CHECK-NEXT: add v1.16b, v2.16b, v3.16b +; CHECK-NEXT: ushll2 v2.8h, v0.16b, #0 ; CHECK-NEXT: ushll v0.8h, v0.8b, #0 -; CHECK-NEXT: add v3.8b, v3.8b, v6.8b -; CHECK-NEXT: add v2.8b, v4.8b, v7.8b -; CHECK-NEXT: ushll v3.8h, v3.8b, #0 -; CHECK-NEXT: ushll v2.8h, v2.8b, #0 -; CHECK-NEXT: ushll v1.8h, v1.8b, #0 -; CHECK-NEXT: ushll v4.4s, v3.4h, #0 -; CHECK-NEXT: ushll v5.4s, v0.4h, #0 -; CHECK-NEXT: ushll2 v3.4s, v3.8h, #0 -; CHECK-NEXT: ushll2 v0.4s, v0.8h, #0 -; CHECK-NEXT: ushll v6.4s, v2.4h, #3 -; CHECK-NEXT: ushll v7.4s, v1.4h, #3 -; CHECK-NEXT: ushll2 v1.4s, v1.8h, #3 -; CHECK-NEXT: ushll2 v2.4s, v2.8h, #3 -; CHECK-NEXT: sub v1.4s, v0.4s, v1.4s -; CHECK-NEXT: sub v3.4s, v3.4s, v2.4s -; CHECK-NEXT: sub v0.4s, v5.4s, v7.4s -; CHECK-NEXT: sub v2.4s, v4.4s, v6.4s +; CHECK-NEXT: ushll2 v4.8h, v1.16b, #0 +; CHECK-NEXT: ushll v5.8h, v1.8b, #0 +; CHECK-NEXT: ushll2 v1.4s, v2.8h, #3 +; CHECK-NEXT: ushll2 v3.4s, v0.8h, #0 +; CHECK-NEXT: ushll2 v6.4s, v4.8h, #3 +; CHECK-NEXT: sub v1.4s, v3.4s, v1.4s +; CHECK-NEXT: ushll2 v3.4s, v5.8h, #0 +; CHECK-NEXT: ushll v2.4s, v2.4h, #3 +; CHECK-NEXT: ushll v0.4s, v0.4h, #0 +; CHECK-NEXT: ushll v4.4s, v4.4h, #3 +; CHECK-NEXT: ushll v5.4s, v5.4h, #0 +; CHECK-NEXT: sub v3.4s, v3.4s, v6.4s +; CHECK-NEXT: sub v0.4s, v0.4s, v2.4s +; CHECK-NEXT: sub v2.4s, v5.4s, v4.4s ; CHECK-NEXT: ret %lp1 = load <8 x i8>, ptr %p1 %q1 = getelementptr i8, ptr %p1, i32 8 @@ -3254,55 +3111,20 @@ define <4 x i64> @dbl_bv2_v2i8_v4i64(ptr %p1, ptr %p2, ptr %p3, ptr %p4) { ; CHECK-LABEL: dbl_bv2_v2i8_v4i64: ; CHECK: // %bb.0: -; CHECK-NEXT: add x8, x0, #2 -; CHECK-NEXT: add x9, x1, #2 -; CHECK-NEXT: ld1 { v1.b }[0], [x1] -; CHECK-NEXT: add x10, x0, #1 -; CHECK-NEXT: ld1 { v0.b }[0], [x0] -; CHECK-NEXT: add x11, x3, #1 -; CHECK-NEXT: ld1 { v2.b }[0], [x8] -; CHECK-NEXT: add x8, x1, #1 -; CHECK-NEXT: ld1 { v3.b }[0], [x9] -; CHECK-NEXT: add x9, x0, #3 -; CHECK-NEXT: ld1 { v4.b }[0], [x2] -; CHECK-NEXT: ld1 { v1.b }[4], [x8] -; CHECK-NEXT: add x8, x2, #2 -; CHECK-NEXT: ld1 { v2.b }[4], [x9] -; CHECK-NEXT: add x9, x3, #2 -; CHECK-NEXT: ld1 { v0.b }[4], [x10] -; CHECK-NEXT: add x10, x1, #3 -; CHECK-NEXT: ld1 { v5.b }[0], [x8] -; CHECK-NEXT: add x8, x2, #3 -; CHECK-NEXT: ld1 { v6.b }[0], [x3] -; CHECK-NEXT: ld1 { v7.b }[0], [x9] -; CHECK-NEXT: add x9, x3, #3 -; CHECK-NEXT: ld1 { v3.b }[4], [x10] -; CHECK-NEXT: add x10, x2, #1 -; CHECK-NEXT: ld1 { v5.b }[4], [x8] -; CHECK-NEXT: uzp1 v0.4h, v0.4h, v1.4h -; CHECK-NEXT: ld1 { v6.b }[4], [x11] -; CHECK-NEXT: ld1 { v4.b }[4], [x10] -; CHECK-NEXT: ld1 { v7.b }[4], [x9] -; CHECK-NEXT: uzp1 v2.4h, v2.4h, v3.4h -; CHECK-NEXT: uzp1 v1.4h, v4.4h, v6.4h -; CHECK-NEXT: uzp1 v3.4h, v5.4h, v7.4h -; CHECK-NEXT: add v0.4h, v0.4h, v1.4h -; CHECK-NEXT: add v1.4h, v2.4h, v3.4h -; CHECK-NEXT: movi v2.2d, #0x000000000000ff -; CHECK-NEXT: ushll v1.4s, v1.4h, #0 +; CHECK-NEXT: ldr s0, [x2] +; CHECK-NEXT: ldr s1, [x0] +; CHECK-NEXT: ld1 { v0.s }[1], [x3] +; CHECK-NEXT: ld1 { v1.s }[1], [x1] +; CHECK-NEXT: add v0.8b, v1.8b, v0.8b +; CHECK-NEXT: ushll v0.8h, v0.8b, #0 +; CHECK-NEXT: ushll2 v1.4s, v0.8h, #0 ; CHECK-NEXT: ushll v0.4s, v0.4h, #0 -; CHECK-NEXT: ushll2 v4.2d, v1.4s, #0 +; CHECK-NEXT: ushll2 v2.2d, v1.4s, #3 ; CHECK-NEXT: ushll v1.2d, v1.2s, #0 -; CHECK-NEXT: ushll v3.2d, v0.2s, #0 -; CHECK-NEXT: ushll2 v0.2d, v0.4s, #0 -; CHECK-NEXT: and v4.16b, v4.16b, v2.16b -; CHECK-NEXT: and v1.16b, v1.16b, v2.16b -; CHECK-NEXT: and v3.16b, v3.16b, v2.16b -; CHECK-NEXT: and v0.16b, v0.16b, v2.16b -; CHECK-NEXT: shl v2.2d, v4.2d, #3 -; CHECK-NEXT: shl v4.2d, v1.2d, #3 -; CHECK-NEXT: sub v1.2d, v0.2d, v2.2d -; CHECK-NEXT: sub v0.2d, v3.2d, v4.2d +; CHECK-NEXT: ushll2 v3.2d, v0.4s, #3 +; CHECK-NEXT: ushll v0.2d, v0.2s, #0 +; CHECK-NEXT: sub v1.2d, v1.2d, v2.2d +; CHECK-NEXT: sub v0.2d, v0.2d, v3.2d ; CHECK-NEXT: ret %lp1 = load <2 x i8>, ptr %p1 %q1 = getelementptr i8, ptr %p1, i32 2 @@ -3332,32 +3154,30 @@ define <8 x i64> @dbl_bv2_v4i8_v8i64(ptr %p1, ptr %p2, ptr %p3, ptr %p4) { ; CHECK-LABEL: dbl_bv2_v4i8_v8i64: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp s0, s1, [x0] -; CHECK-NEXT: ld1 { v0.s }[1], [x1], #4 -; CHECK-NEXT: ldp s2, s3, [x2] -; CHECK-NEXT: ld1 { v1.s }[1], [x1] -; CHECK-NEXT: ld1 { v2.s }[1], [x3], #4 -; CHECK-NEXT: add v0.8b, v0.8b, v2.8b +; CHECK-NEXT: ldr d0, [x2] +; CHECK-NEXT: ldr d1, [x0] +; CHECK-NEXT: ldr d2, [x1] +; CHECK-NEXT: ldr d3, [x3] +; CHECK-NEXT: add v0.8b, v1.8b, v0.8b +; CHECK-NEXT: add v1.8b, v2.8b, v3.8b ; CHECK-NEXT: ushll v0.8h, v0.8b, #0 -; CHECK-NEXT: ld1 { v3.s }[1], [x3] -; CHECK-NEXT: ushll v2.4s, v0.4h, #0 -; CHECK-NEXT: ushll2 v0.4s, v0.8h, #0 -; CHECK-NEXT: ushll v4.2d, v2.2s, #0 -; CHECK-NEXT: add v1.8b, v1.8b, v3.8b -; CHECK-NEXT: ushll v5.2d, v0.2s, #0 ; CHECK-NEXT: ushll v1.8h, v1.8b, #0 -; CHECK-NEXT: ushll2 v2.2d, v2.4s, #0 -; CHECK-NEXT: ushll v3.4s, v1.4h, #0 -; CHECK-NEXT: ushll2 v1.4s, v1.8h, #0 -; CHECK-NEXT: ushll2 v0.2d, v0.4s, #0 -; CHECK-NEXT: ushll v6.2d, v3.2s, #3 -; CHECK-NEXT: ushll v7.2d, v1.2s, #3 -; CHECK-NEXT: ushll2 v1.2d, v1.4s, #3 -; CHECK-NEXT: ushll2 v16.2d, v3.4s, #3 -; CHECK-NEXT: sub v3.2d, v0.2d, v1.2d -; CHECK-NEXT: sub v1.2d, v2.2d, v16.2d -; CHECK-NEXT: sub v0.2d, v4.2d, v6.2d -; CHECK-NEXT: sub v2.2d, v5.2d, v7.2d +; CHECK-NEXT: ushll2 v2.4s, v0.8h, #0 +; CHECK-NEXT: ushll v0.4s, v0.4h, #0 +; CHECK-NEXT: ushll2 v4.4s, v1.8h, #0 +; CHECK-NEXT: ushll v5.4s, v1.4h, #0 +; CHECK-NEXT: ushll2 v1.2d, v2.4s, #3 +; CHECK-NEXT: ushll2 v3.2d, v0.4s, #0 +; CHECK-NEXT: ushll2 v6.2d, v4.4s, #3 +; CHECK-NEXT: sub v1.2d, v3.2d, v1.2d +; CHECK-NEXT: ushll2 v3.2d, v5.4s, #0 +; CHECK-NEXT: ushll v2.2d, v2.2s, #3 +; CHECK-NEXT: ushll v0.2d, v0.2s, #0 +; CHECK-NEXT: ushll v4.2d, v4.2s, #3 +; CHECK-NEXT: ushll v5.2d, v5.2s, #0 +; CHECK-NEXT: sub v3.2d, v3.2d, v6.2d +; CHECK-NEXT: sub v0.2d, v0.2d, v2.2d +; CHECK-NEXT: sub v2.2d, v5.2d, v4.2d ; CHECK-NEXT: ret %lp1 = load <4 x i8>, ptr %p1 %q1 = getelementptr i8, ptr %p1, i32 4 @@ -3387,50 +3207,48 @@ define <16 x i64> @dbl_bv2_v8i8_v16i64(ptr %p1, ptr %p2, ptr %p3, ptr %p4) { ; CHECK-LABEL: dbl_bv2_v8i8_v16i64: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp d0, d6, [x3] -; CHECK-NEXT: ldp d3, d5, [x1] -; CHECK-NEXT: ldp d1, d2, [x0] -; CHECK-NEXT: add v0.8b, v3.8b, v0.8b -; CHECK-NEXT: ldp d4, d3, [x2] +; CHECK-NEXT: ldr q0, [x2] +; CHECK-NEXT: ldr q1, [x0] +; CHECK-NEXT: ldr q2, [x1] +; CHECK-NEXT: ldr q3, [x3] +; CHECK-NEXT: add v0.16b, v1.16b, v0.16b +; CHECK-NEXT: add v1.16b, v2.16b, v3.16b +; CHECK-NEXT: ushll2 v2.8h, v0.16b, #0 ; CHECK-NEXT: ushll v0.8h, v0.8b, #0 -; CHECK-NEXT: add v1.8b, v1.8b, v4.8b -; CHECK-NEXT: add v2.8b, v2.8b, v3.8b -; CHECK-NEXT: add v3.8b, v5.8b, v6.8b -; CHECK-NEXT: ushll v1.8h, v1.8b, #0 -; CHECK-NEXT: ushll v2.8h, v2.8b, #0 -; CHECK-NEXT: ushll v3.8h, v3.8b, #0 -; CHECK-NEXT: ushll v4.4s, v0.4h, #0 -; CHECK-NEXT: ushll v7.4s, v1.4h, #0 -; CHECK-NEXT: ushll2 v0.4s, v0.8h, #0 -; CHECK-NEXT: ushll2 v1.4s, v1.8h, #0 -; CHECK-NEXT: ushll2 v5.4s, v2.8h, #0 -; CHECK-NEXT: ushll2 v6.4s, v3.8h, #0 -; CHECK-NEXT: ushll v3.4s, v3.4h, #0 -; CHECK-NEXT: ushll v2.4s, v2.4h, #0 -; CHECK-NEXT: ushll v16.2d, v4.2s, #0 -; CHECK-NEXT: ushll v17.2d, v7.2s, #0 -; CHECK-NEXT: ushll v18.2d, v0.2s, #0 -; CHECK-NEXT: ushll2 v4.2d, v4.4s, #0 -; CHECK-NEXT: ushll v19.2d, v1.2s, #0 -; CHECK-NEXT: ushll2 v20.2d, v7.4s, #0 -; CHECK-NEXT: ushll2 v0.2d, v0.4s, #0 -; CHECK-NEXT: ushll2 v1.2d, v1.4s, #0 -; CHECK-NEXT: ushll v21.2d, v3.2s, #3 -; CHECK-NEXT: ushll v22.2d, v2.2s, #3 -; CHECK-NEXT: ushll v23.2d, v6.2s, #3 -; CHECK-NEXT: ushll2 v24.2d, v3.4s, #3 -; CHECK-NEXT: ushll v25.2d, v5.2s, #3 -; CHECK-NEXT: ushll2 v2.2d, v2.4s, #3 -; CHECK-NEXT: ushll2 v3.2d, v5.4s, #3 -; CHECK-NEXT: ushll2 v5.2d, v6.4s, #3 -; CHECK-NEXT: sub v3.2d, v1.2d, v3.2d -; CHECK-NEXT: sub v7.2d, v0.2d, v5.2d -; CHECK-NEXT: sub v1.2d, v20.2d, v2.2d -; CHECK-NEXT: sub v2.2d, v19.2d, v25.2d -; CHECK-NEXT: sub v5.2d, v4.2d, v24.2d -; CHECK-NEXT: sub v6.2d, v18.2d, v23.2d -; CHECK-NEXT: sub v0.2d, v17.2d, v22.2d -; CHECK-NEXT: sub v4.2d, v16.2d, v21.2d +; CHECK-NEXT: ushll2 v4.8h, v1.16b, #0 +; CHECK-NEXT: ushll v5.8h, v1.8b, #0 +; CHECK-NEXT: ushll2 v1.4s, v2.8h, #0 +; CHECK-NEXT: ushll2 v6.4s, v0.8h, #0 +; CHECK-NEXT: ushll2 v16.4s, v4.8h, #0 +; CHECK-NEXT: ushll2 v17.4s, v5.8h, #0 +; CHECK-NEXT: ushll2 v3.2d, v1.4s, #3 +; CHECK-NEXT: ushll2 v7.2d, v6.4s, #0 +; CHECK-NEXT: ushll2 v18.2d, v16.4s, #3 +; CHECK-NEXT: ushll2 v19.2d, v17.4s, #0 +; CHECK-NEXT: sub v3.2d, v7.2d, v3.2d +; CHECK-NEXT: sub v7.2d, v19.2d, v18.2d +; CHECK-NEXT: ushll v18.4s, v2.4h, #0 +; CHECK-NEXT: ushll v0.4s, v0.4h, #0 +; CHECK-NEXT: ushll2 v2.2d, v18.4s, #3 +; CHECK-NEXT: ushll2 v19.2d, v0.4s, #0 +; CHECK-NEXT: ushll v20.2d, v1.2s, #3 +; CHECK-NEXT: ushll v6.2d, v6.2s, #0 +; CHECK-NEXT: sub v1.2d, v19.2d, v2.2d +; CHECK-NEXT: ushll v4.4s, v4.4h, #0 +; CHECK-NEXT: ushll v19.4s, v5.4h, #0 +; CHECK-NEXT: sub v2.2d, v6.2d, v20.2d +; CHECK-NEXT: ushll2 v5.2d, v4.4s, #3 +; CHECK-NEXT: ushll2 v6.2d, v19.4s, #0 +; CHECK-NEXT: ushll v16.2d, v16.2s, #3 +; CHECK-NEXT: ushll v17.2d, v17.2s, #0 +; CHECK-NEXT: sub v5.2d, v6.2d, v5.2d +; CHECK-NEXT: sub v6.2d, v17.2d, v16.2d +; CHECK-NEXT: ushll v16.2d, v18.2s, #3 +; CHECK-NEXT: ushll v0.2d, v0.2s, #0 +; CHECK-NEXT: ushll v4.2d, v4.2s, #3 +; CHECK-NEXT: ushll v17.2d, v19.2s, #0 +; CHECK-NEXT: sub v0.2d, v0.2d, v16.2d +; CHECK-NEXT: sub v4.2d, v17.2d, v4.2d ; CHECK-NEXT: ret %lp1 = load <8 x i8>, ptr %p1 %q1 = getelementptr i8, ptr %p1, i32 8 @@ -3526,20 +3344,18 @@ define <8 x i32> @dbl_bv2_v4i16_v8i32(ptr %p1, ptr %p2, ptr %p3, ptr %p4) { ; CHECK-LABEL: dbl_bv2_v4i16_v8i32: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp d0, d5, [x2] -; CHECK-NEXT: ldp d1, d2, [x0] -; CHECK-NEXT: ldp d3, d4, [x1] -; CHECK-NEXT: ldp d6, d7, [x3] -; CHECK-NEXT: add v0.4h, v1.4h, v0.4h -; CHECK-NEXT: add v1.4h, v2.4h, v5.4h +; CHECK-NEXT: ldr q0, [x2] +; CHECK-NEXT: ldr q1, [x0] +; CHECK-NEXT: ldr q2, [x1] +; CHECK-NEXT: ldr q3, [x3] +; CHECK-NEXT: add v0.8h, v1.8h, v0.8h +; CHECK-NEXT: add v1.8h, v2.8h, v3.8h +; CHECK-NEXT: ushll2 v2.4s, v0.8h, #3 ; CHECK-NEXT: ushll v0.4s, v0.4h, #0 -; CHECK-NEXT: add v3.4h, v3.4h, v6.4h -; CHECK-NEXT: add v2.4h, v4.4h, v7.4h -; CHECK-NEXT: ushll v3.4s, v3.4h, #0 -; CHECK-NEXT: ushll v1.4s, v1.4h, #3 -; CHECK-NEXT: ushll v2.4s, v2.4h, #3 -; CHECK-NEXT: sub v0.4s, v0.4s, v1.4s -; CHECK-NEXT: sub v1.4s, v3.4s, v2.4s +; CHECK-NEXT: ushll2 v3.4s, v1.8h, #3 +; CHECK-NEXT: ushll v1.4s, v1.4h, #0 +; CHECK-NEXT: sub v0.4s, v0.4s, v2.4s +; CHECK-NEXT: sub v1.4s, v1.4s, v3.4s ; CHECK-NEXT: ret %lp1 = load <4 x i16>, ptr %p1 %q1 = getelementptr i8, ptr %p1, i32 8 @@ -3569,26 +3385,26 @@ define <16 x i32> @dbl_bv2_v8i16_v16i32(ptr %p1, ptr %p2, ptr %p3, ptr %p4) { ; CHECK-LABEL: dbl_bv2_v8i16_v16i32: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q5, [x2] -; CHECK-NEXT: ldp q1, q2, [x0] +; CHECK-NEXT: ldp q3, q0, [x3] +; CHECK-NEXT: ldp q2, q1, [x1] +; CHECK-NEXT: add v2.8h, v2.8h, v3.8h +; CHECK-NEXT: ldp q5, q4, [x0] ; CHECK-NEXT: add v0.8h, v1.8h, v0.8h -; CHECK-NEXT: ldp q3, q4, [x1] -; CHECK-NEXT: add v2.8h, v2.8h, v5.8h -; CHECK-NEXT: ushll v5.4s, v0.4h, #0 -; CHECK-NEXT: ushll2 v0.4s, v0.8h, #0 -; CHECK-NEXT: ushll v16.4s, v2.4h, #3 -; CHECK-NEXT: ldp q6, q7, [x3] -; CHECK-NEXT: add v1.8h, v3.8h, v6.8h -; CHECK-NEXT: add v3.8h, v4.8h, v7.8h -; CHECK-NEXT: ushll v4.4s, v1.4h, #0 -; CHECK-NEXT: ushll2 v6.4s, v1.8h, #0 -; CHECK-NEXT: ushll v7.4s, v3.4h, #3 -; CHECK-NEXT: ushll2 v1.4s, v2.8h, #3 -; CHECK-NEXT: ushll2 v2.4s, v3.8h, #3 -; CHECK-NEXT: sub v1.4s, v0.4s, v1.4s -; CHECK-NEXT: sub v3.4s, v6.4s, v2.4s -; CHECK-NEXT: sub v0.4s, v5.4s, v16.4s -; CHECK-NEXT: sub v2.4s, v4.4s, v7.4s +; CHECK-NEXT: ushll2 v7.4s, v2.8h, #0 +; CHECK-NEXT: ushll v2.4s, v2.4h, #0 +; CHECK-NEXT: ldp q3, q6, [x2] +; CHECK-NEXT: add v5.8h, v5.8h, v3.8h +; CHECK-NEXT: add v4.8h, v4.8h, v6.8h +; CHECK-NEXT: ushll2 v3.4s, v5.8h, #0 +; CHECK-NEXT: ushll2 v1.4s, v4.8h, #3 +; CHECK-NEXT: ushll2 v6.4s, v0.8h, #3 +; CHECK-NEXT: sub v1.4s, v3.4s, v1.4s +; CHECK-NEXT: sub v3.4s, v7.4s, v6.4s +; CHECK-NEXT: ushll v4.4s, v4.4h, #3 +; CHECK-NEXT: ushll v5.4s, v5.4h, #0 +; CHECK-NEXT: ushll v6.4s, v0.4h, #3 +; CHECK-NEXT: sub v0.4s, v5.4s, v4.4s +; CHECK-NEXT: sub v2.4s, v2.4s, v6.4s ; CHECK-NEXT: ret %lp1 = load <8 x i16>, ptr %p1 %q1 = getelementptr i8, ptr %p1, i32 16 @@ -3618,48 +3434,20 @@ define <4 x i64> @dbl_bv2_v2i16_v4i64(ptr %p1, ptr %p2, ptr %p3, ptr %p4) { ; CHECK-LABEL: dbl_bv2_v2i16_v4i64: ; CHECK: // %bb.0: -; CHECK-NEXT: add x8, x0, #4 -; CHECK-NEXT: add x9, x1, #4 -; CHECK-NEXT: ld1 { v0.h }[0], [x0] -; CHECK-NEXT: add x10, x0, #6 -; CHECK-NEXT: ld1 { v2.h }[0], [x1] -; CHECK-NEXT: add x11, x2, #6 -; CHECK-NEXT: ld1 { v1.h }[0], [x8] -; CHECK-NEXT: add x8, x0, #2 -; CHECK-NEXT: ld1 { v3.h }[0], [x9] -; CHECK-NEXT: add x9, x1, #2 -; CHECK-NEXT: ld1 { v4.h }[0], [x2] -; CHECK-NEXT: ld1 { v0.h }[2], [x8] -; CHECK-NEXT: add x8, x2, #2 -; CHECK-NEXT: ld1 { v2.h }[2], [x9] -; CHECK-NEXT: add x9, x2, #4 -; CHECK-NEXT: ld1 { v1.h }[2], [x10] -; CHECK-NEXT: add x10, x3, #4 -; CHECK-NEXT: ld1 { v5.h }[0], [x3] -; CHECK-NEXT: ld1 { v4.h }[2], [x8] -; CHECK-NEXT: add x8, x3, #2 -; CHECK-NEXT: ld1 { v6.h }[0], [x9] -; CHECK-NEXT: add x9, x1, #6 -; CHECK-NEXT: ld1 { v7.h }[0], [x10] -; CHECK-NEXT: uzp1 v0.4h, v0.4h, v2.4h -; CHECK-NEXT: ld1 { v5.h }[2], [x8] -; CHECK-NEXT: add x8, x3, #6 -; CHECK-NEXT: ld1 { v3.h }[2], [x9] -; CHECK-NEXT: ld1 { v6.h }[2], [x11] -; CHECK-NEXT: ld1 { v7.h }[2], [x8] -; CHECK-NEXT: uzp1 v2.4h, v4.4h, v5.4h -; CHECK-NEXT: uzp1 v1.4h, v1.4h, v3.4h -; CHECK-NEXT: uzp1 v3.4h, v6.4h, v7.4h -; CHECK-NEXT: add v0.4h, v0.4h, v2.4h +; CHECK-NEXT: ldr d0, [x2] +; CHECK-NEXT: ldr d1, [x0] +; CHECK-NEXT: ldr d2, [x1] +; CHECK-NEXT: ldr d3, [x3] +; CHECK-NEXT: add v0.4h, v1.4h, v0.4h +; CHECK-NEXT: add v1.4h, v2.4h, v3.4h ; CHECK-NEXT: ushll v0.4s, v0.4h, #0 -; CHECK-NEXT: add v1.4h, v1.4h, v3.4h -; CHECK-NEXT: ushll v2.2d, v0.2s, #0 ; CHECK-NEXT: ushll v1.4s, v1.4h, #0 -; CHECK-NEXT: ushll2 v0.2d, v0.4s, #0 +; CHECK-NEXT: ushll2 v2.2d, v0.4s, #3 +; CHECK-NEXT: ushll v0.2d, v0.2s, #0 ; CHECK-NEXT: ushll2 v3.2d, v1.4s, #3 -; CHECK-NEXT: ushll v4.2d, v1.2s, #3 -; CHECK-NEXT: sub v1.2d, v0.2d, v3.2d -; CHECK-NEXT: sub v0.2d, v2.2d, v4.2d +; CHECK-NEXT: ushll v1.2d, v1.2s, #0 +; CHECK-NEXT: sub v0.2d, v0.2d, v2.2d +; CHECK-NEXT: sub v1.2d, v1.2d, v3.2d ; CHECK-NEXT: ret %lp1 = load <2 x i16>, ptr %p1 %q1 = getelementptr i8, ptr %p1, i32 4 @@ -3689,30 +3477,28 @@ define <8 x i64> @dbl_bv2_v4i16_v8i64(ptr %p1, ptr %p2, ptr %p3, ptr %p4) { ; CHECK-LABEL: dbl_bv2_v4i16_v8i64: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp d0, d5, [x2] -; CHECK-NEXT: ldp d1, d2, [x0] -; CHECK-NEXT: ldp d3, d4, [x1] -; CHECK-NEXT: ldp d6, d7, [x3] -; CHECK-NEXT: add v0.4h, v1.4h, v0.4h -; CHECK-NEXT: add v1.4h, v2.4h, v5.4h +; CHECK-NEXT: ldr q0, [x2] +; CHECK-NEXT: ldr q1, [x0] +; CHECK-NEXT: ldr q2, [x1] +; CHECK-NEXT: ldr q3, [x3] +; CHECK-NEXT: add v0.8h, v1.8h, v0.8h +; CHECK-NEXT: add v1.8h, v2.8h, v3.8h +; CHECK-NEXT: ushll2 v2.4s, v0.8h, #0 ; CHECK-NEXT: ushll v0.4s, v0.4h, #0 -; CHECK-NEXT: add v3.4h, v3.4h, v6.4h -; CHECK-NEXT: add v2.4h, v4.4h, v7.4h -; CHECK-NEXT: ushll v3.4s, v3.4h, #0 -; CHECK-NEXT: ushll v2.4s, v2.4h, #0 -; CHECK-NEXT: ushll v1.4s, v1.4h, #0 -; CHECK-NEXT: ushll v4.2d, v3.2s, #0 -; CHECK-NEXT: ushll v5.2d, v0.2s, #0 -; CHECK-NEXT: ushll2 v3.2d, v3.4s, #0 -; CHECK-NEXT: ushll2 v0.2d, v0.4s, #0 -; CHECK-NEXT: ushll v6.2d, v2.2s, #3 -; CHECK-NEXT: ushll v7.2d, v1.2s, #3 -; CHECK-NEXT: ushll2 v1.2d, v1.4s, #3 -; CHECK-NEXT: ushll2 v2.2d, v2.4s, #3 -; CHECK-NEXT: sub v1.2d, v0.2d, v1.2d -; CHECK-NEXT: sub v3.2d, v3.2d, v2.2d -; CHECK-NEXT: sub v0.2d, v5.2d, v7.2d -; CHECK-NEXT: sub v2.2d, v4.2d, v6.2d +; CHECK-NEXT: ushll2 v4.4s, v1.8h, #0 +; CHECK-NEXT: ushll v5.4s, v1.4h, #0 +; CHECK-NEXT: ushll2 v1.2d, v2.4s, #3 +; CHECK-NEXT: ushll2 v3.2d, v0.4s, #0 +; CHECK-NEXT: ushll2 v6.2d, v4.4s, #3 +; CHECK-NEXT: sub v1.2d, v3.2d, v1.2d +; CHECK-NEXT: ushll2 v3.2d, v5.4s, #0 +; CHECK-NEXT: ushll v2.2d, v2.2s, #3 +; CHECK-NEXT: ushll v0.2d, v0.2s, #0 +; CHECK-NEXT: ushll v4.2d, v4.2s, #3 +; CHECK-NEXT: ushll v5.2d, v5.2s, #0 +; CHECK-NEXT: sub v3.2d, v3.2d, v6.2d +; CHECK-NEXT: sub v0.2d, v0.2d, v2.2d +; CHECK-NEXT: sub v2.2d, v5.2d, v4.2d ; CHECK-NEXT: ret %lp1 = load <4 x i16>, ptr %p1 %q1 = getelementptr i8, ptr %p1, i32 8 @@ -3742,46 +3528,46 @@ define <16 x i64> @dbl_bv2_v8i16_v16i64(ptr %p1, ptr %p2, ptr %p3, ptr %p4) { ; CHECK-LABEL: dbl_bv2_v8i16_v16i64: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q5, [x2] -; CHECK-NEXT: ldp q1, q2, [x0] +; CHECK-NEXT: ldp q3, q0, [x3] +; CHECK-NEXT: ldp q2, q1, [x1] +; CHECK-NEXT: add v16.8h, v2.8h, v3.8h +; CHECK-NEXT: ldp q5, q4, [x0] ; CHECK-NEXT: add v0.8h, v1.8h, v0.8h -; CHECK-NEXT: ldp q3, q4, [x1] -; CHECK-NEXT: add v2.8h, v2.8h, v5.8h -; CHECK-NEXT: ushll v5.4s, v0.4h, #0 -; CHECK-NEXT: ushll2 v0.4s, v0.8h, #0 -; CHECK-NEXT: ushll v17.2d, v5.2s, #0 -; CHECK-NEXT: ushll v18.2d, v0.2s, #0 -; CHECK-NEXT: ldp q6, q7, [x3] -; CHECK-NEXT: ushll2 v5.2d, v5.4s, #0 -; CHECK-NEXT: ushll2 v0.2d, v0.4s, #0 -; CHECK-NEXT: add v1.8h, v3.8h, v6.8h -; CHECK-NEXT: add v3.8h, v4.8h, v7.8h -; CHECK-NEXT: ushll v4.4s, v1.4h, #0 -; CHECK-NEXT: ushll2 v1.4s, v1.8h, #0 -; CHECK-NEXT: ushll2 v7.4s, v2.8h, #0 -; CHECK-NEXT: ushll2 v19.4s, v3.8h, #0 -; CHECK-NEXT: ushll v3.4s, v3.4h, #0 -; CHECK-NEXT: ushll v2.4s, v2.4h, #0 -; CHECK-NEXT: ushll v16.2d, v4.2s, #0 -; CHECK-NEXT: ushll v6.2d, v1.2s, #0 -; CHECK-NEXT: ushll2 v4.2d, v4.4s, #0 -; CHECK-NEXT: ushll2 v1.2d, v1.4s, #0 -; CHECK-NEXT: ushll v20.2d, v3.2s, #3 -; CHECK-NEXT: ushll v21.2d, v2.2s, #3 -; CHECK-NEXT: ushll v22.2d, v19.2s, #3 -; CHECK-NEXT: ushll2 v23.2d, v3.4s, #3 -; CHECK-NEXT: ushll v24.2d, v7.2s, #3 -; CHECK-NEXT: ushll2 v2.2d, v2.4s, #3 -; CHECK-NEXT: ushll2 v3.2d, v7.4s, #3 -; CHECK-NEXT: ushll2 v7.2d, v19.4s, #3 -; CHECK-NEXT: sub v3.2d, v0.2d, v3.2d -; CHECK-NEXT: sub v7.2d, v1.2d, v7.2d -; CHECK-NEXT: sub v1.2d, v5.2d, v2.2d -; CHECK-NEXT: sub v2.2d, v18.2d, v24.2d -; CHECK-NEXT: sub v5.2d, v4.2d, v23.2d -; CHECK-NEXT: sub v6.2d, v6.2d, v22.2d -; CHECK-NEXT: sub v0.2d, v17.2d, v21.2d -; CHECK-NEXT: sub v4.2d, v16.2d, v20.2d +; CHECK-NEXT: ushll2 v17.4s, v16.8h, #0 +; CHECK-NEXT: ushll2 v19.2d, v17.4s, #0 +; CHECK-NEXT: ushll v17.2d, v17.2s, #0 +; CHECK-NEXT: ldp q2, q6, [x2] +; CHECK-NEXT: add v2.8h, v5.8h, v2.8h +; CHECK-NEXT: add v1.8h, v4.8h, v6.8h +; CHECK-NEXT: ushll2 v5.4s, v2.8h, #0 +; CHECK-NEXT: ushll2 v4.4s, v1.8h, #0 +; CHECK-NEXT: ushll2 v6.4s, v0.8h, #0 +; CHECK-NEXT: ushll2 v3.2d, v4.4s, #3 +; CHECK-NEXT: ushll2 v7.2d, v5.4s, #0 +; CHECK-NEXT: ushll2 v18.2d, v6.4s, #3 +; CHECK-NEXT: sub v3.2d, v7.2d, v3.2d +; CHECK-NEXT: sub v7.2d, v19.2d, v18.2d +; CHECK-NEXT: ushll v18.4s, v1.4h, #0 +; CHECK-NEXT: ushll v19.4s, v2.4h, #0 +; CHECK-NEXT: ushll2 v1.2d, v18.4s, #3 +; CHECK-NEXT: ushll2 v2.2d, v19.4s, #0 +; CHECK-NEXT: ushll v4.2d, v4.2s, #3 +; CHECK-NEXT: ushll v5.2d, v5.2s, #0 +; CHECK-NEXT: sub v1.2d, v2.2d, v1.2d +; CHECK-NEXT: sub v2.2d, v5.2d, v4.2d +; CHECK-NEXT: ushll v0.4s, v0.4h, #0 +; CHECK-NEXT: ushll v4.4s, v16.4h, #0 +; CHECK-NEXT: ushll2 v5.2d, v0.4s, #3 +; CHECK-NEXT: ushll2 v16.2d, v4.4s, #0 +; CHECK-NEXT: ushll v6.2d, v6.2s, #3 +; CHECK-NEXT: sub v5.2d, v16.2d, v5.2d +; CHECK-NEXT: sub v6.2d, v17.2d, v6.2d +; CHECK-NEXT: ushll v16.2d, v18.2s, #3 +; CHECK-NEXT: ushll v17.2d, v19.2s, #0 +; CHECK-NEXT: ushll v18.2d, v0.2s, #3 +; CHECK-NEXT: ushll v4.2d, v4.2s, #0 +; CHECK-NEXT: sub v0.2d, v17.2d, v16.2d +; CHECK-NEXT: sub v4.2d, v4.2d, v18.2d ; CHECK-NEXT: ret %lp1 = load <8 x i16>, ptr %p1 %q1 = getelementptr i8, ptr %p1, i32 16 @@ -3811,20 +3597,18 @@ define <4 x i64> @dbl_bv2_v2i32_v4i64(ptr %p1, ptr %p2, ptr %p3, ptr %p4) { ; CHECK-LABEL: dbl_bv2_v2i32_v4i64: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp d0, d5, [x2] -; CHECK-NEXT: ldp d1, d2, [x0] -; CHECK-NEXT: ldp d3, d4, [x1] -; CHECK-NEXT: ldp d6, d7, [x3] -; CHECK-NEXT: add v0.2s, v1.2s, v0.2s -; CHECK-NEXT: add v1.2s, v2.2s, v5.2s +; CHECK-NEXT: ldr q0, [x2] +; CHECK-NEXT: ldr q1, [x0] +; CHECK-NEXT: ldr q2, [x1] +; CHECK-NEXT: ldr q3, [x3] +; CHECK-NEXT: add v0.4s, v1.4s, v0.4s +; CHECK-NEXT: add v1.4s, v2.4s, v3.4s +; CHECK-NEXT: ushll2 v2.2d, v0.4s, #3 ; CHECK-NEXT: ushll v0.2d, v0.2s, #0 -; CHECK-NEXT: add v3.2s, v3.2s, v6.2s -; CHECK-NEXT: add v2.2s, v4.2s, v7.2s -; CHECK-NEXT: ushll v3.2d, v3.2s, #0 -; CHECK-NEXT: ushll v1.2d, v1.2s, #3 -; CHECK-NEXT: ushll v2.2d, v2.2s, #3 -; CHECK-NEXT: sub v0.2d, v0.2d, v1.2d -; CHECK-NEXT: sub v1.2d, v3.2d, v2.2d +; CHECK-NEXT: ushll2 v3.2d, v1.4s, #3 +; CHECK-NEXT: ushll v1.2d, v1.2s, #0 +; CHECK-NEXT: sub v0.2d, v0.2d, v2.2d +; CHECK-NEXT: sub v1.2d, v1.2d, v3.2d ; CHECK-NEXT: ret %lp1 = load <2 x i32>, ptr %p1 %q1 = getelementptr i8, ptr %p1, i32 8 @@ -3854,26 +3638,26 @@ define <8 x i64> @dbl_bv2_v4i32_v8i64(ptr %p1, ptr %p2, ptr %p3, ptr %p4) { ; CHECK-LABEL: dbl_bv2_v4i32_v8i64: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q5, [x2] -; CHECK-NEXT: ldp q1, q2, [x0] +; CHECK-NEXT: ldp q3, q0, [x3] +; CHECK-NEXT: ldp q2, q1, [x1] +; CHECK-NEXT: add v2.4s, v2.4s, v3.4s +; CHECK-NEXT: ldp q5, q4, [x0] ; CHECK-NEXT: add v0.4s, v1.4s, v0.4s -; CHECK-NEXT: ldp q3, q4, [x1] -; CHECK-NEXT: add v2.4s, v2.4s, v5.4s -; CHECK-NEXT: ushll v5.2d, v0.2s, #0 -; CHECK-NEXT: ushll2 v0.2d, v0.4s, #0 -; CHECK-NEXT: ushll v16.2d, v2.2s, #3 -; CHECK-NEXT: ldp q6, q7, [x3] -; CHECK-NEXT: add v1.4s, v3.4s, v6.4s -; CHECK-NEXT: add v3.4s, v4.4s, v7.4s -; CHECK-NEXT: ushll v4.2d, v1.2s, #0 -; CHECK-NEXT: ushll2 v6.2d, v1.4s, #0 -; CHECK-NEXT: ushll v7.2d, v3.2s, #3 -; CHECK-NEXT: ushll2 v1.2d, v2.4s, #3 -; CHECK-NEXT: ushll2 v2.2d, v3.4s, #3 -; CHECK-NEXT: sub v1.2d, v0.2d, v1.2d -; CHECK-NEXT: sub v3.2d, v6.2d, v2.2d -; CHECK-NEXT: sub v0.2d, v5.2d, v16.2d -; CHECK-NEXT: sub v2.2d, v4.2d, v7.2d +; CHECK-NEXT: ushll2 v7.2d, v2.4s, #0 +; CHECK-NEXT: ushll v2.2d, v2.2s, #0 +; CHECK-NEXT: ldp q3, q6, [x2] +; CHECK-NEXT: add v5.4s, v5.4s, v3.4s +; CHECK-NEXT: add v4.4s, v4.4s, v6.4s +; CHECK-NEXT: ushll2 v3.2d, v5.4s, #0 +; CHECK-NEXT: ushll2 v1.2d, v4.4s, #3 +; CHECK-NEXT: ushll2 v6.2d, v0.4s, #3 +; CHECK-NEXT: sub v1.2d, v3.2d, v1.2d +; CHECK-NEXT: sub v3.2d, v7.2d, v6.2d +; CHECK-NEXT: ushll v4.2d, v4.2s, #3 +; CHECK-NEXT: ushll v5.2d, v5.2s, #0 +; CHECK-NEXT: ushll v6.2d, v0.2s, #3 +; CHECK-NEXT: sub v0.2d, v5.2d, v4.2d +; CHECK-NEXT: sub v2.2d, v2.2d, v6.2d ; CHECK-NEXT: ret %lp1 = load <4 x i32>, ptr %p1 %q1 = getelementptr i8, ptr %p1, i32 16 @@ -3903,45 +3687,45 @@ define <16 x i64> @dbl_bv2_v8i32_v16i64(ptr %p1, ptr %p2, ptr %p3, ptr %p4) { ; CHECK-LABEL: dbl_bv2_v8i32_v16i64: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q17, q0, [x2] -; CHECK-NEXT: ldp q2, q1, [x0] -; CHECK-NEXT: add v2.4s, v2.4s, v17.4s -; CHECK-NEXT: ldp q6, q5, [x1] -; CHECK-NEXT: add v0.4s, v1.4s, v0.4s -; CHECK-NEXT: ldp q19, q18, [x3] +; CHECK-NEXT: ldp q0, q17, [x3] +; CHECK-NEXT: ldp q5, q6, [x1, #32] +; CHECK-NEXT: ldp q7, q16, [x1] +; CHECK-NEXT: add v21.4s, v7.4s, v0.4s +; CHECK-NEXT: ldp q18, q19, [x3, #32] +; CHECK-NEXT: add v16.4s, v16.4s, v17.4s +; CHECK-NEXT: add v18.4s, v5.4s, v18.4s +; CHECK-NEXT: ldp q1, q2, [x0, #32] ; CHECK-NEXT: add v6.4s, v6.4s, v19.4s -; CHECK-NEXT: ldp q3, q4, [x0, #32] -; CHECK-NEXT: add v5.4s, v5.4s, v18.4s -; CHECK-NEXT: ushll2 v20.2d, v5.4s, #0 -; CHECK-NEXT: ldp q7, q16, [x1, #32] -; CHECK-NEXT: ldp q17, q1, [x3, #32] -; CHECK-NEXT: add v7.4s, v7.4s, v17.4s -; CHECK-NEXT: ushll v17.2d, v6.2s, #0 -; CHECK-NEXT: ldp q19, q18, [x2, #32] -; CHECK-NEXT: add v1.4s, v16.4s, v1.4s -; CHECK-NEXT: ushll v16.2d, v5.2s, #0 -; CHECK-NEXT: ushll2 v5.2d, v6.4s, #0 -; CHECK-NEXT: add v3.4s, v3.4s, v19.4s -; CHECK-NEXT: ushll v19.2d, v2.2s, #0 -; CHECK-NEXT: add v4.4s, v4.4s, v18.4s -; CHECK-NEXT: ushll v18.2d, v0.2s, #0 -; CHECK-NEXT: ushll2 v0.2d, v0.4s, #0 -; CHECK-NEXT: ushll2 v2.2d, v2.4s, #0 -; CHECK-NEXT: ushll v6.2d, v1.2s, #3 -; CHECK-NEXT: ushll v21.2d, v7.2s, #3 -; CHECK-NEXT: ushll v22.2d, v4.2s, #3 -; CHECK-NEXT: ushll v23.2d, v3.2s, #3 -; CHECK-NEXT: ushll2 v24.2d, v1.4s, #3 -; CHECK-NEXT: ushll2 v7.2d, v7.4s, #3 -; CHECK-NEXT: ushll2 v1.2d, v3.4s, #3 -; CHECK-NEXT: ushll2 v3.2d, v4.4s, #3 -; CHECK-NEXT: sub v1.2d, v2.2d, v1.2d -; CHECK-NEXT: sub v3.2d, v0.2d, v3.2d -; CHECK-NEXT: sub v5.2d, v5.2d, v7.2d -; CHECK-NEXT: sub v7.2d, v20.2d, v24.2d -; CHECK-NEXT: sub v0.2d, v19.2d, v23.2d -; CHECK-NEXT: sub v2.2d, v18.2d, v22.2d -; CHECK-NEXT: sub v4.2d, v17.2d, v21.2d +; CHECK-NEXT: ldp q3, q4, [x0] +; CHECK-NEXT: ldp q20, q17, [x2] +; CHECK-NEXT: ldp q19, q0, [x2, #32] +; CHECK-NEXT: add v4.4s, v4.4s, v17.4s +; CHECK-NEXT: add v17.4s, v3.4s, v20.4s +; CHECK-NEXT: ushll2 v7.2d, v4.4s, #0 +; CHECK-NEXT: ushll2 v3.2d, v17.4s, #0 +; CHECK-NEXT: ushll2 v20.2d, v16.4s, #0 +; CHECK-NEXT: add v0.4s, v2.4s, v0.4s +; CHECK-NEXT: add v2.4s, v1.4s, v19.4s +; CHECK-NEXT: ushll2 v5.2d, v0.4s, #3 +; CHECK-NEXT: ushll2 v1.2d, v2.4s, #3 +; CHECK-NEXT: ushll2 v19.2d, v6.4s, #3 +; CHECK-NEXT: sub v1.2d, v3.2d, v1.2d +; CHECK-NEXT: sub v3.2d, v7.2d, v5.2d +; CHECK-NEXT: ushll2 v5.2d, v18.4s, #3 +; CHECK-NEXT: ushll2 v7.2d, v21.4s, #0 +; CHECK-NEXT: ushll v2.2d, v2.2s, #3 +; CHECK-NEXT: sub v5.2d, v7.2d, v5.2d +; CHECK-NEXT: sub v7.2d, v20.2d, v19.2d +; CHECK-NEXT: ushll v17.2d, v17.2s, #0 +; CHECK-NEXT: ushll v19.2d, v0.2s, #3 +; CHECK-NEXT: ushll v4.2d, v4.2s, #0 +; CHECK-NEXT: sub v0.2d, v17.2d, v2.2d +; CHECK-NEXT: sub v2.2d, v4.2d, v19.2d +; CHECK-NEXT: ushll v4.2d, v18.2s, #3 +; CHECK-NEXT: ushll v17.2d, v21.2s, #0 +; CHECK-NEXT: ushll v6.2d, v6.2s, #3 +; CHECK-NEXT: ushll v16.2d, v16.2s, #0 +; CHECK-NEXT: sub v4.2d, v17.2d, v4.2d ; CHECK-NEXT: sub v6.2d, v16.2d, v6.2d ; CHECK-NEXT: ret %lp1 = load <8 x i32>, ptr %p1 @@ -4046,20 +3830,18 @@ define <8 x i32> @dblext_bv2_v4i8_v8i16_v8i32(ptr %p1, ptr %p2, ptr %p3, ptr %p4) { ; CHECK-LABEL: dblext_bv2_v4i8_v8i16_v8i32: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp s0, s1, [x0] -; CHECK-NEXT: ld1 { v0.s }[1], [x1], #4 -; CHECK-NEXT: ldp s2, s3, [x2] -; CHECK-NEXT: ld1 { v1.s }[1], [x1] -; CHECK-NEXT: ld1 { v2.s }[1], [x3], #4 -; CHECK-NEXT: uaddl v0.8h, v0.8b, v2.8b -; CHECK-NEXT: ld1 { v3.s }[1], [x3] -; CHECK-NEXT: ushll v2.4s, v0.4h, #0 -; CHECK-NEXT: ushll2 v0.4s, v0.8h, #0 -; CHECK-NEXT: uaddl v1.8h, v1.8b, v3.8b +; CHECK-NEXT: ldr d0, [x2] +; CHECK-NEXT: ldr d1, [x0] +; CHECK-NEXT: ldr d2, [x1] +; CHECK-NEXT: ldr d3, [x3] +; CHECK-NEXT: uaddl v0.8h, v1.8b, v0.8b +; CHECK-NEXT: uaddl v1.8h, v2.8b, v3.8b +; CHECK-NEXT: ushll2 v2.4s, v0.8h, #3 +; CHECK-NEXT: ushll v0.4s, v0.4h, #0 ; CHECK-NEXT: ushll2 v3.4s, v1.8h, #3 -; CHECK-NEXT: ushll v4.4s, v1.4h, #3 -; CHECK-NEXT: sub v1.4s, v0.4s, v3.4s -; CHECK-NEXT: sub v0.4s, v2.4s, v4.4s +; CHECK-NEXT: ushll v1.4s, v1.4h, #0 +; CHECK-NEXT: sub v0.4s, v0.4s, v2.4s +; CHECK-NEXT: sub v1.4s, v1.4s, v3.4s ; CHECK-NEXT: ret %lp1 = load <4 x i8>, ptr %p1 %q1 = getelementptr i8, ptr %p1, i32 4 @@ -4093,26 +3875,26 @@ define <16 x i32> @dblext_bv2_v8i8_v16i16_v16i32(ptr %p1, ptr %p2, ptr %p3, ptr %p4) { ; CHECK-LABEL: dblext_bv2_v8i8_v16i16_v16i32: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp d0, d5, [x2] -; CHECK-NEXT: ldp d1, d2, [x0] -; CHECK-NEXT: ldp d3, d4, [x1] -; CHECK-NEXT: ldp d6, d7, [x3] -; CHECK-NEXT: uaddl v0.8h, v1.8b, v0.8b -; CHECK-NEXT: uaddl v2.8h, v2.8b, v5.8b -; CHECK-NEXT: uaddl v1.8h, v3.8b, v6.8b -; CHECK-NEXT: uaddl v3.8h, v4.8b, v7.8b -; CHECK-NEXT: ushll v5.4s, v0.4h, #0 -; CHECK-NEXT: ushll v4.4s, v1.4h, #0 -; CHECK-NEXT: ushll2 v6.4s, v1.8h, #0 -; CHECK-NEXT: ushll2 v0.4s, v0.8h, #0 -; CHECK-NEXT: ushll v7.4s, v3.4h, #3 -; CHECK-NEXT: ushll v16.4s, v2.4h, #3 -; CHECK-NEXT: ushll2 v1.4s, v2.8h, #3 -; CHECK-NEXT: ushll2 v2.4s, v3.8h, #3 -; CHECK-NEXT: sub v1.4s, v0.4s, v1.4s -; CHECK-NEXT: sub v3.4s, v6.4s, v2.4s -; CHECK-NEXT: sub v0.4s, v5.4s, v16.4s -; CHECK-NEXT: sub v2.4s, v4.4s, v7.4s +; CHECK-NEXT: ldr q1, [x0] +; CHECK-NEXT: ldr q3, [x2] +; CHECK-NEXT: ldr q0, [x3] +; CHECK-NEXT: ldr q2, [x1] +; CHECK-NEXT: uaddl2 v5.8h, v1.16b, v3.16b +; CHECK-NEXT: uaddl v3.8h, v1.8b, v3.8b +; CHECK-NEXT: uaddl v4.8h, v2.8b, v0.8b +; CHECK-NEXT: uaddl2 v0.8h, v2.16b, v0.16b +; CHECK-NEXT: ushll2 v1.4s, v5.8h, #3 +; CHECK-NEXT: ushll2 v2.4s, v3.8h, #0 +; CHECK-NEXT: ushll2 v6.4s, v0.8h, #3 +; CHECK-NEXT: sub v1.4s, v2.4s, v1.4s +; CHECK-NEXT: ushll2 v2.4s, v4.8h, #0 +; CHECK-NEXT: ushll v5.4s, v5.4h, #3 +; CHECK-NEXT: ushll v7.4s, v3.4h, #0 +; CHECK-NEXT: sub v3.4s, v2.4s, v6.4s +; CHECK-NEXT: ushll v2.4s, v0.4h, #3 +; CHECK-NEXT: ushll v4.4s, v4.4h, #0 +; CHECK-NEXT: sub v0.4s, v7.4s, v5.4s +; CHECK-NEXT: sub v2.4s, v4.4s, v2.4s ; CHECK-NEXT: ret %lp1 = load <8 x i8>, ptr %p1 %q1 = getelementptr i8, ptr %p1, i32 8 @@ -4146,52 +3928,19 @@ define <4 x i64> @dblext_bv2_v2i8_v4i16_v4i64(ptr %p1, ptr %p2, ptr %p3, ptr %p4) { ; CHECK-LABEL: dblext_bv2_v2i8_v4i16_v4i64: ; CHECK: // %bb.0: -; CHECK-NEXT: add x8, x0, #2 -; CHECK-NEXT: add x9, x1, #2 -; CHECK-NEXT: ld1 { v1.b }[0], [x1] -; CHECK-NEXT: add x10, x0, #1 -; CHECK-NEXT: ld1 { v0.b }[0], [x0] -; CHECK-NEXT: add x11, x2, #3 -; CHECK-NEXT: ld1 { v2.b }[0], [x8] -; CHECK-NEXT: add x8, x1, #1 -; CHECK-NEXT: ld1 { v3.b }[0], [x9] -; CHECK-NEXT: add x9, x0, #3 -; CHECK-NEXT: ld1 { v4.b }[0], [x2] -; CHECK-NEXT: ld1 { v1.b }[4], [x8] -; CHECK-NEXT: add x8, x2, #2 -; CHECK-NEXT: ld1 { v2.b }[4], [x9] -; CHECK-NEXT: add x9, x3, #2 -; CHECK-NEXT: ld1 { v0.b }[4], [x10] -; CHECK-NEXT: add x10, x1, #3 -; CHECK-NEXT: ld1 { v5.b }[0], [x3] -; CHECK-NEXT: ld1 { v6.b }[0], [x8] -; CHECK-NEXT: add x8, x3, #1 -; CHECK-NEXT: ld1 { v7.b }[0], [x9] -; CHECK-NEXT: add x9, x3, #3 -; CHECK-NEXT: ld1 { v3.b }[4], [x10] -; CHECK-NEXT: add x10, x2, #1 -; CHECK-NEXT: ld1 { v5.b }[4], [x8] -; CHECK-NEXT: uzp1 v0.4h, v0.4h, v1.4h -; CHECK-NEXT: ld1 { v6.b }[4], [x11] -; CHECK-NEXT: ld1 { v4.b }[4], [x10] -; CHECK-NEXT: ld1 { v7.b }[4], [x9] -; CHECK-NEXT: uzp1 v2.4h, v2.4h, v3.4h -; CHECK-NEXT: bic v0.4h, #255, lsl #8 -; CHECK-NEXT: uzp1 v1.4h, v4.4h, v5.4h -; CHECK-NEXT: uzp1 v3.4h, v6.4h, v7.4h -; CHECK-NEXT: bic v2.4h, #255, lsl #8 -; CHECK-NEXT: bic v1.4h, #255, lsl #8 -; CHECK-NEXT: bic v3.4h, #255, lsl #8 -; CHECK-NEXT: add v0.4h, v0.4h, v1.4h -; CHECK-NEXT: add v1.4h, v2.4h, v3.4h +; CHECK-NEXT: ldr s0, [x2] +; CHECK-NEXT: ldr s1, [x0] +; CHECK-NEXT: ld1 { v0.s }[1], [x3] +; CHECK-NEXT: ld1 { v1.s }[1], [x1] +; CHECK-NEXT: uaddl v0.8h, v1.8b, v0.8b +; CHECK-NEXT: ushll2 v1.4s, v0.8h, #0 ; CHECK-NEXT: ushll v0.4s, v0.4h, #0 -; CHECK-NEXT: ushll v1.4s, v1.4h, #0 -; CHECK-NEXT: ushll v2.2d, v0.2s, #0 -; CHECK-NEXT: ushll2 v0.2d, v0.4s, #0 -; CHECK-NEXT: ushll2 v3.2d, v1.4s, #3 -; CHECK-NEXT: ushll v4.2d, v1.2s, #3 -; CHECK-NEXT: sub v1.2d, v0.2d, v3.2d -; CHECK-NEXT: sub v0.2d, v2.2d, v4.2d +; CHECK-NEXT: ushll2 v2.2d, v1.4s, #3 +; CHECK-NEXT: ushll v1.2d, v1.2s, #0 +; CHECK-NEXT: ushll2 v3.2d, v0.4s, #3 +; CHECK-NEXT: ushll v0.2d, v0.2s, #0 +; CHECK-NEXT: sub v1.2d, v1.2d, v2.2d +; CHECK-NEXT: sub v0.2d, v0.2d, v3.2d ; CHECK-NEXT: ret %lp1 = load <2 x i8>, ptr %p1 %q1 = getelementptr i8, ptr %p1, i32 2 @@ -4225,30 +3974,28 @@ define <8 x i64> @dblext_bv2_v4i8_v8i16_v8i64(ptr %p1, ptr %p2, ptr %p3, ptr %p4) { ; CHECK-LABEL: dblext_bv2_v4i8_v8i16_v8i64: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp s0, s1, [x0] -; CHECK-NEXT: ld1 { v0.s }[1], [x1], #4 -; CHECK-NEXT: ldp s2, s3, [x2] -; CHECK-NEXT: ld1 { v1.s }[1], [x1] -; CHECK-NEXT: ld1 { v2.s }[1], [x3], #4 -; CHECK-NEXT: uaddl v0.8h, v0.8b, v2.8b -; CHECK-NEXT: ld1 { v3.s }[1], [x3] -; CHECK-NEXT: ushll v2.4s, v0.4h, #0 -; CHECK-NEXT: ushll2 v0.4s, v0.8h, #0 -; CHECK-NEXT: ushll v4.2d, v2.2s, #0 -; CHECK-NEXT: uaddl v1.8h, v1.8b, v3.8b -; CHECK-NEXT: ushll v5.2d, v0.2s, #0 -; CHECK-NEXT: ushll2 v2.2d, v2.4s, #0 -; CHECK-NEXT: ushll v3.4s, v1.4h, #0 -; CHECK-NEXT: ushll2 v1.4s, v1.8h, #0 -; CHECK-NEXT: ushll2 v0.2d, v0.4s, #0 -; CHECK-NEXT: ushll v6.2d, v3.2s, #3 -; CHECK-NEXT: ushll v7.2d, v1.2s, #3 -; CHECK-NEXT: ushll2 v1.2d, v1.4s, #3 -; CHECK-NEXT: ushll2 v16.2d, v3.4s, #3 -; CHECK-NEXT: sub v3.2d, v0.2d, v1.2d -; CHECK-NEXT: sub v1.2d, v2.2d, v16.2d -; CHECK-NEXT: sub v0.2d, v4.2d, v6.2d -; CHECK-NEXT: sub v2.2d, v5.2d, v7.2d +; CHECK-NEXT: ldr d0, [x2] +; CHECK-NEXT: ldr d1, [x0] +; CHECK-NEXT: ldr d2, [x1] +; CHECK-NEXT: ldr d3, [x3] +; CHECK-NEXT: uaddl v0.8h, v1.8b, v0.8b +; CHECK-NEXT: uaddl v1.8h, v2.8b, v3.8b +; CHECK-NEXT: ushll2 v2.4s, v0.8h, #0 +; CHECK-NEXT: ushll v0.4s, v0.4h, #0 +; CHECK-NEXT: ushll2 v4.4s, v1.8h, #0 +; CHECK-NEXT: ushll v5.4s, v1.4h, #0 +; CHECK-NEXT: ushll2 v1.2d, v2.4s, #3 +; CHECK-NEXT: ushll2 v3.2d, v0.4s, #0 +; CHECK-NEXT: ushll2 v6.2d, v4.4s, #3 +; CHECK-NEXT: sub v1.2d, v3.2d, v1.2d +; CHECK-NEXT: ushll2 v3.2d, v5.4s, #0 +; CHECK-NEXT: ushll v2.2d, v2.2s, #3 +; CHECK-NEXT: ushll v0.2d, v0.2s, #0 +; CHECK-NEXT: ushll v4.2d, v4.2s, #3 +; CHECK-NEXT: ushll v5.2d, v5.2s, #0 +; CHECK-NEXT: sub v3.2d, v3.2d, v6.2d +; CHECK-NEXT: sub v0.2d, v0.2d, v2.2d +; CHECK-NEXT: sub v2.2d, v5.2d, v4.2d ; CHECK-NEXT: ret %lp1 = load <4 x i8>, ptr %p1 %q1 = getelementptr i8, ptr %p1, i32 4 @@ -4282,46 +4029,46 @@ define <16 x i64> @dblext_bv2_v8i8_v16i16_v16i64(ptr %p1, ptr %p2, ptr %p3, ptr %p4) { ; CHECK-LABEL: dblext_bv2_v8i8_v16i16_v16i64: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp d0, d5, [x2] -; CHECK-NEXT: ldp d1, d2, [x0] -; CHECK-NEXT: ldp d3, d4, [x1] -; CHECK-NEXT: ldp d6, d7, [x3] -; CHECK-NEXT: uaddl v0.8h, v1.8b, v0.8b -; CHECK-NEXT: uaddl v2.8h, v2.8b, v5.8b -; CHECK-NEXT: uaddl v1.8h, v3.8b, v6.8b -; CHECK-NEXT: uaddl v3.8h, v4.8b, v7.8b -; CHECK-NEXT: ushll v5.4s, v0.4h, #0 -; CHECK-NEXT: ushll v4.4s, v1.4h, #0 -; CHECK-NEXT: ushll2 v1.4s, v1.8h, #0 -; CHECK-NEXT: ushll2 v0.4s, v0.8h, #0 -; CHECK-NEXT: ushll2 v7.4s, v2.8h, #0 -; CHECK-NEXT: ushll2 v19.4s, v3.8h, #0 -; CHECK-NEXT: ushll v3.4s, v3.4h, #0 -; CHECK-NEXT: ushll v2.4s, v2.4h, #0 -; CHECK-NEXT: ushll v16.2d, v4.2s, #0 -; CHECK-NEXT: ushll v17.2d, v5.2s, #0 -; CHECK-NEXT: ushll v6.2d, v1.2s, #0 -; CHECK-NEXT: ushll2 v4.2d, v4.4s, #0 -; CHECK-NEXT: ushll v18.2d, v0.2s, #0 -; CHECK-NEXT: ushll2 v5.2d, v5.4s, #0 -; CHECK-NEXT: ushll2 v1.2d, v1.4s, #0 -; CHECK-NEXT: ushll2 v0.2d, v0.4s, #0 -; CHECK-NEXT: ushll v20.2d, v3.2s, #3 -; CHECK-NEXT: ushll v21.2d, v2.2s, #3 -; CHECK-NEXT: ushll v22.2d, v19.2s, #3 -; CHECK-NEXT: ushll2 v23.2d, v3.4s, #3 -; CHECK-NEXT: ushll v24.2d, v7.2s, #3 -; CHECK-NEXT: ushll2 v2.2d, v2.4s, #3 -; CHECK-NEXT: ushll2 v3.2d, v7.4s, #3 -; CHECK-NEXT: ushll2 v7.2d, v19.4s, #3 -; CHECK-NEXT: sub v3.2d, v0.2d, v3.2d -; CHECK-NEXT: sub v7.2d, v1.2d, v7.2d -; CHECK-NEXT: sub v1.2d, v5.2d, v2.2d -; CHECK-NEXT: sub v2.2d, v18.2d, v24.2d -; CHECK-NEXT: sub v5.2d, v4.2d, v23.2d -; CHECK-NEXT: sub v6.2d, v6.2d, v22.2d -; CHECK-NEXT: sub v0.2d, v17.2d, v21.2d -; CHECK-NEXT: sub v4.2d, v16.2d, v20.2d +; CHECK-NEXT: ldr q0, [x3] +; CHECK-NEXT: ldr q1, [x0] +; CHECK-NEXT: ldr q2, [x1] +; CHECK-NEXT: ldr q3, [x2] +; CHECK-NEXT: uaddl v4.8h, v2.8b, v0.8b +; CHECK-NEXT: uaddl2 v5.8h, v1.16b, v3.16b +; CHECK-NEXT: uaddl v1.8h, v1.8b, v3.8b +; CHECK-NEXT: uaddl2 v0.8h, v2.16b, v0.16b +; CHECK-NEXT: ushll2 v2.4s, v5.8h, #0 +; CHECK-NEXT: ushll2 v6.4s, v1.8h, #0 +; CHECK-NEXT: ushll2 v16.4s, v0.8h, #0 +; CHECK-NEXT: ushll2 v17.4s, v4.8h, #0 +; CHECK-NEXT: ushll2 v3.2d, v2.4s, #3 +; CHECK-NEXT: ushll2 v7.2d, v6.4s, #0 +; CHECK-NEXT: ushll2 v18.2d, v16.4s, #3 +; CHECK-NEXT: ushll2 v19.2d, v17.4s, #0 +; CHECK-NEXT: sub v3.2d, v7.2d, v3.2d +; CHECK-NEXT: sub v7.2d, v19.2d, v18.2d +; CHECK-NEXT: ushll v18.4s, v5.4h, #0 +; CHECK-NEXT: ushll v19.4s, v1.4h, #0 +; CHECK-NEXT: ushll2 v1.2d, v18.4s, #3 +; CHECK-NEXT: ushll2 v5.2d, v19.4s, #0 +; CHECK-NEXT: ushll v2.2d, v2.2s, #3 +; CHECK-NEXT: ushll v6.2d, v6.2s, #0 +; CHECK-NEXT: ushll v0.4s, v0.4h, #0 +; CHECK-NEXT: ushll v4.4s, v4.4h, #0 +; CHECK-NEXT: sub v1.2d, v5.2d, v1.2d +; CHECK-NEXT: sub v2.2d, v6.2d, v2.2d +; CHECK-NEXT: ushll2 v5.2d, v0.4s, #3 +; CHECK-NEXT: ushll2 v6.2d, v4.4s, #0 +; CHECK-NEXT: ushll v16.2d, v16.2s, #3 +; CHECK-NEXT: ushll v17.2d, v17.2s, #0 +; CHECK-NEXT: sub v5.2d, v6.2d, v5.2d +; CHECK-NEXT: sub v6.2d, v17.2d, v16.2d +; CHECK-NEXT: ushll v16.2d, v18.2s, #3 +; CHECK-NEXT: ushll v17.2d, v19.2s, #0 +; CHECK-NEXT: ushll v18.2d, v0.2s, #3 +; CHECK-NEXT: ushll v4.2d, v4.2s, #0 +; CHECK-NEXT: sub v0.2d, v17.2d, v16.2d +; CHECK-NEXT: sub v4.2d, v4.2d, v18.2d ; CHECK-NEXT: ret %lp1 = load <8 x i8>, ptr %p1 %q1 = getelementptr i8, ptr %p1, i32 8 @@ -4355,55 +4102,19 @@ define <4 x i64> @dblext_bv2_v2i8_v4i32_v4i64(ptr %p1, ptr %p2, ptr %p3, ptr %p4) { ; CHECK-LABEL: dblext_bv2_v2i8_v4i32_v4i64: ; CHECK: // %bb.0: -; CHECK-NEXT: add x8, x0, #2 -; CHECK-NEXT: add x9, x1, #2 -; CHECK-NEXT: ld1 { v1.b }[0], [x1] -; CHECK-NEXT: add x10, x0, #1 -; CHECK-NEXT: ld1 { v0.b }[0], [x0] -; CHECK-NEXT: add x11, x3, #1 -; CHECK-NEXT: ld1 { v2.b }[0], [x8] -; CHECK-NEXT: add x8, x1, #1 -; CHECK-NEXT: ld1 { v3.b }[0], [x9] -; CHECK-NEXT: add x9, x0, #3 -; CHECK-NEXT: ld1 { v4.b }[0], [x2] -; CHECK-NEXT: ld1 { v1.b }[4], [x8] -; CHECK-NEXT: add x8, x2, #2 -; CHECK-NEXT: ld1 { v2.b }[4], [x9] -; CHECK-NEXT: add x9, x3, #2 -; CHECK-NEXT: ld1 { v0.b }[4], [x10] -; CHECK-NEXT: add x10, x1, #3 -; CHECK-NEXT: ld1 { v5.b }[0], [x8] -; CHECK-NEXT: add x8, x2, #3 -; CHECK-NEXT: ld1 { v6.b }[0], [x3] -; CHECK-NEXT: ld1 { v7.b }[0], [x9] -; CHECK-NEXT: add x9, x3, #3 -; CHECK-NEXT: ld1 { v3.b }[4], [x10] -; CHECK-NEXT: add x10, x2, #1 -; CHECK-NEXT: ld1 { v5.b }[4], [x8] -; CHECK-NEXT: uzp1 v0.4h, v0.4h, v1.4h -; CHECK-NEXT: ld1 { v6.b }[4], [x11] -; CHECK-NEXT: ld1 { v4.b }[4], [x10] -; CHECK-NEXT: ld1 { v7.b }[4], [x9] -; CHECK-NEXT: uzp1 v1.4h, v2.4h, v3.4h -; CHECK-NEXT: movi v2.2d, #0x0000ff000000ff +; CHECK-NEXT: ldr s0, [x2] +; CHECK-NEXT: ldr s1, [x0] +; CHECK-NEXT: ld1 { v0.s }[1], [x3] +; CHECK-NEXT: ld1 { v1.s }[1], [x1] +; CHECK-NEXT: uaddl v0.8h, v1.8b, v0.8b +; CHECK-NEXT: ushll2 v1.4s, v0.8h, #0 ; CHECK-NEXT: ushll v0.4s, v0.4h, #0 -; CHECK-NEXT: uzp1 v3.4h, v4.4h, v6.4h -; CHECK-NEXT: uzp1 v4.4h, v5.4h, v7.4h -; CHECK-NEXT: ushll v1.4s, v1.4h, #0 -; CHECK-NEXT: and v0.16b, v0.16b, v2.16b -; CHECK-NEXT: ushll v3.4s, v3.4h, #0 -; CHECK-NEXT: ushll v4.4s, v4.4h, #0 -; CHECK-NEXT: and v1.16b, v1.16b, v2.16b -; CHECK-NEXT: and v3.16b, v3.16b, v2.16b -; CHECK-NEXT: and v2.16b, v4.16b, v2.16b -; CHECK-NEXT: add v0.4s, v0.4s, v3.4s -; CHECK-NEXT: add v1.4s, v1.4s, v2.4s -; CHECK-NEXT: ushll v2.2d, v0.2s, #0 -; CHECK-NEXT: ushll2 v0.2d, v0.4s, #0 -; CHECK-NEXT: ushll2 v3.2d, v1.4s, #3 -; CHECK-NEXT: ushll v4.2d, v1.2s, #3 -; CHECK-NEXT: sub v1.2d, v0.2d, v3.2d -; CHECK-NEXT: sub v0.2d, v2.2d, v4.2d +; CHECK-NEXT: ushll2 v2.2d, v1.4s, #3 +; CHECK-NEXT: ushll v1.2d, v1.2s, #0 +; CHECK-NEXT: ushll2 v3.2d, v0.4s, #3 +; CHECK-NEXT: ushll v0.2d, v0.2s, #0 +; CHECK-NEXT: sub v1.2d, v1.2d, v2.2d +; CHECK-NEXT: sub v0.2d, v0.2d, v3.2d ; CHECK-NEXT: ret %lp1 = load <2 x i8>, ptr %p1 %q1 = getelementptr i8, ptr %p1, i32 2 @@ -4437,30 +4148,28 @@ define <8 x i64> @dblext_bv2_v4i8_v8i32_v8i64(ptr %p1, ptr %p2, ptr %p3, ptr %p4) { ; CHECK-LABEL: dblext_bv2_v4i8_v8i32_v8i64: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp s0, s1, [x0] -; CHECK-NEXT: ld1 { v0.s }[1], [x1], #4 -; CHECK-NEXT: ldp s2, s3, [x2] -; CHECK-NEXT: ld1 { v1.s }[1], [x1] -; CHECK-NEXT: ld1 { v2.s }[1], [x3], #4 -; CHECK-NEXT: uaddl v0.8h, v0.8b, v2.8b -; CHECK-NEXT: ld1 { v3.s }[1], [x3] -; CHECK-NEXT: ushll v2.4s, v0.4h, #0 -; CHECK-NEXT: ushll2 v0.4s, v0.8h, #0 -; CHECK-NEXT: ushll v4.2d, v2.2s, #0 -; CHECK-NEXT: uaddl v1.8h, v1.8b, v3.8b -; CHECK-NEXT: ushll v5.2d, v0.2s, #0 -; CHECK-NEXT: ushll2 v2.2d, v2.4s, #0 -; CHECK-NEXT: ushll v3.4s, v1.4h, #0 -; CHECK-NEXT: ushll2 v1.4s, v1.8h, #0 -; CHECK-NEXT: ushll2 v0.2d, v0.4s, #0 -; CHECK-NEXT: ushll v6.2d, v3.2s, #3 -; CHECK-NEXT: ushll v7.2d, v1.2s, #3 -; CHECK-NEXT: ushll2 v1.2d, v1.4s, #3 -; CHECK-NEXT: ushll2 v16.2d, v3.4s, #3 -; CHECK-NEXT: sub v3.2d, v0.2d, v1.2d -; CHECK-NEXT: sub v1.2d, v2.2d, v16.2d -; CHECK-NEXT: sub v0.2d, v4.2d, v6.2d -; CHECK-NEXT: sub v2.2d, v5.2d, v7.2d +; CHECK-NEXT: ldr d0, [x2] +; CHECK-NEXT: ldr d1, [x0] +; CHECK-NEXT: ldr d2, [x1] +; CHECK-NEXT: ldr d3, [x3] +; CHECK-NEXT: uaddl v0.8h, v1.8b, v0.8b +; CHECK-NEXT: uaddl v1.8h, v2.8b, v3.8b +; CHECK-NEXT: ushll2 v2.4s, v0.8h, #0 +; CHECK-NEXT: ushll v0.4s, v0.4h, #0 +; CHECK-NEXT: ushll2 v4.4s, v1.8h, #0 +; CHECK-NEXT: ushll v5.4s, v1.4h, #0 +; CHECK-NEXT: ushll2 v1.2d, v2.4s, #3 +; CHECK-NEXT: ushll2 v3.2d, v0.4s, #0 +; CHECK-NEXT: ushll2 v6.2d, v4.4s, #3 +; CHECK-NEXT: sub v1.2d, v3.2d, v1.2d +; CHECK-NEXT: ushll2 v3.2d, v5.4s, #0 +; CHECK-NEXT: ushll v2.2d, v2.2s, #3 +; CHECK-NEXT: ushll v0.2d, v0.2s, #0 +; CHECK-NEXT: ushll v4.2d, v4.2s, #3 +; CHECK-NEXT: ushll v5.2d, v5.2s, #0 +; CHECK-NEXT: sub v3.2d, v3.2d, v6.2d +; CHECK-NEXT: sub v0.2d, v0.2d, v2.2d +; CHECK-NEXT: sub v2.2d, v5.2d, v4.2d ; CHECK-NEXT: ret %lp1 = load <4 x i8>, ptr %p1 %q1 = getelementptr i8, ptr %p1, i32 4 @@ -4494,46 +4203,46 @@ define <16 x i64> @dblext_bv2_v8i8_v16i32_v16i64(ptr %p1, ptr %p2, ptr %p3, ptr %p4) { ; CHECK-LABEL: dblext_bv2_v8i8_v16i32_v16i64: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp d0, d5, [x2] -; CHECK-NEXT: ldp d1, d2, [x0] -; CHECK-NEXT: ldp d3, d4, [x1] -; CHECK-NEXT: ldp d6, d7, [x3] -; CHECK-NEXT: uaddl v0.8h, v1.8b, v0.8b -; CHECK-NEXT: uaddl v2.8h, v2.8b, v5.8b -; CHECK-NEXT: uaddl v1.8h, v3.8b, v6.8b -; CHECK-NEXT: uaddl v3.8h, v4.8b, v7.8b -; CHECK-NEXT: ushll v5.4s, v0.4h, #0 -; CHECK-NEXT: ushll v4.4s, v1.4h, #0 -; CHECK-NEXT: ushll2 v1.4s, v1.8h, #0 -; CHECK-NEXT: ushll2 v0.4s, v0.8h, #0 -; CHECK-NEXT: ushll2 v7.4s, v2.8h, #0 -; CHECK-NEXT: ushll2 v19.4s, v3.8h, #0 -; CHECK-NEXT: ushll v3.4s, v3.4h, #0 -; CHECK-NEXT: ushll v2.4s, v2.4h, #0 -; CHECK-NEXT: ushll v16.2d, v4.2s, #0 -; CHECK-NEXT: ushll v17.2d, v5.2s, #0 -; CHECK-NEXT: ushll v6.2d, v1.2s, #0 -; CHECK-NEXT: ushll2 v4.2d, v4.4s, #0 -; CHECK-NEXT: ushll v18.2d, v0.2s, #0 -; CHECK-NEXT: ushll2 v5.2d, v5.4s, #0 -; CHECK-NEXT: ushll2 v1.2d, v1.4s, #0 -; CHECK-NEXT: ushll2 v0.2d, v0.4s, #0 -; CHECK-NEXT: ushll v20.2d, v3.2s, #3 -; CHECK-NEXT: ushll v21.2d, v2.2s, #3 -; CHECK-NEXT: ushll v22.2d, v19.2s, #3 -; CHECK-NEXT: ushll2 v23.2d, v3.4s, #3 -; CHECK-NEXT: ushll v24.2d, v7.2s, #3 -; CHECK-NEXT: ushll2 v2.2d, v2.4s, #3 -; CHECK-NEXT: ushll2 v3.2d, v7.4s, #3 -; CHECK-NEXT: ushll2 v7.2d, v19.4s, #3 -; CHECK-NEXT: sub v3.2d, v0.2d, v3.2d -; CHECK-NEXT: sub v7.2d, v1.2d, v7.2d -; CHECK-NEXT: sub v1.2d, v5.2d, v2.2d -; CHECK-NEXT: sub v2.2d, v18.2d, v24.2d -; CHECK-NEXT: sub v5.2d, v4.2d, v23.2d -; CHECK-NEXT: sub v6.2d, v6.2d, v22.2d -; CHECK-NEXT: sub v0.2d, v17.2d, v21.2d -; CHECK-NEXT: sub v4.2d, v16.2d, v20.2d +; CHECK-NEXT: ldr q0, [x3] +; CHECK-NEXT: ldr q1, [x0] +; CHECK-NEXT: ldr q2, [x1] +; CHECK-NEXT: ldr q3, [x2] +; CHECK-NEXT: uaddl v4.8h, v2.8b, v0.8b +; CHECK-NEXT: uaddl2 v5.8h, v1.16b, v3.16b +; CHECK-NEXT: uaddl v1.8h, v1.8b, v3.8b +; CHECK-NEXT: uaddl2 v0.8h, v2.16b, v0.16b +; CHECK-NEXT: ushll2 v2.4s, v5.8h, #0 +; CHECK-NEXT: ushll2 v6.4s, v1.8h, #0 +; CHECK-NEXT: ushll2 v16.4s, v0.8h, #0 +; CHECK-NEXT: ushll2 v17.4s, v4.8h, #0 +; CHECK-NEXT: ushll2 v3.2d, v2.4s, #3 +; CHECK-NEXT: ushll2 v7.2d, v6.4s, #0 +; CHECK-NEXT: ushll2 v18.2d, v16.4s, #3 +; CHECK-NEXT: ushll2 v19.2d, v17.4s, #0 +; CHECK-NEXT: sub v3.2d, v7.2d, v3.2d +; CHECK-NEXT: sub v7.2d, v19.2d, v18.2d +; CHECK-NEXT: ushll v18.4s, v5.4h, #0 +; CHECK-NEXT: ushll v19.4s, v1.4h, #0 +; CHECK-NEXT: ushll2 v1.2d, v18.4s, #3 +; CHECK-NEXT: ushll2 v5.2d, v19.4s, #0 +; CHECK-NEXT: ushll v2.2d, v2.2s, #3 +; CHECK-NEXT: ushll v6.2d, v6.2s, #0 +; CHECK-NEXT: ushll v0.4s, v0.4h, #0 +; CHECK-NEXT: ushll v4.4s, v4.4h, #0 +; CHECK-NEXT: sub v1.2d, v5.2d, v1.2d +; CHECK-NEXT: sub v2.2d, v6.2d, v2.2d +; CHECK-NEXT: ushll2 v5.2d, v0.4s, #3 +; CHECK-NEXT: ushll2 v6.2d, v4.4s, #0 +; CHECK-NEXT: ushll v16.2d, v16.2s, #3 +; CHECK-NEXT: ushll v17.2d, v17.2s, #0 +; CHECK-NEXT: sub v5.2d, v6.2d, v5.2d +; CHECK-NEXT: sub v6.2d, v17.2d, v16.2d +; CHECK-NEXT: ushll v16.2d, v18.2s, #3 +; CHECK-NEXT: ushll v17.2d, v19.2s, #0 +; CHECK-NEXT: ushll v18.2d, v0.2s, #3 +; CHECK-NEXT: ushll v4.2d, v4.2s, #0 +; CHECK-NEXT: sub v0.2d, v17.2d, v16.2d +; CHECK-NEXT: sub v4.2d, v4.2d, v18.2d ; CHECK-NEXT: ret %lp1 = load <8 x i8>, ptr %p1 %q1 = getelementptr i8, ptr %p1, i32 8 @@ -4567,46 +4276,18 @@ define <4 x i64> @dblext_bv2_v2i16_v4i32_v4i64(ptr %p1, ptr %p2, ptr %p3, ptr %p4) { ; CHECK-LABEL: dblext_bv2_v2i16_v4i32_v4i64: ; CHECK: // %bb.0: -; CHECK-NEXT: add x8, x0, #4 -; CHECK-NEXT: add x9, x1, #4 -; CHECK-NEXT: ld1 { v1.h }[0], [x1] -; CHECK-NEXT: add x10, x0, #2 -; CHECK-NEXT: ld1 { v0.h }[0], [x0] -; CHECK-NEXT: add x11, x2, #6 -; CHECK-NEXT: ld1 { v2.h }[0], [x8] -; CHECK-NEXT: add x8, x1, #2 -; CHECK-NEXT: ld1 { v3.h }[0], [x9] -; CHECK-NEXT: add x9, x0, #6 -; CHECK-NEXT: ld1 { v4.h }[0], [x2] -; CHECK-NEXT: ld1 { v1.h }[2], [x8] -; CHECK-NEXT: add x8, x2, #4 -; CHECK-NEXT: ld1 { v2.h }[2], [x9] -; CHECK-NEXT: add x9, x3, #4 -; CHECK-NEXT: ld1 { v0.h }[2], [x10] -; CHECK-NEXT: add x10, x1, #6 -; CHECK-NEXT: ld1 { v5.h }[0], [x3] -; CHECK-NEXT: ld1 { v6.h }[0], [x8] -; CHECK-NEXT: add x8, x3, #2 -; CHECK-NEXT: ld1 { v7.h }[0], [x9] -; CHECK-NEXT: add x9, x3, #6 -; CHECK-NEXT: ld1 { v3.h }[2], [x10] -; CHECK-NEXT: add x10, x2, #2 -; CHECK-NEXT: ld1 { v5.h }[2], [x8] -; CHECK-NEXT: uzp1 v0.4h, v0.4h, v1.4h -; CHECK-NEXT: ld1 { v6.h }[2], [x11] -; CHECK-NEXT: ld1 { v4.h }[2], [x10] -; CHECK-NEXT: ld1 { v7.h }[2], [x9] -; CHECK-NEXT: uzp1 v2.4h, v2.4h, v3.4h -; CHECK-NEXT: uzp1 v1.4h, v4.4h, v5.4h -; CHECK-NEXT: uzp1 v3.4h, v6.4h, v7.4h -; CHECK-NEXT: uaddl v0.4s, v0.4h, v1.4h +; CHECK-NEXT: ldr d0, [x2] +; CHECK-NEXT: ldr d1, [x0] +; CHECK-NEXT: ldr d2, [x1] +; CHECK-NEXT: ldr d3, [x3] +; CHECK-NEXT: uaddl v0.4s, v1.4h, v0.4h ; CHECK-NEXT: uaddl v1.4s, v2.4h, v3.4h -; CHECK-NEXT: ushll v2.2d, v0.2s, #0 -; CHECK-NEXT: ushll2 v0.2d, v0.4s, #0 +; CHECK-NEXT: ushll2 v2.2d, v0.4s, #3 +; CHECK-NEXT: ushll v0.2d, v0.2s, #0 ; CHECK-NEXT: ushll2 v3.2d, v1.4s, #3 -; CHECK-NEXT: ushll v4.2d, v1.2s, #3 -; CHECK-NEXT: sub v1.2d, v0.2d, v3.2d -; CHECK-NEXT: sub v0.2d, v2.2d, v4.2d +; CHECK-NEXT: ushll v1.2d, v1.2s, #0 +; CHECK-NEXT: sub v0.2d, v0.2d, v2.2d +; CHECK-NEXT: sub v1.2d, v1.2d, v3.2d ; CHECK-NEXT: ret %lp1 = load <2 x i16>, ptr %p1 %q1 = getelementptr i8, ptr %p1, i32 4 @@ -4640,26 +4321,26 @@ define <8 x i64> @dblext_bv2_v4i16_v8i32_v8i64(ptr %p1, ptr %p2, ptr %p3, ptr %p4) { ; CHECK-LABEL: dblext_bv2_v4i16_v8i32_v8i64: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp d0, d5, [x2] -; CHECK-NEXT: ldp d1, d2, [x0] -; CHECK-NEXT: ldp d3, d4, [x1] -; CHECK-NEXT: ldp d6, d7, [x3] -; CHECK-NEXT: uaddl v0.4s, v1.4h, v0.4h -; CHECK-NEXT: uaddl v2.4s, v2.4h, v5.4h -; CHECK-NEXT: uaddl v1.4s, v3.4h, v6.4h -; CHECK-NEXT: uaddl v3.4s, v4.4h, v7.4h -; CHECK-NEXT: ushll v5.2d, v0.2s, #0 -; CHECK-NEXT: ushll v4.2d, v1.2s, #0 -; CHECK-NEXT: ushll2 v6.2d, v1.4s, #0 -; CHECK-NEXT: ushll2 v0.2d, v0.4s, #0 -; CHECK-NEXT: ushll v7.2d, v3.2s, #3 -; CHECK-NEXT: ushll v16.2d, v2.2s, #3 -; CHECK-NEXT: ushll2 v1.2d, v2.4s, #3 -; CHECK-NEXT: ushll2 v2.2d, v3.4s, #3 -; CHECK-NEXT: sub v1.2d, v0.2d, v1.2d -; CHECK-NEXT: sub v3.2d, v6.2d, v2.2d -; CHECK-NEXT: sub v0.2d, v5.2d, v16.2d -; CHECK-NEXT: sub v2.2d, v4.2d, v7.2d +; CHECK-NEXT: ldr q1, [x0] +; CHECK-NEXT: ldr q3, [x2] +; CHECK-NEXT: ldr q0, [x3] +; CHECK-NEXT: ldr q2, [x1] +; CHECK-NEXT: uaddl2 v5.4s, v1.8h, v3.8h +; CHECK-NEXT: uaddl v3.4s, v1.4h, v3.4h +; CHECK-NEXT: uaddl v4.4s, v2.4h, v0.4h +; CHECK-NEXT: uaddl2 v0.4s, v2.8h, v0.8h +; CHECK-NEXT: ushll2 v1.2d, v5.4s, #3 +; CHECK-NEXT: ushll2 v2.2d, v3.4s, #0 +; CHECK-NEXT: ushll2 v6.2d, v0.4s, #3 +; CHECK-NEXT: sub v1.2d, v2.2d, v1.2d +; CHECK-NEXT: ushll2 v2.2d, v4.4s, #0 +; CHECK-NEXT: ushll v5.2d, v5.2s, #3 +; CHECK-NEXT: ushll v7.2d, v3.2s, #0 +; CHECK-NEXT: sub v3.2d, v2.2d, v6.2d +; CHECK-NEXT: ushll v2.2d, v0.2s, #3 +; CHECK-NEXT: ushll v4.2d, v4.2s, #0 +; CHECK-NEXT: sub v0.2d, v7.2d, v5.2d +; CHECK-NEXT: sub v2.2d, v4.2d, v2.2d ; CHECK-NEXT: ret %lp1 = load <4 x i16>, ptr %p1 %q1 = getelementptr i8, ptr %p1, i32 8 @@ -4692,42 +4373,42 @@ define <16 x i64> @dblext_bv2_v8i16_v16i32_v16i64(ptr %p1, ptr %p2, ptr %p3, ptr %p4) { ; CHECK-LABEL: dblext_bv2_v8i16_v16i32_v16i64: -; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q6, [x2] -; CHECK-NEXT: ldp q1, q2, [x0] -; CHECK-NEXT: uaddl2 v7.4s, v1.8h, v0.8h -; CHECK-NEXT: uaddl v0.4s, v1.4h, v0.4h -; CHECK-NEXT: ldp q3, q4, [x1] -; CHECK-NEXT: ushll v20.2d, v7.2s, #0 -; CHECK-NEXT: ushll v18.2d, v0.2s, #0 -; CHECK-NEXT: ushll2 v0.2d, v0.4s, #0 -; CHECK-NEXT: ushll2 v7.2d, v7.4s, #0 -; CHECK-NEXT: ldp q5, q16, [x3] -; CHECK-NEXT: uaddl2 v17.4s, v3.8h, v5.8h -; CHECK-NEXT: uaddl v1.4s, v3.4h, v5.4h -; CHECK-NEXT: uaddl v3.4s, v4.4h, v16.4h -; CHECK-NEXT: uaddl v5.4s, v2.4h, v6.4h -; CHECK-NEXT: uaddl2 v4.4s, v4.8h, v16.8h -; CHECK-NEXT: uaddl2 v2.4s, v2.8h, v6.8h -; CHECK-NEXT: ushll v16.2d, v1.2s, #0 -; CHECK-NEXT: ushll v6.2d, v17.2s, #0 -; CHECK-NEXT: ushll2 v19.2d, v1.4s, #0 -; CHECK-NEXT: ushll2 v1.2d, v17.4s, #0 -; CHECK-NEXT: ushll v17.2d, v3.2s, #3 -; CHECK-NEXT: ushll v21.2d, v5.2s, #3 -; CHECK-NEXT: ushll v22.2d, v4.2s, #3 -; CHECK-NEXT: ushll2 v23.2d, v3.4s, #3 -; CHECK-NEXT: ushll v24.2d, v2.2s, #3 -; CHECK-NEXT: ushll2 v5.2d, v5.4s, #3 -; CHECK-NEXT: ushll2 v2.2d, v2.4s, #3 -; CHECK-NEXT: ushll2 v4.2d, v4.4s, #3 -; CHECK-NEXT: sub v3.2d, v7.2d, v2.2d -; CHECK-NEXT: sub v7.2d, v1.2d, v4.2d -; CHECK-NEXT: sub v1.2d, v0.2d, v5.2d -; CHECK-NEXT: sub v2.2d, v20.2d, v24.2d -; CHECK-NEXT: sub v5.2d, v19.2d, v23.2d -; CHECK-NEXT: sub v6.2d, v6.2d, v22.2d -; CHECK-NEXT: sub v0.2d, v18.2d, v21.2d +; CHECK: // %bb.0: +; CHECK-NEXT: ldp q0, q5, [x3] +; CHECK-NEXT: ldp q1, q2, [x0] +; CHECK-NEXT: ldp q3, q4, [x1] +; CHECK-NEXT: uaddl v16.4s, v3.4h, v0.4h +; CHECK-NEXT: uaddl2 v0.4s, v3.8h, v0.8h +; CHECK-NEXT: ldp q6, q7, [x2] +; CHECK-NEXT: uaddl v17.4s, v4.4h, v5.4h +; CHECK-NEXT: uaddl2 v4.4s, v4.8h, v5.8h +; CHECK-NEXT: uaddl v18.4s, v1.4h, v6.4h +; CHECK-NEXT: uaddl2 v1.4s, v1.8h, v6.8h +; CHECK-NEXT: uaddl v19.4s, v2.4h, v7.4h +; CHECK-NEXT: uaddl2 v2.4s, v2.8h, v7.8h +; CHECK-NEXT: ushll2 v5.2d, v1.4s, #0 +; CHECK-NEXT: ushll2 v6.2d, v4.4s, #3 +; CHECK-NEXT: ushll2 v3.2d, v2.4s, #3 +; CHECK-NEXT: ushll2 v7.2d, v0.4s, #0 +; CHECK-NEXT: sub v3.2d, v5.2d, v3.2d +; CHECK-NEXT: sub v7.2d, v7.2d, v6.2d +; CHECK-NEXT: ushll2 v5.2d, v19.4s, #3 +; CHECK-NEXT: ushll2 v6.2d, v18.4s, #0 +; CHECK-NEXT: ushll v20.2d, v1.2s, #0 +; CHECK-NEXT: sub v1.2d, v6.2d, v5.2d +; CHECK-NEXT: ushll2 v5.2d, v17.4s, #3 +; CHECK-NEXT: ushll2 v6.2d, v16.4s, #0 +; CHECK-NEXT: ushll v4.2d, v4.2s, #3 +; CHECK-NEXT: ushll v0.2d, v0.2s, #0 +; CHECK-NEXT: ushll v2.2d, v2.2s, #3 +; CHECK-NEXT: sub v5.2d, v6.2d, v5.2d +; CHECK-NEXT: sub v6.2d, v0.2d, v4.2d +; CHECK-NEXT: ushll v0.2d, v19.2s, #3 +; CHECK-NEXT: ushll v4.2d, v18.2s, #0 +; CHECK-NEXT: ushll v17.2d, v17.2s, #3 +; CHECK-NEXT: ushll v16.2d, v16.2s, #0 +; CHECK-NEXT: sub v2.2d, v20.2d, v2.2d +; CHECK-NEXT: sub v0.2d, v4.2d, v0.2d ; CHECK-NEXT: sub v4.2d, v16.2d, v17.2d ; CHECK-NEXT: ret %lp1 = load <8 x i16>, ptr %p1 @@ -4875,22 +4556,22 @@ define <32 x i16> @std_bv4_v8i8_v32i16(ptr %p, ptr %q, ptr %r, ptr %s) { ; CHECK-LABEL: std_bv4_v8i8_v32i16: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp d0, d5, [x3] -; CHECK-NEXT: ldp d1, d2, [x1] -; CHECK-NEXT: ldp d3, d4, [x2] -; CHECK-NEXT: ushll v6.8h, v0.8b, #0 -; CHECK-NEXT: ldp d0, d7, [x0] -; CHECK-NEXT: ushll v1.8h, v1.8b, #0 -; CHECK-NEXT: ushll v3.8h, v3.8b, #0 -; CHECK-NEXT: ushll v5.8h, v5.8b, #3 +; CHECK-NEXT: ldr q0, [x0] +; CHECK-NEXT: ldr q2, [x1] +; CHECK-NEXT: ldr q1, [x2] +; CHECK-NEXT: ldr q3, [x3] +; CHECK-NEXT: ushll2 v4.8h, v0.16b, #3 ; CHECK-NEXT: ushll v0.8h, v0.8b, #0 -; CHECK-NEXT: ushll v4.8h, v4.8b, #3 -; CHECK-NEXT: ushll v7.8h, v7.8b, #3 -; CHECK-NEXT: ushll v2.8h, v2.8b, #3 -; CHECK-NEXT: sub v0.8h, v0.8h, v7.8h -; CHECK-NEXT: sub v1.8h, v1.8h, v2.8h -; CHECK-NEXT: sub v2.8h, v3.8h, v4.8h -; CHECK-NEXT: sub v3.8h, v6.8h, v5.8h +; CHECK-NEXT: ushll2 v5.8h, v2.16b, #3 +; CHECK-NEXT: ushll v2.8h, v2.8b, #0 +; CHECK-NEXT: sub v0.8h, v0.8h, v4.8h +; CHECK-NEXT: ushll2 v4.8h, v1.16b, #3 +; CHECK-NEXT: ushll v6.8h, v1.8b, #0 +; CHECK-NEXT: sub v1.8h, v2.8h, v5.8h +; CHECK-NEXT: ushll2 v5.8h, v3.16b, #3 +; CHECK-NEXT: ushll v3.8h, v3.8b, #0 +; CHECK-NEXT: sub v2.8h, v6.8h, v4.8h +; CHECK-NEXT: sub v3.8h, v3.8h, v5.8h ; CHECK-NEXT: ret %lp1 = load <8 x i8>, ptr %p %p2 = getelementptr i8, ptr %p, i32 8 @@ -4997,28 +4678,26 @@ define <16 x i32> @std_bv4_v4i8_v16i32(ptr %p, ptr %q, ptr %r, ptr %s) { ; CHECK-LABEL: std_bv4_v4i8_v16i32: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp s0, s1, [x2] -; CHECK-NEXT: ldp s2, s3, [x0] -; CHECK-NEXT: ld1 { v0.s }[1], [x3], #4 -; CHECK-NEXT: ld1 { v2.s }[1], [x1], #4 -; CHECK-NEXT: ushll v0.8h, v0.8b, #0 -; CHECK-NEXT: ushll v2.8h, v2.8b, #0 -; CHECK-NEXT: ld1 { v1.s }[1], [x3] -; CHECK-NEXT: ld1 { v3.s }[1], [x1] -; CHECK-NEXT: ushll v4.4s, v0.4h, #0 -; CHECK-NEXT: ushll v5.4s, v2.4h, #0 +; CHECK-NEXT: ldr d1, [x0] +; CHECK-NEXT: ldr d2, [x1] +; CHECK-NEXT: ldr d0, [x2] +; CHECK-NEXT: ldr d3, [x3] ; CHECK-NEXT: ushll v1.8h, v1.8b, #0 +; CHECK-NEXT: ushll v2.8h, v2.8b, #0 +; CHECK-NEXT: ushll v4.8h, v0.8b, #0 ; CHECK-NEXT: ushll v3.8h, v3.8b, #0 -; CHECK-NEXT: ushll2 v0.4s, v0.8h, #0 -; CHECK-NEXT: ushll2 v2.4s, v2.8h, #0 -; CHECK-NEXT: ushll v6.4s, v1.4h, #3 -; CHECK-NEXT: ushll v7.4s, v3.4h, #3 -; CHECK-NEXT: ushll2 v3.4s, v3.8h, #3 -; CHECK-NEXT: ushll2 v16.4s, v1.8h, #3 -; CHECK-NEXT: sub v1.4s, v2.4s, v3.4s -; CHECK-NEXT: sub v3.4s, v0.4s, v16.4s -; CHECK-NEXT: sub v0.4s, v5.4s, v7.4s -; CHECK-NEXT: sub v2.4s, v4.4s, v6.4s +; CHECK-NEXT: ushll2 v0.4s, v1.8h, #3 +; CHECK-NEXT: ushll v1.4s, v1.4h, #0 +; CHECK-NEXT: ushll2 v5.4s, v2.8h, #3 +; CHECK-NEXT: ushll v2.4s, v2.4h, #0 +; CHECK-NEXT: sub v0.4s, v1.4s, v0.4s +; CHECK-NEXT: ushll2 v6.4s, v4.8h, #3 +; CHECK-NEXT: sub v1.4s, v2.4s, v5.4s +; CHECK-NEXT: ushll v2.4s, v4.4h, #0 +; CHECK-NEXT: ushll2 v4.4s, v3.8h, #3 +; CHECK-NEXT: ushll v3.4s, v3.4h, #0 +; CHECK-NEXT: sub v2.4s, v2.4s, v6.4s +; CHECK-NEXT: sub v3.4s, v3.4s, v4.4s ; CHECK-NEXT: ret %lp1 = load <4 x i8>, ptr %p %p2 = getelementptr i8, ptr %p, i32 4 @@ -5052,42 +4731,42 @@ define <32 x i32> @std_bv4_v8i8_v32i32(ptr %p, ptr %q, ptr %r, ptr %s) { ; CHECK-LABEL: std_bv4_v8i8_v32i32: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp d1, d2, [x1] -; CHECK-NEXT: ldp d3, d4, [x2] -; CHECK-NEXT: ldp d6, d7, [x0] -; CHECK-NEXT: ushll v1.8h, v1.8b, #0 -; CHECK-NEXT: ldp d0, d5, [x3] -; CHECK-NEXT: ushll v3.8h, v3.8b, #0 -; CHECK-NEXT: ushll v6.8h, v6.8b, #0 -; CHECK-NEXT: ushll v17.4s, v3.4h, #0 -; CHECK-NEXT: ushll v0.8h, v0.8b, #0 -; CHECK-NEXT: ushll v18.4s, v1.4h, #0 -; CHECK-NEXT: ushll v19.4s, v6.4h, #0 -; CHECK-NEXT: ushll2 v20.4s, v3.8h, #0 -; CHECK-NEXT: ushll2 v3.4s, v1.8h, #0 -; CHECK-NEXT: ushll2 v1.4s, v6.8h, #0 -; CHECK-NEXT: ushll v6.8h, v7.8b, #0 +; CHECK-NEXT: ldr q1, [x0] +; CHECK-NEXT: ldr q2, [x1] +; CHECK-NEXT: ldr q0, [x2] +; CHECK-NEXT: ushll2 v6.8h, v1.16b, #0 +; CHECK-NEXT: ldr q4, [x3] +; CHECK-NEXT: ushll v16.8h, v1.8b, #0 +; CHECK-NEXT: ushll2 v17.8h, v2.16b, #0 ; CHECK-NEXT: ushll v2.8h, v2.8b, #0 -; CHECK-NEXT: ushll v5.8h, v5.8b, #0 +; CHECK-NEXT: ushll2 v18.8h, v0.16b, #0 +; CHECK-NEXT: ushll2 v1.4s, v6.8h, #3 +; CHECK-NEXT: ushll2 v3.4s, v16.8h, #0 +; CHECK-NEXT: ushll2 v5.4s, v17.8h, #3 +; CHECK-NEXT: ushll2 v7.4s, v2.8h, #0 +; CHECK-NEXT: ushll v19.8h, v0.8b, #0 +; CHECK-NEXT: sub v1.4s, v3.4s, v1.4s +; CHECK-NEXT: sub v3.4s, v7.4s, v5.4s +; CHECK-NEXT: ushll2 v0.4s, v18.8h, #3 +; CHECK-NEXT: ushll2 v5.4s, v19.8h, #0 +; CHECK-NEXT: ushll2 v20.8h, v4.16b, #0 ; CHECK-NEXT: ushll v4.8h, v4.8b, #0 -; CHECK-NEXT: ushll v16.4s, v0.4h, #0 -; CHECK-NEXT: ushll2 v0.4s, v0.8h, #0 -; CHECK-NEXT: ushll v21.4s, v5.4h, #3 -; CHECK-NEXT: ushll v22.4s, v4.4h, #3 -; CHECK-NEXT: ushll v23.4s, v2.4h, #3 -; CHECK-NEXT: ushll v24.4s, v6.4h, #3 -; CHECK-NEXT: ushll2 v7.4s, v5.8h, #3 -; CHECK-NEXT: ushll2 v4.4s, v4.8h, #3 -; CHECK-NEXT: ushll2 v5.4s, v6.8h, #3 -; CHECK-NEXT: ushll2 v2.4s, v2.8h, #3 -; CHECK-NEXT: sub v1.4s, v1.4s, v5.4s -; CHECK-NEXT: sub v3.4s, v3.4s, v2.4s -; CHECK-NEXT: sub v5.4s, v20.4s, v4.4s -; CHECK-NEXT: sub v7.4s, v0.4s, v7.4s -; CHECK-NEXT: sub v0.4s, v19.4s, v24.4s -; CHECK-NEXT: sub v2.4s, v18.4s, v23.4s -; CHECK-NEXT: sub v4.4s, v17.4s, v22.4s -; CHECK-NEXT: sub v6.4s, v16.4s, v21.4s +; CHECK-NEXT: sub v5.4s, v5.4s, v0.4s +; CHECK-NEXT: ushll v0.4s, v6.4h, #3 +; CHECK-NEXT: ushll v6.4s, v16.4h, #0 +; CHECK-NEXT: ushll v16.4s, v17.4h, #3 +; CHECK-NEXT: ushll v2.4s, v2.4h, #0 +; CHECK-NEXT: ushll2 v7.4s, v20.8h, #3 +; CHECK-NEXT: ushll2 v21.4s, v4.8h, #0 +; CHECK-NEXT: sub v0.4s, v6.4s, v0.4s +; CHECK-NEXT: sub v2.4s, v2.4s, v16.4s +; CHECK-NEXT: ushll v6.4s, v18.4h, #3 +; CHECK-NEXT: ushll v16.4s, v19.4h, #0 +; CHECK-NEXT: ushll v17.4s, v20.4h, #3 +; CHECK-NEXT: ushll v18.4s, v4.4h, #0 +; CHECK-NEXT: sub v7.4s, v21.4s, v7.4s +; CHECK-NEXT: sub v4.4s, v16.4s, v6.4s +; CHECK-NEXT: sub v6.4s, v18.4s, v17.4s ; CHECK-NEXT: ret %lp1 = load <8 x i8>, ptr %p %p2 = getelementptr i8, ptr %p, i32 8 @@ -5121,56 +4800,28 @@ define <8 x i64> @std_bv4_v2i8_v8i64(ptr %p, ptr %q, ptr %r, ptr %s) { ; CHECK-LABEL: std_bv4_v2i8_v8i64: ; CHECK: // %bb.0: -; CHECK-NEXT: add x8, x0, #2 -; CHECK-NEXT: ld1 { v4.b }[0], [x0] -; CHECK-NEXT: add x9, x0, #3 -; CHECK-NEXT: add x10, x2, #2 -; CHECK-NEXT: ld1 { v0.b }[0], [x8] -; CHECK-NEXT: add x8, x0, #1 -; CHECK-NEXT: ld1 { v4.b }[4], [x8] -; CHECK-NEXT: add x8, x1, #2 -; CHECK-NEXT: ld1 { v0.b }[4], [x9] -; CHECK-NEXT: add x9, x1, #1 -; CHECK-NEXT: ld1 { v5.b }[0], [x1] -; CHECK-NEXT: ld1 { v1.b }[0], [x8] -; CHECK-NEXT: add x8, x1, #3 -; CHECK-NEXT: ld1 { v5.b }[4], [x9] -; CHECK-NEXT: add x9, x2, #3 -; CHECK-NEXT: ld1 { v1.b }[4], [x8] -; CHECK-NEXT: add x8, x2, #1 -; CHECK-NEXT: ld1 { v6.b }[0], [x2] -; CHECK-NEXT: ld1 { v2.b }[0], [x10] -; CHECK-NEXT: add x10, x3, #2 -; CHECK-NEXT: ld1 { v6.b }[4], [x8] -; CHECK-NEXT: adrp x8, .LCPI118_0 -; CHECK-NEXT: ld1 { v2.b }[4], [x9] -; CHECK-NEXT: add x9, x3, #1 -; CHECK-NEXT: ld1 { v7.b }[0], [x3] -; CHECK-NEXT: ld1 { v3.b }[0], [x10] -; CHECK-NEXT: add x10, x3, #3 -; CHECK-NEXT: ldr d16, [x8, :lo12:.LCPI118_0] -; CHECK-NEXT: ld1 { v7.b }[4], [x9] -; CHECK-NEXT: ld1 { v3.b }[4], [x10] -; CHECK-NEXT: tbl v4.8b, { v4.16b, v5.16b, v6.16b, v7.16b }, v16.8b -; CHECK-NEXT: tbl v0.8b, { v0.16b, v1.16b, v2.16b, v3.16b }, v16.8b -; CHECK-NEXT: ushll v1.8h, v4.8b, #0 +; CHECK-NEXT: ldr s0, [x2] +; CHECK-NEXT: ldr s1, [x0] +; CHECK-NEXT: ld1 { v0.s }[1], [x3] +; CHECK-NEXT: ld1 { v1.s }[1], [x1] ; CHECK-NEXT: ushll v0.8h, v0.8b, #0 -; CHECK-NEXT: ushll v2.4s, v1.4h, #0 -; CHECK-NEXT: ushll2 v1.4s, v1.8h, #0 -; CHECK-NEXT: ushll2 v3.4s, v0.8h, #0 +; CHECK-NEXT: ushll v1.8h, v1.8b, #0 +; CHECK-NEXT: ushll2 v2.4s, v0.8h, #0 +; CHECK-NEXT: ushll2 v3.4s, v1.8h, #0 +; CHECK-NEXT: ushll v4.4s, v1.4h, #0 ; CHECK-NEXT: ushll v0.4s, v0.4h, #0 -; CHECK-NEXT: ushll v4.2d, v2.2s, #0 -; CHECK-NEXT: ushll v5.2d, v1.2s, #0 -; CHECK-NEXT: ushll2 v2.2d, v2.4s, #0 -; CHECK-NEXT: ushll2 v1.2d, v1.4s, #0 -; CHECK-NEXT: ushll v6.2d, v0.2s, #3 -; CHECK-NEXT: ushll v7.2d, v3.2s, #3 -; CHECK-NEXT: ushll2 v3.2d, v3.4s, #3 -; CHECK-NEXT: ushll2 v0.2d, v0.4s, #3 -; CHECK-NEXT: sub v3.2d, v1.2d, v3.2d -; CHECK-NEXT: sub v1.2d, v2.2d, v0.2d +; CHECK-NEXT: ushll2 v1.2d, v3.4s, #3 +; CHECK-NEXT: ushll v3.2d, v3.2s, #0 +; CHECK-NEXT: ushll2 v5.2d, v2.4s, #3 +; CHECK-NEXT: ushll v2.2d, v2.2s, #0 +; CHECK-NEXT: sub v1.2d, v3.2d, v1.2d +; CHECK-NEXT: ushll2 v6.2d, v4.4s, #3 +; CHECK-NEXT: ushll v4.2d, v4.2s, #0 +; CHECK-NEXT: sub v3.2d, v2.2d, v5.2d +; CHECK-NEXT: ushll2 v2.2d, v0.4s, #3 +; CHECK-NEXT: ushll v5.2d, v0.2s, #0 ; CHECK-NEXT: sub v0.2d, v4.2d, v6.2d -; CHECK-NEXT: sub v2.2d, v5.2d, v7.2d +; CHECK-NEXT: sub v2.2d, v5.2d, v2.2d ; CHECK-NEXT: ret %lp1 = load <2 x i8>, ptr %p %p2 = getelementptr i8, ptr %p, i32 2 @@ -5204,48 +4855,46 @@ define <16 x i64> @std_bv4_v4i8_v16i64(ptr %p, ptr %q, ptr %r, ptr %s) { ; CHECK-LABEL: std_bv4_v4i8_v16i64: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp s0, s1, [x2] -; CHECK-NEXT: ldp s2, s3, [x0] -; CHECK-NEXT: ld1 { v0.s }[1], [x3], #4 -; CHECK-NEXT: ld1 { v2.s }[1], [x1], #4 -; CHECK-NEXT: ushll v0.8h, v0.8b, #0 -; CHECK-NEXT: ushll v2.8h, v2.8b, #0 -; CHECK-NEXT: ld1 { v1.s }[1], [x3] -; CHECK-NEXT: ld1 { v3.s }[1], [x1] -; CHECK-NEXT: ushll v4.4s, v0.4h, #0 -; CHECK-NEXT: ushll v5.4s, v2.4h, #0 +; CHECK-NEXT: ldr d0, [x2] +; CHECK-NEXT: ldr d1, [x0] +; CHECK-NEXT: ldr d2, [x1] +; CHECK-NEXT: ldr d4, [x3] ; CHECK-NEXT: ushll v1.8h, v1.8b, #0 -; CHECK-NEXT: ushll v3.8h, v3.8b, #0 -; CHECK-NEXT: ushll2 v0.4s, v0.8h, #0 -; CHECK-NEXT: ushll2 v2.4s, v2.8h, #0 -; CHECK-NEXT: ushll2 v7.4s, v3.8h, #0 -; CHECK-NEXT: ushll2 v19.4s, v1.8h, #0 -; CHECK-NEXT: ushll v1.4s, v1.4h, #0 -; CHECK-NEXT: ushll v3.4s, v3.4h, #0 -; CHECK-NEXT: ushll v16.2d, v4.2s, #0 -; CHECK-NEXT: ushll v17.2d, v5.2s, #0 -; CHECK-NEXT: ushll v6.2d, v0.2s, #0 -; CHECK-NEXT: ushll2 v4.2d, v4.4s, #0 -; CHECK-NEXT: ushll v18.2d, v2.2s, #0 -; CHECK-NEXT: ushll2 v5.2d, v5.4s, #0 -; CHECK-NEXT: ushll2 v0.2d, v0.4s, #0 -; CHECK-NEXT: ushll2 v2.2d, v2.4s, #0 -; CHECK-NEXT: ushll v20.2d, v1.2s, #3 -; CHECK-NEXT: ushll v21.2d, v3.2s, #3 -; CHECK-NEXT: ushll v22.2d, v19.2s, #3 -; CHECK-NEXT: ushll2 v23.2d, v1.4s, #3 -; CHECK-NEXT: ushll v24.2d, v7.2s, #3 -; CHECK-NEXT: ushll2 v1.2d, v3.4s, #3 -; CHECK-NEXT: ushll2 v3.2d, v7.4s, #3 -; CHECK-NEXT: ushll2 v7.2d, v19.4s, #3 -; CHECK-NEXT: sub v3.2d, v2.2d, v3.2d -; CHECK-NEXT: sub v7.2d, v0.2d, v7.2d -; CHECK-NEXT: sub v1.2d, v5.2d, v1.2d -; CHECK-NEXT: sub v2.2d, v18.2d, v24.2d -; CHECK-NEXT: sub v5.2d, v4.2d, v23.2d -; CHECK-NEXT: sub v6.2d, v6.2d, v22.2d -; CHECK-NEXT: sub v0.2d, v17.2d, v21.2d -; CHECK-NEXT: sub v4.2d, v16.2d, v20.2d +; CHECK-NEXT: ushll v2.8h, v2.8b, #0 +; CHECK-NEXT: ushll v0.8h, v0.8b, #0 +; CHECK-NEXT: ushll2 v6.4s, v1.8h, #0 +; CHECK-NEXT: ushll v16.4s, v1.4h, #0 +; CHECK-NEXT: ushll2 v17.4s, v2.8h, #0 +; CHECK-NEXT: ushll v2.4s, v2.4h, #0 +; CHECK-NEXT: ushll2 v18.4s, v0.8h, #0 +; CHECK-NEXT: ushll v19.4s, v0.4h, #0 +; CHECK-NEXT: ushll v0.8h, v4.8b, #0 +; CHECK-NEXT: ushll2 v1.2d, v6.4s, #3 +; CHECK-NEXT: ushll2 v3.2d, v16.4s, #0 +; CHECK-NEXT: ushll2 v5.2d, v17.4s, #3 +; CHECK-NEXT: ushll2 v7.2d, v2.4s, #0 +; CHECK-NEXT: ushll2 v20.4s, v0.8h, #0 +; CHECK-NEXT: ushll v21.4s, v0.4h, #0 +; CHECK-NEXT: sub v1.2d, v3.2d, v1.2d +; CHECK-NEXT: sub v3.2d, v7.2d, v5.2d +; CHECK-NEXT: ushll2 v5.2d, v18.4s, #3 +; CHECK-NEXT: ushll2 v4.2d, v19.4s, #0 +; CHECK-NEXT: ushll2 v0.2d, v20.4s, #3 +; CHECK-NEXT: ushll2 v7.2d, v21.4s, #0 +; CHECK-NEXT: sub v5.2d, v4.2d, v5.2d +; CHECK-NEXT: sub v7.2d, v7.2d, v0.2d +; CHECK-NEXT: ushll v0.2d, v6.2s, #3 +; CHECK-NEXT: ushll v4.2d, v16.2s, #0 +; CHECK-NEXT: ushll v6.2d, v17.2s, #3 +; CHECK-NEXT: ushll v2.2d, v2.2s, #0 +; CHECK-NEXT: sub v0.2d, v4.2d, v0.2d +; CHECK-NEXT: sub v2.2d, v2.2d, v6.2d +; CHECK-NEXT: ushll v4.2d, v18.2s, #3 +; CHECK-NEXT: ushll v6.2d, v19.2s, #0 +; CHECK-NEXT: ushll v16.2d, v20.2s, #3 +; CHECK-NEXT: ushll v17.2d, v21.2s, #0 +; CHECK-NEXT: sub v4.2d, v6.2d, v4.2d +; CHECK-NEXT: sub v6.2d, v17.2d, v16.2d ; CHECK-NEXT: ret %lp1 = load <4 x i8>, ptr %p %p2 = getelementptr i8, ptr %p, i32 4 @@ -5279,106 +4928,90 @@ define <32 x i64> @std_bv4_v8i8_v32i64(ptr %p, ptr %q, ptr %r, ptr %s) { ; CHECK-LABEL: std_bv4_v8i8_v32i64: ; CHECK: // %bb.0: -; CHECK-NEXT: str d14, [sp, #-64]! // 8-byte Folded Spill -; CHECK-NEXT: .cfi_def_cfa_offset 64 -; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill -; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill -; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill -; CHECK-NEXT: .cfi_offset b8, -8 -; CHECK-NEXT: .cfi_offset b9, -16 -; CHECK-NEXT: .cfi_offset b10, -24 -; CHECK-NEXT: .cfi_offset b11, -32 -; CHECK-NEXT: .cfi_offset b12, -40 -; CHECK-NEXT: .cfi_offset b13, -48 -; CHECK-NEXT: .cfi_offset b14, -64 -; CHECK-NEXT: ldp d2, d26, [x3] -; CHECK-NEXT: ldp d0, d25, [x2] -; CHECK-NEXT: ldp d1, d24, [x0] -; CHECK-NEXT: ushll v6.8h, v2.8b, #0 -; CHECK-NEXT: ldp d3, d27, [x1] -; CHECK-NEXT: ushll v26.8h, v26.8b, #0 -; CHECK-NEXT: ushll v7.8h, v0.8b, #0 -; CHECK-NEXT: ushll v20.8h, v1.8b, #0 -; CHECK-NEXT: ushll v16.8h, v3.8b, #0 -; CHECK-NEXT: ushll2 v3.4s, v6.8h, #0 -; CHECK-NEXT: ushll v25.8h, v25.8b, #0 -; CHECK-NEXT: ushll v31.4s, v26.4h, #0 -; CHECK-NEXT: ushll2 v26.4s, v26.8h, #0 -; CHECK-NEXT: ushll2 v18.4s, v7.8h, #0 -; CHECK-NEXT: ushll2 v4.2d, v3.4s, #0 -; CHECK-NEXT: ushll2 v21.4s, v20.8h, #0 -; CHECK-NEXT: ushll v17.2d, v3.2s, #0 -; CHECK-NEXT: ushll v28.4s, v7.4h, #0 -; CHECK-NEXT: ushll v29.4s, v16.4h, #0 -; CHECK-NEXT: ushll v27.8h, v27.8b, #0 -; CHECK-NEXT: ushll v30.4s, v25.4h, #0 -; CHECK-NEXT: ushll2 v25.4s, v25.8h, #0 -; CHECK-NEXT: ushll2 v8.2d, v26.4s, #3 -; CHECK-NEXT: ushll v26.2d, v26.2s, #3 -; CHECK-NEXT: ushll2 v19.4s, v16.8h, #0 -; CHECK-NEXT: ushll2 v5.2d, v18.4s, #0 -; CHECK-NEXT: ushll2 v0.2d, v21.4s, #0 -; CHECK-NEXT: ushll v23.4s, v6.4h, #0 -; CHECK-NEXT: ushll v18.2d, v18.2s, #0 -; CHECK-NEXT: ushll2 v3.2d, v28.4s, #0 -; CHECK-NEXT: ushll2 v6.2d, v29.4s, #0 -; CHECK-NEXT: ushll v16.4s, v20.4h, #0 -; CHECK-NEXT: ushll v22.2d, v21.2s, #0 -; CHECK-NEXT: ushll v21.2d, v28.2s, #0 -; CHECK-NEXT: ushll v20.2d, v29.2s, #0 -; CHECK-NEXT: ushll v28.8h, v24.8b, #0 -; CHECK-NEXT: ushll v29.4s, v27.4h, #0 -; CHECK-NEXT: ushll2 v27.4s, v27.8h, #0 -; CHECK-NEXT: ushll2 v9.2d, v25.4s, #3 -; CHECK-NEXT: ushll v25.2d, v25.2s, #3 -; CHECK-NEXT: sub v4.2d, v4.2d, v8.2d -; CHECK-NEXT: sub v17.2d, v17.2d, v26.2d -; CHECK-NEXT: ushll2 v2.2d, v19.4s, #0 -; CHECK-NEXT: ushll v19.2d, v19.2s, #0 -; CHECK-NEXT: stp q17, q4, [x8, #224] -; CHECK-NEXT: ushll v24.4s, v28.4h, #0 -; CHECK-NEXT: ushll2 v28.4s, v28.8h, #0 -; CHECK-NEXT: ushll2 v10.2d, v27.4s, #3 -; CHECK-NEXT: ushll v27.2d, v27.2s, #3 -; CHECK-NEXT: sub v5.2d, v5.2d, v9.2d -; CHECK-NEXT: sub v4.2d, v18.2d, v25.2d -; CHECK-NEXT: ushll2 v11.2d, v28.4s, #3 -; CHECK-NEXT: ushll v28.2d, v28.2s, #3 -; CHECK-NEXT: stp q4, q5, [x8, #160] -; CHECK-NEXT: sub v2.2d, v2.2d, v10.2d -; CHECK-NEXT: sub v5.2d, v19.2d, v27.2d -; CHECK-NEXT: ushll2 v1.2d, v23.4s, #0 -; CHECK-NEXT: ushll v23.2d, v23.2s, #0 -; CHECK-NEXT: stp q5, q2, [x8, #96] -; CHECK-NEXT: ushll2 v12.2d, v31.4s, #3 -; CHECK-NEXT: ushll v31.2d, v31.2s, #3 -; CHECK-NEXT: sub v0.2d, v0.2d, v11.2d -; CHECK-NEXT: sub v2.2d, v22.2d, v28.2d -; CHECK-NEXT: ushll2 v13.2d, v30.4s, #3 -; CHECK-NEXT: ushll v30.2d, v30.2s, #3 -; CHECK-NEXT: stp q2, q0, [x8, #32] -; CHECK-NEXT: sub v1.2d, v1.2d, v12.2d -; CHECK-NEXT: sub v0.2d, v23.2d, v31.2d -; CHECK-NEXT: ushll2 v14.2d, v29.4s, #3 -; CHECK-NEXT: ushll v29.2d, v29.2s, #3 -; CHECK-NEXT: stp q0, q1, [x8, #192] -; CHECK-NEXT: sub v2.2d, v3.2d, v13.2d -; CHECK-NEXT: sub v1.2d, v21.2d, v30.2d -; CHECK-NEXT: sub v0.2d, v6.2d, v14.2d -; CHECK-NEXT: stp q1, q2, [x8, #128] -; CHECK-NEXT: sub v2.2d, v20.2d, v29.2d -; CHECK-NEXT: ushll2 v7.2d, v16.4s, #0 -; CHECK-NEXT: ushll v1.2d, v16.2s, #0 -; CHECK-NEXT: stp q2, q0, [x8, #64] -; CHECK-NEXT: ushll2 v0.2d, v24.4s, #3 -; CHECK-NEXT: ushll v2.2d, v24.2s, #3 -; CHECK-NEXT: sub v0.2d, v7.2d, v0.2d -; CHECK-NEXT: sub v1.2d, v1.2d, v2.2d +; CHECK-NEXT: ldr q6, [x3] +; CHECK-NEXT: ldr q1, [x0] +; CHECK-NEXT: ldr q3, [x1] +; CHECK-NEXT: ushll2 v7.8h, v6.16b, #0 +; CHECK-NEXT: ldr q5, [x2] +; CHECK-NEXT: ushll v6.8h, v6.8b, #0 +; CHECK-NEXT: ushll v16.4s, v7.4h, #0 +; CHECK-NEXT: ushll v17.4s, v6.4h, #0 +; CHECK-NEXT: ushll2 v0.8h, v1.16b, #0 +; CHECK-NEXT: ushll v1.8h, v1.8b, #0 +; CHECK-NEXT: ushll2 v2.8h, v3.16b, #0 +; CHECK-NEXT: ushll v3.8h, v3.8b, #0 +; CHECK-NEXT: ushll v18.2d, v16.2s, #3 +; CHECK-NEXT: ushll v19.2d, v17.2s, #0 +; CHECK-NEXT: ushll2 v4.8h, v5.16b, #0 +; CHECK-NEXT: ushll v20.8h, v5.8b, #0 +; CHECK-NEXT: sub v5.2d, v19.2d, v18.2d +; CHECK-NEXT: ushll2 v18.4s, v0.8h, #0 +; CHECK-NEXT: ushll2 v19.4s, v1.8h, #0 +; CHECK-NEXT: ushll2 v23.4s, v2.8h, #0 +; CHECK-NEXT: ushll2 v24.4s, v3.8h, #0 +; CHECK-NEXT: ushll v21.2d, v18.2s, #3 +; CHECK-NEXT: ushll v22.2d, v19.2s, #0 +; CHECK-NEXT: ushll v25.2d, v23.2s, #3 +; CHECK-NEXT: ushll v26.2d, v24.2s, #0 +; CHECK-NEXT: sub v21.2d, v22.2d, v21.2d +; CHECK-NEXT: sub v22.2d, v26.2d, v25.2d +; CHECK-NEXT: ushll2 v25.4s, v4.8h, #0 +; CHECK-NEXT: ushll2 v26.4s, v20.8h, #0 +; CHECK-NEXT: ushll2 v7.4s, v7.8h, #0 +; CHECK-NEXT: ushll2 v6.4s, v6.8h, #0 +; CHECK-NEXT: ushll v27.2d, v25.2s, #3 +; CHECK-NEXT: ushll v28.2d, v26.2s, #0 +; CHECK-NEXT: ushll v29.2d, v7.2s, #3 +; CHECK-NEXT: ushll v30.2d, v6.2s, #0 +; CHECK-NEXT: ushll2 v25.2d, v25.4s, #3 +; CHECK-NEXT: ushll2 v26.2d, v26.4s, #0 +; CHECK-NEXT: ushll v2.4s, v2.4h, #0 +; CHECK-NEXT: ushll2 v7.2d, v7.4s, #3 +; CHECK-NEXT: ushll2 v6.2d, v6.4s, #0 +; CHECK-NEXT: ushll v3.4s, v3.4h, #0 +; CHECK-NEXT: sub v25.2d, v26.2d, v25.2d +; CHECK-NEXT: ushll v26.2d, v2.2s, #3 +; CHECK-NEXT: sub v6.2d, v6.2d, v7.2d +; CHECK-NEXT: ushll v7.2d, v3.2s, #0 +; CHECK-NEXT: ushll2 v2.2d, v2.4s, #3 +; CHECK-NEXT: ushll2 v3.2d, v3.4s, #0 +; CHECK-NEXT: sub v27.2d, v28.2d, v27.2d +; CHECK-NEXT: sub v28.2d, v30.2d, v29.2d +; CHECK-NEXT: ushll2 v23.2d, v23.4s, #3 +; CHECK-NEXT: stp q27, q25, [x8, #160] +; CHECK-NEXT: ushll2 v24.2d, v24.4s, #0 +; CHECK-NEXT: stp q28, q6, [x8, #224] +; CHECK-NEXT: ushll v0.4s, v0.4h, #0 +; CHECK-NEXT: ushll v1.4s, v1.4h, #0 +; CHECK-NEXT: ushll v4.4s, v4.4h, #0 +; CHECK-NEXT: ushll v20.4s, v20.4h, #0 +; CHECK-NEXT: ushll2 v18.2d, v18.4s, #3 +; CHECK-NEXT: ushll2 v19.2d, v19.4s, #0 +; CHECK-NEXT: sub v2.2d, v3.2d, v2.2d +; CHECK-NEXT: sub v3.2d, v7.2d, v26.2d +; CHECK-NEXT: sub v23.2d, v24.2d, v23.2d +; CHECK-NEXT: ushll v24.2d, v0.2s, #3 +; CHECK-NEXT: stp q3, q2, [x8, #64] +; CHECK-NEXT: ushll v6.2d, v4.2s, #3 +; CHECK-NEXT: stp q22, q23, [x8, #96] +; CHECK-NEXT: ushll v28.2d, v20.2s, #0 +; CHECK-NEXT: ushll2 v4.2d, v4.4s, #3 +; CHECK-NEXT: ushll2 v16.2d, v16.4s, #3 +; CHECK-NEXT: ushll2 v17.2d, v17.4s, #0 +; CHECK-NEXT: sub v18.2d, v19.2d, v18.2d +; CHECK-NEXT: ushll2 v19.2d, v20.4s, #0 +; CHECK-NEXT: ushll2 v0.2d, v0.4s, #3 +; CHECK-NEXT: stp q21, q18, [x8, #32] +; CHECK-NEXT: ushll2 v2.2d, v1.4s, #0 +; CHECK-NEXT: ushll v1.2d, v1.2s, #0 +; CHECK-NEXT: sub v16.2d, v17.2d, v16.2d +; CHECK-NEXT: sub v4.2d, v19.2d, v4.2d +; CHECK-NEXT: sub v6.2d, v28.2d, v6.2d +; CHECK-NEXT: stp q5, q16, [x8, #192] +; CHECK-NEXT: sub v0.2d, v2.2d, v0.2d +; CHECK-NEXT: sub v1.2d, v1.2d, v24.2d +; CHECK-NEXT: stp q6, q4, [x8, #128] ; CHECK-NEXT: stp q1, q0, [x8] -; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload -; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload -; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload -; CHECK-NEXT: ldr d14, [sp], #64 // 8-byte Folded Reload ; CHECK-NEXT: ret %lp1 = load <8 x i8>, ptr %p %p2 = getelementptr i8, ptr %p, i32 8 @@ -5483,22 +5116,22 @@ define <16 x i32> @std_bv4_v4i16_v16i32(ptr %p, ptr %q, ptr %r, ptr %s) { ; CHECK-LABEL: std_bv4_v4i16_v16i32: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp d0, d5, [x3] -; CHECK-NEXT: ldp d1, d2, [x1] -; CHECK-NEXT: ldp d3, d4, [x2] -; CHECK-NEXT: ushll v6.4s, v0.4h, #0 -; CHECK-NEXT: ldp d0, d7, [x0] -; CHECK-NEXT: ushll v1.4s, v1.4h, #0 -; CHECK-NEXT: ushll v3.4s, v3.4h, #0 -; CHECK-NEXT: ushll v5.4s, v5.4h, #3 +; CHECK-NEXT: ldr q0, [x0] +; CHECK-NEXT: ldr q2, [x1] +; CHECK-NEXT: ldr q1, [x2] +; CHECK-NEXT: ldr q3, [x3] +; CHECK-NEXT: ushll2 v4.4s, v0.8h, #3 ; CHECK-NEXT: ushll v0.4s, v0.4h, #0 -; CHECK-NEXT: ushll v4.4s, v4.4h, #3 -; CHECK-NEXT: ushll v7.4s, v7.4h, #3 -; CHECK-NEXT: ushll v2.4s, v2.4h, #3 -; CHECK-NEXT: sub v0.4s, v0.4s, v7.4s -; CHECK-NEXT: sub v1.4s, v1.4s, v2.4s -; CHECK-NEXT: sub v2.4s, v3.4s, v4.4s -; CHECK-NEXT: sub v3.4s, v6.4s, v5.4s +; CHECK-NEXT: ushll2 v5.4s, v2.8h, #3 +; CHECK-NEXT: ushll v2.4s, v2.4h, #0 +; CHECK-NEXT: sub v0.4s, v0.4s, v4.4s +; CHECK-NEXT: ushll2 v4.4s, v1.8h, #3 +; CHECK-NEXT: ushll v6.4s, v1.4h, #0 +; CHECK-NEXT: sub v1.4s, v2.4s, v5.4s +; CHECK-NEXT: ushll2 v5.4s, v3.8h, #3 +; CHECK-NEXT: ushll v3.4s, v3.4h, #0 +; CHECK-NEXT: sub v2.4s, v6.4s, v4.4s +; CHECK-NEXT: sub v3.4s, v3.4s, v5.4s ; CHECK-NEXT: ret %lp1 = load <4 x i16>, ptr %p %p2 = getelementptr i8, ptr %p, i32 8 @@ -5532,34 +5165,34 @@ define <32 x i32> @std_bv4_v8i16_v32i32(ptr %p, ptr %q, ptr %r, ptr %s) { ; CHECK-LABEL: std_bv4_v8i16_v32i32: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q5, [x3] -; CHECK-NEXT: ushll v6.4s, v0.4h, #0 -; CHECK-NEXT: ushll2 v0.4s, v0.8h, #0 -; CHECK-NEXT: ldp q1, q2, [x1] -; CHECK-NEXT: ushll v21.4s, v5.4h, #3 -; CHECK-NEXT: sub v6.4s, v6.4s, v21.4s -; CHECK-NEXT: ushll v17.4s, v1.4h, #0 -; CHECK-NEXT: ldp q3, q4, [x2] -; CHECK-NEXT: ushll v23.4s, v2.4h, #3 -; CHECK-NEXT: ushll2 v2.4s, v2.8h, #3 -; CHECK-NEXT: ushll v16.4s, v3.4h, #0 -; CHECK-NEXT: ushll2 v20.4s, v3.8h, #0 -; CHECK-NEXT: ldp q7, q18, [x0] -; CHECK-NEXT: ushll2 v3.4s, v1.8h, #0 -; CHECK-NEXT: ushll v22.4s, v4.4h, #3 -; CHECK-NEXT: ushll2 v4.4s, v4.8h, #3 -; CHECK-NEXT: ushll v19.4s, v7.4h, #0 -; CHECK-NEXT: ushll2 v1.4s, v7.8h, #0 -; CHECK-NEXT: ushll v24.4s, v18.4h, #3 -; CHECK-NEXT: ushll2 v7.4s, v5.8h, #3 -; CHECK-NEXT: ushll2 v5.4s, v18.8h, #3 -; CHECK-NEXT: sub v3.4s, v3.4s, v2.4s -; CHECK-NEXT: sub v1.4s, v1.4s, v5.4s -; CHECK-NEXT: sub v5.4s, v20.4s, v4.4s -; CHECK-NEXT: sub v7.4s, v0.4s, v7.4s -; CHECK-NEXT: sub v0.4s, v19.4s, v24.4s -; CHECK-NEXT: sub v2.4s, v17.4s, v23.4s -; CHECK-NEXT: sub v4.4s, v16.4s, v22.4s +; CHECK-NEXT: ldp q0, q2, [x0] +; CHECK-NEXT: ushll2 v3.4s, v0.8h, #0 +; CHECK-NEXT: ushll v0.4s, v0.4h, #0 +; CHECK-NEXT: ldp q6, q16, [x1] +; CHECK-NEXT: ushll2 v1.4s, v2.8h, #3 +; CHECK-NEXT: ushll v2.4s, v2.4h, #3 +; CHECK-NEXT: sub v1.4s, v3.4s, v1.4s +; CHECK-NEXT: ushll2 v7.4s, v6.8h, #0 +; CHECK-NEXT: ushll v6.4s, v6.4h, #0 +; CHECK-NEXT: ldp q4, q17, [x2] +; CHECK-NEXT: ushll2 v5.4s, v16.8h, #3 +; CHECK-NEXT: ushll v16.4s, v16.4h, #3 +; CHECK-NEXT: sub v3.4s, v7.4s, v5.4s +; CHECK-NEXT: ushll2 v7.4s, v4.8h, #0 +; CHECK-NEXT: sub v0.4s, v0.4s, v2.4s +; CHECK-NEXT: ldp q18, q19, [x3] +; CHECK-NEXT: ushll2 v5.4s, v17.8h, #3 +; CHECK-NEXT: sub v2.4s, v6.4s, v16.4s +; CHECK-NEXT: ushll v6.4s, v17.4h, #3 +; CHECK-NEXT: ushll2 v21.4s, v18.8h, #0 +; CHECK-NEXT: ushll v4.4s, v4.4h, #0 +; CHECK-NEXT: ushll2 v20.4s, v19.8h, #3 +; CHECK-NEXT: ushll v16.4s, v19.4h, #3 +; CHECK-NEXT: ushll v17.4s, v18.4h, #0 +; CHECK-NEXT: sub v5.4s, v7.4s, v5.4s +; CHECK-NEXT: sub v7.4s, v21.4s, v20.4s +; CHECK-NEXT: sub v4.4s, v4.4s, v6.4s +; CHECK-NEXT: sub v6.4s, v17.4s, v16.4s ; CHECK-NEXT: ret %lp1 = load <8 x i16>, ptr %p %p2 = getelementptr i8, ptr %p, i32 16 @@ -5593,54 +5226,26 @@ define <8 x i64> @std_bv4_v2i16_v8i64(ptr %p, ptr %q, ptr %r, ptr %s) { ; CHECK-LABEL: std_bv4_v2i16_v8i64: ; CHECK: // %bb.0: -; CHECK-NEXT: add x8, x0, #4 -; CHECK-NEXT: ld1 { v0.h }[0], [x0] -; CHECK-NEXT: ld1 { v1.h }[0], [x1] -; CHECK-NEXT: add x9, x0, #2 -; CHECK-NEXT: ld1 { v3.h }[0], [x2] -; CHECK-NEXT: add x10, x0, #6 -; CHECK-NEXT: ld1 { v2.h }[0], [x8] -; CHECK-NEXT: add x8, x1, #2 -; CHECK-NEXT: ld1 { v4.h }[0], [x3] -; CHECK-NEXT: ld1 { v0.h }[2], [x9] -; CHECK-NEXT: add x9, x2, #2 -; CHECK-NEXT: ld1 { v1.h }[2], [x8] -; CHECK-NEXT: add x8, x3, #2 -; CHECK-NEXT: ld1 { v2.h }[2], [x10] -; CHECK-NEXT: add x10, x1, #4 -; CHECK-NEXT: ld1 { v3.h }[2], [x9] -; CHECK-NEXT: add x9, x2, #4 -; CHECK-NEXT: ld1 { v4.h }[2], [x8] -; CHECK-NEXT: add x8, x3, #4 -; CHECK-NEXT: ld1 { v5.h }[0], [x10] -; CHECK-NEXT: add x10, x1, #6 -; CHECK-NEXT: ld1 { v6.h }[0], [x9] -; CHECK-NEXT: add x9, x2, #6 -; CHECK-NEXT: ld1 { v7.h }[0], [x8] -; CHECK-NEXT: add x8, x3, #6 -; CHECK-NEXT: uzp1 v3.4h, v3.4h, v4.4h -; CHECK-NEXT: uzp1 v0.4h, v0.4h, v1.4h -; CHECK-NEXT: ld1 { v5.h }[2], [x10] -; CHECK-NEXT: ld1 { v6.h }[2], [x9] -; CHECK-NEXT: ld1 { v7.h }[2], [x8] -; CHECK-NEXT: ushll v1.4s, v3.4h, #0 -; CHECK-NEXT: uzp1 v2.4h, v2.4h, v5.4h -; CHECK-NEXT: ushll v0.4s, v0.4h, #0 -; CHECK-NEXT: uzp1 v3.4h, v6.4h, v7.4h -; CHECK-NEXT: ushll v4.2d, v1.2s, #0 -; CHECK-NEXT: ushll2 v5.2d, v1.4s, #0 +; CHECK-NEXT: ldr d1, [x0] +; CHECK-NEXT: ldr d2, [x1] +; CHECK-NEXT: ldr d0, [x2] +; CHECK-NEXT: ldr d3, [x3] +; CHECK-NEXT: ushll v1.4s, v1.4h, #0 ; CHECK-NEXT: ushll v2.4s, v2.4h, #0 -; CHECK-NEXT: ushll v1.4s, v3.4h, #0 -; CHECK-NEXT: ushll v16.2d, v0.2s, #0 -; CHECK-NEXT: ushll2 v0.2d, v0.4s, #0 -; CHECK-NEXT: ushll v6.2d, v1.2s, #3 -; CHECK-NEXT: ushll v7.2d, v2.2s, #3 -; CHECK-NEXT: ushll2 v2.2d, v2.4s, #3 -; CHECK-NEXT: ushll2 v3.2d, v1.4s, #3 -; CHECK-NEXT: sub v1.2d, v0.2d, v2.2d -; CHECK-NEXT: sub v3.2d, v5.2d, v3.2d -; CHECK-NEXT: sub v0.2d, v16.2d, v7.2d -; CHECK-NEXT: sub v2.2d, v4.2d, v6.2d +; CHECK-NEXT: ushll v4.4s, v0.4h, #0 +; CHECK-NEXT: ushll v3.4s, v3.4h, #0 +; CHECK-NEXT: ushll2 v0.2d, v1.4s, #3 +; CHECK-NEXT: ushll v1.2d, v1.2s, #0 +; CHECK-NEXT: ushll2 v5.2d, v2.4s, #3 +; CHECK-NEXT: ushll v2.2d, v2.2s, #0 +; CHECK-NEXT: sub v0.2d, v1.2d, v0.2d +; CHECK-NEXT: ushll2 v6.2d, v4.4s, #3 +; CHECK-NEXT: sub v1.2d, v2.2d, v5.2d +; CHECK-NEXT: ushll v2.2d, v4.2s, #0 +; CHECK-NEXT: ushll2 v4.2d, v3.4s, #3 +; CHECK-NEXT: ushll v3.2d, v3.2s, #0 +; CHECK-NEXT: sub v2.2d, v2.2d, v6.2d +; CHECK-NEXT: sub v3.2d, v3.2d, v4.2d ; CHECK-NEXT: ret %lp1 = load <2 x i16>, ptr %p %p2 = getelementptr i8, ptr %p, i32 4 @@ -5674,42 +5279,42 @@ define <16 x i64> @std_bv4_v4i16_v16i64(ptr %p, ptr %q, ptr %r, ptr %s) { ; CHECK-LABEL: std_bv4_v4i16_v16i64: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp d1, d2, [x1] -; CHECK-NEXT: ldp d3, d4, [x2] -; CHECK-NEXT: ldp d6, d7, [x0] -; CHECK-NEXT: ushll v1.4s, v1.4h, #0 -; CHECK-NEXT: ldp d0, d5, [x3] -; CHECK-NEXT: ushll v3.4s, v3.4h, #0 -; CHECK-NEXT: ushll v6.4s, v6.4h, #0 -; CHECK-NEXT: ushll v17.2d, v3.2s, #0 -; CHECK-NEXT: ushll v0.4s, v0.4h, #0 -; CHECK-NEXT: ushll v18.2d, v1.2s, #0 -; CHECK-NEXT: ushll v19.2d, v6.2s, #0 -; CHECK-NEXT: ushll2 v20.2d, v3.4s, #0 -; CHECK-NEXT: ushll2 v3.2d, v1.4s, #0 -; CHECK-NEXT: ushll2 v1.2d, v6.4s, #0 -; CHECK-NEXT: ushll v6.4s, v7.4h, #0 +; CHECK-NEXT: ldr q1, [x0] +; CHECK-NEXT: ldr q2, [x1] +; CHECK-NEXT: ldr q0, [x2] +; CHECK-NEXT: ushll2 v6.4s, v1.8h, #0 +; CHECK-NEXT: ldr q4, [x3] +; CHECK-NEXT: ushll v16.4s, v1.4h, #0 +; CHECK-NEXT: ushll2 v17.4s, v2.8h, #0 ; CHECK-NEXT: ushll v2.4s, v2.4h, #0 -; CHECK-NEXT: ushll v5.4s, v5.4h, #0 +; CHECK-NEXT: ushll2 v18.4s, v0.8h, #0 +; CHECK-NEXT: ushll2 v1.2d, v6.4s, #3 +; CHECK-NEXT: ushll2 v3.2d, v16.4s, #0 +; CHECK-NEXT: ushll2 v5.2d, v17.4s, #3 +; CHECK-NEXT: ushll2 v7.2d, v2.4s, #0 +; CHECK-NEXT: ushll v19.4s, v0.4h, #0 +; CHECK-NEXT: sub v1.2d, v3.2d, v1.2d +; CHECK-NEXT: sub v3.2d, v7.2d, v5.2d +; CHECK-NEXT: ushll2 v0.2d, v18.4s, #3 +; CHECK-NEXT: ushll2 v5.2d, v19.4s, #0 +; CHECK-NEXT: ushll2 v20.4s, v4.8h, #0 ; CHECK-NEXT: ushll v4.4s, v4.4h, #0 -; CHECK-NEXT: ushll v16.2d, v0.2s, #0 -; CHECK-NEXT: ushll2 v0.2d, v0.4s, #0 -; CHECK-NEXT: ushll v21.2d, v5.2s, #3 -; CHECK-NEXT: ushll v22.2d, v4.2s, #3 -; CHECK-NEXT: ushll v23.2d, v2.2s, #3 -; CHECK-NEXT: ushll v24.2d, v6.2s, #3 -; CHECK-NEXT: ushll2 v7.2d, v5.4s, #3 -; CHECK-NEXT: ushll2 v4.2d, v4.4s, #3 -; CHECK-NEXT: ushll2 v5.2d, v6.4s, #3 -; CHECK-NEXT: ushll2 v2.2d, v2.4s, #3 -; CHECK-NEXT: sub v1.2d, v1.2d, v5.2d -; CHECK-NEXT: sub v3.2d, v3.2d, v2.2d -; CHECK-NEXT: sub v5.2d, v20.2d, v4.2d -; CHECK-NEXT: sub v7.2d, v0.2d, v7.2d -; CHECK-NEXT: sub v0.2d, v19.2d, v24.2d -; CHECK-NEXT: sub v2.2d, v18.2d, v23.2d -; CHECK-NEXT: sub v4.2d, v17.2d, v22.2d -; CHECK-NEXT: sub v6.2d, v16.2d, v21.2d +; CHECK-NEXT: sub v5.2d, v5.2d, v0.2d +; CHECK-NEXT: ushll v0.2d, v6.2s, #3 +; CHECK-NEXT: ushll v6.2d, v16.2s, #0 +; CHECK-NEXT: ushll v16.2d, v17.2s, #3 +; CHECK-NEXT: ushll v2.2d, v2.2s, #0 +; CHECK-NEXT: ushll2 v7.2d, v20.4s, #3 +; CHECK-NEXT: ushll2 v21.2d, v4.4s, #0 +; CHECK-NEXT: sub v0.2d, v6.2d, v0.2d +; CHECK-NEXT: sub v2.2d, v2.2d, v16.2d +; CHECK-NEXT: ushll v6.2d, v18.2s, #3 +; CHECK-NEXT: ushll v16.2d, v19.2s, #0 +; CHECK-NEXT: ushll v17.2d, v20.2s, #3 +; CHECK-NEXT: ushll v18.2d, v4.2s, #0 +; CHECK-NEXT: sub v7.2d, v21.2d, v7.2d +; CHECK-NEXT: sub v4.2d, v16.2d, v6.2d +; CHECK-NEXT: sub v6.2d, v18.2d, v17.2d ; CHECK-NEXT: ret %lp1 = load <4 x i16>, ptr %p %p2 = getelementptr i8, ptr %p, i32 8 @@ -5737,104 +5342,88 @@ %e2 = zext <16 x i16> %l2 to <16 x i64> %se2 = shl <16 x i64> %e2, %a = sub <16 x i64> %e1, %se2 - ret <16 x i64> %a -} - -define <32 x i64> @std_bv4_v8i16_v32i64(ptr %p, ptr %q, ptr %r, ptr %s) { -; CHECK-LABEL: std_bv4_v8i16_v32i64: -; CHECK: // %bb.0: -; CHECK-NEXT: str d14, [sp, #-64]! // 8-byte Folded Spill -; CHECK-NEXT: .cfi_def_cfa_offset 64 -; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill -; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill -; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill -; CHECK-NEXT: .cfi_offset b8, -8 -; CHECK-NEXT: .cfi_offset b9, -16 -; CHECK-NEXT: .cfi_offset b10, -24 -; CHECK-NEXT: .cfi_offset b11, -32 -; CHECK-NEXT: .cfi_offset b12, -40 -; CHECK-NEXT: .cfi_offset b13, -48 -; CHECK-NEXT: .cfi_offset b14, -64 -; CHECK-NEXT: ldp q2, q27, [x3] -; CHECK-NEXT: ushll2 v16.4s, v2.8h, #0 -; CHECK-NEXT: ushll v21.4s, v2.4h, #0 -; CHECK-NEXT: ldp q4, q26, [x2] -; CHECK-NEXT: ushll v31.4s, v27.4h, #0 -; CHECK-NEXT: ushll2 v27.4s, v27.8h, #0 -; CHECK-NEXT: ushll2 v5.2d, v16.4s, #0 -; CHECK-NEXT: ushll2 v18.4s, v4.8h, #0 -; CHECK-NEXT: ushll v17.2d, v16.2s, #0 -; CHECK-NEXT: ldp q7, q25, [x0] -; CHECK-NEXT: ushll v30.4s, v26.4h, #0 -; CHECK-NEXT: ushll2 v26.4s, v26.8h, #0 -; CHECK-NEXT: ushll2 v8.2d, v27.4s, #3 -; CHECK-NEXT: ushll2 v20.4s, v7.8h, #0 -; CHECK-NEXT: ushll v27.2d, v27.2s, #3 -; CHECK-NEXT: ldp q6, q28, [x1] -; CHECK-NEXT: ushll2 v3.2d, v18.4s, #0 -; CHECK-NEXT: ushll2 v0.2d, v20.4s, #0 -; CHECK-NEXT: ushll v24.4s, v4.4h, #0 -; CHECK-NEXT: ushll v29.4s, v6.4h, #0 -; CHECK-NEXT: ushll2 v19.4s, v6.8h, #0 -; CHECK-NEXT: ushll v18.2d, v18.2s, #0 -; CHECK-NEXT: ushll2 v6.2d, v29.4s, #0 -; CHECK-NEXT: ushll v22.2d, v20.2s, #0 -; CHECK-NEXT: ushll v20.2d, v29.2s, #0 -; CHECK-NEXT: ushll v29.4s, v28.4h, #0 -; CHECK-NEXT: ushll2 v28.4s, v28.8h, #0 -; CHECK-NEXT: ushll2 v9.2d, v26.4s, #3 -; CHECK-NEXT: ushll v26.2d, v26.2s, #3 -; CHECK-NEXT: sub v5.2d, v5.2d, v8.2d -; CHECK-NEXT: sub v17.2d, v17.2d, v27.2d -; CHECK-NEXT: ushll2 v1.2d, v19.4s, #0 -; CHECK-NEXT: ushll2 v2.2d, v21.4s, #0 -; CHECK-NEXT: stp q17, q5, [x8, #224] -; CHECK-NEXT: ushll2 v4.2d, v24.4s, #0 -; CHECK-NEXT: ushll v19.2d, v19.2s, #0 + ret <16 x i64> %a +} + +define <32 x i64> @std_bv4_v8i16_v32i64(ptr %p, ptr %q, ptr %r, ptr %s) { +; CHECK-LABEL: std_bv4_v8i16_v32i64: +; CHECK: // %bb.0: +; CHECK-NEXT: ldp q4, q5, [x3] +; CHECK-NEXT: ushll v16.4s, v4.4h, #0 +; CHECK-NEXT: ushll2 v4.4s, v4.8h, #0 +; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ushll v6.4s, v5.4h, #0 +; CHECK-NEXT: ushll v20.2d, v16.2s, #0 +; CHECK-NEXT: ushll v19.2d, v6.2s, #3 +; CHECK-NEXT: ushll2 v21.4s, v0.8h, #0 +; CHECK-NEXT: sub v19.2d, v20.2d, v19.2d +; CHECK-NEXT: ldp q2, q3, [x1] +; CHECK-NEXT: ushll2 v18.4s, v1.8h, #0 ; CHECK-NEXT: ushll v23.2d, v21.2s, #0 -; CHECK-NEXT: ushll v21.2d, v24.2s, #0 -; CHECK-NEXT: ushll v24.4s, v25.4h, #0 -; CHECK-NEXT: ushll2 v25.4s, v25.8h, #0 -; CHECK-NEXT: ushll2 v10.2d, v28.4s, #3 -; CHECK-NEXT: ushll v28.2d, v28.2s, #3 -; CHECK-NEXT: sub v3.2d, v3.2d, v9.2d -; CHECK-NEXT: sub v5.2d, v18.2d, v26.2d -; CHECK-NEXT: ushll2 v11.2d, v25.4s, #3 -; CHECK-NEXT: ushll v25.2d, v25.2s, #3 -; CHECK-NEXT: stp q5, q3, [x8, #160] -; CHECK-NEXT: sub v1.2d, v1.2d, v10.2d -; CHECK-NEXT: sub v3.2d, v19.2d, v28.2d -; CHECK-NEXT: ushll2 v12.2d, v31.4s, #3 -; CHECK-NEXT: ushll v31.2d, v31.2s, #3 -; CHECK-NEXT: stp q3, q1, [x8, #96] -; CHECK-NEXT: sub v0.2d, v0.2d, v11.2d -; CHECK-NEXT: sub v1.2d, v22.2d, v25.2d -; CHECK-NEXT: ushll2 v13.2d, v30.4s, #3 -; CHECK-NEXT: ushll v30.2d, v30.2s, #3 -; CHECK-NEXT: stp q1, q0, [x8, #32] -; CHECK-NEXT: sub v2.2d, v2.2d, v12.2d -; CHECK-NEXT: sub v0.2d, v23.2d, v31.2d -; CHECK-NEXT: ushll2 v14.2d, v29.4s, #3 -; CHECK-NEXT: ushll v29.2d, v29.2s, #3 -; CHECK-NEXT: stp q0, q2, [x8, #192] -; CHECK-NEXT: sub v1.2d, v4.2d, v13.2d -; CHECK-NEXT: sub v2.2d, v21.2d, v30.2d -; CHECK-NEXT: ushll v16.4s, v7.4h, #0 -; CHECK-NEXT: sub v0.2d, v6.2d, v14.2d -; CHECK-NEXT: stp q2, q1, [x8, #128] -; CHECK-NEXT: sub v1.2d, v20.2d, v29.2d -; CHECK-NEXT: ushll2 v7.2d, v16.4s, #0 -; CHECK-NEXT: ushll v2.2d, v16.2s, #0 -; CHECK-NEXT: stp q1, q0, [x8, #64] -; CHECK-NEXT: ushll2 v0.2d, v24.4s, #3 -; CHECK-NEXT: ushll v1.2d, v24.2s, #3 -; CHECK-NEXT: sub v0.2d, v7.2d, v0.2d +; CHECK-NEXT: ushll v22.2d, v18.2s, #3 +; CHECK-NEXT: ushll2 v24.4s, v2.8h, #0 +; CHECK-NEXT: sub v22.2d, v23.2d, v22.2d +; CHECK-NEXT: ldp q7, q17, [x2] +; CHECK-NEXT: ushll2 v20.4s, v3.8h, #0 +; CHECK-NEXT: ushll v26.2d, v24.2s, #0 +; CHECK-NEXT: ushll v25.2d, v20.2s, #3 +; CHECK-NEXT: ushll2 v5.4s, v5.8h, #0 +; CHECK-NEXT: sub v23.2d, v26.2d, v25.2d +; CHECK-NEXT: ushll2 v25.4s, v17.8h, #0 +; CHECK-NEXT: ushll2 v26.4s, v7.8h, #0 +; CHECK-NEXT: ushll v27.2d, v25.2s, #3 +; CHECK-NEXT: ushll v28.2d, v26.2s, #0 +; CHECK-NEXT: ushll v29.2d, v5.2s, #3 +; CHECK-NEXT: ushll v30.2d, v4.2s, #0 +; CHECK-NEXT: ushll2 v25.2d, v25.4s, #3 +; CHECK-NEXT: ushll2 v26.2d, v26.4s, #0 +; CHECK-NEXT: ushll v3.4s, v3.4h, #0 +; CHECK-NEXT: ushll2 v5.2d, v5.4s, #3 +; CHECK-NEXT: ushll2 v4.2d, v4.4s, #0 +; CHECK-NEXT: ushll v2.4s, v2.4h, #0 +; CHECK-NEXT: sub v25.2d, v26.2d, v25.2d +; CHECK-NEXT: ushll v26.2d, v3.2s, #3 +; CHECK-NEXT: sub v4.2d, v4.2d, v5.2d +; CHECK-NEXT: ushll v5.2d, v2.2s, #0 +; CHECK-NEXT: ushll2 v3.2d, v3.4s, #3 +; CHECK-NEXT: ushll2 v2.2d, v2.4s, #0 +; CHECK-NEXT: sub v27.2d, v28.2d, v27.2d +; CHECK-NEXT: sub v28.2d, v30.2d, v29.2d +; CHECK-NEXT: ushll2 v20.2d, v20.4s, #3 +; CHECK-NEXT: stp q27, q25, [x8, #160] +; CHECK-NEXT: ushll2 v24.2d, v24.4s, #0 +; CHECK-NEXT: stp q28, q4, [x8, #224] +; CHECK-NEXT: ushll v1.4s, v1.4h, #0 +; CHECK-NEXT: ushll v0.4s, v0.4h, #0 +; CHECK-NEXT: ushll v17.4s, v17.4h, #0 +; CHECK-NEXT: ushll v7.4s, v7.4h, #0 +; CHECK-NEXT: sub v2.2d, v2.2d, v3.2d +; CHECK-NEXT: sub v3.2d, v5.2d, v26.2d +; CHECK-NEXT: sub v20.2d, v24.2d, v20.2d +; CHECK-NEXT: ushll v24.2d, v1.2s, #3 +; CHECK-NEXT: stp q3, q2, [x8, #64] +; CHECK-NEXT: ushll v4.2d, v17.2s, #3 +; CHECK-NEXT: stp q23, q20, [x8, #96] +; CHECK-NEXT: ushll v28.2d, v7.2s, #0 +; CHECK-NEXT: ushll2 v17.2d, v17.4s, #3 +; CHECK-NEXT: ushll2 v6.2d, v6.4s, #3 +; CHECK-NEXT: ushll2 v18.2d, v18.4s, #3 +; CHECK-NEXT: ushll2 v21.2d, v21.4s, #0 +; CHECK-NEXT: ushll2 v16.2d, v16.4s, #0 +; CHECK-NEXT: ushll2 v7.2d, v7.4s, #0 +; CHECK-NEXT: ushll2 v1.2d, v1.4s, #3 +; CHECK-NEXT: ushll2 v2.2d, v0.4s, #0 +; CHECK-NEXT: ushll v0.2d, v0.2s, #0 +; CHECK-NEXT: sub v18.2d, v21.2d, v18.2d +; CHECK-NEXT: sub v6.2d, v16.2d, v6.2d +; CHECK-NEXT: sub v7.2d, v7.2d, v17.2d +; CHECK-NEXT: stp q22, q18, [x8, #32] +; CHECK-NEXT: sub v4.2d, v28.2d, v4.2d +; CHECK-NEXT: stp q19, q6, [x8, #192] ; CHECK-NEXT: sub v1.2d, v2.2d, v1.2d -; CHECK-NEXT: stp q1, q0, [x8] -; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload -; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload -; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload -; CHECK-NEXT: ldr d14, [sp], #64 // 8-byte Folded Reload +; CHECK-NEXT: sub v0.2d, v0.2d, v24.2d +; CHECK-NEXT: stp q4, q7, [x8, #128] +; CHECK-NEXT: stp q0, q1, [x8] ; CHECK-NEXT: ret %lp1 = load <8 x i16>, ptr %p %p2 = getelementptr i8, ptr %p, i32 16 @@ -5868,22 +5457,22 @@ define <8 x i64> @std_bv4_v2i32_v8i64(ptr %p, ptr %q, ptr %r, ptr %s) { ; CHECK-LABEL: std_bv4_v2i32_v8i64: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp d0, d5, [x3] -; CHECK-NEXT: ldp d1, d2, [x1] -; CHECK-NEXT: ldp d3, d4, [x2] -; CHECK-NEXT: ushll v6.2d, v0.2s, #0 -; CHECK-NEXT: ldp d0, d7, [x0] -; CHECK-NEXT: ushll v1.2d, v1.2s, #0 -; CHECK-NEXT: ushll v3.2d, v3.2s, #0 -; CHECK-NEXT: ushll v5.2d, v5.2s, #3 +; CHECK-NEXT: ldr q0, [x0] +; CHECK-NEXT: ldr q2, [x1] +; CHECK-NEXT: ldr q1, [x2] +; CHECK-NEXT: ldr q3, [x3] +; CHECK-NEXT: ushll2 v4.2d, v0.4s, #3 ; CHECK-NEXT: ushll v0.2d, v0.2s, #0 -; CHECK-NEXT: ushll v4.2d, v4.2s, #3 -; CHECK-NEXT: ushll v7.2d, v7.2s, #3 -; CHECK-NEXT: ushll v2.2d, v2.2s, #3 -; CHECK-NEXT: sub v0.2d, v0.2d, v7.2d -; CHECK-NEXT: sub v1.2d, v1.2d, v2.2d -; CHECK-NEXT: sub v2.2d, v3.2d, v4.2d -; CHECK-NEXT: sub v3.2d, v6.2d, v5.2d +; CHECK-NEXT: ushll2 v5.2d, v2.4s, #3 +; CHECK-NEXT: ushll v2.2d, v2.2s, #0 +; CHECK-NEXT: sub v0.2d, v0.2d, v4.2d +; CHECK-NEXT: ushll2 v4.2d, v1.4s, #3 +; CHECK-NEXT: ushll v6.2d, v1.2s, #0 +; CHECK-NEXT: sub v1.2d, v2.2d, v5.2d +; CHECK-NEXT: ushll2 v5.2d, v3.4s, #3 +; CHECK-NEXT: ushll v3.2d, v3.2s, #0 +; CHECK-NEXT: sub v2.2d, v6.2d, v4.2d +; CHECK-NEXT: sub v3.2d, v3.2d, v5.2d ; CHECK-NEXT: ret %lp1 = load <2 x i32>, ptr %p %p2 = getelementptr i8, ptr %p, i32 8 @@ -5917,34 +5506,34 @@ define <16 x i64> @std_bv4_v4i32_v16i64(ptr %p, ptr %q, ptr %r, ptr %s) { ; CHECK-LABEL: std_bv4_v4i32_v16i64: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q5, [x3] -; CHECK-NEXT: ushll v6.2d, v0.2s, #0 -; CHECK-NEXT: ushll2 v0.2d, v0.4s, #0 -; CHECK-NEXT: ldp q1, q2, [x1] -; CHECK-NEXT: ushll v21.2d, v5.2s, #3 -; CHECK-NEXT: sub v6.2d, v6.2d, v21.2d -; CHECK-NEXT: ushll v17.2d, v1.2s, #0 -; CHECK-NEXT: ldp q3, q4, [x2] -; CHECK-NEXT: ushll v23.2d, v2.2s, #3 -; CHECK-NEXT: ushll2 v2.2d, v2.4s, #3 -; CHECK-NEXT: ushll v16.2d, v3.2s, #0 -; CHECK-NEXT: ushll2 v20.2d, v3.4s, #0 -; CHECK-NEXT: ldp q7, q18, [x0] -; CHECK-NEXT: ushll2 v3.2d, v1.4s, #0 -; CHECK-NEXT: ushll v22.2d, v4.2s, #3 -; CHECK-NEXT: ushll2 v4.2d, v4.4s, #3 -; CHECK-NEXT: ushll v19.2d, v7.2s, #0 -; CHECK-NEXT: ushll2 v1.2d, v7.4s, #0 -; CHECK-NEXT: ushll v24.2d, v18.2s, #3 -; CHECK-NEXT: ushll2 v7.2d, v5.4s, #3 -; CHECK-NEXT: ushll2 v5.2d, v18.4s, #3 -; CHECK-NEXT: sub v3.2d, v3.2d, v2.2d -; CHECK-NEXT: sub v1.2d, v1.2d, v5.2d -; CHECK-NEXT: sub v5.2d, v20.2d, v4.2d -; CHECK-NEXT: sub v7.2d, v0.2d, v7.2d -; CHECK-NEXT: sub v0.2d, v19.2d, v24.2d -; CHECK-NEXT: sub v2.2d, v17.2d, v23.2d -; CHECK-NEXT: sub v4.2d, v16.2d, v22.2d +; CHECK-NEXT: ldp q0, q2, [x0] +; CHECK-NEXT: ushll2 v3.2d, v0.4s, #0 +; CHECK-NEXT: ushll v0.2d, v0.2s, #0 +; CHECK-NEXT: ldp q6, q16, [x1] +; CHECK-NEXT: ushll2 v1.2d, v2.4s, #3 +; CHECK-NEXT: ushll v2.2d, v2.2s, #3 +; CHECK-NEXT: sub v1.2d, v3.2d, v1.2d +; CHECK-NEXT: ushll2 v7.2d, v6.4s, #0 +; CHECK-NEXT: ushll v6.2d, v6.2s, #0 +; CHECK-NEXT: ldp q4, q17, [x2] +; CHECK-NEXT: ushll2 v5.2d, v16.4s, #3 +; CHECK-NEXT: ushll v16.2d, v16.2s, #3 +; CHECK-NEXT: sub v3.2d, v7.2d, v5.2d +; CHECK-NEXT: ushll2 v7.2d, v4.4s, #0 +; CHECK-NEXT: sub v0.2d, v0.2d, v2.2d +; CHECK-NEXT: ldp q18, q19, [x3] +; CHECK-NEXT: ushll2 v5.2d, v17.4s, #3 +; CHECK-NEXT: sub v2.2d, v6.2d, v16.2d +; CHECK-NEXT: ushll v6.2d, v17.2s, #3 +; CHECK-NEXT: ushll2 v21.2d, v18.4s, #0 +; CHECK-NEXT: ushll v4.2d, v4.2s, #0 +; CHECK-NEXT: ushll2 v20.2d, v19.4s, #3 +; CHECK-NEXT: ushll v16.2d, v19.2s, #3 +; CHECK-NEXT: ushll v17.2d, v18.2s, #0 +; CHECK-NEXT: sub v5.2d, v7.2d, v5.2d +; CHECK-NEXT: sub v7.2d, v21.2d, v20.2d +; CHECK-NEXT: sub v4.2d, v4.2d, v6.2d +; CHECK-NEXT: sub v6.2d, v17.2d, v16.2d ; CHECK-NEXT: ret %lp1 = load <4 x i32>, ptr %p %p2 = getelementptr i8, ptr %p, i32 16 @@ -5978,86 +5567,74 @@ define <32 x i64> @std_bv4_v8i32_v32i64(ptr %p, ptr %q, ptr %r, ptr %s) { ; CHECK-LABEL: std_bv4_v8i32_v32i64: ; CHECK: // %bb.0: -; CHECK-NEXT: str d14, [sp, #-64]! // 8-byte Folded Spill -; CHECK-NEXT: .cfi_def_cfa_offset 64 -; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill -; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill -; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill -; CHECK-NEXT: .cfi_offset b8, -8 -; CHECK-NEXT: .cfi_offset b9, -16 -; CHECK-NEXT: .cfi_offset b10, -24 -; CHECK-NEXT: .cfi_offset b11, -32 -; CHECK-NEXT: .cfi_offset b12, -40 -; CHECK-NEXT: .cfi_offset b13, -48 -; CHECK-NEXT: .cfi_offset b14, -64 -; CHECK-NEXT: ldp q16, q18, [x3] -; CHECK-NEXT: ushll2 v23.2d, v16.4s, #0 -; CHECK-NEXT: ushll v16.2d, v16.2s, #0 -; CHECK-NEXT: ldp q31, q30, [x3, #32] -; CHECK-NEXT: ushll2 v21.2d, v18.4s, #0 -; CHECK-NEXT: ushll v18.2d, v18.2s, #0 -; CHECK-NEXT: ushll2 v9.2d, v31.4s, #3 -; CHECK-NEXT: ushll v31.2d, v31.2s, #3 -; CHECK-NEXT: ldp q19, q20, [x2] -; CHECK-NEXT: ushll2 v8.2d, v30.4s, #3 -; CHECK-NEXT: ushll v30.2d, v30.2s, #3 -; CHECK-NEXT: sub v21.2d, v21.2d, v8.2d -; CHECK-NEXT: sub v18.2d, v18.2d, v30.2d -; CHECK-NEXT: ushll2 v6.2d, v19.4s, #0 -; CHECK-NEXT: ldp q29, q28, [x2, #32] -; CHECK-NEXT: ushll2 v7.2d, v20.4s, #0 -; CHECK-NEXT: ushll v20.2d, v20.2s, #0 -; CHECK-NEXT: ushll v19.2d, v19.2s, #0 -; CHECK-NEXT: ushll2 v11.2d, v29.4s, #3 -; CHECK-NEXT: ushll v29.2d, v29.2s, #3 -; CHECK-NEXT: ldp q22, q24, [x1] -; CHECK-NEXT: ushll2 v10.2d, v28.4s, #3 -; CHECK-NEXT: ushll v28.2d, v28.2s, #3 -; CHECK-NEXT: sub v7.2d, v7.2d, v10.2d -; CHECK-NEXT: sub v6.2d, v6.2d, v11.2d -; CHECK-NEXT: ushll2 v4.2d, v22.4s, #0 -; CHECK-NEXT: ldp q27, q26, [x1, #32] -; CHECK-NEXT: ushll2 v5.2d, v24.4s, #0 -; CHECK-NEXT: ushll v24.2d, v24.2s, #0 -; CHECK-NEXT: ushll v22.2d, v22.2s, #0 -; CHECK-NEXT: ushll2 v13.2d, v27.4s, #3 -; CHECK-NEXT: ushll v27.2d, v27.2s, #3 -; CHECK-NEXT: ldp q0, q17, [x0] -; CHECK-NEXT: ushll2 v12.2d, v26.4s, #3 -; CHECK-NEXT: ushll v26.2d, v26.2s, #3 -; CHECK-NEXT: sub v5.2d, v5.2d, v12.2d -; CHECK-NEXT: sub v4.2d, v4.2d, v13.2d -; CHECK-NEXT: ushll2 v2.2d, v0.4s, #0 -; CHECK-NEXT: ldp q1, q25, [x0, #32] -; CHECK-NEXT: stp q18, q21, [x8, #224] -; CHECK-NEXT: sub v18.2d, v20.2d, v28.2d -; CHECK-NEXT: ushll2 v3.2d, v17.4s, #0 -; CHECK-NEXT: stp q18, q7, [x8, #160] -; CHECK-NEXT: sub v7.2d, v19.2d, v29.2d -; CHECK-NEXT: ushll v17.2d, v17.2s, #0 -; CHECK-NEXT: stp q7, q6, [x8, #128] -; CHECK-NEXT: sub v6.2d, v24.2d, v26.2d -; CHECK-NEXT: ushll2 v14.2d, v25.4s, #3 -; CHECK-NEXT: ushll v25.2d, v25.2s, #3 -; CHECK-NEXT: stp q6, q5, [x8, #96] -; CHECK-NEXT: sub v5.2d, v22.2d, v27.2d -; CHECK-NEXT: sub v3.2d, v3.2d, v14.2d -; CHECK-NEXT: stp q5, q4, [x8, #64] -; CHECK-NEXT: sub v4.2d, v17.2d, v25.2d -; CHECK-NEXT: ushll v0.2d, v0.2s, #0 -; CHECK-NEXT: stp q4, q3, [x8, #32] -; CHECK-NEXT: ushll2 v3.2d, v1.4s, #3 -; CHECK-NEXT: ushll v1.2d, v1.2s, #3 -; CHECK-NEXT: sub v23.2d, v23.2d, v9.2d -; CHECK-NEXT: sub v16.2d, v16.2d, v31.2d -; CHECK-NEXT: sub v2.2d, v2.2d, v3.2d -; CHECK-NEXT: sub v0.2d, v0.2d, v1.2d -; CHECK-NEXT: stp q16, q23, [x8, #192] -; CHECK-NEXT: stp q0, q2, [x8] -; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload -; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload -; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload -; CHECK-NEXT: ldr d14, [sp], #64 // 8-byte Folded Reload +; CHECK-NEXT: str d8, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: .cfi_offset b8, -16 +; CHECK-NEXT: ldp q2, q1, [x1] +; CHECK-NEXT: ldp q16, q7, [x1, #32] +; CHECK-NEXT: ushll v21.2d, v1.2s, #0 +; CHECK-NEXT: ushll2 v1.2d, v1.4s, #0 +; CHECK-NEXT: ldp q17, q4, [x2, #32] +; CHECK-NEXT: ushll v19.2d, v7.2s, #3 +; CHECK-NEXT: ushll2 v7.2d, v7.4s, #3 +; CHECK-NEXT: sub v19.2d, v21.2d, v19.2d +; CHECK-NEXT: ushll v22.2d, v17.2s, #3 +; CHECK-NEXT: sub v1.2d, v1.2d, v7.2d +; CHECK-NEXT: ldp q20, q18, [x2] +; CHECK-NEXT: ushll v26.2d, v4.2s, #3 +; CHECK-NEXT: ushll2 v4.2d, v4.4s, #3 +; CHECK-NEXT: ushll2 v17.2d, v17.4s, #3 +; CHECK-NEXT: ushll v24.2d, v20.2s, #0 +; CHECK-NEXT: ushll2 v20.2d, v20.4s, #0 +; CHECK-NEXT: ldp q25, q23, [x3] +; CHECK-NEXT: sub v22.2d, v24.2d, v22.2d +; CHECK-NEXT: ushll v28.2d, v18.2s, #0 +; CHECK-NEXT: ushll2 v18.2d, v18.4s, #0 +; CHECK-NEXT: ushll v31.2d, v25.2s, #0 +; CHECK-NEXT: sub v26.2d, v28.2d, v26.2d +; CHECK-NEXT: ldp q24, q21, [x3, #32] +; CHECK-NEXT: sub v4.2d, v18.2d, v4.2d +; CHECK-NEXT: ushll2 v25.2d, v25.4s, #0 +; CHECK-NEXT: sub v17.2d, v20.2d, v17.2d +; CHECK-NEXT: ushll v30.2d, v24.2s, #3 +; CHECK-NEXT: ushll2 v24.2d, v24.4s, #3 +; CHECK-NEXT: ldp q0, q5, [x0, #32] +; CHECK-NEXT: sub v28.2d, v31.2d, v30.2d +; CHECK-NEXT: ushll v30.2d, v21.2s, #3 +; CHECK-NEXT: ushll v31.2d, v23.2s, #0 +; CHECK-NEXT: ushll2 v21.2d, v21.4s, #3 +; CHECK-NEXT: ushll2 v23.2d, v23.4s, #0 +; CHECK-NEXT: ldp q29, q27, [x0] +; CHECK-NEXT: sub v30.2d, v31.2d, v30.2d +; CHECK-NEXT: stp q19, q1, [x8, #96] +; CHECK-NEXT: ushll v31.2d, v16.2s, #3 +; CHECK-NEXT: stp q22, q17, [x8, #128] +; CHECK-NEXT: sub v21.2d, v23.2d, v21.2d +; CHECK-NEXT: stp q26, q4, [x8, #160] +; CHECK-NEXT: ushll v23.2d, v2.2s, #0 +; CHECK-NEXT: ushll2 v16.2d, v16.4s, #3 +; CHECK-NEXT: stp q30, q21, [x8, #224] +; CHECK-NEXT: ushll2 v2.2d, v2.4s, #0 +; CHECK-NEXT: ushll v6.2d, v5.2s, #3 +; CHECK-NEXT: ushll v8.2d, v27.2s, #0 +; CHECK-NEXT: ushll2 v5.2d, v5.4s, #3 +; CHECK-NEXT: ushll2 v4.2d, v27.4s, #0 +; CHECK-NEXT: sub v2.2d, v2.2d, v16.2d +; CHECK-NEXT: sub v1.2d, v23.2d, v31.2d +; CHECK-NEXT: sub v4.2d, v4.2d, v5.2d +; CHECK-NEXT: stp q1, q2, [x8, #64] +; CHECK-NEXT: sub v2.2d, v8.2d, v6.2d +; CHECK-NEXT: ushll v3.2d, v0.2s, #3 +; CHECK-NEXT: ushll2 v0.2d, v0.4s, #3 +; CHECK-NEXT: stp q2, q4, [x8, #32] +; CHECK-NEXT: ushll2 v1.2d, v29.4s, #0 +; CHECK-NEXT: ushll v2.2d, v29.2s, #0 +; CHECK-NEXT: sub v24.2d, v25.2d, v24.2d +; CHECK-NEXT: sub v0.2d, v1.2d, v0.2d +; CHECK-NEXT: sub v1.2d, v2.2d, v3.2d +; CHECK-NEXT: stp q28, q24, [x8, #192] +; CHECK-NEXT: stp q1, q0, [x8] +; CHECK-NEXT: ldr d8, [sp], #16 // 8-byte Folded Reload ; CHECK-NEXT: ret %lp1 = load <8 x i32>, ptr %p %p2 = getelementptr i8, ptr %p, i32 32 @@ -6316,36 +5893,26 @@ define <32 x i16> @dbl_bv4_v8i8_v32i16(ptr %p, ptr %q, ptr %r, ptr %s) { ; CHECK-LABEL: dbl_bv4_v8i8_v32i16: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp d16, d6, [x3, #8] -; CHECK-NEXT: ldr d7, [x3] -; CHECK-NEXT: ldp d0, d4, [x2] -; CHECK-NEXT: add v6.8b, v7.8b, v6.8b -; CHECK-NEXT: ldr d17, [x3, #24] -; CHECK-NEXT: ldp d5, d7, [x2, #16] -; CHECK-NEXT: ushll v6.8h, v6.8b, #0 -; CHECK-NEXT: ldp d18, d3, [x1, #8] -; CHECK-NEXT: add v0.8b, v0.8b, v5.8b -; CHECK-NEXT: ldr d5, [x1] -; CHECK-NEXT: ldp d2, d20, [x0, #16] -; CHECK-NEXT: ushll v19.8h, v0.8b, #0 -; CHECK-NEXT: add v4.8b, v4.8b, v7.8b -; CHECK-NEXT: add v3.8b, v5.8b, v3.8b -; CHECK-NEXT: ldr d0, [x1, #24] -; CHECK-NEXT: ldp d1, d5, [x0] -; CHECK-NEXT: ushll v4.8h, v4.8b, #3 -; CHECK-NEXT: add v0.8b, v18.8b, v0.8b -; CHECK-NEXT: add v1.8b, v1.8b, v2.8b -; CHECK-NEXT: ushll v2.8h, v3.8b, #0 -; CHECK-NEXT: add v3.8b, v5.8b, v20.8b -; CHECK-NEXT: add v5.8b, v16.8b, v17.8b +; CHECK-NEXT: ldp q0, q3, [x3] +; CHECK-NEXT: ldp q1, q2, [x0] +; CHECK-NEXT: add v3.16b, v0.16b, v3.16b +; CHECK-NEXT: ldp q6, q0, [x1] +; CHECK-NEXT: add v1.16b, v1.16b, v2.16b +; CHECK-NEXT: ushll2 v2.8h, v1.16b, #3 ; CHECK-NEXT: ushll v1.8h, v1.8b, #0 -; CHECK-NEXT: ushll v5.8h, v5.8b, #3 -; CHECK-NEXT: ushll v3.8h, v3.8b, #3 -; CHECK-NEXT: ushll v7.8h, v0.8b, #3 -; CHECK-NEXT: sub v0.8h, v1.8h, v3.8h -; CHECK-NEXT: sub v1.8h, v2.8h, v7.8h -; CHECK-NEXT: sub v2.8h, v19.8h, v4.8h -; CHECK-NEXT: sub v3.8h, v6.8h, v5.8h +; CHECK-NEXT: ldp q4, q5, [x2] +; CHECK-NEXT: add v0.16b, v6.16b, v0.16b +; CHECK-NEXT: ushll v6.8h, v0.8b, #0 +; CHECK-NEXT: add v4.16b, v4.16b, v5.16b +; CHECK-NEXT: ushll2 v5.8h, v0.16b, #3 +; CHECK-NEXT: sub v0.8h, v1.8h, v2.8h +; CHECK-NEXT: sub v1.8h, v6.8h, v5.8h +; CHECK-NEXT: ushll2 v2.8h, v4.16b, #3 +; CHECK-NEXT: ushll v4.8h, v4.8b, #0 +; CHECK-NEXT: ushll2 v5.8h, v3.16b, #3 +; CHECK-NEXT: ushll v3.8h, v3.8b, #0 +; CHECK-NEXT: sub v2.8h, v4.8h, v2.8h +; CHECK-NEXT: sub v3.8h, v3.8h, v5.8h ; CHECK-NEXT: ret %lp1 = load <8 x i8>, ptr %p %p2 = getelementptr i8, ptr %p, i32 8 @@ -6548,44 +6115,30 @@ define <16 x i32> @dbl_bv4_v4i8_v16i32(ptr %p, ptr %q, ptr %r, ptr %s) { ; CHECK-LABEL: dbl_bv4_v4i8_v16i32: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp s0, s1, [x0] -; CHECK-NEXT: add x8, x1, #8 -; CHECK-NEXT: add x9, x1, #12 -; CHECK-NEXT: add x10, x2, #8 -; CHECK-NEXT: add x11, x2, #12 -; CHECK-NEXT: add x12, x3, #8 -; CHECK-NEXT: add x13, x3, #12 -; CHECK-NEXT: ld1 { v0.s }[1], [x1], #4 -; CHECK-NEXT: ld1 { v0.s }[2], [x2], #4 -; CHECK-NEXT: ld1 { v0.s }[3], [x3], #4 -; CHECK-NEXT: ldp s2, s3, [x0, #8] -; CHECK-NEXT: ld1 { v1.s }[1], [x1] -; CHECK-NEXT: ld1 { v2.s }[1], [x8] -; CHECK-NEXT: ld1 { v3.s }[1], [x9] -; CHECK-NEXT: ld1 { v1.s }[2], [x2] -; CHECK-NEXT: ld1 { v2.s }[2], [x10] -; CHECK-NEXT: ld1 { v3.s }[2], [x11] -; CHECK-NEXT: ld1 { v1.s }[3], [x3] -; CHECK-NEXT: ld1 { v2.s }[3], [x12] -; CHECK-NEXT: ld1 { v3.s }[3], [x13] -; CHECK-NEXT: add v0.16b, v0.16b, v2.16b -; CHECK-NEXT: add v1.16b, v1.16b, v3.16b -; CHECK-NEXT: ushll v2.8h, v0.8b, #0 -; CHECK-NEXT: ushll2 v0.8h, v0.16b, #0 -; CHECK-NEXT: ushll2 v3.8h, v1.16b, #0 +; CHECK-NEXT: ldp d1, d2, [x0] +; CHECK-NEXT: ldp d3, d4, [x1] +; CHECK-NEXT: ldp d0, d5, [x2] +; CHECK-NEXT: add v1.8b, v1.8b, v2.8b +; CHECK-NEXT: ldp d6, d2, [x3] +; CHECK-NEXT: add v3.8b, v3.8b, v4.8b ; CHECK-NEXT: ushll v1.8h, v1.8b, #0 -; CHECK-NEXT: ushll v4.4s, v2.4h, #0 -; CHECK-NEXT: ushll v5.4s, v0.4h, #0 -; CHECK-NEXT: ushll2 v2.4s, v2.8h, #0 -; CHECK-NEXT: ushll2 v0.4s, v0.8h, #0 -; CHECK-NEXT: ushll v6.4s, v1.4h, #3 -; CHECK-NEXT: ushll v7.4s, v3.4h, #3 -; CHECK-NEXT: ushll2 v3.4s, v3.8h, #3 -; CHECK-NEXT: ushll2 v1.4s, v1.8h, #3 -; CHECK-NEXT: sub v3.4s, v0.4s, v3.4s -; CHECK-NEXT: sub v1.4s, v2.4s, v1.4s -; CHECK-NEXT: sub v0.4s, v4.4s, v6.4s -; CHECK-NEXT: sub v2.4s, v5.4s, v7.4s +; CHECK-NEXT: add v0.8b, v0.8b, v5.8b +; CHECK-NEXT: ushll v3.8h, v3.8b, #0 +; CHECK-NEXT: add v2.8b, v6.8b, v2.8b +; CHECK-NEXT: ushll v4.8h, v0.8b, #0 +; CHECK-NEXT: ushll2 v0.4s, v1.8h, #3 +; CHECK-NEXT: ushll v1.4s, v1.4h, #0 +; CHECK-NEXT: ushll2 v5.4s, v3.8h, #3 +; CHECK-NEXT: ushll v3.4s, v3.4h, #0 +; CHECK-NEXT: ushll v2.8h, v2.8b, #0 +; CHECK-NEXT: ushll2 v6.4s, v4.8h, #3 +; CHECK-NEXT: sub v0.4s, v1.4s, v0.4s +; CHECK-NEXT: sub v1.4s, v3.4s, v5.4s +; CHECK-NEXT: ushll v3.4s, v4.4h, #0 +; CHECK-NEXT: ushll2 v4.4s, v2.8h, #3 +; CHECK-NEXT: ushll v5.4s, v2.4h, #0 +; CHECK-NEXT: sub v2.4s, v3.4s, v6.4s +; CHECK-NEXT: sub v3.4s, v5.4s, v4.4s ; CHECK-NEXT: ret %lp1 = load <4 x i8>, ptr %p %p2 = getelementptr i8, ptr %p, i32 4 @@ -6649,56 +6202,46 @@ define <32 x i32> @dbl_bv4_v8i8_v32i32(ptr %p, ptr %q, ptr %r, ptr %s) { ; CHECK-LABEL: dbl_bv4_v8i8_v32i32: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp d6, d4, [x3, #8] -; CHECK-NEXT: ldr d5, [x3] -; CHECK-NEXT: ldp d19, d16, [x1, #8] -; CHECK-NEXT: ldp d0, d2, [x2] -; CHECK-NEXT: add v4.8b, v5.8b, v4.8b -; CHECK-NEXT: ldp d3, d5, [x2, #16] -; CHECK-NEXT: ushll v4.8h, v4.8b, #0 -; CHECK-NEXT: ldr d17, [x1] -; CHECK-NEXT: ldr d7, [x3, #24] -; CHECK-NEXT: add v0.8b, v0.8b, v3.8b -; CHECK-NEXT: ldr d21, [x1, #24] -; CHECK-NEXT: ldp d3, d22, [x0, #16] -; CHECK-NEXT: add v16.8b, v17.8b, v16.8b -; CHECK-NEXT: add v6.8b, v6.8b, v7.8b -; CHECK-NEXT: ldp d1, d17, [x0] -; CHECK-NEXT: add v19.8b, v19.8b, v21.8b -; CHECK-NEXT: add v2.8b, v2.8b, v5.8b -; CHECK-NEXT: ushll v0.8h, v0.8b, #0 -; CHECK-NEXT: ushll v19.8h, v19.8b, #0 -; CHECK-NEXT: add v1.8b, v1.8b, v3.8b -; CHECK-NEXT: add v17.8b, v17.8b, v22.8b -; CHECK-NEXT: ushll v3.8h, v16.8b, #0 -; CHECK-NEXT: ushll v1.8h, v1.8b, #0 -; CHECK-NEXT: ushll v17.8h, v17.8b, #0 -; CHECK-NEXT: ushll v5.8h, v6.8b, #0 +; CHECK-NEXT: ldp q0, q3, [x3] +; CHECK-NEXT: ldp q1, q2, [x0] +; CHECK-NEXT: add v0.16b, v0.16b, v3.16b +; CHECK-NEXT: ushll2 v19.8h, v0.16b, #0 +; CHECK-NEXT: ushll v20.8h, v0.8b, #0 +; CHECK-NEXT: ushll2 v0.4s, v19.8h, #3 +; CHECK-NEXT: ldp q6, q3, [x1] +; CHECK-NEXT: add v1.16b, v1.16b, v2.16b +; CHECK-NEXT: ushll2 v21.4s, v20.8h, #0 +; CHECK-NEXT: ushll v16.8h, v1.8b, #0 +; CHECK-NEXT: ldp q4, q5, [x2] +; CHECK-NEXT: add v2.16b, v6.16b, v3.16b +; CHECK-NEXT: ushll2 v6.8h, v1.16b, #0 +; CHECK-NEXT: ushll2 v17.8h, v2.16b, #0 ; CHECK-NEXT: ushll v2.8h, v2.8b, #0 -; CHECK-NEXT: ushll v18.4s, v4.4h, #0 -; CHECK-NEXT: ushll v20.4s, v0.4h, #0 -; CHECK-NEXT: ushll v16.4s, v3.4h, #0 -; CHECK-NEXT: ushll v23.4s, v1.4h, #0 -; CHECK-NEXT: ushll2 v4.4s, v4.8h, #0 -; CHECK-NEXT: ushll2 v0.4s, v0.8h, #0 -; CHECK-NEXT: ushll2 v3.4s, v3.8h, #0 -; CHECK-NEXT: ushll2 v1.4s, v1.8h, #0 -; CHECK-NEXT: ushll v6.4s, v5.4h, #3 -; CHECK-NEXT: ushll v21.4s, v2.4h, #3 -; CHECK-NEXT: ushll v22.4s, v19.4h, #3 -; CHECK-NEXT: ushll v24.4s, v17.4h, #3 -; CHECK-NEXT: ushll2 v7.4s, v5.8h, #3 -; CHECK-NEXT: ushll2 v2.4s, v2.8h, #3 +; CHECK-NEXT: ushll2 v1.4s, v6.8h, #3 +; CHECK-NEXT: add v4.16b, v4.16b, v5.16b +; CHECK-NEXT: ushll2 v3.4s, v16.8h, #0 ; CHECK-NEXT: ushll2 v5.4s, v17.8h, #3 -; CHECK-NEXT: ushll2 v17.4s, v19.8h, #3 -; CHECK-NEXT: sub v1.4s, v1.4s, v5.4s -; CHECK-NEXT: sub v3.4s, v3.4s, v17.4s -; CHECK-NEXT: sub v5.4s, v0.4s, v2.4s -; CHECK-NEXT: sub v7.4s, v4.4s, v7.4s -; CHECK-NEXT: sub v0.4s, v23.4s, v24.4s -; CHECK-NEXT: sub v2.4s, v16.4s, v22.4s -; CHECK-NEXT: sub v4.4s, v20.4s, v21.4s -; CHECK-NEXT: sub v6.4s, v18.4s, v6.4s +; CHECK-NEXT: ushll2 v7.4s, v2.8h, #0 +; CHECK-NEXT: ushll2 v18.8h, v4.16b, #0 +; CHECK-NEXT: ushll v4.8h, v4.8b, #0 +; CHECK-NEXT: sub v1.4s, v3.4s, v1.4s +; CHECK-NEXT: sub v3.4s, v7.4s, v5.4s +; CHECK-NEXT: ushll2 v5.4s, v18.8h, #3 +; CHECK-NEXT: ushll2 v7.4s, v4.8h, #0 +; CHECK-NEXT: ushll v2.4s, v2.4h, #0 +; CHECK-NEXT: sub v5.4s, v7.4s, v5.4s +; CHECK-NEXT: sub v7.4s, v21.4s, v0.4s +; CHECK-NEXT: ushll v0.4s, v6.4h, #3 +; CHECK-NEXT: ushll v6.4s, v16.4h, #0 +; CHECK-NEXT: ushll v16.4s, v17.4h, #3 +; CHECK-NEXT: sub v0.4s, v6.4s, v0.4s +; CHECK-NEXT: sub v2.4s, v2.4s, v16.4s +; CHECK-NEXT: ushll v6.4s, v18.4h, #3 +; CHECK-NEXT: ushll v4.4s, v4.4h, #0 +; CHECK-NEXT: ushll v16.4s, v19.4h, #3 +; CHECK-NEXT: ushll v17.4s, v20.4h, #0 +; CHECK-NEXT: sub v4.4s, v4.4s, v6.4s +; CHECK-NEXT: sub v6.4s, v17.4s, v16.4s ; CHECK-NEXT: ret %lp1 = load <8 x i8>, ptr %p %p2 = getelementptr i8, ptr %p, i32 8 @@ -6725,129 +6268,69 @@ %s2 = getelementptr i8, ptr %s, i32 8 %ls2 = load <8 x i8>, ptr %s2 %s3 = getelementptr i8, ptr %s, i32 16 - %ls3 = load <8 x i8>, ptr %s3 - %s4 = getelementptr i8, ptr %s, i32 24 - %ls4 = load <8 x i8>, ptr %s4 - - %jk1 = shufflevector <8 x i8> %lp1, <8 x i8> %lq1, <32 x i32> - %m1l = shufflevector <8 x i8> %lr1, <8 x i8> poison, <32 x i32> - %jkm1 = shufflevector <32 x i8> %jk1, <32 x i8> %m1l, <32 x i32> - %n1l = shufflevector <8 x i8> %ls1, <8 x i8> poison, <32 x i32> - %l1 = shufflevector <32 x i8> %jkm1, <32 x i8> %n1l, <32 x i32> - %jk2 = shufflevector <8 x i8> %lp2, <8 x i8> %lq2, <32 x i32> - %m2l = shufflevector <8 x i8> %lr2, <8 x i8> poison, <32 x i32> - %jkm2 = shufflevector <32 x i8> %jk2, <32 x i8> %m2l, <32 x i32> - %n2l = shufflevector <8 x i8> %ls2, <8 x i8> poison, <32 x i32> - %l2 = shufflevector <32 x i8> %jkm2, <32 x i8> %n2l, <32 x i32> - %jk3 = shufflevector <8 x i8> %lp3, <8 x i8> %lq3, <32 x i32> - %m3l = shufflevector <8 x i8> %lr3, <8 x i8> poison, <32 x i32> - %jkm3 = shufflevector <32 x i8> %jk3, <32 x i8> %m3l, <32 x i32> - %n3l = shufflevector <8 x i8> %ls3, <8 x i8> poison, <32 x i32> - %l3 = shufflevector <32 x i8> %jkm3, <32 x i8> %n3l, <32 x i32> - %jk4 = shufflevector <8 x i8> %lp4, <8 x i8> %lq4, <32 x i32> - %m4l = shufflevector <8 x i8> %lr4, <8 x i8> poison, <32 x i32> - %jkm4 = shufflevector <32 x i8> %jk4, <32 x i8> %m4l, <32 x i32> - %n4l = shufflevector <8 x i8> %ls4, <8 x i8> poison, <32 x i32> - %l4 = shufflevector <32 x i8> %jkm4, <32 x i8> %n4l, <32 x i32> - - %la1 = add <32 x i8> %l1, %l3 - %la2 = add <32 x i8> %l2, %l4 - %e1 = zext <32 x i8> %la1 to <32 x i32> - %e2 = zext <32 x i8> %la2 to <32 x i32> - %se2 = shl <32 x i32> %e2, - %a = sub <32 x i32> %e1, %se2 - ret <32 x i32> %a -} - -define <8 x i64> @dbl_bv4_v2i8_v8i64(ptr %p, ptr %q, ptr %r, ptr %s) { -; CHECK-LABEL: dbl_bv4_v2i8_v8i64: -; CHECK: // %bb.0: -; CHECK-NEXT: add x8, x0, #2 -; CHECK-NEXT: add x9, x0, #4 -; CHECK-NEXT: ld1 { v20.b }[0], [x0] -; CHECK-NEXT: add x10, x0, #6 -; CHECK-NEXT: add x11, x3, #3 -; CHECK-NEXT: ld1 { v4.b }[0], [x8] -; CHECK-NEXT: add x8, x0, #1 -; CHECK-NEXT: ld1 { v16.b }[0], [x9] -; CHECK-NEXT: add x9, x0, #3 -; CHECK-NEXT: ld1 { v0.b }[0], [x10] -; CHECK-NEXT: add x10, x0, #7 -; CHECK-NEXT: ld1 { v20.b }[4], [x8] -; CHECK-NEXT: add x8, x0, #5 -; CHECK-NEXT: ld1 { v4.b }[4], [x9] -; CHECK-NEXT: add x9, x1, #2 -; CHECK-NEXT: ld1 { v21.b }[0], [x1] -; CHECK-NEXT: ld1 { v16.b }[4], [x8] -; CHECK-NEXT: add x8, x1, #1 -; CHECK-NEXT: ld1 { v0.b }[4], [x10] -; CHECK-NEXT: add x10, x1, #4 -; CHECK-NEXT: ld1 { v5.b }[0], [x9] -; CHECK-NEXT: add x9, x1, #6 -; CHECK-NEXT: ld1 { v21.b }[4], [x8] -; CHECK-NEXT: add x8, x1, #3 -; CHECK-NEXT: ld1 { v17.b }[0], [x10] -; CHECK-NEXT: add x10, x2, #2 -; CHECK-NEXT: ld1 { v1.b }[0], [x9] -; CHECK-NEXT: add x9, x1, #5 -; CHECK-NEXT: ld1 { v5.b }[4], [x8] -; CHECK-NEXT: add x8, x1, #7 -; CHECK-NEXT: ld1 { v22.b }[0], [x2] -; CHECK-NEXT: ld1 { v17.b }[4], [x9] -; CHECK-NEXT: add x9, x2, #1 -; CHECK-NEXT: ld1 { v1.b }[4], [x8] -; CHECK-NEXT: add x8, x2, #4 -; CHECK-NEXT: ld1 { v6.b }[0], [x10] -; CHECK-NEXT: add x10, x2, #6 -; CHECK-NEXT: ld1 { v22.b }[4], [x9] -; CHECK-NEXT: add x9, x2, #3 -; CHECK-NEXT: ld1 { v18.b }[0], [x8] -; CHECK-NEXT: add x8, x2, #5 -; CHECK-NEXT: ld1 { v2.b }[0], [x10] -; CHECK-NEXT: add x10, x3, #4 -; CHECK-NEXT: ld1 { v6.b }[4], [x9] -; CHECK-NEXT: add x9, x2, #7 -; CHECK-NEXT: ld1 { v23.b }[0], [x3] -; CHECK-NEXT: ld1 { v18.b }[4], [x8] -; CHECK-NEXT: add x8, x3, #1 -; CHECK-NEXT: ld1 { v2.b }[4], [x9] -; CHECK-NEXT: add x9, x3, #2 -; CHECK-NEXT: ld1 { v19.b }[0], [x10] -; CHECK-NEXT: add x10, x3, #6 -; CHECK-NEXT: ld1 { v23.b }[4], [x8] -; CHECK-NEXT: add x8, x3, #5 -; CHECK-NEXT: ld1 { v7.b }[0], [x9] -; CHECK-NEXT: adrp x9, .LCPI136_0 -; CHECK-NEXT: ld1 { v3.b }[0], [x10] -; CHECK-NEXT: ld1 { v19.b }[4], [x8] -; CHECK-NEXT: add x8, x3, #7 -; CHECK-NEXT: ldr d24, [x9, :lo12:.LCPI136_0] -; CHECK-NEXT: ld1 { v7.b }[4], [x11] -; CHECK-NEXT: ld1 { v3.b }[4], [x8] -; CHECK-NEXT: tbl v20.8b, { v20.16b, v21.16b, v22.16b, v23.16b }, v24.8b -; CHECK-NEXT: tbl v16.8b, { v16.16b, v17.16b, v18.16b, v19.16b }, v24.8b -; CHECK-NEXT: tbl v4.8b, { v4.16b, v5.16b, v6.16b, v7.16b }, v24.8b -; CHECK-NEXT: tbl v0.8b, { v0.16b, v1.16b, v2.16b, v3.16b }, v24.8b -; CHECK-NEXT: add v1.8b, v20.8b, v16.8b -; CHECK-NEXT: ushll v1.8h, v1.8b, #0 -; CHECK-NEXT: add v0.8b, v4.8b, v0.8b -; CHECK-NEXT: ushll v2.4s, v1.4h, #0 + %ls3 = load <8 x i8>, ptr %s3 + %s4 = getelementptr i8, ptr %s, i32 24 + %ls4 = load <8 x i8>, ptr %s4 + + %jk1 = shufflevector <8 x i8> %lp1, <8 x i8> %lq1, <32 x i32> + %m1l = shufflevector <8 x i8> %lr1, <8 x i8> poison, <32 x i32> + %jkm1 = shufflevector <32 x i8> %jk1, <32 x i8> %m1l, <32 x i32> + %n1l = shufflevector <8 x i8> %ls1, <8 x i8> poison, <32 x i32> + %l1 = shufflevector <32 x i8> %jkm1, <32 x i8> %n1l, <32 x i32> + %jk2 = shufflevector <8 x i8> %lp2, <8 x i8> %lq2, <32 x i32> + %m2l = shufflevector <8 x i8> %lr2, <8 x i8> poison, <32 x i32> + %jkm2 = shufflevector <32 x i8> %jk2, <32 x i8> %m2l, <32 x i32> + %n2l = shufflevector <8 x i8> %ls2, <8 x i8> poison, <32 x i32> + %l2 = shufflevector <32 x i8> %jkm2, <32 x i8> %n2l, <32 x i32> + %jk3 = shufflevector <8 x i8> %lp3, <8 x i8> %lq3, <32 x i32> + %m3l = shufflevector <8 x i8> %lr3, <8 x i8> poison, <32 x i32> + %jkm3 = shufflevector <32 x i8> %jk3, <32 x i8> %m3l, <32 x i32> + %n3l = shufflevector <8 x i8> %ls3, <8 x i8> poison, <32 x i32> + %l3 = shufflevector <32 x i8> %jkm3, <32 x i8> %n3l, <32 x i32> + %jk4 = shufflevector <8 x i8> %lp4, <8 x i8> %lq4, <32 x i32> + %m4l = shufflevector <8 x i8> %lr4, <8 x i8> poison, <32 x i32> + %jkm4 = shufflevector <32 x i8> %jk4, <32 x i8> %m4l, <32 x i32> + %n4l = shufflevector <8 x i8> %ls4, <8 x i8> poison, <32 x i32> + %l4 = shufflevector <32 x i8> %jkm4, <32 x i8> %n4l, <32 x i32> + + %la1 = add <32 x i8> %l1, %l3 + %la2 = add <32 x i8> %l2, %l4 + %e1 = zext <32 x i8> %la1 to <32 x i32> + %e2 = zext <32 x i8> %la2 to <32 x i32> + %se2 = shl <32 x i32> %e2, + %a = sub <32 x i32> %e1, %se2 + ret <32 x i32> %a +} + +define <8 x i64> @dbl_bv4_v2i8_v8i64(ptr %p, ptr %q, ptr %r, ptr %s) { +; CHECK-LABEL: dbl_bv4_v2i8_v8i64: +; CHECK: // %bb.0: +; CHECK-NEXT: ldp s0, s1, [x0] +; CHECK-NEXT: ld1 { v0.s }[1], [x1], #4 +; CHECK-NEXT: ld1 { v0.s }[2], [x2], #4 +; CHECK-NEXT: ld1 { v0.s }[3], [x3], #4 +; CHECK-NEXT: ld1 { v1.s }[1], [x1] +; CHECK-NEXT: ld1 { v1.s }[2], [x2] +; CHECK-NEXT: ld1 { v1.s }[3], [x3] +; CHECK-NEXT: add v0.16b, v0.16b, v1.16b +; CHECK-NEXT: ushll2 v1.8h, v0.16b, #0 ; CHECK-NEXT: ushll v0.8h, v0.8b, #0 -; CHECK-NEXT: ushll2 v1.4s, v1.8h, #0 -; CHECK-NEXT: ushll v3.4s, v0.4h, #0 -; CHECK-NEXT: ushll2 v0.4s, v0.8h, #0 -; CHECK-NEXT: ushll v4.2d, v2.2s, #0 -; CHECK-NEXT: ushll v5.2d, v1.2s, #0 -; CHECK-NEXT: ushll2 v2.2d, v2.4s, #0 -; CHECK-NEXT: ushll2 v1.2d, v1.4s, #0 -; CHECK-NEXT: ushll v6.2d, v3.2s, #3 -; CHECK-NEXT: ushll v7.2d, v0.2s, #3 -; CHECK-NEXT: ushll2 v0.2d, v0.4s, #3 -; CHECK-NEXT: ushll2 v16.2d, v3.4s, #3 -; CHECK-NEXT: sub v3.2d, v1.2d, v0.2d -; CHECK-NEXT: sub v1.2d, v2.2d, v16.2d -; CHECK-NEXT: sub v0.2d, v4.2d, v6.2d -; CHECK-NEXT: sub v2.2d, v5.2d, v7.2d +; CHECK-NEXT: ushll2 v2.4s, v1.8h, #0 +; CHECK-NEXT: ushll2 v3.4s, v0.8h, #0 +; CHECK-NEXT: ushll v4.4s, v1.4h, #0 +; CHECK-NEXT: ushll v0.4s, v0.4h, #0 +; CHECK-NEXT: ushll2 v1.2d, v2.4s, #3 +; CHECK-NEXT: ushll v2.2d, v2.2s, #0 +; CHECK-NEXT: ushll2 v5.2d, v3.4s, #3 +; CHECK-NEXT: ushll v6.2d, v3.2s, #0 +; CHECK-NEXT: sub v3.2d, v2.2d, v1.2d +; CHECK-NEXT: ushll2 v2.2d, v4.4s, #3 +; CHECK-NEXT: sub v1.2d, v6.2d, v5.2d +; CHECK-NEXT: ushll2 v5.2d, v0.4s, #3 +; CHECK-NEXT: ushll v0.2d, v0.2s, #0 +; CHECK-NEXT: ushll v4.2d, v4.2s, #0 +; CHECK-NEXT: sub v0.2d, v0.2d, v5.2d +; CHECK-NEXT: sub v2.2d, v4.2d, v2.2d ; CHECK-NEXT: ret %lp1 = load <2 x i8>, ptr %p %p2 = getelementptr i8, ptr %p, i32 2 @@ -6911,64 +6394,50 @@ define <16 x i64> @dbl_bv4_v4i8_v16i64(ptr %p, ptr %q, ptr %r, ptr %s) { ; CHECK-LABEL: dbl_bv4_v4i8_v16i64: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp s0, s1, [x0] -; CHECK-NEXT: add x8, x1, #8 -; CHECK-NEXT: add x9, x1, #12 -; CHECK-NEXT: add x10, x2, #8 -; CHECK-NEXT: add x11, x2, #12 -; CHECK-NEXT: add x12, x3, #8 -; CHECK-NEXT: add x13, x3, #12 -; CHECK-NEXT: ld1 { v0.s }[1], [x1], #4 -; CHECK-NEXT: ld1 { v0.s }[2], [x2], #4 -; CHECK-NEXT: ld1 { v0.s }[3], [x3], #4 -; CHECK-NEXT: ldp s2, s3, [x0, #8] -; CHECK-NEXT: ld1 { v1.s }[1], [x1] -; CHECK-NEXT: ld1 { v2.s }[1], [x8] -; CHECK-NEXT: ld1 { v3.s }[1], [x9] -; CHECK-NEXT: ld1 { v1.s }[2], [x2] -; CHECK-NEXT: ld1 { v2.s }[2], [x10] -; CHECK-NEXT: ld1 { v3.s }[2], [x11] -; CHECK-NEXT: ld1 { v1.s }[3], [x3] -; CHECK-NEXT: ld1 { v2.s }[3], [x12] -; CHECK-NEXT: ld1 { v3.s }[3], [x13] -; CHECK-NEXT: add v0.16b, v0.16b, v2.16b -; CHECK-NEXT: add v1.16b, v1.16b, v3.16b -; CHECK-NEXT: ushll v2.8h, v0.8b, #0 -; CHECK-NEXT: ushll2 v0.8h, v0.16b, #0 -; CHECK-NEXT: ushll2 v3.8h, v1.16b, #0 +; CHECK-NEXT: ldp d1, d2, [x0] +; CHECK-NEXT: ldp d0, d5, [x2] +; CHECK-NEXT: ldp d3, d4, [x1] +; CHECK-NEXT: add v1.8b, v1.8b, v2.8b +; CHECK-NEXT: ldp d6, d2, [x3] +; CHECK-NEXT: add v0.8b, v0.8b, v5.8b ; CHECK-NEXT: ushll v1.8h, v1.8b, #0 -; CHECK-NEXT: ushll v4.4s, v2.4h, #0 -; CHECK-NEXT: ushll v5.4s, v0.4h, #0 -; CHECK-NEXT: ushll2 v2.4s, v2.8h, #0 -; CHECK-NEXT: ushll2 v0.4s, v0.8h, #0 -; CHECK-NEXT: ushll2 v6.4s, v3.8h, #0 -; CHECK-NEXT: ushll2 v7.4s, v1.8h, #0 -; CHECK-NEXT: ushll v3.4s, v3.4h, #0 -; CHECK-NEXT: ushll v1.4s, v1.4h, #0 -; CHECK-NEXT: ushll v16.2d, v4.2s, #0 -; CHECK-NEXT: ushll v17.2d, v5.2s, #0 -; CHECK-NEXT: ushll v18.2d, v2.2s, #0 -; CHECK-NEXT: ushll2 v4.2d, v4.4s, #0 -; CHECK-NEXT: ushll v19.2d, v0.2s, #0 -; CHECK-NEXT: ushll2 v5.2d, v5.4s, #0 -; CHECK-NEXT: ushll2 v2.2d, v2.4s, #0 -; CHECK-NEXT: ushll2 v0.2d, v0.4s, #0 -; CHECK-NEXT: ushll v20.2d, v1.2s, #3 -; CHECK-NEXT: ushll v21.2d, v3.2s, #3 -; CHECK-NEXT: ushll v22.2d, v7.2s, #3 -; CHECK-NEXT: ushll2 v1.2d, v1.4s, #3 -; CHECK-NEXT: ushll v23.2d, v6.2s, #3 -; CHECK-NEXT: ushll2 v24.2d, v3.4s, #3 -; CHECK-NEXT: ushll2 v3.2d, v6.4s, #3 -; CHECK-NEXT: ushll2 v6.2d, v7.4s, #3 -; CHECK-NEXT: sub v7.2d, v0.2d, v3.2d -; CHECK-NEXT: sub v3.2d, v2.2d, v6.2d -; CHECK-NEXT: sub v5.2d, v5.2d, v24.2d -; CHECK-NEXT: sub v6.2d, v19.2d, v23.2d -; CHECK-NEXT: sub v1.2d, v4.2d, v1.2d -; CHECK-NEXT: sub v2.2d, v18.2d, v22.2d -; CHECK-NEXT: sub v0.2d, v16.2d, v20.2d -; CHECK-NEXT: sub v4.2d, v17.2d, v21.2d +; CHECK-NEXT: add v3.8b, v3.8b, v4.8b +; CHECK-NEXT: ushll v0.8h, v0.8b, #0 +; CHECK-NEXT: ushll v3.8h, v3.8b, #0 +; CHECK-NEXT: add v2.8b, v6.8b, v2.8b +; CHECK-NEXT: ushll2 v4.4s, v1.8h, #0 +; CHECK-NEXT: ushll v16.4s, v1.4h, #0 +; CHECK-NEXT: ushll2 v17.4s, v3.8h, #0 +; CHECK-NEXT: ushll v18.4s, v3.4h, #0 +; CHECK-NEXT: ushll2 v19.4s, v0.8h, #0 +; CHECK-NEXT: ushll v6.4s, v0.4h, #0 +; CHECK-NEXT: ushll v0.8h, v2.8b, #0 +; CHECK-NEXT: ushll2 v1.2d, v4.4s, #3 +; CHECK-NEXT: ushll2 v3.2d, v16.4s, #0 +; CHECK-NEXT: ushll2 v5.2d, v17.4s, #3 +; CHECK-NEXT: ushll2 v7.2d, v18.4s, #0 +; CHECK-NEXT: ushll2 v20.4s, v0.8h, #0 +; CHECK-NEXT: ushll v21.4s, v0.4h, #0 +; CHECK-NEXT: sub v1.2d, v3.2d, v1.2d +; CHECK-NEXT: sub v3.2d, v7.2d, v5.2d +; CHECK-NEXT: ushll2 v5.2d, v19.4s, #3 +; CHECK-NEXT: ushll2 v2.2d, v6.4s, #0 +; CHECK-NEXT: ushll2 v0.2d, v20.4s, #3 +; CHECK-NEXT: ushll2 v7.2d, v21.4s, #0 +; CHECK-NEXT: sub v5.2d, v2.2d, v5.2d +; CHECK-NEXT: sub v7.2d, v7.2d, v0.2d +; CHECK-NEXT: ushll v0.2d, v4.2s, #3 +; CHECK-NEXT: ushll v2.2d, v16.2s, #0 +; CHECK-NEXT: ushll v4.2d, v17.2s, #3 +; CHECK-NEXT: ushll v16.2d, v18.2s, #0 +; CHECK-NEXT: sub v0.2d, v2.2d, v0.2d +; CHECK-NEXT: sub v2.2d, v16.2d, v4.2d +; CHECK-NEXT: ushll v4.2d, v19.2s, #3 +; CHECK-NEXT: ushll v6.2d, v6.2s, #0 +; CHECK-NEXT: ushll v16.2d, v20.2s, #3 +; CHECK-NEXT: ushll v17.2d, v21.2s, #0 +; CHECK-NEXT: sub v4.2d, v6.2d, v4.2d +; CHECK-NEXT: sub v6.2d, v17.2d, v16.2d ; CHECK-NEXT: ret %lp1 = load <4 x i8>, ptr %p %p2 = getelementptr i8, ptr %p, i32 4 @@ -7032,119 +6501,94 @@ define <32 x i64> @dbl_bv4_v8i8_v32i64(ptr %p, ptr %q, ptr %r, ptr %s) { ; CHECK-LABEL: dbl_bv4_v8i8_v32i64: ; CHECK: // %bb.0: -; CHECK-NEXT: str d14, [sp, #-64]! // 8-byte Folded Spill -; CHECK-NEXT: .cfi_def_cfa_offset 64 -; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill -; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill -; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill -; CHECK-NEXT: .cfi_offset b8, -8 -; CHECK-NEXT: .cfi_offset b9, -16 -; CHECK-NEXT: .cfi_offset b10, -24 -; CHECK-NEXT: .cfi_offset b11, -32 -; CHECK-NEXT: .cfi_offset b12, -40 -; CHECK-NEXT: .cfi_offset b13, -48 -; CHECK-NEXT: .cfi_offset b14, -64 -; CHECK-NEXT: ldp d0, d25, [x2] -; CHECK-NEXT: ldp d1, d28, [x2, #16] -; CHECK-NEXT: ldp d26, d2, [x3, #8] -; CHECK-NEXT: add v0.8b, v0.8b, v1.8b -; CHECK-NEXT: ldr d3, [x3] -; CHECK-NEXT: ldp d17, d29, [x1, #16] -; CHECK-NEXT: ushll v6.8h, v0.8b, #0 -; CHECK-NEXT: add v25.8b, v25.8b, v28.8b -; CHECK-NEXT: ldp d0, d24, [x1] -; CHECK-NEXT: add v2.8b, v3.8b, v2.8b -; CHECK-NEXT: ldr d27, [x3, #24] -; CHECK-NEXT: ldp d3, d30, [x0] -; CHECK-NEXT: add v0.8b, v0.8b, v17.8b -; CHECK-NEXT: ldp d4, d31, [x0, #16] -; CHECK-NEXT: add v26.8b, v26.8b, v27.8b -; CHECK-NEXT: add v24.8b, v24.8b, v29.8b -; CHECK-NEXT: ushll v5.8h, v2.8b, #0 -; CHECK-NEXT: ushll v19.8h, v0.8b, #0 -; CHECK-NEXT: ushll v26.8h, v26.8b, #0 -; CHECK-NEXT: ushll2 v7.4s, v5.8h, #0 -; CHECK-NEXT: add v3.8b, v3.8b, v4.8b -; CHECK-NEXT: ushll2 v21.4s, v19.8h, #0 -; CHECK-NEXT: add v30.8b, v30.8b, v31.8b -; CHECK-NEXT: ushll v8.4s, v6.4h, #0 -; CHECK-NEXT: ushll v25.8h, v25.8b, #0 -; CHECK-NEXT: ushll v28.4s, v26.4h, #0 -; CHECK-NEXT: ushll2 v26.4s, v26.8h, #0 -; CHECK-NEXT: ushll2 v16.4s, v6.8h, #0 -; CHECK-NEXT: ushll2 v1.2d, v7.4s, #0 -; CHECK-NEXT: ushll v20.8h, v3.8b, #0 -; CHECK-NEXT: ushll2 v4.2d, v21.4s, #0 -; CHECK-NEXT: ushll v23.4s, v5.4h, #0 -; CHECK-NEXT: ushll v17.2d, v7.2s, #0 -; CHECK-NEXT: ushll2 v5.2d, v8.4s, #0 -; CHECK-NEXT: ushll v9.4s, v19.4h, #0 -; CHECK-NEXT: ushll v19.2d, v21.2s, #0 -; CHECK-NEXT: ushll v21.2d, v8.2s, #0 -; CHECK-NEXT: ushll v29.8h, v30.8b, #0 -; CHECK-NEXT: ushll v30.8h, v24.8b, #0 -; CHECK-NEXT: ushll v27.4s, v25.4h, #0 -; CHECK-NEXT: ushll2 v25.4s, v25.8h, #0 -; CHECK-NEXT: ushll2 v8.2d, v26.4s, #3 -; CHECK-NEXT: ushll v26.2d, v26.2s, #3 -; CHECK-NEXT: ushll2 v2.2d, v16.4s, #0 -; CHECK-NEXT: ushll2 v22.4s, v20.8h, #0 -; CHECK-NEXT: ushll v18.2d, v16.2s, #0 -; CHECK-NEXT: ushll2 v6.2d, v9.4s, #0 -; CHECK-NEXT: ushll v16.4s, v20.4h, #0 -; CHECK-NEXT: ushll v20.2d, v9.2s, #0 -; CHECK-NEXT: ushll v24.4s, v29.4h, #0 -; CHECK-NEXT: ushll v31.4s, v30.4h, #0 -; CHECK-NEXT: ushll2 v29.4s, v29.8h, #0 -; CHECK-NEXT: ushll2 v30.4s, v30.8h, #0 -; CHECK-NEXT: ushll2 v9.2d, v25.4s, #3 -; CHECK-NEXT: ushll v25.2d, v25.2s, #3 -; CHECK-NEXT: sub v1.2d, v1.2d, v8.2d -; CHECK-NEXT: sub v17.2d, v17.2d, v26.2d -; CHECK-NEXT: ushll2 v0.2d, v22.4s, #0 -; CHECK-NEXT: ushll v22.2d, v22.2s, #0 -; CHECK-NEXT: stp q17, q1, [x8, #224] -; CHECK-NEXT: ushll2 v10.2d, v30.4s, #3 -; CHECK-NEXT: ushll2 v11.2d, v29.4s, #3 -; CHECK-NEXT: ushll v30.2d, v30.2s, #3 -; CHECK-NEXT: ushll v29.2d, v29.2s, #3 -; CHECK-NEXT: sub v2.2d, v2.2d, v9.2d -; CHECK-NEXT: sub v1.2d, v18.2d, v25.2d -; CHECK-NEXT: ushll2 v3.2d, v23.4s, #0 -; CHECK-NEXT: ushll v23.2d, v23.2s, #0 -; CHECK-NEXT: stp q1, q2, [x8, #160] -; CHECK-NEXT: ushll2 v12.2d, v28.4s, #3 -; CHECK-NEXT: ushll v28.2d, v28.2s, #3 -; CHECK-NEXT: sub v4.2d, v4.2d, v10.2d -; CHECK-NEXT: sub v2.2d, v19.2d, v30.2d -; CHECK-NEXT: sub v0.2d, v0.2d, v11.2d -; CHECK-NEXT: sub v1.2d, v22.2d, v29.2d -; CHECK-NEXT: stp q2, q4, [x8, #96] -; CHECK-NEXT: ushll2 v13.2d, v27.4s, #3 -; CHECK-NEXT: ushll v27.2d, v27.2s, #3 -; CHECK-NEXT: stp q1, q0, [x8, #32] -; CHECK-NEXT: sub v2.2d, v3.2d, v12.2d -; CHECK-NEXT: sub v0.2d, v23.2d, v28.2d -; CHECK-NEXT: ushll2 v14.2d, v31.4s, #3 -; CHECK-NEXT: ushll v31.2d, v31.2s, #3 -; CHECK-NEXT: stp q0, q2, [x8, #192] -; CHECK-NEXT: sub v1.2d, v5.2d, v13.2d -; CHECK-NEXT: sub v2.2d, v21.2d, v27.2d -; CHECK-NEXT: sub v0.2d, v6.2d, v14.2d -; CHECK-NEXT: stp q2, q1, [x8, #128] -; CHECK-NEXT: sub v1.2d, v20.2d, v31.2d -; CHECK-NEXT: ushll2 v7.2d, v16.4s, #0 -; CHECK-NEXT: ushll v2.2d, v16.2s, #0 -; CHECK-NEXT: stp q1, q0, [x8, #64] -; CHECK-NEXT: ushll2 v0.2d, v24.4s, #3 -; CHECK-NEXT: ushll v1.2d, v24.2s, #3 -; CHECK-NEXT: sub v0.2d, v7.2d, v0.2d -; CHECK-NEXT: sub v1.2d, v2.2d, v1.2d +; CHECK-NEXT: ldp q0, q3, [x3] +; CHECK-NEXT: add v7.16b, v0.16b, v3.16b +; CHECK-NEXT: ldp q6, q0, [x1] +; CHECK-NEXT: ldp q1, q2, [x0] +; CHECK-NEXT: add v3.16b, v6.16b, v0.16b +; CHECK-NEXT: ushll2 v6.8h, v7.16b, #0 +; CHECK-NEXT: ushll v7.8h, v7.8b, #0 +; CHECK-NEXT: ushll v16.4s, v6.4h, #0 +; CHECK-NEXT: ushll v17.4s, v7.4h, #0 +; CHECK-NEXT: ldp q4, q5, [x2] +; CHECK-NEXT: add v1.16b, v1.16b, v2.16b +; CHECK-NEXT: ushll2 v2.8h, v3.16b, #0 +; CHECK-NEXT: ushll2 v0.8h, v1.16b, #0 +; CHECK-NEXT: ushll v1.8h, v1.8b, #0 +; CHECK-NEXT: ushll v3.8h, v3.8b, #0 +; CHECK-NEXT: add v5.16b, v4.16b, v5.16b +; CHECK-NEXT: ushll v18.2d, v16.2s, #3 +; CHECK-NEXT: ushll v19.2d, v17.2s, #0 +; CHECK-NEXT: ushll2 v4.8h, v5.16b, #0 +; CHECK-NEXT: ushll v20.8h, v5.8b, #0 +; CHECK-NEXT: sub v5.2d, v19.2d, v18.2d +; CHECK-NEXT: ushll2 v18.4s, v0.8h, #0 +; CHECK-NEXT: ushll2 v19.4s, v1.8h, #0 +; CHECK-NEXT: ushll2 v23.4s, v2.8h, #0 +; CHECK-NEXT: ushll2 v24.4s, v3.8h, #0 +; CHECK-NEXT: ushll v21.2d, v18.2s, #3 +; CHECK-NEXT: ushll v22.2d, v19.2s, #0 +; CHECK-NEXT: ushll v25.2d, v23.2s, #3 +; CHECK-NEXT: ushll v26.2d, v24.2s, #0 +; CHECK-NEXT: sub v21.2d, v22.2d, v21.2d +; CHECK-NEXT: sub v22.2d, v26.2d, v25.2d +; CHECK-NEXT: ushll2 v25.4s, v4.8h, #0 +; CHECK-NEXT: ushll2 v26.4s, v20.8h, #0 +; CHECK-NEXT: ushll2 v6.4s, v6.8h, #0 +; CHECK-NEXT: ushll2 v7.4s, v7.8h, #0 +; CHECK-NEXT: ushll v27.2d, v25.2s, #3 +; CHECK-NEXT: ushll v28.2d, v26.2s, #0 +; CHECK-NEXT: ushll v29.2d, v6.2s, #3 +; CHECK-NEXT: ushll v30.2d, v7.2s, #0 +; CHECK-NEXT: ushll2 v25.2d, v25.4s, #3 +; CHECK-NEXT: ushll2 v26.2d, v26.4s, #0 +; CHECK-NEXT: ushll v2.4s, v2.4h, #0 +; CHECK-NEXT: ushll2 v6.2d, v6.4s, #3 +; CHECK-NEXT: ushll2 v7.2d, v7.4s, #0 +; CHECK-NEXT: ushll v3.4s, v3.4h, #0 +; CHECK-NEXT: sub v25.2d, v26.2d, v25.2d +; CHECK-NEXT: ushll v26.2d, v2.2s, #3 +; CHECK-NEXT: sub v6.2d, v7.2d, v6.2d +; CHECK-NEXT: ushll v7.2d, v3.2s, #0 +; CHECK-NEXT: ushll2 v2.2d, v2.4s, #3 +; CHECK-NEXT: ushll2 v3.2d, v3.4s, #0 +; CHECK-NEXT: sub v27.2d, v28.2d, v27.2d +; CHECK-NEXT: sub v28.2d, v30.2d, v29.2d +; CHECK-NEXT: ushll2 v23.2d, v23.4s, #3 +; CHECK-NEXT: stp q27, q25, [x8, #160] +; CHECK-NEXT: ushll2 v24.2d, v24.4s, #0 +; CHECK-NEXT: stp q28, q6, [x8, #224] +; CHECK-NEXT: ushll v0.4s, v0.4h, #0 +; CHECK-NEXT: ushll v1.4s, v1.4h, #0 +; CHECK-NEXT: ushll v4.4s, v4.4h, #0 +; CHECK-NEXT: ushll v20.4s, v20.4h, #0 +; CHECK-NEXT: ushll2 v18.2d, v18.4s, #3 +; CHECK-NEXT: ushll2 v19.2d, v19.4s, #0 +; CHECK-NEXT: sub v2.2d, v3.2d, v2.2d +; CHECK-NEXT: sub v3.2d, v7.2d, v26.2d +; CHECK-NEXT: sub v23.2d, v24.2d, v23.2d +; CHECK-NEXT: ushll v24.2d, v0.2s, #3 +; CHECK-NEXT: stp q3, q2, [x8, #64] +; CHECK-NEXT: ushll v6.2d, v4.2s, #3 +; CHECK-NEXT: stp q22, q23, [x8, #96] +; CHECK-NEXT: ushll v28.2d, v20.2s, #0 +; CHECK-NEXT: ushll2 v4.2d, v4.4s, #3 +; CHECK-NEXT: ushll2 v16.2d, v16.4s, #3 +; CHECK-NEXT: ushll2 v17.2d, v17.4s, #0 +; CHECK-NEXT: sub v18.2d, v19.2d, v18.2d +; CHECK-NEXT: ushll2 v19.2d, v20.4s, #0 +; CHECK-NEXT: ushll2 v0.2d, v0.4s, #3 +; CHECK-NEXT: stp q21, q18, [x8, #32] +; CHECK-NEXT: ushll2 v2.2d, v1.4s, #0 +; CHECK-NEXT: ushll v1.2d, v1.2s, #0 +; CHECK-NEXT: sub v16.2d, v17.2d, v16.2d +; CHECK-NEXT: sub v4.2d, v19.2d, v4.2d +; CHECK-NEXT: sub v6.2d, v28.2d, v6.2d +; CHECK-NEXT: stp q5, q16, [x8, #192] +; CHECK-NEXT: sub v0.2d, v2.2d, v0.2d +; CHECK-NEXT: sub v1.2d, v1.2d, v24.2d +; CHECK-NEXT: stp q6, q4, [x8, #128] ; CHECK-NEXT: stp q1, q0, [x8] -; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload -; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload -; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload -; CHECK-NEXT: ldr d14, [sp], #64 // 8-byte Folded Reload ; CHECK-NEXT: ret %lp1 = load <8 x i8>, ptr %p %p2 = getelementptr i8, ptr %p, i32 8 @@ -7345,36 +6789,26 @@ define <16 x i32> @dbl_bv4_v4i16_v16i32(ptr %p, ptr %q, ptr %r, ptr %s) { ; CHECK-LABEL: dbl_bv4_v4i16_v16i32: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp d16, d6, [x3, #8] -; CHECK-NEXT: ldr d7, [x3] -; CHECK-NEXT: ldp d0, d4, [x2] -; CHECK-NEXT: add v6.4h, v7.4h, v6.4h -; CHECK-NEXT: ldr d17, [x3, #24] -; CHECK-NEXT: ldp d5, d7, [x2, #16] -; CHECK-NEXT: ushll v6.4s, v6.4h, #0 -; CHECK-NEXT: ldp d18, d3, [x1, #8] -; CHECK-NEXT: add v0.4h, v0.4h, v5.4h -; CHECK-NEXT: ldr d5, [x1] -; CHECK-NEXT: ldp d2, d20, [x0, #16] -; CHECK-NEXT: ushll v19.4s, v0.4h, #0 -; CHECK-NEXT: add v4.4h, v4.4h, v7.4h -; CHECK-NEXT: add v3.4h, v5.4h, v3.4h -; CHECK-NEXT: ldr d0, [x1, #24] -; CHECK-NEXT: ldp d1, d5, [x0] -; CHECK-NEXT: ushll v4.4s, v4.4h, #3 -; CHECK-NEXT: add v0.4h, v18.4h, v0.4h -; CHECK-NEXT: add v1.4h, v1.4h, v2.4h -; CHECK-NEXT: ushll v2.4s, v3.4h, #0 -; CHECK-NEXT: add v3.4h, v5.4h, v20.4h -; CHECK-NEXT: add v5.4h, v16.4h, v17.4h +; CHECK-NEXT: ldp q0, q3, [x3] +; CHECK-NEXT: ldp q1, q2, [x0] +; CHECK-NEXT: add v3.8h, v0.8h, v3.8h +; CHECK-NEXT: ldp q6, q0, [x1] +; CHECK-NEXT: add v1.8h, v1.8h, v2.8h +; CHECK-NEXT: ushll2 v2.4s, v1.8h, #3 ; CHECK-NEXT: ushll v1.4s, v1.4h, #0 -; CHECK-NEXT: ushll v5.4s, v5.4h, #3 -; CHECK-NEXT: ushll v3.4s, v3.4h, #3 -; CHECK-NEXT: ushll v7.4s, v0.4h, #3 -; CHECK-NEXT: sub v0.4s, v1.4s, v3.4s -; CHECK-NEXT: sub v1.4s, v2.4s, v7.4s -; CHECK-NEXT: sub v2.4s, v19.4s, v4.4s -; CHECK-NEXT: sub v3.4s, v6.4s, v5.4s +; CHECK-NEXT: ldp q4, q5, [x2] +; CHECK-NEXT: add v0.8h, v6.8h, v0.8h +; CHECK-NEXT: ushll v6.4s, v0.4h, #0 +; CHECK-NEXT: add v4.8h, v4.8h, v5.8h +; CHECK-NEXT: ushll2 v5.4s, v0.8h, #3 +; CHECK-NEXT: sub v0.4s, v1.4s, v2.4s +; CHECK-NEXT: sub v1.4s, v6.4s, v5.4s +; CHECK-NEXT: ushll2 v2.4s, v4.8h, #3 +; CHECK-NEXT: ushll v4.4s, v4.4h, #0 +; CHECK-NEXT: ushll2 v5.4s, v3.8h, #3 +; CHECK-NEXT: ushll v3.4s, v3.4h, #0 +; CHECK-NEXT: sub v2.4s, v4.4s, v2.4s +; CHECK-NEXT: sub v3.4s, v3.4s, v5.4s ; CHECK-NEXT: ret %lp1 = load <4 x i16>, ptr %p %p2 = getelementptr i8, ptr %p, i32 8 @@ -7438,46 +6872,46 @@ define <32 x i32> @dbl_bv4_v8i16_v32i32(ptr %p, ptr %q, ptr %r, ptr %s) { ; CHECK-LABEL: dbl_bv4_v8i16_v32i32: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q1, q2, [x0] -; CHECK-NEXT: ldp q3, q4, [x0, #32] -; CHECK-NEXT: add v1.8h, v1.8h, v3.8h -; CHECK-NEXT: ldp q5, q6, [x1] -; CHECK-NEXT: add v2.8h, v2.8h, v4.8h -; CHECK-NEXT: ushll v24.4s, v2.4h, #3 -; CHECK-NEXT: ushll2 v2.4s, v2.8h, #3 -; CHECK-NEXT: ldp q7, q16, [x1, #32] -; CHECK-NEXT: add v5.8h, v5.8h, v7.8h -; CHECK-NEXT: ldp q0, q17, [x2] -; CHECK-NEXT: add v6.8h, v6.8h, v16.8h -; CHECK-NEXT: ushll v23.4s, v6.4h, #3 -; CHECK-NEXT: ushll2 v6.4s, v6.8h, #3 -; CHECK-NEXT: ldp q18, q19, [x2, #32] -; CHECK-NEXT: add v0.8h, v0.8h, v18.8h -; CHECK-NEXT: ushll v18.4s, v5.4h, #0 -; CHECK-NEXT: ldp q20, q21, [x3] -; CHECK-NEXT: add v17.8h, v17.8h, v19.8h -; CHECK-NEXT: ushll v4.4s, v0.4h, #0 -; CHECK-NEXT: ushll v19.4s, v1.4h, #0 -; CHECK-NEXT: ushll2 v0.4s, v0.8h, #0 -; CHECK-NEXT: ushll2 v1.4s, v1.8h, #0 -; CHECK-NEXT: ldp q3, q7, [x3, #32] -; CHECK-NEXT: ushll v22.4s, v17.4h, #3 -; CHECK-NEXT: sub v1.4s, v1.4s, v2.4s -; CHECK-NEXT: sub v2.4s, v18.4s, v23.4s -; CHECK-NEXT: add v3.8h, v20.8h, v3.8h -; CHECK-NEXT: sub v4.4s, v4.4s, v22.4s -; CHECK-NEXT: add v7.8h, v21.8h, v7.8h -; CHECK-NEXT: ushll v16.4s, v3.4h, #0 -; CHECK-NEXT: ushll2 v20.4s, v3.8h, #0 -; CHECK-NEXT: ushll2 v3.4s, v5.8h, #0 -; CHECK-NEXT: ushll v21.4s, v7.4h, #3 -; CHECK-NEXT: ushll2 v7.4s, v7.8h, #3 -; CHECK-NEXT: ushll2 v5.4s, v17.8h, #3 -; CHECK-NEXT: sub v3.4s, v3.4s, v6.4s -; CHECK-NEXT: sub v5.4s, v0.4s, v5.4s -; CHECK-NEXT: sub v7.4s, v20.4s, v7.4s -; CHECK-NEXT: sub v0.4s, v19.4s, v24.4s -; CHECK-NEXT: sub v6.4s, v16.4s, v21.4s +; CHECK-NEXT: ldp q5, q4, [x3] +; CHECK-NEXT: ldp q7, q6, [x3, #32] +; CHECK-NEXT: add v19.8h, v5.8h, v7.8h +; CHECK-NEXT: ldp q3, q0, [x2] +; CHECK-NEXT: add v4.8h, v4.8h, v6.8h +; CHECK-NEXT: ushll2 v21.4s, v19.8h, #0 +; CHECK-NEXT: ushll2 v20.4s, v4.8h, #3 +; CHECK-NEXT: ldp q17, q16, [x2, #32] +; CHECK-NEXT: add v17.8h, v3.8h, v17.8h +; CHECK-NEXT: ldp q2, q1, [x1] +; CHECK-NEXT: add v16.8h, v0.8h, v16.8h +; CHECK-NEXT: ldp q5, q18, [x1, #32] +; CHECK-NEXT: add v2.8h, v2.8h, v5.8h +; CHECK-NEXT: ldp q3, q6, [x0] +; CHECK-NEXT: add v18.8h, v1.8h, v18.8h +; CHECK-NEXT: ushll2 v7.4s, v2.8h, #0 +; CHECK-NEXT: ushll v2.4s, v2.4h, #0 +; CHECK-NEXT: ldp q5, q0, [x0, #32] +; CHECK-NEXT: add v0.8h, v6.8h, v0.8h +; CHECK-NEXT: add v6.8h, v3.8h, v5.8h +; CHECK-NEXT: ushll2 v1.4s, v0.8h, #3 +; CHECK-NEXT: ushll2 v3.4s, v6.8h, #0 +; CHECK-NEXT: ushll2 v5.4s, v18.8h, #3 +; CHECK-NEXT: ushll v0.4s, v0.4h, #3 +; CHECK-NEXT: ushll v6.4s, v6.4h, #0 +; CHECK-NEXT: ushll v18.4s, v18.4h, #3 +; CHECK-NEXT: sub v1.4s, v3.4s, v1.4s +; CHECK-NEXT: sub v3.4s, v7.4s, v5.4s +; CHECK-NEXT: ushll2 v5.4s, v16.8h, #3 +; CHECK-NEXT: ushll2 v7.4s, v17.8h, #0 +; CHECK-NEXT: sub v0.4s, v6.4s, v0.4s +; CHECK-NEXT: sub v2.4s, v2.4s, v18.4s +; CHECK-NEXT: ushll v6.4s, v16.4h, #3 +; CHECK-NEXT: ushll v16.4s, v17.4h, #0 +; CHECK-NEXT: ushll v17.4s, v4.4h, #3 +; CHECK-NEXT: ushll v18.4s, v19.4h, #0 +; CHECK-NEXT: sub v5.4s, v7.4s, v5.4s +; CHECK-NEXT: sub v7.4s, v21.4s, v20.4s +; CHECK-NEXT: sub v4.4s, v16.4s, v6.4s +; CHECK-NEXT: sub v6.4s, v18.4s, v17.4s ; CHECK-NEXT: ret %lp1 = load <8 x i16>, ptr %p %p2 = getelementptr i8, ptr %p, i32 16 @@ -7541,90 +6975,30 @@ define <8 x i64> @dbl_bv4_v2i16_v8i64(ptr %p, ptr %q, ptr %r, ptr %s) { ; CHECK-LABEL: dbl_bv4_v2i16_v8i64: ; CHECK: // %bb.0: -; CHECK-NEXT: add x8, x0, #4 -; CHECK-NEXT: add x9, x0, #8 -; CHECK-NEXT: ld1 { v20.h }[0], [x0] -; CHECK-NEXT: add x10, x0, #12 -; CHECK-NEXT: add x11, x3, #10 -; CHECK-NEXT: ld1 { v16.h }[0], [x8] -; CHECK-NEXT: add x8, x0, #2 -; CHECK-NEXT: ld1 { v4.h }[0], [x9] -; CHECK-NEXT: add x9, x0, #6 -; CHECK-NEXT: ld1 { v0.h }[0], [x10] -; CHECK-NEXT: add x10, x0, #14 -; CHECK-NEXT: ld1 { v20.h }[2], [x8] -; CHECK-NEXT: add x8, x0, #10 -; CHECK-NEXT: ld1 { v16.h }[2], [x9] -; CHECK-NEXT: add x9, x1, #4 -; CHECK-NEXT: ld1 { v21.h }[0], [x1] -; CHECK-NEXT: ld1 { v4.h }[2], [x8] -; CHECK-NEXT: add x8, x1, #2 -; CHECK-NEXT: ld1 { v0.h }[2], [x10] -; CHECK-NEXT: add x10, x1, #8 -; CHECK-NEXT: ld1 { v17.h }[0], [x9] -; CHECK-NEXT: add x9, x1, #12 -; CHECK-NEXT: ld1 { v21.h }[2], [x8] -; CHECK-NEXT: add x8, x1, #6 -; CHECK-NEXT: ld1 { v5.h }[0], [x10] -; CHECK-NEXT: add x10, x2, #4 -; CHECK-NEXT: ld1 { v1.h }[0], [x9] -; CHECK-NEXT: add x9, x1, #10 -; CHECK-NEXT: ld1 { v17.h }[2], [x8] -; CHECK-NEXT: add x8, x1, #14 -; CHECK-NEXT: ld1 { v22.h }[0], [x2] -; CHECK-NEXT: ld1 { v5.h }[2], [x9] -; CHECK-NEXT: add x9, x2, #2 -; CHECK-NEXT: ld1 { v1.h }[2], [x8] -; CHECK-NEXT: add x8, x2, #8 -; CHECK-NEXT: ld1 { v18.h }[0], [x10] -; CHECK-NEXT: add x10, x2, #12 -; CHECK-NEXT: ld1 { v22.h }[2], [x9] -; CHECK-NEXT: add x9, x2, #6 -; CHECK-NEXT: ld1 { v6.h }[0], [x8] -; CHECK-NEXT: add x8, x2, #10 -; CHECK-NEXT: ld1 { v2.h }[0], [x10] -; CHECK-NEXT: add x10, x3, #4 -; CHECK-NEXT: ld1 { v18.h }[2], [x9] -; CHECK-NEXT: add x9, x2, #14 -; CHECK-NEXT: ld1 { v23.h }[0], [x3] -; CHECK-NEXT: ld1 { v6.h }[2], [x8] -; CHECK-NEXT: add x8, x3, #2 -; CHECK-NEXT: ld1 { v2.h }[2], [x9] -; CHECK-NEXT: add x9, x3, #8 -; CHECK-NEXT: ld1 { v19.h }[0], [x10] -; CHECK-NEXT: add x10, x3, #12 -; CHECK-NEXT: ld1 { v23.h }[2], [x8] -; CHECK-NEXT: add x8, x3, #6 -; CHECK-NEXT: ld1 { v7.h }[0], [x9] -; CHECK-NEXT: adrp x9, .LCPI142_0 -; CHECK-NEXT: ld1 { v3.h }[0], [x10] -; CHECK-NEXT: ld1 { v19.h }[2], [x8] -; CHECK-NEXT: add x8, x3, #14 -; CHECK-NEXT: ldr q24, [x9, :lo12:.LCPI142_0] -; CHECK-NEXT: ld1 { v7.h }[2], [x11] -; CHECK-NEXT: ld1 { v3.h }[2], [x8] -; CHECK-NEXT: tbl v20.16b, { v20.16b, v21.16b, v22.16b, v23.16b }, v24.16b -; CHECK-NEXT: tbl v16.16b, { v16.16b, v17.16b, v18.16b, v19.16b }, v24.16b -; CHECK-NEXT: tbl v4.16b, { v4.16b, v5.16b, v6.16b, v7.16b }, v24.16b -; CHECK-NEXT: tbl v0.16b, { v0.16b, v1.16b, v2.16b, v3.16b }, v24.16b -; CHECK-NEXT: add v1.8h, v20.8h, v4.8h -; CHECK-NEXT: add v0.8h, v16.8h, v0.8h -; CHECK-NEXT: ushll v2.4s, v1.4h, #0 -; CHECK-NEXT: ushll2 v1.4s, v1.8h, #0 -; CHECK-NEXT: ushll v3.4s, v0.4h, #0 -; CHECK-NEXT: ushll2 v0.4s, v0.8h, #0 -; CHECK-NEXT: ushll v4.2d, v2.2s, #0 -; CHECK-NEXT: ushll v5.2d, v1.2s, #0 -; CHECK-NEXT: ushll2 v2.2d, v2.4s, #0 -; CHECK-NEXT: ushll2 v1.2d, v1.4s, #0 -; CHECK-NEXT: ushll v6.2d, v3.2s, #3 -; CHECK-NEXT: ushll v7.2d, v0.2s, #3 -; CHECK-NEXT: ushll2 v0.2d, v0.4s, #3 -; CHECK-NEXT: ushll2 v16.2d, v3.4s, #3 -; CHECK-NEXT: sub v3.2d, v1.2d, v0.2d -; CHECK-NEXT: sub v1.2d, v2.2d, v16.2d -; CHECK-NEXT: sub v0.2d, v4.2d, v6.2d -; CHECK-NEXT: sub v2.2d, v5.2d, v7.2d +; CHECK-NEXT: ldp d1, d2, [x0] +; CHECK-NEXT: ldp d3, d4, [x1] +; CHECK-NEXT: ldp d0, d5, [x2] +; CHECK-NEXT: add v1.4h, v1.4h, v2.4h +; CHECK-NEXT: ldp d6, d2, [x3] +; CHECK-NEXT: add v3.4h, v3.4h, v4.4h +; CHECK-NEXT: ushll v1.4s, v1.4h, #0 +; CHECK-NEXT: add v0.4h, v0.4h, v5.4h +; CHECK-NEXT: ushll v3.4s, v3.4h, #0 +; CHECK-NEXT: add v2.4h, v6.4h, v2.4h +; CHECK-NEXT: ushll v4.4s, v0.4h, #0 +; CHECK-NEXT: ushll2 v0.2d, v1.4s, #3 +; CHECK-NEXT: ushll v1.2d, v1.2s, #0 +; CHECK-NEXT: ushll2 v5.2d, v3.4s, #3 +; CHECK-NEXT: ushll v3.2d, v3.2s, #0 +; CHECK-NEXT: ushll v2.4s, v2.4h, #0 +; CHECK-NEXT: ushll2 v6.2d, v4.4s, #3 +; CHECK-NEXT: sub v0.2d, v1.2d, v0.2d +; CHECK-NEXT: sub v1.2d, v3.2d, v5.2d +; CHECK-NEXT: ushll v3.2d, v4.2s, #0 +; CHECK-NEXT: ushll2 v4.2d, v2.4s, #3 +; CHECK-NEXT: ushll v5.2d, v2.2s, #0 +; CHECK-NEXT: sub v2.2d, v3.2d, v6.2d +; CHECK-NEXT: sub v3.2d, v5.2d, v4.2d ; CHECK-NEXT: ret %lp1 = load <2 x i16>, ptr %p %p2 = getelementptr i8, ptr %p, i32 4 @@ -7688,56 +7062,46 @@ define <16 x i64> @dbl_bv4_v4i16_v16i64(ptr %p, ptr %q, ptr %r, ptr %s) { ; CHECK-LABEL: dbl_bv4_v4i16_v16i64: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp d6, d4, [x3, #8] -; CHECK-NEXT: ldr d5, [x3] -; CHECK-NEXT: ldp d19, d16, [x1, #8] -; CHECK-NEXT: ldp d0, d2, [x2] -; CHECK-NEXT: add v4.4h, v5.4h, v4.4h -; CHECK-NEXT: ldp d3, d5, [x2, #16] -; CHECK-NEXT: ushll v4.4s, v4.4h, #0 -; CHECK-NEXT: ldr d17, [x1] -; CHECK-NEXT: ldr d7, [x3, #24] -; CHECK-NEXT: add v0.4h, v0.4h, v3.4h -; CHECK-NEXT: ldr d21, [x1, #24] -; CHECK-NEXT: ldp d3, d22, [x0, #16] -; CHECK-NEXT: add v16.4h, v17.4h, v16.4h -; CHECK-NEXT: add v6.4h, v6.4h, v7.4h -; CHECK-NEXT: ldp d1, d17, [x0] -; CHECK-NEXT: add v19.4h, v19.4h, v21.4h -; CHECK-NEXT: add v2.4h, v2.4h, v5.4h -; CHECK-NEXT: ushll v0.4s, v0.4h, #0 -; CHECK-NEXT: ushll v19.4s, v19.4h, #0 -; CHECK-NEXT: add v1.4h, v1.4h, v3.4h -; CHECK-NEXT: add v17.4h, v17.4h, v22.4h -; CHECK-NEXT: ushll v3.4s, v16.4h, #0 -; CHECK-NEXT: ushll v1.4s, v1.4h, #0 -; CHECK-NEXT: ushll v17.4s, v17.4h, #0 -; CHECK-NEXT: ushll v5.4s, v6.4h, #0 +; CHECK-NEXT: ldp q0, q3, [x3] +; CHECK-NEXT: ldp q1, q2, [x0] +; CHECK-NEXT: add v0.8h, v0.8h, v3.8h +; CHECK-NEXT: ushll2 v19.4s, v0.8h, #0 +; CHECK-NEXT: ushll v20.4s, v0.4h, #0 +; CHECK-NEXT: ushll2 v0.2d, v19.4s, #3 +; CHECK-NEXT: ldp q6, q3, [x1] +; CHECK-NEXT: add v1.8h, v1.8h, v2.8h +; CHECK-NEXT: ushll2 v21.2d, v20.4s, #0 +; CHECK-NEXT: ushll v16.4s, v1.4h, #0 +; CHECK-NEXT: ldp q4, q5, [x2] +; CHECK-NEXT: add v2.8h, v6.8h, v3.8h +; CHECK-NEXT: ushll2 v6.4s, v1.8h, #0 +; CHECK-NEXT: ushll2 v17.4s, v2.8h, #0 ; CHECK-NEXT: ushll v2.4s, v2.4h, #0 -; CHECK-NEXT: ushll v18.2d, v4.2s, #0 -; CHECK-NEXT: ushll v20.2d, v0.2s, #0 -; CHECK-NEXT: ushll v16.2d, v3.2s, #0 -; CHECK-NEXT: ushll v23.2d, v1.2s, #0 -; CHECK-NEXT: ushll2 v4.2d, v4.4s, #0 -; CHECK-NEXT: ushll2 v0.2d, v0.4s, #0 -; CHECK-NEXT: ushll2 v3.2d, v3.4s, #0 -; CHECK-NEXT: ushll2 v1.2d, v1.4s, #0 -; CHECK-NEXT: ushll v6.2d, v5.2s, #3 -; CHECK-NEXT: ushll v21.2d, v2.2s, #3 -; CHECK-NEXT: ushll v22.2d, v19.2s, #3 -; CHECK-NEXT: ushll v24.2d, v17.2s, #3 -; CHECK-NEXT: ushll2 v7.2d, v5.4s, #3 -; CHECK-NEXT: ushll2 v2.2d, v2.4s, #3 +; CHECK-NEXT: ushll2 v1.2d, v6.4s, #3 +; CHECK-NEXT: add v4.8h, v4.8h, v5.8h +; CHECK-NEXT: ushll2 v3.2d, v16.4s, #0 ; CHECK-NEXT: ushll2 v5.2d, v17.4s, #3 -; CHECK-NEXT: ushll2 v17.2d, v19.4s, #3 -; CHECK-NEXT: sub v1.2d, v1.2d, v5.2d -; CHECK-NEXT: sub v3.2d, v3.2d, v17.2d -; CHECK-NEXT: sub v5.2d, v0.2d, v2.2d -; CHECK-NEXT: sub v7.2d, v4.2d, v7.2d -; CHECK-NEXT: sub v0.2d, v23.2d, v24.2d -; CHECK-NEXT: sub v2.2d, v16.2d, v22.2d -; CHECK-NEXT: sub v4.2d, v20.2d, v21.2d -; CHECK-NEXT: sub v6.2d, v18.2d, v6.2d +; CHECK-NEXT: ushll2 v7.2d, v2.4s, #0 +; CHECK-NEXT: ushll2 v18.4s, v4.8h, #0 +; CHECK-NEXT: ushll v4.4s, v4.4h, #0 +; CHECK-NEXT: sub v1.2d, v3.2d, v1.2d +; CHECK-NEXT: sub v3.2d, v7.2d, v5.2d +; CHECK-NEXT: ushll2 v5.2d, v18.4s, #3 +; CHECK-NEXT: ushll2 v7.2d, v4.4s, #0 +; CHECK-NEXT: ushll v2.2d, v2.2s, #0 +; CHECK-NEXT: sub v5.2d, v7.2d, v5.2d +; CHECK-NEXT: sub v7.2d, v21.2d, v0.2d +; CHECK-NEXT: ushll v0.2d, v6.2s, #3 +; CHECK-NEXT: ushll v6.2d, v16.2s, #0 +; CHECK-NEXT: ushll v16.2d, v17.2s, #3 +; CHECK-NEXT: sub v0.2d, v6.2d, v0.2d +; CHECK-NEXT: sub v2.2d, v2.2d, v16.2d +; CHECK-NEXT: ushll v6.2d, v18.2s, #3 +; CHECK-NEXT: ushll v4.2d, v4.2s, #0 +; CHECK-NEXT: ushll v16.2d, v19.2s, #3 +; CHECK-NEXT: ushll v17.2d, v20.2s, #0 +; CHECK-NEXT: sub v4.2d, v4.2d, v6.2d +; CHECK-NEXT: sub v6.2d, v17.2d, v16.2d ; CHECK-NEXT: ret %lp1 = load <4 x i16>, ptr %p %p2 = getelementptr i8, ptr %p, i32 8 @@ -7801,110 +7165,94 @@ define <32 x i64> @dbl_bv4_v8i16_v32i64(ptr %p, ptr %q, ptr %r, ptr %s) { ; CHECK-LABEL: dbl_bv4_v8i16_v32i64: ; CHECK: // %bb.0: -; CHECK-NEXT: str d14, [sp, #-64]! // 8-byte Folded Spill -; CHECK-NEXT: .cfi_def_cfa_offset 64 -; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill -; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill -; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill -; CHECK-NEXT: .cfi_offset b8, -8 -; CHECK-NEXT: .cfi_offset b9, -16 -; CHECK-NEXT: .cfi_offset b10, -24 -; CHECK-NEXT: .cfi_offset b11, -32 -; CHECK-NEXT: .cfi_offset b12, -40 -; CHECK-NEXT: .cfi_offset b13, -48 -; CHECK-NEXT: .cfi_offset b14, -64 -; CHECK-NEXT: ldp q1, q2, [x0] -; CHECK-NEXT: ldp q3, q4, [x0, #32] -; CHECK-NEXT: add v22.8h, v1.8h, v3.8h -; CHECK-NEXT: ldp q20, q21, [x3] -; CHECK-NEXT: add v28.8h, v2.8h, v4.8h -; CHECK-NEXT: ldp q1, q3, [x3, #32] -; CHECK-NEXT: ldp q0, q17, [x2] -; CHECK-NEXT: add v25.8h, v21.8h, v3.8h -; CHECK-NEXT: ushll v31.4s, v25.4h, #0 -; CHECK-NEXT: ushll2 v25.4s, v25.8h, #0 -; CHECK-NEXT: ushll2 v12.2d, v31.4s, #3 -; CHECK-NEXT: ldp q5, q6, [x1] -; CHECK-NEXT: ushll2 v8.2d, v25.4s, #3 -; CHECK-NEXT: ushll v25.2d, v25.2s, #3 -; CHECK-NEXT: ushll v31.2d, v31.2s, #3 -; CHECK-NEXT: ldp q7, q16, [x1, #32] -; CHECK-NEXT: add v7.8h, v5.8h, v7.8h -; CHECK-NEXT: ldp q18, q19, [x2, #32] -; CHECK-NEXT: add v27.8h, v6.8h, v16.8h -; CHECK-NEXT: ushll2 v16.4s, v7.8h, #0 -; CHECK-NEXT: ushll v29.4s, v7.4h, #0 -; CHECK-NEXT: add v5.8h, v0.8h, v18.8h -; CHECK-NEXT: add v18.8h, v20.8h, v1.8h -; CHECK-NEXT: add v26.8h, v17.8h, v19.8h -; CHECK-NEXT: ushll2 v2.4s, v18.8h, #0 -; CHECK-NEXT: ushll2 v6.4s, v5.8h, #0 -; CHECK-NEXT: ushll2 v3.2d, v2.4s, #0 -; CHECK-NEXT: ushll2 v20.4s, v22.8h, #0 -; CHECK-NEXT: ushll v17.2d, v2.2s, #0 -; CHECK-NEXT: ushll v30.4s, v26.4h, #0 -; CHECK-NEXT: ushll2 v26.4s, v26.8h, #0 -; CHECK-NEXT: ushll2 v4.2d, v6.4s, #0 -; CHECK-NEXT: ushll2 v1.2d, v16.4s, #0 -; CHECK-NEXT: ushll2 v0.2d, v20.4s, #0 -; CHECK-NEXT: ushll v21.4s, v18.4h, #0 -; CHECK-NEXT: ushll v24.4s, v5.4h, #0 -; CHECK-NEXT: ushll v18.2d, v6.2s, #0 -; CHECK-NEXT: ushll v19.2d, v16.2s, #0 -; CHECK-NEXT: ushll2 v6.2d, v29.4s, #0 -; CHECK-NEXT: ushll v16.4s, v22.4h, #0 +; CHECK-NEXT: ldp q5, q6, [x3] +; CHECK-NEXT: ldp q17, q16, [x3, #32] +; CHECK-NEXT: add v5.8h, v5.8h, v17.8h +; CHECK-NEXT: ldp q0, q1, [x2] +; CHECK-NEXT: add v6.8h, v6.8h, v16.8h +; CHECK-NEXT: ldp q19, q18, [x2, #32] +; CHECK-NEXT: add v0.8h, v0.8h, v19.8h +; CHECK-NEXT: ldp q2, q3, [x1] +; CHECK-NEXT: add v1.8h, v1.8h, v18.8h +; CHECK-NEXT: ldp q16, q17, [x1, #32] +; CHECK-NEXT: add v2.8h, v2.8h, v16.8h +; CHECK-NEXT: ushll v16.4s, v6.4h, #0 +; CHECK-NEXT: ldp q4, q7, [x0] +; CHECK-NEXT: add v3.8h, v3.8h, v17.8h +; CHECK-NEXT: ushll v17.4s, v5.4h, #0 +; CHECK-NEXT: ushll2 v23.4s, v3.8h, #0 +; CHECK-NEXT: ushll v20.2d, v17.2s, #0 +; CHECK-NEXT: ushll2 v24.4s, v2.8h, #0 +; CHECK-NEXT: ldp q18, q19, [x0, #32] +; CHECK-NEXT: ushll v25.2d, v23.2s, #3 +; CHECK-NEXT: ushll v26.2d, v24.2s, #0 +; CHECK-NEXT: ushll2 v6.4s, v6.8h, #0 +; CHECK-NEXT: add v4.8h, v4.8h, v18.8h +; CHECK-NEXT: ushll v18.2d, v16.2s, #3 +; CHECK-NEXT: add v19.8h, v7.8h, v19.8h +; CHECK-NEXT: sub v7.2d, v20.2d, v18.2d +; CHECK-NEXT: ushll2 v18.4s, v19.8h, #0 +; CHECK-NEXT: ushll2 v20.4s, v4.8h, #0 +; CHECK-NEXT: ushll v21.2d, v18.2s, #3 ; CHECK-NEXT: ushll v22.2d, v20.2s, #0 -; CHECK-NEXT: ushll v20.2d, v29.2s, #0 -; CHECK-NEXT: ushll v29.4s, v27.4h, #0 -; CHECK-NEXT: ushll2 v27.4s, v27.8h, #0 -; CHECK-NEXT: ushll2 v9.2d, v26.4s, #3 -; CHECK-NEXT: ushll v26.2d, v26.2s, #3 -; CHECK-NEXT: sub v3.2d, v3.2d, v8.2d -; CHECK-NEXT: sub v17.2d, v17.2d, v25.2d -; CHECK-NEXT: ushll2 v2.2d, v21.4s, #0 -; CHECK-NEXT: ushll2 v5.2d, v24.4s, #0 -; CHECK-NEXT: stp q17, q3, [x8, #224] -; CHECK-NEXT: ushll v23.2d, v21.2s, #0 -; CHECK-NEXT: ushll v21.2d, v24.2s, #0 -; CHECK-NEXT: ushll v24.4s, v28.4h, #0 -; CHECK-NEXT: ushll2 v28.4s, v28.8h, #0 -; CHECK-NEXT: ushll2 v10.2d, v27.4s, #3 -; CHECK-NEXT: ushll v27.2d, v27.2s, #3 -; CHECK-NEXT: sub v4.2d, v4.2d, v9.2d -; CHECK-NEXT: sub v3.2d, v18.2d, v26.2d -; CHECK-NEXT: ushll2 v11.2d, v28.4s, #3 -; CHECK-NEXT: ushll v28.2d, v28.2s, #3 -; CHECK-NEXT: stp q3, q4, [x8, #160] -; CHECK-NEXT: sub v1.2d, v1.2d, v10.2d -; CHECK-NEXT: sub v4.2d, v19.2d, v27.2d -; CHECK-NEXT: sub v0.2d, v0.2d, v11.2d -; CHECK-NEXT: stp q4, q1, [x8, #96] -; CHECK-NEXT: sub v1.2d, v22.2d, v28.2d -; CHECK-NEXT: ushll2 v13.2d, v30.4s, #3 -; CHECK-NEXT: ushll v30.2d, v30.2s, #3 -; CHECK-NEXT: stp q1, q0, [x8, #32] -; CHECK-NEXT: sub v2.2d, v2.2d, v12.2d -; CHECK-NEXT: sub v0.2d, v23.2d, v31.2d -; CHECK-NEXT: ushll2 v14.2d, v29.4s, #3 -; CHECK-NEXT: ushll v29.2d, v29.2s, #3 -; CHECK-NEXT: stp q0, q2, [x8, #192] -; CHECK-NEXT: sub v1.2d, v5.2d, v13.2d -; CHECK-NEXT: sub v2.2d, v21.2d, v30.2d -; CHECK-NEXT: sub v0.2d, v6.2d, v14.2d -; CHECK-NEXT: stp q2, q1, [x8, #128] -; CHECK-NEXT: sub v1.2d, v20.2d, v29.2d -; CHECK-NEXT: ushll2 v7.2d, v16.4s, #0 -; CHECK-NEXT: ushll v2.2d, v16.2s, #0 -; CHECK-NEXT: stp q1, q0, [x8, #64] -; CHECK-NEXT: ushll2 v0.2d, v24.4s, #3 -; CHECK-NEXT: ushll v1.2d, v24.2s, #3 -; CHECK-NEXT: sub v0.2d, v7.2d, v0.2d +; CHECK-NEXT: ushll2 v5.4s, v5.8h, #0 +; CHECK-NEXT: sub v21.2d, v22.2d, v21.2d +; CHECK-NEXT: sub v22.2d, v26.2d, v25.2d +; CHECK-NEXT: ushll2 v25.4s, v1.8h, #0 +; CHECK-NEXT: ushll2 v26.4s, v0.8h, #0 +; CHECK-NEXT: ushll v27.2d, v25.2s, #3 +; CHECK-NEXT: ushll v28.2d, v26.2s, #0 +; CHECK-NEXT: ushll v29.2d, v6.2s, #3 +; CHECK-NEXT: ushll v30.2d, v5.2s, #0 +; CHECK-NEXT: ushll2 v6.2d, v6.4s, #3 +; CHECK-NEXT: ushll2 v5.2d, v5.4s, #0 +; CHECK-NEXT: sub v27.2d, v28.2d, v27.2d +; CHECK-NEXT: sub v28.2d, v30.2d, v29.2d +; CHECK-NEXT: sub v5.2d, v5.2d, v6.2d +; CHECK-NEXT: ushll v1.4s, v1.4h, #0 +; CHECK-NEXT: ushll v0.4s, v0.4h, #0 +; CHECK-NEXT: stp q28, q5, [x8, #224] +; CHECK-NEXT: ushll2 v25.2d, v25.4s, #3 +; CHECK-NEXT: ushll2 v26.2d, v26.4s, #0 +; CHECK-NEXT: ushll v3.4s, v3.4h, #0 +; CHECK-NEXT: ushll v2.4s, v2.4h, #0 +; CHECK-NEXT: ushll v5.2d, v1.2s, #3 +; CHECK-NEXT: ushll v28.2d, v0.2s, #0 +; CHECK-NEXT: ushll2 v1.2d, v1.4s, #3 +; CHECK-NEXT: ushll2 v0.2d, v0.4s, #0 +; CHECK-NEXT: sub v25.2d, v26.2d, v25.2d +; CHECK-NEXT: ushll v26.2d, v3.2s, #3 +; CHECK-NEXT: ushll v6.2d, v2.2s, #0 +; CHECK-NEXT: stp q27, q25, [x8, #160] +; CHECK-NEXT: ushll2 v3.2d, v3.4s, #3 +; CHECK-NEXT: ushll2 v2.2d, v2.4s, #0 +; CHECK-NEXT: sub v0.2d, v0.2d, v1.2d +; CHECK-NEXT: sub v1.2d, v28.2d, v5.2d +; CHECK-NEXT: ushll2 v23.2d, v23.4s, #3 +; CHECK-NEXT: ushll2 v24.2d, v24.4s, #0 +; CHECK-NEXT: stp q1, q0, [x8, #128] +; CHECK-NEXT: ushll v19.4s, v19.4h, #0 +; CHECK-NEXT: ushll v4.4s, v4.4h, #0 +; CHECK-NEXT: sub v2.2d, v2.2d, v3.2d +; CHECK-NEXT: sub v0.2d, v6.2d, v26.2d +; CHECK-NEXT: sub v23.2d, v24.2d, v23.2d +; CHECK-NEXT: ushll v24.2d, v19.2s, #3 +; CHECK-NEXT: stp q0, q2, [x8, #64] +; CHECK-NEXT: ushll2 v16.2d, v16.4s, #3 +; CHECK-NEXT: stp q22, q23, [x8, #96] +; CHECK-NEXT: ushll2 v18.2d, v18.4s, #3 +; CHECK-NEXT: ushll2 v20.2d, v20.4s, #0 +; CHECK-NEXT: ushll2 v17.2d, v17.4s, #0 +; CHECK-NEXT: ushll2 v1.2d, v19.4s, #3 +; CHECK-NEXT: ushll2 v2.2d, v4.4s, #0 +; CHECK-NEXT: ushll v0.2d, v4.2s, #0 +; CHECK-NEXT: sub v18.2d, v20.2d, v18.2d +; CHECK-NEXT: sub v16.2d, v17.2d, v16.2d ; CHECK-NEXT: sub v1.2d, v2.2d, v1.2d -; CHECK-NEXT: stp q1, q0, [x8] -; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload -; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload -; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload -; CHECK-NEXT: ldr d14, [sp], #64 // 8-byte Folded Reload +; CHECK-NEXT: stp q21, q18, [x8, #32] +; CHECK-NEXT: sub v0.2d, v0.2d, v24.2d +; CHECK-NEXT: stp q7, q16, [x8, #192] +; CHECK-NEXT: stp q0, q1, [x8] ; CHECK-NEXT: ret %lp1 = load <8 x i16>, ptr %p %p2 = getelementptr i8, ptr %p, i32 16 @@ -7968,36 +7316,26 @@ define <8 x i64> @dbl_bv4_v2i32_v8i64(ptr %p, ptr %q, ptr %r, ptr %s) { ; CHECK-LABEL: dbl_bv4_v2i32_v8i64: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp d16, d6, [x3, #8] -; CHECK-NEXT: ldr d7, [x3] -; CHECK-NEXT: ldp d0, d4, [x2] -; CHECK-NEXT: add v6.2s, v7.2s, v6.2s -; CHECK-NEXT: ldr d17, [x3, #24] -; CHECK-NEXT: ldp d5, d7, [x2, #16] -; CHECK-NEXT: ushll v6.2d, v6.2s, #0 -; CHECK-NEXT: ldp d18, d3, [x1, #8] -; CHECK-NEXT: add v0.2s, v0.2s, v5.2s -; CHECK-NEXT: ldr d5, [x1] -; CHECK-NEXT: ldp d2, d20, [x0, #16] -; CHECK-NEXT: ushll v19.2d, v0.2s, #0 -; CHECK-NEXT: add v4.2s, v4.2s, v7.2s -; CHECK-NEXT: add v3.2s, v5.2s, v3.2s -; CHECK-NEXT: ldr d0, [x1, #24] -; CHECK-NEXT: ldp d1, d5, [x0] -; CHECK-NEXT: ushll v4.2d, v4.2s, #3 -; CHECK-NEXT: add v0.2s, v18.2s, v0.2s -; CHECK-NEXT: add v1.2s, v1.2s, v2.2s -; CHECK-NEXT: ushll v2.2d, v3.2s, #0 -; CHECK-NEXT: add v3.2s, v5.2s, v20.2s -; CHECK-NEXT: add v5.2s, v16.2s, v17.2s +; CHECK-NEXT: ldp q0, q3, [x3] +; CHECK-NEXT: ldp q1, q2, [x0] +; CHECK-NEXT: add v3.4s, v0.4s, v3.4s +; CHECK-NEXT: ldp q6, q0, [x1] +; CHECK-NEXT: add v1.4s, v1.4s, v2.4s +; CHECK-NEXT: ushll2 v2.2d, v1.4s, #3 ; CHECK-NEXT: ushll v1.2d, v1.2s, #0 -; CHECK-NEXT: ushll v5.2d, v5.2s, #3 -; CHECK-NEXT: ushll v3.2d, v3.2s, #3 -; CHECK-NEXT: ushll v7.2d, v0.2s, #3 -; CHECK-NEXT: sub v0.2d, v1.2d, v3.2d -; CHECK-NEXT: sub v1.2d, v2.2d, v7.2d -; CHECK-NEXT: sub v2.2d, v19.2d, v4.2d -; CHECK-NEXT: sub v3.2d, v6.2d, v5.2d +; CHECK-NEXT: ldp q4, q5, [x2] +; CHECK-NEXT: add v0.4s, v6.4s, v0.4s +; CHECK-NEXT: ushll v6.2d, v0.2s, #0 +; CHECK-NEXT: add v4.4s, v4.4s, v5.4s +; CHECK-NEXT: ushll2 v5.2d, v0.4s, #3 +; CHECK-NEXT: sub v0.2d, v1.2d, v2.2d +; CHECK-NEXT: sub v1.2d, v6.2d, v5.2d +; CHECK-NEXT: ushll2 v2.2d, v4.4s, #3 +; CHECK-NEXT: ushll v4.2d, v4.2s, #0 +; CHECK-NEXT: ushll2 v5.2d, v3.4s, #3 +; CHECK-NEXT: ushll v3.2d, v3.2s, #0 +; CHECK-NEXT: sub v2.2d, v4.2d, v2.2d +; CHECK-NEXT: sub v3.2d, v3.2d, v5.2d ; CHECK-NEXT: ret %lp1 = load <2 x i32>, ptr %p %p2 = getelementptr i8, ptr %p, i32 8 @@ -8061,46 +7399,46 @@ define <16 x i64> @dbl_bv4_v4i32_v16i64(ptr %p, ptr %q, ptr %r, ptr %s) { ; CHECK-LABEL: dbl_bv4_v4i32_v16i64: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q1, q2, [x0] -; CHECK-NEXT: ldp q3, q4, [x0, #32] -; CHECK-NEXT: add v1.4s, v1.4s, v3.4s -; CHECK-NEXT: ldp q5, q6, [x1] -; CHECK-NEXT: add v2.4s, v2.4s, v4.4s -; CHECK-NEXT: ushll v24.2d, v2.2s, #3 -; CHECK-NEXT: ushll2 v2.2d, v2.4s, #3 -; CHECK-NEXT: ldp q7, q16, [x1, #32] -; CHECK-NEXT: add v5.4s, v5.4s, v7.4s -; CHECK-NEXT: ldp q0, q17, [x2] -; CHECK-NEXT: add v6.4s, v6.4s, v16.4s -; CHECK-NEXT: ushll v23.2d, v6.2s, #3 -; CHECK-NEXT: ushll2 v6.2d, v6.4s, #3 -; CHECK-NEXT: ldp q18, q19, [x2, #32] -; CHECK-NEXT: add v0.4s, v0.4s, v18.4s -; CHECK-NEXT: ushll v18.2d, v5.2s, #0 -; CHECK-NEXT: ldp q20, q21, [x3] -; CHECK-NEXT: add v17.4s, v17.4s, v19.4s -; CHECK-NEXT: ushll v4.2d, v0.2s, #0 -; CHECK-NEXT: ushll v19.2d, v1.2s, #0 -; CHECK-NEXT: ushll2 v0.2d, v0.4s, #0 -; CHECK-NEXT: ushll2 v1.2d, v1.4s, #0 -; CHECK-NEXT: ldp q3, q7, [x3, #32] -; CHECK-NEXT: ushll v22.2d, v17.2s, #3 -; CHECK-NEXT: sub v1.2d, v1.2d, v2.2d -; CHECK-NEXT: sub v2.2d, v18.2d, v23.2d -; CHECK-NEXT: add v3.4s, v20.4s, v3.4s -; CHECK-NEXT: sub v4.2d, v4.2d, v22.2d -; CHECK-NEXT: add v7.4s, v21.4s, v7.4s -; CHECK-NEXT: ushll v16.2d, v3.2s, #0 -; CHECK-NEXT: ushll2 v20.2d, v3.4s, #0 -; CHECK-NEXT: ushll2 v3.2d, v5.4s, #0 -; CHECK-NEXT: ushll v21.2d, v7.2s, #3 -; CHECK-NEXT: ushll2 v7.2d, v7.4s, #3 -; CHECK-NEXT: ushll2 v5.2d, v17.4s, #3 -; CHECK-NEXT: sub v3.2d, v3.2d, v6.2d -; CHECK-NEXT: sub v5.2d, v0.2d, v5.2d -; CHECK-NEXT: sub v7.2d, v20.2d, v7.2d -; CHECK-NEXT: sub v0.2d, v19.2d, v24.2d -; CHECK-NEXT: sub v6.2d, v16.2d, v21.2d +; CHECK-NEXT: ldp q5, q4, [x3] +; CHECK-NEXT: ldp q7, q6, [x3, #32] +; CHECK-NEXT: add v19.4s, v5.4s, v7.4s +; CHECK-NEXT: ldp q3, q0, [x2] +; CHECK-NEXT: add v4.4s, v4.4s, v6.4s +; CHECK-NEXT: ushll2 v21.2d, v19.4s, #0 +; CHECK-NEXT: ushll2 v20.2d, v4.4s, #3 +; CHECK-NEXT: ldp q17, q16, [x2, #32] +; CHECK-NEXT: add v17.4s, v3.4s, v17.4s +; CHECK-NEXT: ldp q2, q1, [x1] +; CHECK-NEXT: add v16.4s, v0.4s, v16.4s +; CHECK-NEXT: ldp q5, q18, [x1, #32] +; CHECK-NEXT: add v2.4s, v2.4s, v5.4s +; CHECK-NEXT: ldp q3, q6, [x0] +; CHECK-NEXT: add v18.4s, v1.4s, v18.4s +; CHECK-NEXT: ushll2 v7.2d, v2.4s, #0 +; CHECK-NEXT: ushll v2.2d, v2.2s, #0 +; CHECK-NEXT: ldp q5, q0, [x0, #32] +; CHECK-NEXT: add v0.4s, v6.4s, v0.4s +; CHECK-NEXT: add v6.4s, v3.4s, v5.4s +; CHECK-NEXT: ushll2 v1.2d, v0.4s, #3 +; CHECK-NEXT: ushll2 v3.2d, v6.4s, #0 +; CHECK-NEXT: ushll2 v5.2d, v18.4s, #3 +; CHECK-NEXT: ushll v0.2d, v0.2s, #3 +; CHECK-NEXT: ushll v6.2d, v6.2s, #0 +; CHECK-NEXT: ushll v18.2d, v18.2s, #3 +; CHECK-NEXT: sub v1.2d, v3.2d, v1.2d +; CHECK-NEXT: sub v3.2d, v7.2d, v5.2d +; CHECK-NEXT: ushll2 v5.2d, v16.4s, #3 +; CHECK-NEXT: ushll2 v7.2d, v17.4s, #0 +; CHECK-NEXT: sub v0.2d, v6.2d, v0.2d +; CHECK-NEXT: sub v2.2d, v2.2d, v18.2d +; CHECK-NEXT: ushll v6.2d, v16.2s, #3 +; CHECK-NEXT: ushll v16.2d, v17.2s, #0 +; CHECK-NEXT: ushll v17.2d, v4.2s, #3 +; CHECK-NEXT: ushll v18.2d, v19.2s, #0 +; CHECK-NEXT: sub v5.2d, v7.2d, v5.2d +; CHECK-NEXT: sub v7.2d, v21.2d, v20.2d +; CHECK-NEXT: sub v4.2d, v16.2d, v6.2d +; CHECK-NEXT: sub v6.2d, v18.2d, v17.2d ; CHECK-NEXT: ret %lp1 = load <4 x i32>, ptr %p %p2 = getelementptr i8, ptr %p, i32 16 @@ -8164,110 +7502,103 @@ define <32 x i64> @dbl_bv4_v8i32_v32i64(ptr %p, ptr %q, ptr %r, ptr %s) { ; CHECK-LABEL: dbl_bv4_v8i32_v32i64: ; CHECK: // %bb.0: -; CHECK-NEXT: str d14, [sp, #-64]! // 8-byte Folded Spill -; CHECK-NEXT: .cfi_def_cfa_offset 64 -; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill -; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill -; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill +; CHECK-NEXT: stp d11, d10, [sp, #-32]! // 16-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 32 +; CHECK-NEXT: stp d9, d8, [sp, #16] // 16-byte Folded Spill ; CHECK-NEXT: .cfi_offset b8, -8 ; CHECK-NEXT: .cfi_offset b9, -16 ; CHECK-NEXT: .cfi_offset b10, -24 ; CHECK-NEXT: .cfi_offset b11, -32 -; CHECK-NEXT: .cfi_offset b12, -40 -; CHECK-NEXT: .cfi_offset b13, -48 -; CHECK-NEXT: .cfi_offset b14, -64 -; CHECK-NEXT: ldp q0, q2, [x0] -; CHECK-NEXT: ldp q6, q5, [x0, #64] -; CHECK-NEXT: add v0.4s, v0.4s, v6.4s -; CHECK-NEXT: ldp q16, q7, [x1] -; CHECK-NEXT: add v28.4s, v2.4s, v5.4s -; CHECK-NEXT: ldp q20, q19, [x1, #64] -; CHECK-NEXT: add v16.4s, v16.4s, v20.4s -; CHECK-NEXT: ldp q21, q1, [x2] -; CHECK-NEXT: add v19.4s, v7.4s, v19.4s -; CHECK-NEXT: ldp q25, q24, [x2, #64] -; CHECK-NEXT: add v20.4s, v21.4s, v25.4s -; CHECK-NEXT: ldp q27, q26, [x3] -; CHECK-NEXT: add v24.4s, v1.4s, v24.4s -; CHECK-NEXT: ldp q7, q5, [x3, #64] -; CHECK-NEXT: add v25.4s, v27.4s, v7.4s -; CHECK-NEXT: ldp q22, q23, [x2, #32] -; CHECK-NEXT: add v26.4s, v26.4s, v5.4s -; CHECK-NEXT: ushll2 v31.2d, v25.4s, #0 -; CHECK-NEXT: ushll2 v30.2d, v26.4s, #0 -; CHECK-NEXT: ushll v26.2d, v26.2s, #0 -; CHECK-NEXT: ushll v25.2d, v25.2s, #0 -; CHECK-NEXT: ldp q6, q2, [x3, #32] -; CHECK-NEXT: ldp q21, q1, [x3, #96] -; CHECK-NEXT: add v21.4s, v6.4s, v21.4s -; CHECK-NEXT: ldp q7, q5, [x2, #96] -; CHECK-NEXT: add v27.4s, v2.4s, v1.4s -; CHECK-NEXT: ushll2 v9.2d, v21.4s, #3 -; CHECK-NEXT: ushll2 v8.2d, v27.4s, #3 -; CHECK-NEXT: add v22.4s, v22.4s, v7.4s -; CHECK-NEXT: ushll2 v7.2d, v24.4s, #0 -; CHECK-NEXT: ldp q3, q4, [x0, #32] -; CHECK-NEXT: add v23.4s, v23.4s, v5.4s -; CHECK-NEXT: ushll v24.2d, v24.2s, #0 -; CHECK-NEXT: ushll2 v10.2d, v23.4s, #3 -; CHECK-NEXT: ushll v23.2d, v23.2s, #3 -; CHECK-NEXT: ushll2 v11.2d, v22.4s, #3 -; CHECK-NEXT: ldp q17, q18, [x1, #32] -; CHECK-NEXT: ushll v22.2d, v22.2s, #3 -; CHECK-NEXT: sub v7.2d, v7.2d, v10.2d -; CHECK-NEXT: sub v23.2d, v24.2d, v23.2d -; CHECK-NEXT: ushll v27.2d, v27.2s, #3 -; CHECK-NEXT: ushll v21.2d, v21.2s, #3 -; CHECK-NEXT: ldp q1, q2, [x1, #96] -; CHECK-NEXT: sub v30.2d, v30.2d, v8.2d -; CHECK-NEXT: sub v26.2d, v26.2d, v27.2d -; CHECK-NEXT: sub v27.2d, v31.2d, v9.2d -; CHECK-NEXT: add v17.4s, v17.4s, v1.4s -; CHECK-NEXT: sub v21.2d, v25.2d, v21.2d -; CHECK-NEXT: ldp q5, q6, [x0, #96] -; CHECK-NEXT: add v18.4s, v18.4s, v2.4s -; CHECK-NEXT: stp q23, q7, [x8, #160] -; CHECK-NEXT: ushll2 v13.2d, v17.4s, #3 -; CHECK-NEXT: stp q21, q27, [x8, #192] -; CHECK-NEXT: ushll2 v12.2d, v18.4s, #3 -; CHECK-NEXT: stp q26, q30, [x8, #224] -; CHECK-NEXT: add v1.4s, v3.4s, v5.4s -; CHECK-NEXT: ushll2 v5.2d, v19.4s, #0 -; CHECK-NEXT: add v29.4s, v4.4s, v6.4s -; CHECK-NEXT: ushll2 v6.2d, v20.4s, #0 -; CHECK-NEXT: ushll v20.2d, v20.2s, #0 -; CHECK-NEXT: ushll v19.2d, v19.2s, #0 -; CHECK-NEXT: ushll v18.2d, v18.2s, #3 -; CHECK-NEXT: sub v6.2d, v6.2d, v11.2d -; CHECK-NEXT: sub v7.2d, v20.2d, v22.2d -; CHECK-NEXT: ushll2 v4.2d, v16.4s, #0 -; CHECK-NEXT: ushll v16.2d, v16.2s, #0 -; CHECK-NEXT: stp q7, q6, [x8, #128] -; CHECK-NEXT: ushll v17.2d, v17.2s, #3 -; CHECK-NEXT: sub v5.2d, v5.2d, v12.2d -; CHECK-NEXT: sub v6.2d, v19.2d, v18.2d -; CHECK-NEXT: ushll2 v3.2d, v28.4s, #0 -; CHECK-NEXT: ushll v28.2d, v28.2s, #0 -; CHECK-NEXT: stp q6, q5, [x8, #96] -; CHECK-NEXT: ushll2 v14.2d, v29.4s, #3 -; CHECK-NEXT: ushll v29.2d, v29.2s, #3 -; CHECK-NEXT: sub v4.2d, v4.2d, v13.2d -; CHECK-NEXT: sub v5.2d, v16.2d, v17.2d -; CHECK-NEXT: sub v3.2d, v3.2d, v14.2d -; CHECK-NEXT: stp q5, q4, [x8, #64] -; CHECK-NEXT: sub v4.2d, v28.2d, v29.2d -; CHECK-NEXT: ushll2 v2.2d, v0.4s, #0 +; CHECK-NEXT: ldp q21, q7, [x3, #32] +; CHECK-NEXT: ldp q23, q6, [x3] +; CHECK-NEXT: ldp q25, q17, [x3, #64] +; CHECK-NEXT: ldp q26, q27, [x3, #96] +; CHECK-NEXT: add v6.4s, v6.4s, v17.4s +; CHECK-NEXT: ushll v11.2d, v6.2s, #0 +; CHECK-NEXT: add v21.4s, v21.4s, v26.4s +; CHECK-NEXT: ushll2 v6.2d, v6.4s, #0 +; CHECK-NEXT: ldp q0, q19, [x2, #32] +; CHECK-NEXT: add v17.4s, v7.4s, v27.4s +; CHECK-NEXT: add v7.4s, v23.4s, v25.4s +; CHECK-NEXT: ushll v10.2d, v17.2s, #3 +; CHECK-NEXT: ushll2 v17.2d, v17.4s, #3 +; CHECK-NEXT: ldp q24, q18, [x2] +; CHECK-NEXT: sub v6.2d, v6.2d, v17.2d +; CHECK-NEXT: ldp q28, q29, [x2, #64] +; CHECK-NEXT: ldp q27, q23, [x2, #96] +; CHECK-NEXT: add v18.4s, v18.4s, v29.4s +; CHECK-NEXT: ushll v9.2d, v18.2s, #0 +; CHECK-NEXT: add v27.4s, v0.4s, v27.4s +; CHECK-NEXT: ushll2 v17.2d, v18.4s, #0 +; CHECK-NEXT: ldp q20, q5, [x1, #32] +; CHECK-NEXT: add v23.4s, v19.4s, v23.4s +; CHECK-NEXT: add v19.4s, v24.4s, v28.4s +; CHECK-NEXT: ushll v8.2d, v23.2s, #3 +; CHECK-NEXT: ushll v31.2d, v19.2s, #0 +; CHECK-NEXT: ushll2 v23.2d, v23.4s, #3 +; CHECK-NEXT: ldp q22, q4, [x1] +; CHECK-NEXT: sub v17.2d, v17.2d, v23.2d +; CHECK-NEXT: ldp q25, q26, [x1, #64] +; CHECK-NEXT: ldp q29, q24, [x1, #96] +; CHECK-NEXT: add v4.4s, v4.4s, v26.4s +; CHECK-NEXT: ldp q1, q2, [x0, #32] +; CHECK-NEXT: add v24.4s, v5.4s, v24.4s +; CHECK-NEXT: add v5.4s, v22.4s, v25.4s +; CHECK-NEXT: add v25.4s, v20.4s, v29.4s +; CHECK-NEXT: ushll v29.2d, v4.2s, #0 +; CHECK-NEXT: ushll2 v4.2d, v4.4s, #0 +; CHECK-NEXT: ldp q16, q3, [x0] +; CHECK-NEXT: ushll v30.2d, v5.2s, #0 +; CHECK-NEXT: ushll2 v5.2d, v5.4s, #0 +; CHECK-NEXT: ldp q0, q26, [x0, #64] +; CHECK-NEXT: add v0.4s, v16.4s, v0.4s +; CHECK-NEXT: ldp q28, q22, [x0, #96] +; CHECK-NEXT: add v3.4s, v3.4s, v26.4s +; CHECK-NEXT: ushll v20.2d, v3.2s, #0 +; CHECK-NEXT: ushll2 v3.2d, v3.4s, #0 +; CHECK-NEXT: add v26.4s, v2.4s, v22.4s +; CHECK-NEXT: add v2.4s, v1.4s, v28.4s +; CHECK-NEXT: ushll v28.2d, v24.2s, #3 +; CHECK-NEXT: ushll2 v24.2d, v24.4s, #3 +; CHECK-NEXT: ushll v22.2d, v25.2s, #3 +; CHECK-NEXT: sub v28.2d, v29.2d, v28.2d +; CHECK-NEXT: ushll v29.2d, v27.2s, #3 +; CHECK-NEXT: ushll2 v25.2d, v25.4s, #3 +; CHECK-NEXT: sub v4.2d, v4.2d, v24.2d +; CHECK-NEXT: ushll v16.2d, v26.2s, #3 +; CHECK-NEXT: sub v29.2d, v31.2d, v29.2d +; CHECK-NEXT: stp q28, q4, [x8, #96] +; CHECK-NEXT: sub v31.2d, v9.2d, v8.2d +; CHECK-NEXT: ushll v8.2d, v21.2s, #3 +; CHECK-NEXT: ushll v9.2d, v7.2s, #0 +; CHECK-NEXT: stp q31, q17, [x8, #160] +; CHECK-NEXT: ushll2 v26.2d, v26.4s, #3 +; CHECK-NEXT: sub v5.2d, v5.2d, v25.2d +; CHECK-NEXT: sub v4.2d, v30.2d, v22.2d +; CHECK-NEXT: sub v8.2d, v9.2d, v8.2d +; CHECK-NEXT: sub v9.2d, v11.2d, v10.2d +; CHECK-NEXT: stp q4, q5, [x8, #64] +; CHECK-NEXT: sub v3.2d, v3.2d, v26.2d +; CHECK-NEXT: sub v5.2d, v20.2d, v16.2d +; CHECK-NEXT: stp q9, q6, [x8, #224] +; CHECK-NEXT: ushll v1.2d, v2.2s, #3 +; CHECK-NEXT: ushll2 v27.2d, v27.4s, #3 +; CHECK-NEXT: stp q5, q3, [x8, #32] +; CHECK-NEXT: ushll2 v21.2d, v21.4s, #3 +; CHECK-NEXT: ushll2 v7.2d, v7.4s, #0 +; CHECK-NEXT: ushll2 v6.2d, v19.4s, #0 +; CHECK-NEXT: ushll2 v2.2d, v2.4s, #3 +; CHECK-NEXT: ushll2 v3.2d, v0.4s, #0 ; CHECK-NEXT: ushll v0.2d, v0.2s, #0 -; CHECK-NEXT: stp q4, q3, [x8, #32] -; CHECK-NEXT: ushll2 v3.2d, v1.4s, #3 -; CHECK-NEXT: ushll v1.2d, v1.2s, #3 -; CHECK-NEXT: sub v2.2d, v2.2d, v3.2d +; CHECK-NEXT: sub v7.2d, v7.2d, v21.2d +; CHECK-NEXT: sub v6.2d, v6.2d, v27.2d +; CHECK-NEXT: sub v2.2d, v3.2d, v2.2d +; CHECK-NEXT: stp q8, q7, [x8, #192] ; CHECK-NEXT: sub v0.2d, v0.2d, v1.2d +; CHECK-NEXT: stp q29, q6, [x8, #128] ; CHECK-NEXT: stp q0, q2, [x8] -; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload -; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload -; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload -; CHECK-NEXT: ldr d14, [sp], #64 // 8-byte Folded Reload +; CHECK-NEXT: ldp d9, d8, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: ldp d11, d10, [sp], #32 // 16-byte Folded Reload ; CHECK-NEXT: ret %lp1 = load <8 x i32>, ptr %p %p2 = getelementptr i8, ptr %p, i32 32 @@ -8472,38 +7803,26 @@ define <16 x i32> @dblext_bv4_v4i8_v16i16_v16i32(ptr %p, ptr %q, ptr %r, ptr %s) { ; CHECK-LABEL: dblext_bv4_v4i8_v16i16_v16i32: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp s0, s1, [x2] -; CHECK-NEXT: ldp s2, s3, [x0] -; CHECK-NEXT: add x8, x1, #8 -; CHECK-NEXT: add x9, x1, #12 -; CHECK-NEXT: ldp s4, s5, [x2, #8] -; CHECK-NEXT: add x10, x3, #8 -; CHECK-NEXT: add x11, x3, #12 -; CHECK-NEXT: ldp s6, s7, [x0, #8] -; CHECK-NEXT: ld1 { v0.s }[1], [x3], #4 -; CHECK-NEXT: ld1 { v2.s }[1], [x1], #4 -; CHECK-NEXT: ld1 { v4.s }[1], [x10] -; CHECK-NEXT: ld1 { v6.s }[1], [x8] -; CHECK-NEXT: ld1 { v1.s }[1], [x3] -; CHECK-NEXT: ld1 { v3.s }[1], [x1] -; CHECK-NEXT: ld1 { v5.s }[1], [x11] -; CHECK-NEXT: ld1 { v7.s }[1], [x9] -; CHECK-NEXT: uaddl v2.8h, v2.8b, v6.8b -; CHECK-NEXT: uaddl v0.8h, v0.8b, v4.8b -; CHECK-NEXT: uaddl v1.8h, v1.8b, v5.8b -; CHECK-NEXT: uaddl v3.8h, v3.8b, v7.8b -; CHECK-NEXT: ushll v4.4s, v0.4h, #0 -; CHECK-NEXT: ushll v5.4s, v2.4h, #0 -; CHECK-NEXT: ushll2 v0.4s, v0.8h, #0 -; CHECK-NEXT: ushll2 v2.4s, v2.8h, #0 -; CHECK-NEXT: ushll v6.4s, v1.4h, #3 -; CHECK-NEXT: ushll v7.4s, v3.4h, #3 -; CHECK-NEXT: ushll2 v3.4s, v3.8h, #3 -; CHECK-NEXT: ushll2 v16.4s, v1.8h, #3 -; CHECK-NEXT: sub v1.4s, v2.4s, v3.4s -; CHECK-NEXT: sub v3.4s, v0.4s, v16.4s -; CHECK-NEXT: sub v0.4s, v5.4s, v7.4s -; CHECK-NEXT: sub v2.4s, v4.4s, v6.4s +; CHECK-NEXT: ldp d0, d3, [x3] +; CHECK-NEXT: ldp d1, d2, [x0] +; CHECK-NEXT: ldp d4, d5, [x2] +; CHECK-NEXT: uaddl v3.8h, v0.8b, v3.8b +; CHECK-NEXT: ldp d6, d0, [x1] +; CHECK-NEXT: uaddl v1.8h, v1.8b, v2.8b +; CHECK-NEXT: uaddl v2.8h, v4.8b, v5.8b +; CHECK-NEXT: ushll2 v4.4s, v1.8h, #3 +; CHECK-NEXT: uaddl v0.8h, v6.8b, v0.8b +; CHECK-NEXT: ushll v1.4s, v1.4h, #0 +; CHECK-NEXT: ushll2 v7.4s, v2.8h, #3 +; CHECK-NEXT: ushll2 v5.4s, v0.8h, #3 +; CHECK-NEXT: ushll v6.4s, v0.4h, #0 +; CHECK-NEXT: ushll v2.4s, v2.4h, #0 +; CHECK-NEXT: sub v0.4s, v1.4s, v4.4s +; CHECK-NEXT: ushll2 v4.4s, v3.8h, #3 +; CHECK-NEXT: ushll v3.4s, v3.4h, #0 +; CHECK-NEXT: sub v1.4s, v6.4s, v5.4s +; CHECK-NEXT: sub v2.4s, v2.4s, v7.4s +; CHECK-NEXT: sub v3.4s, v3.4s, v4.4s ; CHECK-NEXT: ret %lp1 = load <4 x i8>, ptr %p %p2 = getelementptr i8, ptr %p, i32 4 @@ -8571,46 +7890,42 @@ define <32 x i32> @dblext_bv4_v8i8_v32i16_v32i32(ptr %p, ptr %q, ptr %r, ptr %s) { ; CHECK-LABEL: dblext_bv4_v8i8_v32i16_v32i32: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp d1, d2, [x0] -; CHECK-NEXT: ldp d3, d4, [x0, #16] -; CHECK-NEXT: ldp d5, d6, [x1] -; CHECK-NEXT: ldp d7, d16, [x1, #16] -; CHECK-NEXT: uaddl v1.8h, v1.8b, v3.8b -; CHECK-NEXT: ldp d0, d17, [x2] -; CHECK-NEXT: uaddl v2.8h, v2.8b, v4.8b -; CHECK-NEXT: ldp d18, d19, [x2, #16] -; CHECK-NEXT: uaddl v5.8h, v5.8b, v7.8b -; CHECK-NEXT: ldp d20, d21, [x3] -; CHECK-NEXT: uaddl v6.8h, v6.8b, v16.8b -; CHECK-NEXT: ldp d3, d7, [x3, #16] -; CHECK-NEXT: uaddl v0.8h, v0.8b, v18.8b -; CHECK-NEXT: uaddl v17.8h, v17.8b, v19.8b -; CHECK-NEXT: ushll v18.4s, v5.4h, #0 -; CHECK-NEXT: uaddl v3.8h, v20.8b, v3.8b -; CHECK-NEXT: uaddl v7.8h, v21.8b, v7.8b -; CHECK-NEXT: ushll v4.4s, v0.4h, #0 -; CHECK-NEXT: ushll v16.4s, v3.4h, #0 -; CHECK-NEXT: ushll v19.4s, v1.4h, #0 -; CHECK-NEXT: ushll2 v20.4s, v3.8h, #0 -; CHECK-NEXT: ushll2 v0.4s, v0.8h, #0 -; CHECK-NEXT: ushll2 v3.4s, v5.8h, #0 -; CHECK-NEXT: ushll2 v1.4s, v1.8h, #0 -; CHECK-NEXT: ushll v21.4s, v7.4h, #3 -; CHECK-NEXT: ushll v22.4s, v17.4h, #3 -; CHECK-NEXT: ushll v23.4s, v6.4h, #3 -; CHECK-NEXT: ushll v24.4s, v2.4h, #3 -; CHECK-NEXT: ushll2 v7.4s, v7.8h, #3 -; CHECK-NEXT: ushll2 v5.4s, v17.8h, #3 -; CHECK-NEXT: ushll2 v2.4s, v2.8h, #3 -; CHECK-NEXT: ushll2 v6.4s, v6.8h, #3 -; CHECK-NEXT: sub v1.4s, v1.4s, v2.4s -; CHECK-NEXT: sub v3.4s, v3.4s, v6.4s -; CHECK-NEXT: sub v5.4s, v0.4s, v5.4s -; CHECK-NEXT: sub v7.4s, v20.4s, v7.4s -; CHECK-NEXT: sub v0.4s, v19.4s, v24.4s -; CHECK-NEXT: sub v2.4s, v18.4s, v23.4s -; CHECK-NEXT: sub v4.4s, v4.4s, v22.4s -; CHECK-NEXT: sub v6.4s, v16.4s, v21.4s +; CHECK-NEXT: ldp q0, q3, [x3] +; CHECK-NEXT: ldp q1, q2, [x1] +; CHECK-NEXT: uaddl v6.8h, v0.8b, v3.8b +; CHECK-NEXT: uaddl2 v16.8h, v0.16b, v3.16b +; CHECK-NEXT: ushll2 v21.4s, v6.8h, #0 +; CHECK-NEXT: ushll2 v20.4s, v16.8h, #3 +; CHECK-NEXT: ldp q0, q3, [x0] +; CHECK-NEXT: uaddl v18.8h, v1.8b, v2.8b +; CHECK-NEXT: uaddl2 v2.8h, v1.16b, v2.16b +; CHECK-NEXT: ushll v16.4s, v16.4h, #3 +; CHECK-NEXT: ushll2 v7.4s, v18.8h, #0 +; CHECK-NEXT: ushll v18.4s, v18.4h, #0 +; CHECK-NEXT: ldp q4, q5, [x2] +; CHECK-NEXT: uaddl2 v19.8h, v0.16b, v3.16b +; CHECK-NEXT: uaddl v0.8h, v0.8b, v3.8b +; CHECK-NEXT: ushll v6.4s, v6.4h, #0 +; CHECK-NEXT: ushll2 v1.4s, v19.8h, #3 +; CHECK-NEXT: ushll2 v3.4s, v0.8h, #0 +; CHECK-NEXT: uaddl v17.8h, v4.8b, v5.8b +; CHECK-NEXT: uaddl2 v4.8h, v4.16b, v5.16b +; CHECK-NEXT: ushll2 v5.4s, v2.8h, #3 +; CHECK-NEXT: sub v1.4s, v3.4s, v1.4s +; CHECK-NEXT: sub v3.4s, v7.4s, v5.4s +; CHECK-NEXT: ushll2 v5.4s, v4.8h, #3 +; CHECK-NEXT: ushll2 v7.4s, v17.8h, #0 +; CHECK-NEXT: ushll v19.4s, v19.4h, #3 +; CHECK-NEXT: ushll v0.4s, v0.4h, #0 +; CHECK-NEXT: ushll v2.4s, v2.4h, #3 +; CHECK-NEXT: ushll v4.4s, v4.4h, #3 +; CHECK-NEXT: ushll v17.4s, v17.4h, #0 +; CHECK-NEXT: sub v5.4s, v7.4s, v5.4s +; CHECK-NEXT: sub v7.4s, v21.4s, v20.4s +; CHECK-NEXT: sub v0.4s, v0.4s, v19.4s +; CHECK-NEXT: sub v2.4s, v18.4s, v2.4s +; CHECK-NEXT: sub v4.4s, v17.4s, v4.4s +; CHECK-NEXT: sub v6.4s, v6.4s, v16.4s ; CHECK-NEXT: ret %lp1 = load <8 x i8>, ptr %p %p2 = getelementptr i8, ptr %p, i32 8 @@ -8678,90 +7993,30 @@ define <8 x i64> @dblext_bv4_v2i8_v8i16_v8i64(ptr %p, ptr %q, ptr %r, ptr %s) { ; CHECK-LABEL: dblext_bv4_v2i8_v8i16_v8i64: ; CHECK: // %bb.0: -; CHECK-NEXT: add x8, x0, #2 -; CHECK-NEXT: add x9, x0, #4 -; CHECK-NEXT: add x10, x0, #6 -; CHECK-NEXT: ld1 { v20.b }[0], [x0] -; CHECK-NEXT: add x11, x3, #3 -; CHECK-NEXT: ld1 { v4.b }[0], [x8] -; CHECK-NEXT: add x8, x0, #1 -; CHECK-NEXT: ld1 { v16.b }[0], [x9] -; CHECK-NEXT: add x9, x0, #3 -; CHECK-NEXT: ld1 { v0.b }[0], [x10] -; CHECK-NEXT: add x10, x0, #5 -; CHECK-NEXT: ld1 { v20.b }[4], [x8] -; CHECK-NEXT: add x8, x0, #7 -; CHECK-NEXT: ld1 { v4.b }[4], [x9] -; CHECK-NEXT: add x9, x1, #2 -; CHECK-NEXT: ld1 { v16.b }[4], [x10] -; CHECK-NEXT: add x10, x1, #4 -; CHECK-NEXT: ld1 { v21.b }[0], [x1] -; CHECK-NEXT: ld1 { v5.b }[0], [x9] -; CHECK-NEXT: add x9, x1, #6 -; CHECK-NEXT: ld1 { v0.b }[4], [x8] -; CHECK-NEXT: add x8, x1, #1 -; CHECK-NEXT: ld1 { v17.b }[0], [x10] -; CHECK-NEXT: add x10, x1, #3 -; CHECK-NEXT: ld1 { v1.b }[0], [x9] -; CHECK-NEXT: add x9, x1, #7 -; CHECK-NEXT: ld1 { v21.b }[4], [x8] -; CHECK-NEXT: add x8, x1, #5 -; CHECK-NEXT: ld1 { v5.b }[4], [x10] -; CHECK-NEXT: add x10, x2, #4 -; CHECK-NEXT: ld1 { v22.b }[0], [x2] -; CHECK-NEXT: ld1 { v17.b }[4], [x8] -; CHECK-NEXT: add x8, x2, #2 -; CHECK-NEXT: ld1 { v1.b }[4], [x9] -; CHECK-NEXT: add x9, x2, #1 -; CHECK-NEXT: ld1 { v18.b }[0], [x10] -; CHECK-NEXT: add x10, x2, #3 -; CHECK-NEXT: ld1 { v6.b }[0], [x8] -; CHECK-NEXT: add x8, x2, #6 -; CHECK-NEXT: ld1 { v22.b }[4], [x9] -; CHECK-NEXT: add x9, x2, #5 -; CHECK-NEXT: ld1 { v23.b }[0], [x3] -; CHECK-NEXT: ld1 { v2.b }[0], [x8] -; CHECK-NEXT: add x8, x2, #7 -; CHECK-NEXT: ld1 { v18.b }[4], [x9] -; CHECK-NEXT: add x9, x3, #2 -; CHECK-NEXT: ld1 { v6.b }[4], [x10] -; CHECK-NEXT: add x10, x3, #4 -; CHECK-NEXT: ld1 { v7.b }[0], [x9] -; CHECK-NEXT: add x9, x3, #6 -; CHECK-NEXT: ld1 { v2.b }[4], [x8] -; CHECK-NEXT: add x8, x3, #1 -; CHECK-NEXT: ld1 { v19.b }[0], [x10] -; CHECK-NEXT: add x10, x3, #7 -; CHECK-NEXT: ld1 { v3.b }[0], [x9] -; CHECK-NEXT: adrp x9, .LCPI151_0 -; CHECK-NEXT: ld1 { v23.b }[4], [x8] -; CHECK-NEXT: add x8, x3, #5 -; CHECK-NEXT: ld1 { v7.b }[4], [x11] -; CHECK-NEXT: ldr d24, [x9, :lo12:.LCPI151_0] -; CHECK-NEXT: ld1 { v19.b }[4], [x8] -; CHECK-NEXT: ld1 { v3.b }[4], [x10] -; CHECK-NEXT: tbl v20.8b, { v20.16b, v21.16b, v22.16b, v23.16b }, v24.8b -; CHECK-NEXT: tbl v4.8b, { v4.16b, v5.16b, v6.16b, v7.16b }, v24.8b -; CHECK-NEXT: tbl v16.8b, { v16.16b, v17.16b, v18.16b, v19.16b }, v24.8b -; CHECK-NEXT: tbl v0.8b, { v0.16b, v1.16b, v2.16b, v3.16b }, v24.8b -; CHECK-NEXT: uaddl v1.8h, v20.8b, v16.8b -; CHECK-NEXT: uaddl v0.8h, v4.8b, v0.8b -; CHECK-NEXT: ushll v2.4s, v1.4h, #0 -; CHECK-NEXT: ushll2 v1.4s, v1.8h, #0 -; CHECK-NEXT: ushll v3.4s, v0.4h, #0 -; CHECK-NEXT: ushll2 v0.4s, v0.8h, #0 -; CHECK-NEXT: ushll v4.2d, v2.2s, #0 -; CHECK-NEXT: ushll v5.2d, v1.2s, #0 -; CHECK-NEXT: ushll2 v2.2d, v2.4s, #0 -; CHECK-NEXT: ushll2 v1.2d, v1.4s, #0 -; CHECK-NEXT: ushll v6.2d, v3.2s, #3 -; CHECK-NEXT: ushll v7.2d, v0.2s, #3 -; CHECK-NEXT: ushll2 v0.2d, v0.4s, #3 -; CHECK-NEXT: ushll2 v16.2d, v3.4s, #3 -; CHECK-NEXT: sub v3.2d, v1.2d, v0.2d -; CHECK-NEXT: sub v1.2d, v2.2d, v16.2d -; CHECK-NEXT: sub v0.2d, v4.2d, v6.2d -; CHECK-NEXT: sub v2.2d, v5.2d, v7.2d +; CHECK-NEXT: ldp s0, s2, [x2] +; CHECK-NEXT: ldp s1, s3, [x0] +; CHECK-NEXT: ld1 { v1.s }[1], [x1], #4 +; CHECK-NEXT: ld1 { v0.s }[1], [x3], #4 +; CHECK-NEXT: ld1 { v3.s }[1], [x1] +; CHECK-NEXT: ld1 { v2.s }[1], [x3] +; CHECK-NEXT: uaddl v1.8h, v1.8b, v3.8b +; CHECK-NEXT: uaddl v0.8h, v0.8b, v2.8b +; CHECK-NEXT: ushll2 v2.4s, v1.8h, #0 +; CHECK-NEXT: ushll2 v3.4s, v0.8h, #0 +; CHECK-NEXT: ushll v4.4s, v1.4h, #0 +; CHECK-NEXT: ushll v0.4s, v0.4h, #0 +; CHECK-NEXT: ushll2 v1.2d, v2.4s, #3 +; CHECK-NEXT: ushll v2.2d, v2.2s, #0 +; CHECK-NEXT: ushll2 v5.2d, v3.4s, #3 +; CHECK-NEXT: ushll v3.2d, v3.2s, #0 +; CHECK-NEXT: sub v1.2d, v2.2d, v1.2d +; CHECK-NEXT: ushll2 v2.2d, v4.4s, #3 +; CHECK-NEXT: ushll v4.2d, v4.2s, #0 +; CHECK-NEXT: sub v3.2d, v3.2d, v5.2d +; CHECK-NEXT: ushll2 v5.2d, v0.4s, #3 +; CHECK-NEXT: ushll v6.2d, v0.2s, #0 +; CHECK-NEXT: sub v0.2d, v4.2d, v2.2d +; CHECK-NEXT: sub v2.2d, v6.2d, v5.2d ; CHECK-NEXT: ret %lp1 = load <2 x i8>, ptr %p %p2 = getelementptr i8, ptr %p, i32 2 @@ -8829,58 +8084,46 @@ define <16 x i64> @dblext_bv4_v4i8_v16i16_v16i64(ptr %p, ptr %q, ptr %r, ptr %s) { ; CHECK-LABEL: dblext_bv4_v4i8_v16i16_v16i64: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp s0, s1, [x2] -; CHECK-NEXT: ldp s2, s3, [x0] -; CHECK-NEXT: add x8, x1, #8 -; CHECK-NEXT: add x9, x1, #12 -; CHECK-NEXT: ldp s4, s5, [x2, #8] -; CHECK-NEXT: add x10, x3, #8 -; CHECK-NEXT: add x11, x3, #12 -; CHECK-NEXT: ldp s6, s7, [x0, #8] -; CHECK-NEXT: ld1 { v0.s }[1], [x3], #4 -; CHECK-NEXT: ld1 { v2.s }[1], [x1], #4 -; CHECK-NEXT: ld1 { v4.s }[1], [x10] -; CHECK-NEXT: ld1 { v6.s }[1], [x8] -; CHECK-NEXT: ld1 { v1.s }[1], [x3] -; CHECK-NEXT: ld1 { v5.s }[1], [x11] -; CHECK-NEXT: ld1 { v3.s }[1], [x1] -; CHECK-NEXT: ld1 { v7.s }[1], [x9] -; CHECK-NEXT: uaddl v0.8h, v0.8b, v4.8b -; CHECK-NEXT: uaddl v2.8h, v2.8b, v6.8b -; CHECK-NEXT: uaddl v1.8h, v1.8b, v5.8b -; CHECK-NEXT: uaddl v3.8h, v3.8b, v7.8b -; CHECK-NEXT: ushll v4.4s, v0.4h, #0 -; CHECK-NEXT: ushll v5.4s, v2.4h, #0 -; CHECK-NEXT: ushll2 v0.4s, v0.8h, #0 -; CHECK-NEXT: ushll2 v2.4s, v2.8h, #0 -; CHECK-NEXT: ushll2 v7.4s, v3.8h, #0 -; CHECK-NEXT: ushll2 v19.4s, v1.8h, #0 -; CHECK-NEXT: ushll v1.4s, v1.4h, #0 -; CHECK-NEXT: ushll v3.4s, v3.4h, #0 -; CHECK-NEXT: ushll v16.2d, v4.2s, #0 -; CHECK-NEXT: ushll v17.2d, v5.2s, #0 -; CHECK-NEXT: ushll v6.2d, v0.2s, #0 -; CHECK-NEXT: ushll2 v4.2d, v4.4s, #0 -; CHECK-NEXT: ushll v18.2d, v2.2s, #0 -; CHECK-NEXT: ushll2 v5.2d, v5.4s, #0 -; CHECK-NEXT: ushll2 v0.2d, v0.4s, #0 -; CHECK-NEXT: ushll2 v2.2d, v2.4s, #0 -; CHECK-NEXT: ushll v20.2d, v1.2s, #3 -; CHECK-NEXT: ushll v21.2d, v3.2s, #3 -; CHECK-NEXT: ushll v22.2d, v19.2s, #3 -; CHECK-NEXT: ushll2 v23.2d, v1.4s, #3 -; CHECK-NEXT: ushll v24.2d, v7.2s, #3 -; CHECK-NEXT: ushll2 v1.2d, v3.4s, #3 -; CHECK-NEXT: ushll2 v3.2d, v7.4s, #3 -; CHECK-NEXT: ushll2 v7.2d, v19.4s, #3 -; CHECK-NEXT: sub v3.2d, v2.2d, v3.2d -; CHECK-NEXT: sub v7.2d, v0.2d, v7.2d -; CHECK-NEXT: sub v1.2d, v5.2d, v1.2d -; CHECK-NEXT: sub v2.2d, v18.2d, v24.2d -; CHECK-NEXT: sub v5.2d, v4.2d, v23.2d -; CHECK-NEXT: sub v6.2d, v6.2d, v22.2d -; CHECK-NEXT: sub v0.2d, v17.2d, v21.2d -; CHECK-NEXT: sub v4.2d, v16.2d, v20.2d +; CHECK-NEXT: ldp d0, d3, [x3] +; CHECK-NEXT: ldp d1, d2, [x0] +; CHECK-NEXT: ldp d4, d5, [x2] +; CHECK-NEXT: uaddl v0.8h, v0.8b, v3.8b +; CHECK-NEXT: ldp d6, d3, [x1] +; CHECK-NEXT: uaddl v1.8h, v1.8b, v2.8b +; CHECK-NEXT: ushll2 v20.4s, v0.8h, #0 +; CHECK-NEXT: uaddl v2.8h, v4.8b, v5.8b +; CHECK-NEXT: ushll2 v4.4s, v1.8h, #0 +; CHECK-NEXT: uaddl v3.8h, v6.8b, v3.8b +; CHECK-NEXT: ushll v6.4s, v1.4h, #0 +; CHECK-NEXT: ushll2 v1.2d, v4.4s, #3 +; CHECK-NEXT: ushll2 v16.4s, v3.8h, #0 +; CHECK-NEXT: ushll v17.4s, v3.4h, #0 +; CHECK-NEXT: ushll2 v3.2d, v6.4s, #0 +; CHECK-NEXT: ushll2 v5.2d, v16.4s, #3 +; CHECK-NEXT: ushll2 v7.2d, v17.4s, #0 +; CHECK-NEXT: ushll2 v18.4s, v2.8h, #0 +; CHECK-NEXT: ushll v19.4s, v2.4h, #0 +; CHECK-NEXT: ushll v21.4s, v0.4h, #0 +; CHECK-NEXT: sub v1.2d, v3.2d, v1.2d +; CHECK-NEXT: sub v3.2d, v7.2d, v5.2d +; CHECK-NEXT: ushll2 v2.2d, v18.4s, #3 +; CHECK-NEXT: ushll2 v5.2d, v19.4s, #0 +; CHECK-NEXT: ushll2 v0.2d, v20.4s, #3 +; CHECK-NEXT: ushll2 v7.2d, v21.4s, #0 +; CHECK-NEXT: sub v5.2d, v5.2d, v2.2d +; CHECK-NEXT: sub v7.2d, v7.2d, v0.2d +; CHECK-NEXT: ushll v0.2d, v4.2s, #3 +; CHECK-NEXT: ushll v2.2d, v6.2s, #0 +; CHECK-NEXT: ushll v4.2d, v16.2s, #3 +; CHECK-NEXT: ushll v6.2d, v17.2s, #0 +; CHECK-NEXT: sub v0.2d, v2.2d, v0.2d +; CHECK-NEXT: sub v2.2d, v6.2d, v4.2d +; CHECK-NEXT: ushll v4.2d, v18.2s, #3 +; CHECK-NEXT: ushll v6.2d, v19.2s, #0 +; CHECK-NEXT: ushll v16.2d, v20.2s, #3 +; CHECK-NEXT: ushll v17.2d, v21.2s, #0 +; CHECK-NEXT: sub v4.2d, v6.2d, v4.2d +; CHECK-NEXT: sub v6.2d, v17.2d, v16.2d ; CHECK-NEXT: ret %lp1 = load <4 x i8>, ptr %p %p2 = getelementptr i8, ptr %p, i32 4 @@ -8945,113 +8188,93 @@ ret <16 x i64> %a } -define <32 x i64> @dblext_bv4_v8i8_v32i16_v32i64(ptr %p, ptr %q, ptr %r, ptr %s) { -; CHECK-LABEL: dblext_bv4_v8i8_v32i16_v32i64: -; CHECK: // %bb.0: -; CHECK-NEXT: str d14, [sp, #-64]! // 8-byte Folded Spill -; CHECK-NEXT: .cfi_def_cfa_offset 64 -; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill -; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill -; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill -; CHECK-NEXT: .cfi_offset b8, -8 -; CHECK-NEXT: .cfi_offset b9, -16 -; CHECK-NEXT: .cfi_offset b10, -24 -; CHECK-NEXT: .cfi_offset b11, -32 -; CHECK-NEXT: .cfi_offset b12, -40 -; CHECK-NEXT: .cfi_offset b13, -48 -; CHECK-NEXT: .cfi_offset b14, -64 -; CHECK-NEXT: ldp d1, d2, [x0] -; CHECK-NEXT: ldp d3, d4, [x0, #16] -; CHECK-NEXT: ldp d20, d21, [x3] -; CHECK-NEXT: uaddl v22.8h, v1.8b, v3.8b -; CHECK-NEXT: ldp d1, d3, [x3, #16] -; CHECK-NEXT: uaddl v28.8h, v2.8b, v4.8b -; CHECK-NEXT: ldp d0, d17, [x2] -; CHECK-NEXT: ldp d5, d6, [x1] -; CHECK-NEXT: uaddl v25.8h, v21.8b, v3.8b -; CHECK-NEXT: ldp d7, d16, [x1, #16] -; CHECK-NEXT: ldp d18, d19, [x2, #16] -; CHECK-NEXT: ushll v31.4s, v25.4h, #0 -; CHECK-NEXT: uaddl v7.8h, v5.8b, v7.8b -; CHECK-NEXT: ushll2 v25.4s, v25.8h, #0 -; CHECK-NEXT: uaddl v5.8h, v0.8b, v18.8b -; CHECK-NEXT: uaddl v18.8h, v20.8b, v1.8b -; CHECK-NEXT: uaddl v26.8h, v17.8b, v19.8b -; CHECK-NEXT: uaddl v27.8h, v6.8b, v16.8b -; CHECK-NEXT: ushll2 v2.4s, v18.8h, #0 -; CHECK-NEXT: ushll2 v6.4s, v5.8h, #0 -; CHECK-NEXT: ushll2 v3.2d, v2.4s, #0 -; CHECK-NEXT: ushll2 v16.4s, v7.8h, #0 -; CHECK-NEXT: ushll2 v20.4s, v22.8h, #0 -; CHECK-NEXT: ushll v17.2d, v2.2s, #0 -; CHECK-NEXT: ushll v29.4s, v7.4h, #0 -; CHECK-NEXT: ushll v30.4s, v26.4h, #0 -; CHECK-NEXT: ushll2 v26.4s, v26.8h, #0 -; CHECK-NEXT: ushll2 v8.2d, v25.4s, #3 -; CHECK-NEXT: ushll v25.2d, v25.2s, #3 -; CHECK-NEXT: ushll2 v4.2d, v6.4s, #0 -; CHECK-NEXT: ushll2 v1.2d, v16.4s, #0 -; CHECK-NEXT: ushll2 v0.2d, v20.4s, #0 -; CHECK-NEXT: ushll v21.4s, v18.4h, #0 -; CHECK-NEXT: ushll v24.4s, v5.4h, #0 -; CHECK-NEXT: ushll v18.2d, v6.2s, #0 -; CHECK-NEXT: ushll v19.2d, v16.2s, #0 -; CHECK-NEXT: ushll2 v6.2d, v29.4s, #0 -; CHECK-NEXT: ushll v16.4s, v22.4h, #0 -; CHECK-NEXT: ushll v22.2d, v20.2s, #0 -; CHECK-NEXT: ushll v20.2d, v29.2s, #0 -; CHECK-NEXT: ushll v29.4s, v27.4h, #0 -; CHECK-NEXT: ushll2 v27.4s, v27.8h, #0 -; CHECK-NEXT: ushll2 v9.2d, v26.4s, #3 -; CHECK-NEXT: ushll v26.2d, v26.2s, #3 -; CHECK-NEXT: sub v3.2d, v3.2d, v8.2d -; CHECK-NEXT: sub v17.2d, v17.2d, v25.2d -; CHECK-NEXT: ushll2 v2.2d, v21.4s, #0 -; CHECK-NEXT: ushll2 v5.2d, v24.4s, #0 -; CHECK-NEXT: stp q17, q3, [x8, #224] -; CHECK-NEXT: ushll v23.2d, v21.2s, #0 -; CHECK-NEXT: ushll v21.2d, v24.2s, #0 -; CHECK-NEXT: ushll v24.4s, v28.4h, #0 -; CHECK-NEXT: ushll2 v28.4s, v28.8h, #0 -; CHECK-NEXT: ushll2 v10.2d, v27.4s, #3 -; CHECK-NEXT: ushll v27.2d, v27.2s, #3 -; CHECK-NEXT: sub v4.2d, v4.2d, v9.2d -; CHECK-NEXT: sub v3.2d, v18.2d, v26.2d -; CHECK-NEXT: ushll2 v11.2d, v28.4s, #3 -; CHECK-NEXT: ushll v28.2d, v28.2s, #3 -; CHECK-NEXT: stp q3, q4, [x8, #160] -; CHECK-NEXT: sub v1.2d, v1.2d, v10.2d -; CHECK-NEXT: sub v4.2d, v19.2d, v27.2d -; CHECK-NEXT: ushll2 v12.2d, v31.4s, #3 -; CHECK-NEXT: ushll v31.2d, v31.2s, #3 -; CHECK-NEXT: stp q4, q1, [x8, #96] -; CHECK-NEXT: sub v0.2d, v0.2d, v11.2d -; CHECK-NEXT: sub v1.2d, v22.2d, v28.2d -; CHECK-NEXT: ushll2 v13.2d, v30.4s, #3 -; CHECK-NEXT: ushll v30.2d, v30.2s, #3 -; CHECK-NEXT: stp q1, q0, [x8, #32] -; CHECK-NEXT: sub v2.2d, v2.2d, v12.2d -; CHECK-NEXT: sub v0.2d, v23.2d, v31.2d -; CHECK-NEXT: ushll2 v14.2d, v29.4s, #3 -; CHECK-NEXT: ushll v29.2d, v29.2s, #3 -; CHECK-NEXT: stp q0, q2, [x8, #192] -; CHECK-NEXT: sub v1.2d, v5.2d, v13.2d -; CHECK-NEXT: sub v2.2d, v21.2d, v30.2d -; CHECK-NEXT: sub v0.2d, v6.2d, v14.2d -; CHECK-NEXT: stp q2, q1, [x8, #128] -; CHECK-NEXT: sub v1.2d, v20.2d, v29.2d -; CHECK-NEXT: ushll2 v7.2d, v16.4s, #0 -; CHECK-NEXT: ushll v2.2d, v16.2s, #0 -; CHECK-NEXT: stp q1, q0, [x8, #64] -; CHECK-NEXT: ushll2 v0.2d, v24.4s, #3 -; CHECK-NEXT: ushll v1.2d, v24.2s, #3 -; CHECK-NEXT: sub v0.2d, v7.2d, v0.2d +define <32 x i64> @dblext_bv4_v8i8_v32i16_v32i64(ptr %p, ptr %q, ptr %r, ptr %s) { +; CHECK-LABEL: dblext_bv4_v8i8_v32i16_v32i64: +; CHECK: // %bb.0: +; CHECK-NEXT: ldp q0, q1, [x3] +; CHECK-NEXT: ldp q5, q16, [x0] +; CHECK-NEXT: uaddl v6.8h, v0.8b, v1.8b +; CHECK-NEXT: uaddl2 v7.8h, v0.16b, v1.16b +; CHECK-NEXT: ldp q2, q3, [x2] +; CHECK-NEXT: ldp q4, q17, [x1] +; CHECK-NEXT: uaddl v0.8h, v2.8b, v3.8b +; CHECK-NEXT: uaddl2 v1.8h, v2.16b, v3.16b +; CHECK-NEXT: uaddl v2.8h, v4.8b, v17.8b +; CHECK-NEXT: uaddl2 v3.8h, v4.16b, v17.16b +; CHECK-NEXT: uaddl v4.8h, v5.8b, v16.8b +; CHECK-NEXT: uaddl2 v5.8h, v5.16b, v16.16b +; CHECK-NEXT: ushll v16.4s, v7.4h, #0 +; CHECK-NEXT: ushll v17.4s, v6.4h, #0 +; CHECK-NEXT: ushll v18.2d, v16.2s, #3 +; CHECK-NEXT: ushll v19.2d, v17.2s, #0 +; CHECK-NEXT: ushll2 v20.4s, v5.8h, #0 +; CHECK-NEXT: sub v18.2d, v19.2d, v18.2d +; CHECK-NEXT: ushll2 v19.4s, v4.8h, #0 +; CHECK-NEXT: ushll2 v23.4s, v3.8h, #0 +; CHECK-NEXT: ushll2 v24.4s, v2.8h, #0 +; CHECK-NEXT: ushll v21.2d, v20.2s, #3 +; CHECK-NEXT: ushll v22.2d, v19.2s, #0 +; CHECK-NEXT: ushll v25.2d, v23.2s, #3 +; CHECK-NEXT: ushll v26.2d, v24.2s, #0 +; CHECK-NEXT: sub v21.2d, v22.2d, v21.2d +; CHECK-NEXT: sub v22.2d, v26.2d, v25.2d +; CHECK-NEXT: ushll2 v25.4s, v1.8h, #0 +; CHECK-NEXT: ushll2 v26.4s, v0.8h, #0 +; CHECK-NEXT: ushll2 v7.4s, v7.8h, #0 +; CHECK-NEXT: ushll2 v6.4s, v6.8h, #0 +; CHECK-NEXT: ushll v27.2d, v25.2s, #3 +; CHECK-NEXT: ushll v28.2d, v26.2s, #0 +; CHECK-NEXT: ushll v29.2d, v7.2s, #3 +; CHECK-NEXT: ushll v30.2d, v6.2s, #0 +; CHECK-NEXT: ushll2 v7.2d, v7.4s, #3 +; CHECK-NEXT: ushll2 v6.2d, v6.4s, #0 +; CHECK-NEXT: sub v27.2d, v28.2d, v27.2d +; CHECK-NEXT: sub v28.2d, v30.2d, v29.2d +; CHECK-NEXT: sub v6.2d, v6.2d, v7.2d +; CHECK-NEXT: ushll v1.4s, v1.4h, #0 +; CHECK-NEXT: ushll v0.4s, v0.4h, #0 +; CHECK-NEXT: stp q28, q6, [x8, #224] +; CHECK-NEXT: ushll2 v25.2d, v25.4s, #3 +; CHECK-NEXT: ushll2 v26.2d, v26.4s, #0 +; CHECK-NEXT: ushll v3.4s, v3.4h, #0 +; CHECK-NEXT: ushll v2.4s, v2.4h, #0 +; CHECK-NEXT: ushll v6.2d, v1.2s, #3 +; CHECK-NEXT: ushll v28.2d, v0.2s, #0 +; CHECK-NEXT: ushll2 v1.2d, v1.4s, #3 +; CHECK-NEXT: ushll2 v0.2d, v0.4s, #0 +; CHECK-NEXT: sub v25.2d, v26.2d, v25.2d +; CHECK-NEXT: ushll v26.2d, v3.2s, #3 +; CHECK-NEXT: ushll v7.2d, v2.2s, #0 +; CHECK-NEXT: stp q27, q25, [x8, #160] +; CHECK-NEXT: ushll2 v3.2d, v3.4s, #3 +; CHECK-NEXT: ushll2 v2.2d, v2.4s, #0 +; CHECK-NEXT: sub v0.2d, v0.2d, v1.2d +; CHECK-NEXT: sub v1.2d, v28.2d, v6.2d +; CHECK-NEXT: ushll2 v23.2d, v23.4s, #3 +; CHECK-NEXT: ushll2 v24.2d, v24.4s, #0 +; CHECK-NEXT: stp q1, q0, [x8, #128] +; CHECK-NEXT: ushll v5.4s, v5.4h, #0 +; CHECK-NEXT: ushll v4.4s, v4.4h, #0 +; CHECK-NEXT: sub v2.2d, v2.2d, v3.2d +; CHECK-NEXT: sub v0.2d, v7.2d, v26.2d +; CHECK-NEXT: sub v23.2d, v24.2d, v23.2d +; CHECK-NEXT: ushll v24.2d, v5.2s, #3 +; CHECK-NEXT: stp q0, q2, [x8, #64] +; CHECK-NEXT: ushll2 v16.2d, v16.4s, #3 +; CHECK-NEXT: stp q22, q23, [x8, #96] +; CHECK-NEXT: ushll2 v20.2d, v20.4s, #3 +; CHECK-NEXT: ushll2 v19.2d, v19.4s, #0 +; CHECK-NEXT: ushll2 v17.2d, v17.4s, #0 +; CHECK-NEXT: ushll2 v1.2d, v5.4s, #3 +; CHECK-NEXT: ushll2 v2.2d, v4.4s, #0 +; CHECK-NEXT: ushll v0.2d, v4.2s, #0 +; CHECK-NEXT: sub v19.2d, v19.2d, v20.2d +; CHECK-NEXT: sub v16.2d, v17.2d, v16.2d ; CHECK-NEXT: sub v1.2d, v2.2d, v1.2d -; CHECK-NEXT: stp q1, q0, [x8] -; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload -; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload -; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload -; CHECK-NEXT: ldr d14, [sp], #64 // 8-byte Folded Reload +; CHECK-NEXT: stp q21, q19, [x8, #32] +; CHECK-NEXT: sub v0.2d, v0.2d, v24.2d +; CHECK-NEXT: stp q18, q16, [x8, #192] +; CHECK-NEXT: stp q0, q1, [x8] ; CHECK-NEXT: ret %lp1 = load <8 x i8>, ptr %p %p2 = getelementptr i8, ptr %p, i32 8 @@ -9119,90 +8342,30 @@ define <8 x i64> @dblext_bv4_v2i8_v8i32_v8i64(ptr %p, ptr %q, ptr %r, ptr %s) { ; CHECK-LABEL: dblext_bv4_v2i8_v8i32_v8i64: ; CHECK: // %bb.0: -; CHECK-NEXT: add x8, x0, #2 -; CHECK-NEXT: add x9, x0, #4 -; CHECK-NEXT: add x10, x0, #6 -; CHECK-NEXT: ld1 { v20.b }[0], [x0] -; CHECK-NEXT: add x11, x3, #3 -; CHECK-NEXT: ld1 { v4.b }[0], [x8] -; CHECK-NEXT: add x8, x0, #1 -; CHECK-NEXT: ld1 { v16.b }[0], [x9] -; CHECK-NEXT: add x9, x0, #3 -; CHECK-NEXT: ld1 { v0.b }[0], [x10] -; CHECK-NEXT: add x10, x0, #5 -; CHECK-NEXT: ld1 { v20.b }[4], [x8] -; CHECK-NEXT: add x8, x0, #7 -; CHECK-NEXT: ld1 { v4.b }[4], [x9] -; CHECK-NEXT: add x9, x1, #2 -; CHECK-NEXT: ld1 { v16.b }[4], [x10] -; CHECK-NEXT: add x10, x1, #4 -; CHECK-NEXT: ld1 { v21.b }[0], [x1] -; CHECK-NEXT: ld1 { v5.b }[0], [x9] -; CHECK-NEXT: add x9, x1, #6 -; CHECK-NEXT: ld1 { v0.b }[4], [x8] -; CHECK-NEXT: add x8, x1, #1 -; CHECK-NEXT: ld1 { v17.b }[0], [x10] -; CHECK-NEXT: add x10, x1, #3 -; CHECK-NEXT: ld1 { v1.b }[0], [x9] -; CHECK-NEXT: add x9, x1, #7 -; CHECK-NEXT: ld1 { v21.b }[4], [x8] -; CHECK-NEXT: add x8, x1, #5 -; CHECK-NEXT: ld1 { v5.b }[4], [x10] -; CHECK-NEXT: add x10, x2, #4 -; CHECK-NEXT: ld1 { v22.b }[0], [x2] -; CHECK-NEXT: ld1 { v17.b }[4], [x8] -; CHECK-NEXT: add x8, x2, #2 -; CHECK-NEXT: ld1 { v1.b }[4], [x9] -; CHECK-NEXT: add x9, x2, #1 -; CHECK-NEXT: ld1 { v18.b }[0], [x10] -; CHECK-NEXT: add x10, x2, #3 -; CHECK-NEXT: ld1 { v6.b }[0], [x8] -; CHECK-NEXT: add x8, x2, #6 -; CHECK-NEXT: ld1 { v22.b }[4], [x9] -; CHECK-NEXT: add x9, x2, #5 -; CHECK-NEXT: ld1 { v23.b }[0], [x3] -; CHECK-NEXT: ld1 { v2.b }[0], [x8] -; CHECK-NEXT: add x8, x2, #7 -; CHECK-NEXT: ld1 { v18.b }[4], [x9] -; CHECK-NEXT: add x9, x3, #2 -; CHECK-NEXT: ld1 { v6.b }[4], [x10] -; CHECK-NEXT: add x10, x3, #4 -; CHECK-NEXT: ld1 { v7.b }[0], [x9] -; CHECK-NEXT: add x9, x3, #6 -; CHECK-NEXT: ld1 { v2.b }[4], [x8] -; CHECK-NEXT: add x8, x3, #1 -; CHECK-NEXT: ld1 { v19.b }[0], [x10] -; CHECK-NEXT: add x10, x3, #7 -; CHECK-NEXT: ld1 { v3.b }[0], [x9] -; CHECK-NEXT: adrp x9, .LCPI154_0 -; CHECK-NEXT: ld1 { v23.b }[4], [x8] -; CHECK-NEXT: add x8, x3, #5 -; CHECK-NEXT: ld1 { v7.b }[4], [x11] -; CHECK-NEXT: ldr d24, [x9, :lo12:.LCPI154_0] -; CHECK-NEXT: ld1 { v19.b }[4], [x8] -; CHECK-NEXT: ld1 { v3.b }[4], [x10] -; CHECK-NEXT: tbl v20.8b, { v20.16b, v21.16b, v22.16b, v23.16b }, v24.8b -; CHECK-NEXT: tbl v4.8b, { v4.16b, v5.16b, v6.16b, v7.16b }, v24.8b -; CHECK-NEXT: tbl v16.8b, { v16.16b, v17.16b, v18.16b, v19.16b }, v24.8b -; CHECK-NEXT: tbl v0.8b, { v0.16b, v1.16b, v2.16b, v3.16b }, v24.8b -; CHECK-NEXT: uaddl v1.8h, v20.8b, v16.8b -; CHECK-NEXT: uaddl v0.8h, v4.8b, v0.8b -; CHECK-NEXT: ushll v2.4s, v1.4h, #0 -; CHECK-NEXT: ushll2 v1.4s, v1.8h, #0 -; CHECK-NEXT: ushll v3.4s, v0.4h, #0 -; CHECK-NEXT: ushll2 v0.4s, v0.8h, #0 -; CHECK-NEXT: ushll v4.2d, v2.2s, #0 -; CHECK-NEXT: ushll v5.2d, v1.2s, #0 -; CHECK-NEXT: ushll2 v2.2d, v2.4s, #0 -; CHECK-NEXT: ushll2 v1.2d, v1.4s, #0 -; CHECK-NEXT: ushll v6.2d, v3.2s, #3 -; CHECK-NEXT: ushll v7.2d, v0.2s, #3 -; CHECK-NEXT: ushll2 v0.2d, v0.4s, #3 -; CHECK-NEXT: ushll2 v16.2d, v3.4s, #3 -; CHECK-NEXT: sub v3.2d, v1.2d, v0.2d -; CHECK-NEXT: sub v1.2d, v2.2d, v16.2d -; CHECK-NEXT: sub v0.2d, v4.2d, v6.2d -; CHECK-NEXT: sub v2.2d, v5.2d, v7.2d +; CHECK-NEXT: ldp s0, s3, [x2] +; CHECK-NEXT: ldp s1, s2, [x0] +; CHECK-NEXT: ld1 { v0.s }[1], [x3], #4 +; CHECK-NEXT: ld1 { v3.s }[1], [x3] +; CHECK-NEXT: ld1 { v1.s }[1], [x1], #4 +; CHECK-NEXT: uaddl v0.8h, v0.8b, v3.8b +; CHECK-NEXT: ld1 { v2.s }[1], [x1] +; CHECK-NEXT: ushll2 v3.4s, v0.8h, #0 +; CHECK-NEXT: ushll v0.4s, v0.4h, #0 +; CHECK-NEXT: uaddl v1.8h, v1.8b, v2.8b +; CHECK-NEXT: ushll2 v2.2d, v3.4s, #3 +; CHECK-NEXT: ushll v3.2d, v3.2s, #0 +; CHECK-NEXT: ushll2 v5.4s, v1.8h, #0 +; CHECK-NEXT: sub v3.2d, v3.2d, v2.2d +; CHECK-NEXT: ushll2 v2.2d, v5.4s, #3 +; CHECK-NEXT: ushll v5.2d, v5.2s, #0 +; CHECK-NEXT: ushll v6.4s, v1.4h, #0 +; CHECK-NEXT: ushll2 v4.2d, v0.4s, #3 +; CHECK-NEXT: sub v1.2d, v5.2d, v2.2d +; CHECK-NEXT: ushll2 v2.2d, v6.4s, #3 +; CHECK-NEXT: ushll v5.2d, v6.2s, #0 +; CHECK-NEXT: ushll v6.2d, v0.2s, #0 +; CHECK-NEXT: sub v0.2d, v5.2d, v2.2d +; CHECK-NEXT: sub v2.2d, v6.2d, v4.2d ; CHECK-NEXT: ret %lp1 = load <2 x i8>, ptr %p %p2 = getelementptr i8, ptr %p, i32 2 @@ -9270,58 +8433,46 @@ define <16 x i64> @dblext_bv4_v4i8_v16i32_v16i64(ptr %p, ptr %q, ptr %r, ptr %s) { ; CHECK-LABEL: dblext_bv4_v4i8_v16i32_v16i64: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp s0, s3, [x2] -; CHECK-NEXT: ldp s1, s2, [x0] -; CHECK-NEXT: ldp s4, s5, [x0, #8] -; CHECK-NEXT: ldp s6, s7, [x2, #8] -; CHECK-NEXT: add x8, x1, #8 -; CHECK-NEXT: add x9, x3, #8 -; CHECK-NEXT: add x10, x1, #12 -; CHECK-NEXT: ld1 { v4.s }[1], [x8] -; CHECK-NEXT: add x8, x3, #12 -; CHECK-NEXT: ld1 { v1.s }[1], [x1], #4 -; CHECK-NEXT: ld1 { v6.s }[1], [x9] -; CHECK-NEXT: ld1 { v0.s }[1], [x3], #4 -; CHECK-NEXT: ld1 { v7.s }[1], [x8] -; CHECK-NEXT: ld1 { v2.s }[1], [x1] -; CHECK-NEXT: ld1 { v5.s }[1], [x10] -; CHECK-NEXT: ld1 { v3.s }[1], [x3] -; CHECK-NEXT: uaddl v0.8h, v0.8b, v6.8b -; CHECK-NEXT: uaddl v1.8h, v1.8b, v4.8b -; CHECK-NEXT: uaddl v2.8h, v2.8b, v5.8b -; CHECK-NEXT: uaddl v3.8h, v3.8b, v7.8b -; CHECK-NEXT: ushll v4.4s, v0.4h, #0 -; CHECK-NEXT: ushll v5.4s, v1.4h, #0 -; CHECK-NEXT: ushll2 v0.4s, v0.8h, #0 -; CHECK-NEXT: ushll2 v1.4s, v1.8h, #0 -; CHECK-NEXT: ushll2 v7.4s, v2.8h, #0 -; CHECK-NEXT: ushll2 v19.4s, v3.8h, #0 -; CHECK-NEXT: ushll v3.4s, v3.4h, #0 -; CHECK-NEXT: ushll v2.4s, v2.4h, #0 -; CHECK-NEXT: ushll v16.2d, v4.2s, #0 -; CHECK-NEXT: ushll v17.2d, v5.2s, #0 -; CHECK-NEXT: ushll v6.2d, v0.2s, #0 -; CHECK-NEXT: ushll2 v4.2d, v4.4s, #0 -; CHECK-NEXT: ushll v18.2d, v1.2s, #0 -; CHECK-NEXT: ushll2 v5.2d, v5.4s, #0 -; CHECK-NEXT: ushll2 v0.2d, v0.4s, #0 -; CHECK-NEXT: ushll2 v1.2d, v1.4s, #0 -; CHECK-NEXT: ushll v20.2d, v3.2s, #3 -; CHECK-NEXT: ushll v21.2d, v2.2s, #3 -; CHECK-NEXT: ushll v22.2d, v19.2s, #3 -; CHECK-NEXT: ushll2 v23.2d, v3.4s, #3 -; CHECK-NEXT: ushll v24.2d, v7.2s, #3 -; CHECK-NEXT: ushll2 v2.2d, v2.4s, #3 -; CHECK-NEXT: ushll2 v3.2d, v7.4s, #3 -; CHECK-NEXT: ushll2 v7.2d, v19.4s, #3 -; CHECK-NEXT: sub v3.2d, v1.2d, v3.2d -; CHECK-NEXT: sub v7.2d, v0.2d, v7.2d -; CHECK-NEXT: sub v1.2d, v5.2d, v2.2d -; CHECK-NEXT: sub v2.2d, v18.2d, v24.2d -; CHECK-NEXT: sub v5.2d, v4.2d, v23.2d -; CHECK-NEXT: sub v6.2d, v6.2d, v22.2d -; CHECK-NEXT: sub v0.2d, v17.2d, v21.2d -; CHECK-NEXT: sub v4.2d, v16.2d, v20.2d +; CHECK-NEXT: ldp d0, d3, [x3] +; CHECK-NEXT: ldp d1, d2, [x0] +; CHECK-NEXT: ldp d4, d5, [x2] +; CHECK-NEXT: uaddl v0.8h, v0.8b, v3.8b +; CHECK-NEXT: ldp d6, d3, [x1] +; CHECK-NEXT: uaddl v1.8h, v1.8b, v2.8b +; CHECK-NEXT: ushll2 v20.4s, v0.8h, #0 +; CHECK-NEXT: uaddl v2.8h, v4.8b, v5.8b +; CHECK-NEXT: ushll2 v4.4s, v1.8h, #0 +; CHECK-NEXT: uaddl v3.8h, v6.8b, v3.8b +; CHECK-NEXT: ushll v6.4s, v1.4h, #0 +; CHECK-NEXT: ushll2 v1.2d, v4.4s, #3 +; CHECK-NEXT: ushll2 v16.4s, v3.8h, #0 +; CHECK-NEXT: ushll v17.4s, v3.4h, #0 +; CHECK-NEXT: ushll2 v3.2d, v6.4s, #0 +; CHECK-NEXT: ushll2 v5.2d, v16.4s, #3 +; CHECK-NEXT: ushll2 v7.2d, v17.4s, #0 +; CHECK-NEXT: ushll2 v18.4s, v2.8h, #0 +; CHECK-NEXT: ushll v19.4s, v2.4h, #0 +; CHECK-NEXT: ushll v21.4s, v0.4h, #0 +; CHECK-NEXT: sub v1.2d, v3.2d, v1.2d +; CHECK-NEXT: sub v3.2d, v7.2d, v5.2d +; CHECK-NEXT: ushll2 v2.2d, v18.4s, #3 +; CHECK-NEXT: ushll2 v5.2d, v19.4s, #0 +; CHECK-NEXT: ushll2 v0.2d, v20.4s, #3 +; CHECK-NEXT: ushll2 v7.2d, v21.4s, #0 +; CHECK-NEXT: sub v5.2d, v5.2d, v2.2d +; CHECK-NEXT: sub v7.2d, v7.2d, v0.2d +; CHECK-NEXT: ushll v0.2d, v4.2s, #3 +; CHECK-NEXT: ushll v2.2d, v6.2s, #0 +; CHECK-NEXT: ushll v4.2d, v16.2s, #3 +; CHECK-NEXT: ushll v6.2d, v17.2s, #0 +; CHECK-NEXT: sub v0.2d, v2.2d, v0.2d +; CHECK-NEXT: sub v2.2d, v6.2d, v4.2d +; CHECK-NEXT: ushll v4.2d, v18.2s, #3 +; CHECK-NEXT: ushll v6.2d, v19.2s, #0 +; CHECK-NEXT: ushll v16.2d, v20.2s, #3 +; CHECK-NEXT: ushll v17.2d, v21.2s, #0 +; CHECK-NEXT: sub v4.2d, v6.2d, v4.2d +; CHECK-NEXT: sub v6.2d, v17.2d, v16.2d ; CHECK-NEXT: ret %lp1 = load <4 x i8>, ptr %p %p2 = getelementptr i8, ptr %p, i32 4 @@ -9389,110 +8540,90 @@ define <32 x i64> @dblext_bv4_v8i8_v32i32_v32i64(ptr %p, ptr %q, ptr %r, ptr %s) { ; CHECK-LABEL: dblext_bv4_v8i8_v32i32_v32i64: ; CHECK: // %bb.0: -; CHECK-NEXT: str d14, [sp, #-64]! // 8-byte Folded Spill -; CHECK-NEXT: .cfi_def_cfa_offset 64 -; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill -; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill -; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill -; CHECK-NEXT: .cfi_offset b8, -8 -; CHECK-NEXT: .cfi_offset b9, -16 -; CHECK-NEXT: .cfi_offset b10, -24 -; CHECK-NEXT: .cfi_offset b11, -32 -; CHECK-NEXT: .cfi_offset b12, -40 -; CHECK-NEXT: .cfi_offset b13, -48 -; CHECK-NEXT: .cfi_offset b14, -64 -; CHECK-NEXT: ldp d1, d2, [x0] -; CHECK-NEXT: ldp d3, d4, [x0, #16] -; CHECK-NEXT: ldp d20, d21, [x3] -; CHECK-NEXT: uaddl v22.8h, v1.8b, v3.8b -; CHECK-NEXT: ldp d1, d3, [x3, #16] -; CHECK-NEXT: uaddl v28.8h, v2.8b, v4.8b -; CHECK-NEXT: ldp d0, d17, [x2] -; CHECK-NEXT: ldp d5, d6, [x1] -; CHECK-NEXT: uaddl v25.8h, v21.8b, v3.8b -; CHECK-NEXT: ldp d7, d16, [x1, #16] -; CHECK-NEXT: ldp d18, d19, [x2, #16] -; CHECK-NEXT: ushll v31.4s, v25.4h, #0 -; CHECK-NEXT: uaddl v7.8h, v5.8b, v7.8b -; CHECK-NEXT: ushll2 v25.4s, v25.8h, #0 -; CHECK-NEXT: uaddl v5.8h, v0.8b, v18.8b -; CHECK-NEXT: uaddl v18.8h, v20.8b, v1.8b -; CHECK-NEXT: uaddl v26.8h, v17.8b, v19.8b -; CHECK-NEXT: uaddl v27.8h, v6.8b, v16.8b -; CHECK-NEXT: ushll2 v2.4s, v18.8h, #0 -; CHECK-NEXT: ushll2 v6.4s, v5.8h, #0 -; CHECK-NEXT: ushll2 v3.2d, v2.4s, #0 -; CHECK-NEXT: ushll2 v16.4s, v7.8h, #0 -; CHECK-NEXT: ushll2 v20.4s, v22.8h, #0 -; CHECK-NEXT: ushll v17.2d, v2.2s, #0 -; CHECK-NEXT: ushll v29.4s, v7.4h, #0 -; CHECK-NEXT: ushll v30.4s, v26.4h, #0 -; CHECK-NEXT: ushll2 v26.4s, v26.8h, #0 -; CHECK-NEXT: ushll2 v8.2d, v25.4s, #3 -; CHECK-NEXT: ushll v25.2d, v25.2s, #3 -; CHECK-NEXT: ushll2 v4.2d, v6.4s, #0 -; CHECK-NEXT: ushll2 v1.2d, v16.4s, #0 -; CHECK-NEXT: ushll2 v0.2d, v20.4s, #0 -; CHECK-NEXT: ushll v21.4s, v18.4h, #0 -; CHECK-NEXT: ushll v24.4s, v5.4h, #0 -; CHECK-NEXT: ushll v18.2d, v6.2s, #0 -; CHECK-NEXT: ushll v19.2d, v16.2s, #0 -; CHECK-NEXT: ushll2 v6.2d, v29.4s, #0 -; CHECK-NEXT: ushll v16.4s, v22.4h, #0 -; CHECK-NEXT: ushll v22.2d, v20.2s, #0 -; CHECK-NEXT: ushll v20.2d, v29.2s, #0 -; CHECK-NEXT: ushll v29.4s, v27.4h, #0 -; CHECK-NEXT: ushll2 v27.4s, v27.8h, #0 -; CHECK-NEXT: ushll2 v9.2d, v26.4s, #3 -; CHECK-NEXT: ushll v26.2d, v26.2s, #3 -; CHECK-NEXT: sub v3.2d, v3.2d, v8.2d -; CHECK-NEXT: sub v17.2d, v17.2d, v25.2d -; CHECK-NEXT: ushll2 v2.2d, v21.4s, #0 -; CHECK-NEXT: ushll2 v5.2d, v24.4s, #0 -; CHECK-NEXT: stp q17, q3, [x8, #224] -; CHECK-NEXT: ushll v23.2d, v21.2s, #0 -; CHECK-NEXT: ushll v21.2d, v24.2s, #0 -; CHECK-NEXT: ushll v24.4s, v28.4h, #0 -; CHECK-NEXT: ushll2 v28.4s, v28.8h, #0 -; CHECK-NEXT: ushll2 v10.2d, v27.4s, #3 -; CHECK-NEXT: ushll v27.2d, v27.2s, #3 -; CHECK-NEXT: sub v4.2d, v4.2d, v9.2d -; CHECK-NEXT: sub v3.2d, v18.2d, v26.2d -; CHECK-NEXT: ushll2 v11.2d, v28.4s, #3 -; CHECK-NEXT: ushll v28.2d, v28.2s, #3 -; CHECK-NEXT: stp q3, q4, [x8, #160] -; CHECK-NEXT: sub v1.2d, v1.2d, v10.2d -; CHECK-NEXT: sub v4.2d, v19.2d, v27.2d -; CHECK-NEXT: ushll2 v12.2d, v31.4s, #3 -; CHECK-NEXT: ushll v31.2d, v31.2s, #3 -; CHECK-NEXT: stp q4, q1, [x8, #96] -; CHECK-NEXT: sub v0.2d, v0.2d, v11.2d -; CHECK-NEXT: sub v1.2d, v22.2d, v28.2d -; CHECK-NEXT: ushll2 v13.2d, v30.4s, #3 -; CHECK-NEXT: ushll v30.2d, v30.2s, #3 -; CHECK-NEXT: stp q1, q0, [x8, #32] -; CHECK-NEXT: sub v2.2d, v2.2d, v12.2d -; CHECK-NEXT: sub v0.2d, v23.2d, v31.2d -; CHECK-NEXT: ushll2 v14.2d, v29.4s, #3 -; CHECK-NEXT: ushll v29.2d, v29.2s, #3 -; CHECK-NEXT: stp q0, q2, [x8, #192] -; CHECK-NEXT: sub v1.2d, v5.2d, v13.2d -; CHECK-NEXT: sub v2.2d, v21.2d, v30.2d -; CHECK-NEXT: sub v0.2d, v6.2d, v14.2d -; CHECK-NEXT: stp q2, q1, [x8, #128] -; CHECK-NEXT: sub v1.2d, v20.2d, v29.2d -; CHECK-NEXT: ushll2 v7.2d, v16.4s, #0 -; CHECK-NEXT: ushll v2.2d, v16.2s, #0 -; CHECK-NEXT: stp q1, q0, [x8, #64] -; CHECK-NEXT: ushll2 v0.2d, v24.4s, #3 -; CHECK-NEXT: ushll v1.2d, v24.2s, #3 -; CHECK-NEXT: sub v0.2d, v7.2d, v0.2d +; CHECK-NEXT: ldp q0, q1, [x3] +; CHECK-NEXT: ldp q7, q16, [x0] +; CHECK-NEXT: uaddl v4.8h, v0.8b, v1.8b +; CHECK-NEXT: uaddl2 v6.8h, v0.16b, v1.16b +; CHECK-NEXT: ldp q2, q3, [x2] +; CHECK-NEXT: ldp q5, q17, [x1] +; CHECK-NEXT: uaddl v0.8h, v2.8b, v3.8b +; CHECK-NEXT: uaddl2 v1.8h, v2.16b, v3.16b +; CHECK-NEXT: uaddl v2.8h, v5.8b, v17.8b +; CHECK-NEXT: uaddl2 v3.8h, v5.16b, v17.16b +; CHECK-NEXT: uaddl v5.8h, v7.8b, v16.8b +; CHECK-NEXT: uaddl2 v7.8h, v7.16b, v16.16b +; CHECK-NEXT: ushll v16.4s, v6.4h, #0 +; CHECK-NEXT: ushll v17.4s, v4.4h, #0 +; CHECK-NEXT: ushll v18.2d, v16.2s, #3 +; CHECK-NEXT: ushll v19.2d, v17.2s, #0 +; CHECK-NEXT: ushll2 v20.4s, v7.8h, #0 +; CHECK-NEXT: sub v18.2d, v19.2d, v18.2d +; CHECK-NEXT: ushll2 v19.4s, v5.8h, #0 +; CHECK-NEXT: ushll2 v23.4s, v3.8h, #0 +; CHECK-NEXT: ushll2 v24.4s, v2.8h, #0 +; CHECK-NEXT: ushll v21.2d, v20.2s, #3 +; CHECK-NEXT: ushll v22.2d, v19.2s, #0 +; CHECK-NEXT: ushll v25.2d, v23.2s, #3 +; CHECK-NEXT: ushll v26.2d, v24.2s, #0 +; CHECK-NEXT: sub v21.2d, v22.2d, v21.2d +; CHECK-NEXT: sub v22.2d, v26.2d, v25.2d +; CHECK-NEXT: ushll2 v25.4s, v1.8h, #0 +; CHECK-NEXT: ushll2 v26.4s, v0.8h, #0 +; CHECK-NEXT: ushll2 v6.4s, v6.8h, #0 +; CHECK-NEXT: ushll2 v4.4s, v4.8h, #0 +; CHECK-NEXT: ushll v27.2d, v25.2s, #3 +; CHECK-NEXT: ushll v28.2d, v26.2s, #0 +; CHECK-NEXT: ushll v29.2d, v6.2s, #3 +; CHECK-NEXT: ushll v30.2d, v4.2s, #0 +; CHECK-NEXT: ushll2 v6.2d, v6.4s, #3 +; CHECK-NEXT: ushll2 v4.2d, v4.4s, #0 +; CHECK-NEXT: sub v27.2d, v28.2d, v27.2d +; CHECK-NEXT: sub v28.2d, v30.2d, v29.2d +; CHECK-NEXT: sub v4.2d, v4.2d, v6.2d +; CHECK-NEXT: ushll v1.4s, v1.4h, #0 +; CHECK-NEXT: ushll v0.4s, v0.4h, #0 +; CHECK-NEXT: stp q28, q4, [x8, #224] +; CHECK-NEXT: ushll2 v25.2d, v25.4s, #3 +; CHECK-NEXT: ushll2 v26.2d, v26.4s, #0 +; CHECK-NEXT: ushll v3.4s, v3.4h, #0 +; CHECK-NEXT: ushll v2.4s, v2.4h, #0 +; CHECK-NEXT: ushll v4.2d, v1.2s, #3 +; CHECK-NEXT: ushll v28.2d, v0.2s, #0 +; CHECK-NEXT: ushll2 v1.2d, v1.4s, #3 +; CHECK-NEXT: ushll2 v0.2d, v0.4s, #0 +; CHECK-NEXT: sub v25.2d, v26.2d, v25.2d +; CHECK-NEXT: ushll v26.2d, v3.2s, #3 +; CHECK-NEXT: ushll v6.2d, v2.2s, #0 +; CHECK-NEXT: stp q27, q25, [x8, #160] +; CHECK-NEXT: ushll2 v3.2d, v3.4s, #3 +; CHECK-NEXT: ushll2 v2.2d, v2.4s, #0 +; CHECK-NEXT: sub v0.2d, v0.2d, v1.2d +; CHECK-NEXT: sub v1.2d, v28.2d, v4.2d +; CHECK-NEXT: ushll2 v23.2d, v23.4s, #3 +; CHECK-NEXT: ushll2 v24.2d, v24.4s, #0 +; CHECK-NEXT: stp q1, q0, [x8, #128] +; CHECK-NEXT: ushll v7.4s, v7.4h, #0 +; CHECK-NEXT: ushll v5.4s, v5.4h, #0 +; CHECK-NEXT: sub v2.2d, v2.2d, v3.2d +; CHECK-NEXT: sub v0.2d, v6.2d, v26.2d +; CHECK-NEXT: sub v23.2d, v24.2d, v23.2d +; CHECK-NEXT: ushll v24.2d, v7.2s, #3 +; CHECK-NEXT: stp q0, q2, [x8, #64] +; CHECK-NEXT: ushll2 v16.2d, v16.4s, #3 +; CHECK-NEXT: stp q22, q23, [x8, #96] +; CHECK-NEXT: ushll2 v20.2d, v20.4s, #3 +; CHECK-NEXT: ushll2 v19.2d, v19.4s, #0 +; CHECK-NEXT: ushll2 v17.2d, v17.4s, #0 +; CHECK-NEXT: ushll2 v1.2d, v7.4s, #3 +; CHECK-NEXT: ushll2 v2.2d, v5.4s, #0 +; CHECK-NEXT: ushll v0.2d, v5.2s, #0 +; CHECK-NEXT: sub v19.2d, v19.2d, v20.2d +; CHECK-NEXT: sub v16.2d, v17.2d, v16.2d ; CHECK-NEXT: sub v1.2d, v2.2d, v1.2d -; CHECK-NEXT: stp q1, q0, [x8] -; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload -; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload -; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload -; CHECK-NEXT: ldr d14, [sp], #64 // 8-byte Folded Reload +; CHECK-NEXT: stp q21, q19, [x8, #32] +; CHECK-NEXT: sub v0.2d, v0.2d, v24.2d +; CHECK-NEXT: stp q18, q16, [x8, #192] +; CHECK-NEXT: stp q0, q1, [x8] ; CHECK-NEXT: ret %lp1 = load <8 x i8>, ptr %p %p2 = getelementptr i8, ptr %p, i32 8 @@ -9560,90 +8691,26 @@ define <8 x i64> @dblext_bv4_v2i16_v8i32_v8i64(ptr %p, ptr %q, ptr %r, ptr %s) { ; CHECK-LABEL: dblext_bv4_v2i16_v8i32_v8i64: ; CHECK: // %bb.0: -; CHECK-NEXT: add x8, x0, #4 -; CHECK-NEXT: add x9, x0, #8 -; CHECK-NEXT: ld1 { v3.h }[0], [x0] -; CHECK-NEXT: add x10, x0, #12 -; CHECK-NEXT: ld1 { v6.h }[0], [x1] -; CHECK-NEXT: add x11, x1, #14 -; CHECK-NEXT: ld1 { v0.h }[0], [x8] -; CHECK-NEXT: add x8, x0, #2 -; CHECK-NEXT: ld1 { v2.h }[0], [x9] -; CHECK-NEXT: add x9, x0, #6 -; CHECK-NEXT: ld1 { v1.h }[0], [x10] -; CHECK-NEXT: add x10, x0, #10 -; CHECK-NEXT: ld1 { v3.h }[2], [x8] -; CHECK-NEXT: add x8, x1, #4 -; CHECK-NEXT: ld1 { v0.h }[2], [x9] -; CHECK-NEXT: add x9, x1, #8 -; CHECK-NEXT: ld1 { v2.h }[2], [x10] -; CHECK-NEXT: add x10, x0, #14 -; CHECK-NEXT: ld1 { v4.h }[0], [x8] -; CHECK-NEXT: add x8, x1, #2 -; CHECK-NEXT: ld1 { v5.h }[0], [x9] -; CHECK-NEXT: add x9, x1, #6 -; CHECK-NEXT: ld1 { v1.h }[2], [x10] -; CHECK-NEXT: add x10, x1, #10 -; CHECK-NEXT: ld1 { v6.h }[2], [x8] -; CHECK-NEXT: add x8, x1, #12 -; CHECK-NEXT: ld1 { v4.h }[2], [x9] -; CHECK-NEXT: add x9, x2, #4 -; CHECK-NEXT: ld1 { v5.h }[2], [x10] -; CHECK-NEXT: add x10, x2, #12 -; CHECK-NEXT: ld1 { v7.h }[0], [x8] -; CHECK-NEXT: add x8, x2, #8 -; CHECK-NEXT: ld1 { v16.h }[0], [x2] -; CHECK-NEXT: uzp1 v3.4h, v3.4h, v6.4h -; CHECK-NEXT: ld1 { v17.h }[0], [x9] -; CHECK-NEXT: add x9, x2, #2 -; CHECK-NEXT: ld1 { v18.h }[0], [x8] -; CHECK-NEXT: add x8, x2, #6 -; CHECK-NEXT: ld1 { v19.h }[0], [x10] -; CHECK-NEXT: add x10, x2, #10 -; CHECK-NEXT: ld1 { v16.h }[2], [x9] -; CHECK-NEXT: add x9, x2, #14 -; CHECK-NEXT: ld1 { v17.h }[2], [x8] -; CHECK-NEXT: add x8, x3, #4 -; CHECK-NEXT: ld1 { v18.h }[2], [x10] -; CHECK-NEXT: add x10, x3, #12 -; CHECK-NEXT: ld1 { v19.h }[2], [x9] -; CHECK-NEXT: add x9, x3, #8 -; CHECK-NEXT: ld1 { v20.h }[0], [x3] -; CHECK-NEXT: uzp1 v2.4h, v2.4h, v5.4h -; CHECK-NEXT: ld1 { v21.h }[0], [x8] -; CHECK-NEXT: add x8, x3, #2 -; CHECK-NEXT: ld1 { v22.h }[0], [x9] -; CHECK-NEXT: add x9, x3, #6 -; CHECK-NEXT: ld1 { v23.h }[0], [x10] -; CHECK-NEXT: add x10, x3, #10 -; CHECK-NEXT: ld1 { v20.h }[2], [x8] -; CHECK-NEXT: add x8, x3, #14 -; CHECK-NEXT: ld1 { v21.h }[2], [x9] -; CHECK-NEXT: uzp1 v0.4h, v0.4h, v4.4h -; CHECK-NEXT: ld1 { v22.h }[2], [x10] -; CHECK-NEXT: ld1 { v23.h }[2], [x8] -; CHECK-NEXT: ld1 { v7.h }[2], [x11] -; CHECK-NEXT: uzp1 v5.4h, v16.4h, v20.4h -; CHECK-NEXT: uzp1 v16.4h, v17.4h, v21.4h -; CHECK-NEXT: uzp1 v6.4h, v18.4h, v22.4h -; CHECK-NEXT: uzp1 v17.4h, v19.4h, v23.4h -; CHECK-NEXT: uzp1 v1.4h, v1.4h, v7.4h -; CHECK-NEXT: uaddl v2.4s, v3.4h, v2.4h -; CHECK-NEXT: uaddl v3.4s, v5.4h, v6.4h -; CHECK-NEXT: uaddl v4.4s, v16.4h, v17.4h -; CHECK-NEXT: uaddl v0.4s, v0.4h, v1.4h -; CHECK-NEXT: ushll v5.2d, v3.2s, #0 -; CHECK-NEXT: ushll v6.2d, v2.2s, #0 -; CHECK-NEXT: ushll2 v3.2d, v3.4s, #0 -; CHECK-NEXT: ushll2 v1.2d, v2.4s, #0 -; CHECK-NEXT: ushll v2.2d, v4.2s, #3 -; CHECK-NEXT: ushll v7.2d, v0.2s, #3 -; CHECK-NEXT: ushll2 v0.2d, v0.4s, #3 -; CHECK-NEXT: ushll2 v4.2d, v4.4s, #3 -; CHECK-NEXT: sub v1.2d, v1.2d, v0.2d +; CHECK-NEXT: ldp d0, d3, [x3] +; CHECK-NEXT: ldp d1, d2, [x0] +; CHECK-NEXT: ldp d4, d5, [x2] +; CHECK-NEXT: uaddl v3.4s, v0.4h, v3.4h +; CHECK-NEXT: ldp d6, d0, [x1] +; CHECK-NEXT: uaddl v1.4s, v1.4h, v2.4h +; CHECK-NEXT: uaddl v2.4s, v4.4h, v5.4h +; CHECK-NEXT: ushll2 v4.2d, v1.4s, #3 +; CHECK-NEXT: uaddl v0.4s, v6.4h, v0.4h +; CHECK-NEXT: ushll v1.2d, v1.2s, #0 +; CHECK-NEXT: ushll2 v7.2d, v2.4s, #3 +; CHECK-NEXT: ushll2 v5.2d, v0.4s, #3 +; CHECK-NEXT: ushll v6.2d, v0.2s, #0 +; CHECK-NEXT: ushll v2.2d, v2.2s, #0 +; CHECK-NEXT: sub v0.2d, v1.2d, v4.2d +; CHECK-NEXT: ushll2 v4.2d, v3.4s, #3 +; CHECK-NEXT: ushll v3.2d, v3.2s, #0 +; CHECK-NEXT: sub v1.2d, v6.2d, v5.2d +; CHECK-NEXT: sub v2.2d, v2.2d, v7.2d ; CHECK-NEXT: sub v3.2d, v3.2d, v4.2d -; CHECK-NEXT: sub v0.2d, v6.2d, v7.2d -; CHECK-NEXT: sub v2.2d, v5.2d, v2.2d ; CHECK-NEXT: ret %lp1 = load <2 x i16>, ptr %p %p2 = getelementptr i8, ptr %p, i32 4 @@ -9711,46 +8778,42 @@ define <16 x i64> @dblext_bv4_v4i16_v16i32_v16i64(ptr %p, ptr %q, ptr %r, ptr %s) { ; CHECK-LABEL: dblext_bv4_v4i16_v16i32_v16i64: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp d1, d2, [x0] -; CHECK-NEXT: ldp d3, d4, [x0, #16] -; CHECK-NEXT: ldp d5, d6, [x1] -; CHECK-NEXT: ldp d7, d16, [x1, #16] -; CHECK-NEXT: uaddl v1.4s, v1.4h, v3.4h -; CHECK-NEXT: ldp d0, d17, [x2] -; CHECK-NEXT: uaddl v2.4s, v2.4h, v4.4h -; CHECK-NEXT: ldp d18, d19, [x2, #16] -; CHECK-NEXT: uaddl v5.4s, v5.4h, v7.4h -; CHECK-NEXT: ldp d20, d21, [x3] -; CHECK-NEXT: uaddl v6.4s, v6.4h, v16.4h -; CHECK-NEXT: ldp d3, d7, [x3, #16] -; CHECK-NEXT: uaddl v0.4s, v0.4h, v18.4h -; CHECK-NEXT: uaddl v17.4s, v17.4h, v19.4h -; CHECK-NEXT: ushll v18.2d, v5.2s, #0 -; CHECK-NEXT: uaddl v3.4s, v20.4h, v3.4h -; CHECK-NEXT: uaddl v7.4s, v21.4h, v7.4h -; CHECK-NEXT: ushll v4.2d, v0.2s, #0 -; CHECK-NEXT: ushll v16.2d, v3.2s, #0 -; CHECK-NEXT: ushll v19.2d, v1.2s, #0 -; CHECK-NEXT: ushll2 v20.2d, v3.4s, #0 -; CHECK-NEXT: ushll2 v0.2d, v0.4s, #0 -; CHECK-NEXT: ushll2 v3.2d, v5.4s, #0 -; CHECK-NEXT: ushll2 v1.2d, v1.4s, #0 -; CHECK-NEXT: ushll v21.2d, v7.2s, #3 -; CHECK-NEXT: ushll v22.2d, v17.2s, #3 -; CHECK-NEXT: ushll v23.2d, v6.2s, #3 -; CHECK-NEXT: ushll v24.2d, v2.2s, #3 -; CHECK-NEXT: ushll2 v7.2d, v7.4s, #3 -; CHECK-NEXT: ushll2 v5.2d, v17.4s, #3 -; CHECK-NEXT: ushll2 v2.2d, v2.4s, #3 -; CHECK-NEXT: ushll2 v6.2d, v6.4s, #3 -; CHECK-NEXT: sub v1.2d, v1.2d, v2.2d -; CHECK-NEXT: sub v3.2d, v3.2d, v6.2d -; CHECK-NEXT: sub v5.2d, v0.2d, v5.2d -; CHECK-NEXT: sub v7.2d, v20.2d, v7.2d -; CHECK-NEXT: sub v0.2d, v19.2d, v24.2d -; CHECK-NEXT: sub v2.2d, v18.2d, v23.2d -; CHECK-NEXT: sub v4.2d, v4.2d, v22.2d -; CHECK-NEXT: sub v6.2d, v16.2d, v21.2d +; CHECK-NEXT: ldp q0, q3, [x3] +; CHECK-NEXT: ldp q1, q2, [x1] +; CHECK-NEXT: uaddl v6.4s, v0.4h, v3.4h +; CHECK-NEXT: uaddl2 v16.4s, v0.8h, v3.8h +; CHECK-NEXT: ushll2 v21.2d, v6.4s, #0 +; CHECK-NEXT: ushll2 v20.2d, v16.4s, #3 +; CHECK-NEXT: ldp q0, q3, [x0] +; CHECK-NEXT: uaddl v18.4s, v1.4h, v2.4h +; CHECK-NEXT: uaddl2 v2.4s, v1.8h, v2.8h +; CHECK-NEXT: ushll v16.2d, v16.2s, #3 +; CHECK-NEXT: ushll2 v7.2d, v18.4s, #0 +; CHECK-NEXT: ushll v18.2d, v18.2s, #0 +; CHECK-NEXT: ldp q4, q5, [x2] +; CHECK-NEXT: uaddl2 v19.4s, v0.8h, v3.8h +; CHECK-NEXT: uaddl v0.4s, v0.4h, v3.4h +; CHECK-NEXT: ushll v6.2d, v6.2s, #0 +; CHECK-NEXT: ushll2 v1.2d, v19.4s, #3 +; CHECK-NEXT: ushll2 v3.2d, v0.4s, #0 +; CHECK-NEXT: uaddl v17.4s, v4.4h, v5.4h +; CHECK-NEXT: uaddl2 v4.4s, v4.8h, v5.8h +; CHECK-NEXT: ushll2 v5.2d, v2.4s, #3 +; CHECK-NEXT: sub v1.2d, v3.2d, v1.2d +; CHECK-NEXT: sub v3.2d, v7.2d, v5.2d +; CHECK-NEXT: ushll2 v5.2d, v4.4s, #3 +; CHECK-NEXT: ushll2 v7.2d, v17.4s, #0 +; CHECK-NEXT: ushll v19.2d, v19.2s, #3 +; CHECK-NEXT: ushll v0.2d, v0.2s, #0 +; CHECK-NEXT: ushll v2.2d, v2.2s, #3 +; CHECK-NEXT: ushll v4.2d, v4.2s, #3 +; CHECK-NEXT: ushll v17.2d, v17.2s, #0 +; CHECK-NEXT: sub v5.2d, v7.2d, v5.2d +; CHECK-NEXT: sub v7.2d, v21.2d, v20.2d +; CHECK-NEXT: sub v0.2d, v0.2d, v19.2d +; CHECK-NEXT: sub v2.2d, v18.2d, v2.2d +; CHECK-NEXT: sub v4.2d, v17.2d, v4.2d +; CHECK-NEXT: sub v6.2d, v6.2d, v16.2d ; CHECK-NEXT: ret %lp1 = load <4 x i16>, ptr %p %p2 = getelementptr i8, ptr %p, i32 8 @@ -9818,103 +8881,86 @@ define <32 x i64> @dblext_bv4_v8i16_v32i32_v32i64(ptr %p, ptr %q, ptr %r, ptr %s) { ; CHECK-LABEL: dblext_bv4_v8i16_v32i32_v32i64: ; CHECK: // %bb.0: -; CHECK-NEXT: str d14, [sp, #-64]! // 8-byte Folded Spill -; CHECK-NEXT: .cfi_def_cfa_offset 64 -; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill -; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill -; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill -; CHECK-NEXT: .cfi_offset b8, -8 -; CHECK-NEXT: .cfi_offset b9, -16 -; CHECK-NEXT: .cfi_offset b10, -24 -; CHECK-NEXT: .cfi_offset b11, -32 -; CHECK-NEXT: .cfi_offset b12, -40 -; CHECK-NEXT: .cfi_offset b13, -48 -; CHECK-NEXT: .cfi_offset b14, -64 -; CHECK-NEXT: ldp q6, q7, [x1] -; CHECK-NEXT: ldp q16, q17, [x1, #32] -; CHECK-NEXT: uaddl v23.4s, v6.4h, v16.4h -; CHECK-NEXT: uaddl2 v16.4s, v6.8h, v16.8h -; CHECK-NEXT: ldp q2, q3, [x0] -; CHECK-NEXT: uaddl2 v29.4s, v7.8h, v17.8h -; CHECK-NEXT: ushll2 v6.2d, v16.4s, #0 -; CHECK-NEXT: ushll v16.2d, v16.2s, #0 -; CHECK-NEXT: ushll2 v10.2d, v29.4s, #3 -; CHECK-NEXT: ushll v29.2d, v29.2s, #3 -; CHECK-NEXT: ldp q4, q5, [x0, #32] -; CHECK-NEXT: sub v6.2d, v6.2d, v10.2d -; CHECK-NEXT: sub v16.2d, v16.2d, v29.2d -; CHECK-NEXT: uaddl v17.4s, v7.4h, v17.4h -; CHECK-NEXT: uaddl2 v28.4s, v2.8h, v4.8h -; CHECK-NEXT: uaddl v0.4s, v2.4h, v4.4h -; CHECK-NEXT: ldp q24, q21, [x3, #16] -; CHECK-NEXT: uaddl2 v30.4s, v3.8h, v5.8h -; CHECK-NEXT: ushll2 v2.2d, v28.4s, #0 -; CHECK-NEXT: ushll v28.2d, v28.2s, #0 -; CHECK-NEXT: ushll2 v11.2d, v30.4s, #3 -; CHECK-NEXT: ushll v30.2d, v30.2s, #3 -; CHECK-NEXT: ldp q1, q18, [x2] -; CHECK-NEXT: sub v2.2d, v2.2d, v11.2d -; CHECK-NEXT: ushll2 v7.2d, v23.4s, #0 -; CHECK-NEXT: ushll v23.2d, v23.2s, #0 -; CHECK-NEXT: ushll2 v14.2d, v17.4s, #3 -; CHECK-NEXT: ushll v17.2d, v17.2s, #3 -; CHECK-NEXT: ldp q19, q20, [x2, #32] -; CHECK-NEXT: uaddl v25.4s, v1.4h, v19.4h -; CHECK-NEXT: uaddl2 v19.4s, v1.8h, v19.8h -; CHECK-NEXT: ldr q22, [x3] -; CHECK-NEXT: ldr q26, [x3, #48] -; CHECK-NEXT: stp q16, q6, [x8, #96] -; CHECK-NEXT: sub v6.2d, v28.2d, v30.2d -; CHECK-NEXT: uaddl v27.4s, v22.4h, v21.4h -; CHECK-NEXT: uaddl2 v4.4s, v22.8h, v21.8h -; CHECK-NEXT: stp q6, q2, [x8, #32] -; CHECK-NEXT: uaddl2 v21.4s, v24.8h, v26.8h -; CHECK-NEXT: uaddl v24.4s, v24.4h, v26.4h -; CHECK-NEXT: uaddl2 v22.4s, v18.8h, v20.8h -; CHECK-NEXT: uaddl v18.4s, v18.4h, v20.4h -; CHECK-NEXT: ushll2 v20.2d, v4.4s, #0 -; CHECK-NEXT: ushll v31.2d, v4.2s, #0 -; CHECK-NEXT: ushll2 v4.2d, v27.4s, #0 -; CHECK-NEXT: ushll v27.2d, v27.2s, #0 -; CHECK-NEXT: ushll2 v12.2d, v24.4s, #3 -; CHECK-NEXT: ushll v24.2d, v24.2s, #3 -; CHECK-NEXT: uaddl v1.4s, v3.4h, v5.4h -; CHECK-NEXT: ushll2 v5.2d, v25.4s, #0 -; CHECK-NEXT: ushll v25.2d, v25.2s, #0 -; CHECK-NEXT: ushll2 v13.2d, v18.4s, #3 -; CHECK-NEXT: ushll v18.2d, v18.2s, #3 -; CHECK-NEXT: sub v4.2d, v4.2d, v12.2d -; CHECK-NEXT: sub v2.2d, v27.2d, v24.2d -; CHECK-NEXT: sub v5.2d, v5.2d, v13.2d -; CHECK-NEXT: stp q2, q4, [x8, #192] -; CHECK-NEXT: sub v4.2d, v25.2d, v18.2d -; CHECK-NEXT: sub v2.2d, v7.2d, v14.2d -; CHECK-NEXT: stp q4, q5, [x8, #128] -; CHECK-NEXT: sub v5.2d, v23.2d, v17.2d -; CHECK-NEXT: ushll2 v26.2d, v19.4s, #0 -; CHECK-NEXT: ushll v19.2d, v19.2s, #0 -; CHECK-NEXT: stp q5, q2, [x8, #64] +; CHECK-NEXT: ldp q1, q2, [x0] +; CHECK-NEXT: ldp q4, q21, [x3] +; CHECK-NEXT: ldp q5, q25, [x3, #32] +; CHECK-NEXT: uaddl2 v7.4s, v4.8h, v5.8h +; CHECK-NEXT: uaddl v5.4s, v4.4h, v5.4h +; CHECK-NEXT: ldp q30, q31, [x0, #32] +; CHECK-NEXT: uaddl2 v18.4s, v21.8h, v25.8h +; CHECK-NEXT: uaddl v21.4s, v21.4h, v25.4h +; CHECK-NEXT: uaddl2 v16.4s, v1.8h, v30.8h +; CHECK-NEXT: ushll v25.2d, v21.2s, #3 +; CHECK-NEXT: ldp q0, q23, [x2] +; CHECK-NEXT: uaddl2 v19.4s, v2.8h, v31.8h +; CHECK-NEXT: ushll2 v21.2d, v21.4s, #3 +; CHECK-NEXT: uaddl v2.4s, v2.4h, v31.4h +; CHECK-NEXT: ldp q3, q24, [x1] +; CHECK-NEXT: ldp q26, q27, [x2, #32] +; CHECK-NEXT: uaddl2 v20.4s, v0.8h, v26.8h +; CHECK-NEXT: uaddl v4.4s, v0.4h, v26.4h +; CHECK-NEXT: ldp q28, q29, [x1, #32] +; CHECK-NEXT: uaddl2 v22.4s, v23.8h, v27.8h +; CHECK-NEXT: uaddl v23.4s, v23.4h, v27.4h +; CHECK-NEXT: ushll v26.2d, v5.2s, #0 +; CHECK-NEXT: uaddl2 v6.4s, v3.8h, v28.8h +; CHECK-NEXT: uaddl v3.4s, v3.4h, v28.4h +; CHECK-NEXT: uaddl2 v17.4s, v24.8h, v29.8h +; CHECK-NEXT: ushll v27.2d, v19.2s, #3 +; CHECK-NEXT: ushll v28.2d, v16.2s, #0 +; CHECK-NEXT: uaddl v24.4s, v24.4h, v29.4h +; CHECK-NEXT: uaddl v0.4s, v1.4h, v30.4h +; CHECK-NEXT: sub v25.2d, v26.2d, v25.2d +; CHECK-NEXT: sub v26.2d, v28.2d, v27.2d +; CHECK-NEXT: ushll v27.2d, v17.2s, #3 +; CHECK-NEXT: ushll v28.2d, v6.2s, #0 +; CHECK-NEXT: ushll v29.2d, v22.2s, #3 +; CHECK-NEXT: ushll v30.2d, v20.2s, #0 +; CHECK-NEXT: sub v27.2d, v28.2d, v27.2d +; CHECK-NEXT: sub v28.2d, v30.2d, v29.2d +; CHECK-NEXT: ushll v29.2d, v18.2s, #3 +; CHECK-NEXT: ushll v30.2d, v7.2s, #0 +; CHECK-NEXT: ushll2 v22.2d, v22.4s, #3 +; CHECK-NEXT: ushll2 v20.2d, v20.4s, #0 +; CHECK-NEXT: ushll2 v18.2d, v18.4s, #3 +; CHECK-NEXT: ushll2 v7.2d, v7.4s, #0 +; CHECK-NEXT: ushll2 v5.2d, v5.4s, #0 +; CHECK-NEXT: sub v20.2d, v20.2d, v22.2d +; CHECK-NEXT: ushll v22.2d, v23.2s, #3 +; CHECK-NEXT: sub v7.2d, v7.2d, v18.2d +; CHECK-NEXT: stp q28, q20, [x8, #160] +; CHECK-NEXT: ushll v18.2d, v4.2s, #0 +; CHECK-NEXT: ushll2 v23.2d, v23.4s, #3 +; CHECK-NEXT: ushll2 v4.2d, v4.4s, #0 +; CHECK-NEXT: sub v5.2d, v5.2d, v21.2d +; CHECK-NEXT: ushll v31.2d, v24.2s, #3 +; CHECK-NEXT: sub v29.2d, v30.2d, v29.2d +; CHECK-NEXT: stp q25, q5, [x8, #192] +; CHECK-NEXT: ushll v30.2d, v3.2s, #0 +; CHECK-NEXT: ushll2 v24.2d, v24.4s, #3 +; CHECK-NEXT: stp q29, q7, [x8, #224] +; CHECK-NEXT: ushll2 v3.2d, v3.4s, #0 +; CHECK-NEXT: sub v4.2d, v4.2d, v23.2d +; CHECK-NEXT: sub v5.2d, v18.2d, v22.2d +; CHECK-NEXT: sub v3.2d, v3.2d, v24.2d +; CHECK-NEXT: stp q5, q4, [x8, #128] +; CHECK-NEXT: sub v4.2d, v30.2d, v31.2d +; CHECK-NEXT: ushll v1.2d, v2.2s, #3 +; CHECK-NEXT: ushll2 v19.2d, v19.4s, #3 +; CHECK-NEXT: stp q4, q3, [x8, #64] +; CHECK-NEXT: ushll2 v17.2d, v17.4s, #3 +; CHECK-NEXT: ushll2 v6.2d, v6.4s, #0 +; CHECK-NEXT: ushll2 v7.2d, v16.4s, #0 +; CHECK-NEXT: ushll2 v2.2d, v2.4s, #3 ; CHECK-NEXT: ushll2 v3.2d, v0.4s, #0 -; CHECK-NEXT: ushll2 v8.2d, v21.4s, #3 -; CHECK-NEXT: ushll2 v9.2d, v22.4s, #3 -; CHECK-NEXT: ushll v21.2d, v21.2s, #3 -; CHECK-NEXT: ushll v22.2d, v22.2s, #3 ; CHECK-NEXT: ushll v0.2d, v0.2s, #0 -; CHECK-NEXT: ushll2 v2.2d, v1.4s, #3 -; CHECK-NEXT: ushll v1.2d, v1.2s, #3 -; CHECK-NEXT: sub v20.2d, v20.2d, v8.2d -; CHECK-NEXT: sub v21.2d, v31.2d, v21.2d -; CHECK-NEXT: sub v26.2d, v26.2d, v9.2d -; CHECK-NEXT: sub v19.2d, v19.2d, v22.2d -; CHECK-NEXT: stp q21, q20, [x8, #224] +; CHECK-NEXT: sub v6.2d, v6.2d, v17.2d +; CHECK-NEXT: sub v7.2d, v7.2d, v19.2d ; CHECK-NEXT: sub v2.2d, v3.2d, v2.2d +; CHECK-NEXT: stp q27, q6, [x8, #96] ; CHECK-NEXT: sub v0.2d, v0.2d, v1.2d -; CHECK-NEXT: stp q19, q26, [x8, #160] +; CHECK-NEXT: stp q26, q7, [x8, #32] ; CHECK-NEXT: stp q0, q2, [x8] -; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload -; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload -; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload -; CHECK-NEXT: ldr d14, [sp], #64 // 8-byte Folded Reload ; CHECK-NEXT: ret %lp1 = load <8 x i16>, ptr %p %p2 = getelementptr i8, ptr %p, i32 16 Index: llvm/test/CodeGen/AArch64/insert-extend.ll =================================================================== --- llvm/test/CodeGen/AArch64/insert-extend.ll +++ llvm/test/CodeGen/AArch64/insert-extend.ll @@ -48,120 +48,114 @@ ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: // kill: def $w3 killed $w3 def $x3 ; CHECK-NEXT: // kill: def $w1 killed $w1 def $x1 -; CHECK-NEXT: sxtw x8, w3 -; CHECK-NEXT: sxtw x9, w1 -; CHECK-NEXT: add x10, x2, x8 -; CHECK-NEXT: add x11, x0, x9 +; CHECK-NEXT: sxtw x8, w1 +; CHECK-NEXT: sxtw x9, w3 +; CHECK-NEXT: add x10, x0, x8 +; CHECK-NEXT: add x11, x2, x9 +; CHECK-NEXT: ldr d0, [x0] ; CHECK-NEXT: add x12, x10, x8 -; CHECK-NEXT: add x13, x11, x9 -; CHECK-NEXT: add x8, x12, x8 -; CHECK-NEXT: add x9, x13, x9 -; CHECK-NEXT: ldp s0, s6, [x11] -; CHECK-NEXT: ldp s3, s7, [x10] -; CHECK-NEXT: ldp s1, s5, [x8] -; CHECK-NEXT: ldp s2, s4, [x9] -; CHECK-NEXT: ld1 { v1.s }[1], [x12], #4 -; CHECK-NEXT: ld1 { v2.s }[1], [x13], #4 -; CHECK-NEXT: ld1 { v3.s }[1], [x2], #4 -; CHECK-NEXT: ld1 { v0.s }[1], [x0], #4 -; CHECK-NEXT: ld1 { v5.s }[1], [x12] -; CHECK-NEXT: ld1 { v4.s }[1], [x13] -; CHECK-NEXT: ld1 { v7.s }[1], [x2] -; CHECK-NEXT: ld1 { v6.s }[1], [x0] -; CHECK-NEXT: usubl v0.8h, v0.8b, v3.8b -; CHECK-NEXT: usubl v1.8h, v2.8b, v1.8b -; CHECK-NEXT: usubl v2.8h, v4.8b, v5.8b +; CHECK-NEXT: ldr d1, [x2] +; CHECK-NEXT: ldr d2, [x10] +; CHECK-NEXT: add x10, x11, x9 +; CHECK-NEXT: ldr d3, [x11] +; CHECK-NEXT: ldr d4, [x12] +; CHECK-NEXT: ldr d5, [x10] +; CHECK-NEXT: ldr d6, [x12, x8] +; CHECK-NEXT: ldr d7, [x10, x9] +; CHECK-NEXT: usubl v0.8h, v0.8b, v1.8b +; CHECK-NEXT: usubl v2.8h, v2.8b, v3.8b +; CHECK-NEXT: usubl v1.8h, v4.8b, v5.8b +; CHECK-NEXT: shll2 v4.4s, v0.8h, #16 +; CHECK-NEXT: shll2 v5.4s, v2.8h, #16 ; CHECK-NEXT: usubl v3.8h, v6.8b, v7.8b -; CHECK-NEXT: shll v4.4s, v2.4h, #16 -; CHECK-NEXT: shll v5.4s, v3.4h, #16 -; CHECK-NEXT: shll2 v3.4s, v3.8h, #16 -; CHECK-NEXT: shll2 v2.4s, v2.8h, #16 -; CHECK-NEXT: saddw2 v3.4s, v3.4s, v0.8h -; CHECK-NEXT: saddw v0.4s, v5.4s, v0.4h -; CHECK-NEXT: saddw2 v2.4s, v2.4s, v1.8h -; CHECK-NEXT: saddw v1.4s, v4.4s, v1.4h -; CHECK-NEXT: rev64 v6.4s, v0.4s -; CHECK-NEXT: rev64 v17.4s, v3.4s +; CHECK-NEXT: saddw v0.4s, v4.4s, v0.4h +; CHECK-NEXT: saddw v2.4s, v5.4s, v2.4h +; CHECK-NEXT: shll2 v7.4s, v1.8h, #16 +; CHECK-NEXT: shll2 v6.4s, v3.8h, #16 +; CHECK-NEXT: saddw v1.4s, v7.4s, v1.4h +; CHECK-NEXT: rev64 v4.4s, v0.4s ; CHECK-NEXT: rev64 v5.4s, v2.4s -; CHECK-NEXT: addp v7.4s, v1.4s, v2.4s -; CHECK-NEXT: rev64 v4.4s, v1.4s -; CHECK-NEXT: addp v16.4s, v0.4s, v3.4s -; CHECK-NEXT: sub v3.4s, v3.4s, v17.4s -; CHECK-NEXT: sub v0.4s, v0.4s, v6.4s -; CHECK-NEXT: ext v18.16b, v7.16b, v7.16b, #8 -; CHECK-NEXT: sub v2.4s, v2.4s, v5.4s -; CHECK-NEXT: zip1 v5.4s, v0.4s, v3.4s -; CHECK-NEXT: uzp2 v19.4s, v7.4s, v16.4s -; CHECK-NEXT: uzp1 v7.4s, v7.4s, v16.4s -; CHECK-NEXT: sub v1.4s, v1.4s, v4.4s -; CHECK-NEXT: uzp1 v6.4s, v18.4s, v16.4s -; CHECK-NEXT: zip2 v4.4s, v2.4s, v1.4s -; CHECK-NEXT: uzp2 v16.4s, v18.4s, v16.4s -; CHECK-NEXT: mov v2.s[1], v1.s[0] -; CHECK-NEXT: ext v1.16b, v0.16b, v5.16b, #8 -; CHECK-NEXT: mov v0.s[3], v3.s[2] -; CHECK-NEXT: add v7.4s, v19.4s, v7.4s -; CHECK-NEXT: sub v3.4s, v6.4s, v16.4s -; CHECK-NEXT: rev64 v5.4s, v7.4s -; CHECK-NEXT: mov v2.d[1], v1.d[1] -; CHECK-NEXT: mov v4.d[1], v0.d[1] +; CHECK-NEXT: saddw v3.4s, v6.4s, v3.4h +; CHECK-NEXT: rev64 v7.4s, v1.4s +; CHECK-NEXT: sub v4.4s, v0.4s, v4.4s ; CHECK-NEXT: rev64 v6.4s, v3.4s -; CHECK-NEXT: sub v0.4s, v7.4s, v5.4s -; CHECK-NEXT: add v5.4s, v4.4s, v2.4s -; CHECK-NEXT: sub v2.4s, v2.4s, v4.4s -; CHECK-NEXT: sub v1.4s, v3.4s, v6.4s -; CHECK-NEXT: rev64 v4.4s, v5.4s -; CHECK-NEXT: addp v6.4s, v7.4s, v5.4s -; CHECK-NEXT: rev64 v7.4s, v2.4s -; CHECK-NEXT: addp v3.4s, v3.4s, v2.4s -; CHECK-NEXT: sub v4.4s, v5.4s, v4.4s -; CHECK-NEXT: zip1 v16.4s, v6.4s, v6.4s -; CHECK-NEXT: sub v2.4s, v2.4s, v7.4s -; CHECK-NEXT: ext v17.16b, v1.16b, v3.16b, #8 -; CHECK-NEXT: ext v5.16b, v6.16b, v4.16b, #4 -; CHECK-NEXT: ext v7.16b, v3.16b, v2.16b, #4 -; CHECK-NEXT: ext v18.16b, v0.16b, v6.16b, #4 -; CHECK-NEXT: trn2 v0.4s, v16.4s, v0.4s -; CHECK-NEXT: ext v16.16b, v17.16b, v1.16b, #4 -; CHECK-NEXT: zip2 v7.4s, v7.4s, v3.4s -; CHECK-NEXT: zip2 v5.4s, v5.4s, v6.4s -; CHECK-NEXT: ext v18.16b, v18.16b, v18.16b, #4 -; CHECK-NEXT: mov v1.s[2], v3.s[1] -; CHECK-NEXT: uzp2 v16.4s, v17.4s, v16.4s -; CHECK-NEXT: ext v7.16b, v2.16b, v7.16b, #12 -; CHECK-NEXT: ext v5.16b, v4.16b, v5.16b, #12 -; CHECK-NEXT: mov v2.s[2], v3.s[3] -; CHECK-NEXT: mov v4.s[2], v6.s[3] -; CHECK-NEXT: sub v17.4s, v0.4s, v18.4s -; CHECK-NEXT: mov v18.s[0], v6.s[1] -; CHECK-NEXT: sub v19.4s, v1.4s, v16.4s -; CHECK-NEXT: sub v20.4s, v2.4s, v7.4s -; CHECK-NEXT: sub v21.4s, v4.4s, v5.4s -; CHECK-NEXT: mov v1.s[1], v3.s[0] -; CHECK-NEXT: mov v2.s[1], v3.s[2] -; CHECK-NEXT: mov v4.s[1], v6.s[2] -; CHECK-NEXT: add v0.4s, v0.4s, v18.4s -; CHECK-NEXT: add v1.4s, v1.4s, v16.4s -; CHECK-NEXT: add v2.4s, v2.4s, v7.4s -; CHECK-NEXT: add v3.4s, v4.4s, v5.4s -; CHECK-NEXT: mov v2.d[1], v20.d[1] -; CHECK-NEXT: mov v3.d[1], v21.d[1] -; CHECK-NEXT: mov v0.d[1], v17.d[1] -; CHECK-NEXT: mov v1.d[1], v19.d[1] -; CHECK-NEXT: cmlt v4.8h, v2.8h, #0 -; CHECK-NEXT: cmlt v5.8h, v3.8h, #0 -; CHECK-NEXT: cmlt v6.8h, v0.8h, #0 -; CHECK-NEXT: cmlt v7.8h, v1.8h, #0 -; CHECK-NEXT: add v0.4s, v6.4s, v0.4s +; CHECK-NEXT: sub v5.4s, v2.4s, v5.4s +; CHECK-NEXT: sub v7.4s, v1.4s, v7.4s +; CHECK-NEXT: zip1 v16.4s, v5.4s, v4.4s +; CHECK-NEXT: addp v1.4s, v3.4s, v1.4s +; CHECK-NEXT: sub v6.4s, v3.4s, v6.4s +; CHECK-NEXT: addp v0.4s, v2.4s, v0.4s +; CHECK-NEXT: zip2 v17.4s, v7.4s, v6.4s +; CHECK-NEXT: mov v7.s[1], v6.s[0] +; CHECK-NEXT: ext v2.16b, v5.16b, v16.16b, #8 +; CHECK-NEXT: mov v5.s[3], v4.s[2] +; CHECK-NEXT: ext v3.16b, v1.16b, v1.16b, #8 +; CHECK-NEXT: uzp2 v4.4s, v1.4s, v0.4s +; CHECK-NEXT: mov v7.d[1], v2.d[1] +; CHECK-NEXT: mov v17.d[1], v5.d[1] +; CHECK-NEXT: uzp1 v1.4s, v1.4s, v0.4s +; CHECK-NEXT: uzp1 v2.4s, v3.4s, v0.4s +; CHECK-NEXT: uzp2 v0.4s, v3.4s, v0.4s +; CHECK-NEXT: add v3.4s, v17.4s, v7.4s +; CHECK-NEXT: add v1.4s, v4.4s, v1.4s +; CHECK-NEXT: sub v4.4s, v7.4s, v17.4s +; CHECK-NEXT: sub v0.4s, v2.4s, v0.4s +; CHECK-NEXT: rev64 v2.4s, v3.4s +; CHECK-NEXT: rev64 v5.4s, v4.4s +; CHECK-NEXT: rev64 v7.4s, v0.4s +; CHECK-NEXT: rev64 v6.4s, v1.4s +; CHECK-NEXT: addp v16.4s, v0.4s, v4.4s +; CHECK-NEXT: addp v17.4s, v1.4s, v3.4s +; CHECK-NEXT: sub v4.4s, v4.4s, v5.4s +; CHECK-NEXT: sub v2.4s, v3.4s, v2.4s +; CHECK-NEXT: sub v0.4s, v0.4s, v7.4s +; CHECK-NEXT: sub v1.4s, v1.4s, v6.4s +; CHECK-NEXT: ext v3.16b, v16.16b, v4.16b, #4 +; CHECK-NEXT: ext v5.16b, v0.16b, v16.16b, #8 +; CHECK-NEXT: ext v6.16b, v17.16b, v2.16b, #4 +; CHECK-NEXT: zip1 v7.4s, v17.4s, v17.4s +; CHECK-NEXT: zip2 v3.4s, v3.4s, v16.4s +; CHECK-NEXT: ext v18.16b, v5.16b, v0.16b, #4 +; CHECK-NEXT: zip2 v6.4s, v6.4s, v17.4s +; CHECK-NEXT: trn2 v7.4s, v7.4s, v1.4s +; CHECK-NEXT: ext v1.16b, v1.16b, v17.16b, #4 +; CHECK-NEXT: ext v3.16b, v4.16b, v3.16b, #12 +; CHECK-NEXT: mov v0.s[2], v16.s[1] +; CHECK-NEXT: ext v6.16b, v2.16b, v6.16b, #12 +; CHECK-NEXT: mov v4.s[2], v16.s[3] +; CHECK-NEXT: uzp2 v5.4s, v5.4s, v18.4s +; CHECK-NEXT: mov v2.s[2], v17.s[3] +; CHECK-NEXT: ext v1.16b, v1.16b, v1.16b, #4 +; CHECK-NEXT: sub v18.4s, v4.4s, v3.4s +; CHECK-NEXT: sub v19.4s, v0.4s, v5.4s +; CHECK-NEXT: sub v20.4s, v2.4s, v6.4s +; CHECK-NEXT: mov v4.s[1], v16.s[2] +; CHECK-NEXT: sub v21.4s, v7.4s, v1.4s +; CHECK-NEXT: mov v2.s[1], v17.s[2] +; CHECK-NEXT: mov v0.s[1], v16.s[0] +; CHECK-NEXT: mov v1.s[0], v17.s[1] +; CHECK-NEXT: add v3.4s, v4.4s, v3.4s +; CHECK-NEXT: add v2.4s, v2.4s, v6.4s +; CHECK-NEXT: add v0.4s, v0.4s, v5.4s ; CHECK-NEXT: add v1.4s, v7.4s, v1.4s -; CHECK-NEXT: add v2.4s, v4.4s, v2.4s -; CHECK-NEXT: add v3.4s, v5.4s, v3.4s -; CHECK-NEXT: eor v2.16b, v2.16b, v4.16b -; CHECK-NEXT: eor v3.16b, v3.16b, v5.16b -; CHECK-NEXT: eor v1.16b, v1.16b, v7.16b -; CHECK-NEXT: eor v0.16b, v0.16b, v6.16b -; CHECK-NEXT: add v0.4s, v0.4s, v1.4s -; CHECK-NEXT: add v1.4s, v3.4s, v2.4s +; CHECK-NEXT: mov v3.d[1], v18.d[1] +; CHECK-NEXT: mov v2.d[1], v20.d[1] +; CHECK-NEXT: mov v1.d[1], v21.d[1] +; CHECK-NEXT: mov v0.d[1], v19.d[1] +; CHECK-NEXT: cmlt v4.8h, v3.8h, #0 +; CHECK-NEXT: cmlt v5.8h, v2.8h, #0 +; CHECK-NEXT: cmlt v6.8h, v1.8h, #0 +; CHECK-NEXT: cmlt v7.8h, v0.8h, #0 +; CHECK-NEXT: add v1.4s, v6.4s, v1.4s +; CHECK-NEXT: add v0.4s, v7.4s, v0.4s +; CHECK-NEXT: add v3.4s, v4.4s, v3.4s +; CHECK-NEXT: add v2.4s, v5.4s, v2.4s +; CHECK-NEXT: eor v3.16b, v3.16b, v4.16b +; CHECK-NEXT: eor v2.16b, v2.16b, v5.16b +; CHECK-NEXT: eor v0.16b, v0.16b, v7.16b +; CHECK-NEXT: eor v1.16b, v1.16b, v6.16b +; CHECK-NEXT: add v0.4s, v1.4s, v0.4s +; CHECK-NEXT: add v1.4s, v2.4s, v3.4s ; CHECK-NEXT: add v0.4s, v0.4s, v1.4s ; CHECK-NEXT: addv s0, v0.4s ; CHECK-NEXT: fmov w8, s0 Index: llvm/test/CodeGen/AArch64/reduce-shuffle.ll =================================================================== --- llvm/test/CodeGen/AArch64/reduce-shuffle.ll +++ llvm/test/CodeGen/AArch64/reduce-shuffle.ll @@ -4,123 +4,117 @@ define i32 @v1(ptr nocapture noundef readonly %p1, i32 noundef %i1, ptr nocapture noundef readonly %p2, i32 noundef %i2) { ; CHECK-LABEL: v1: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: // kill: def $w3 killed $w3 def $x3 ; CHECK-NEXT: // kill: def $w1 killed $w1 def $x1 ; CHECK-NEXT: sxtw x8, w1 -; CHECK-NEXT: sxtw x10, w3 -; CHECK-NEXT: add x9, x0, x8 -; CHECK-NEXT: add x12, x2, x10 -; CHECK-NEXT: add x11, x9, x8 -; CHECK-NEXT: add x13, x12, x10 -; CHECK-NEXT: add x8, x11, x8 -; CHECK-NEXT: add x10, x13, x10 -; CHECK-NEXT: ldp s1, s0, [x9] -; CHECK-NEXT: ldp s7, s6, [x12] -; CHECK-NEXT: ldp s3, s2, [x8] -; CHECK-NEXT: ldp s5, s4, [x10] -; CHECK-NEXT: ld1 { v5.s }[1], [x13], #4 -; CHECK-NEXT: ld1 { v3.s }[1], [x11], #4 -; CHECK-NEXT: ld1 { v7.s }[1], [x2], #4 -; CHECK-NEXT: ld1 { v1.s }[1], [x0], #4 -; CHECK-NEXT: ld1 { v4.s }[1], [x13] -; CHECK-NEXT: ld1 { v2.s }[1], [x11] -; CHECK-NEXT: ld1 { v6.s }[1], [x2] -; CHECK-NEXT: ld1 { v0.s }[1], [x0] -; CHECK-NEXT: usubl v3.8h, v3.8b, v5.8b -; CHECK-NEXT: usubl v2.8h, v2.8b, v4.8b -; CHECK-NEXT: usubl v1.8h, v1.8b, v7.8b -; CHECK-NEXT: usubl v0.8h, v0.8b, v6.8b -; CHECK-NEXT: shll v4.4s, v2.4h, #16 -; CHECK-NEXT: shll2 v2.4s, v2.8h, #16 -; CHECK-NEXT: shll v5.4s, v0.4h, #16 -; CHECK-NEXT: shll2 v0.4s, v0.8h, #16 -; CHECK-NEXT: saddw2 v2.4s, v2.4s, v3.8h -; CHECK-NEXT: saddw v3.4s, v4.4s, v3.4h -; CHECK-NEXT: saddw2 v0.4s, v0.4s, v1.8h -; CHECK-NEXT: saddw v1.4s, v5.4s, v1.4h -; CHECK-NEXT: uzp2 v5.4s, v3.4s, v2.4s +; CHECK-NEXT: // kill: def $w3 killed $w3 def $x3 +; CHECK-NEXT: sxtw x9, w3 +; CHECK-NEXT: add x10, x0, x8 +; CHECK-NEXT: add x11, x2, x9 +; CHECK-NEXT: add x12, x10, x8 +; CHECK-NEXT: ldr d0, [x0] +; CHECK-NEXT: ldr d1, [x2] +; CHECK-NEXT: ldr d2, [x10] +; CHECK-NEXT: add x10, x11, x9 +; CHECK-NEXT: ldr d6, [x12, x8] +; CHECK-NEXT: ldr d7, [x10, x9] +; CHECK-NEXT: ldr d3, [x11] +; CHECK-NEXT: ldr d4, [x12] +; CHECK-NEXT: ldr d5, [x10] +; CHECK-NEXT: usubl v0.8h, v0.8b, v1.8b +; CHECK-NEXT: usubl v2.8h, v2.8b, v3.8b +; CHECK-NEXT: usubl v1.8h, v4.8b, v5.8b +; CHECK-NEXT: usubl v3.8h, v6.8b, v7.8b +; CHECK-NEXT: shll2 v4.4s, v0.8h, #16 +; CHECK-NEXT: shll2 v5.4s, v2.8h, #16 +; CHECK-NEXT: shll2 v6.4s, v3.8h, #16 +; CHECK-NEXT: shll2 v7.4s, v1.8h, #16 +; CHECK-NEXT: saddw v0.4s, v4.4s, v0.4h +; CHECK-NEXT: saddw v2.4s, v5.4s, v2.4h +; CHECK-NEXT: saddw v3.4s, v6.4s, v3.4h +; CHECK-NEXT: saddw v1.4s, v7.4s, v1.4h +; CHECK-NEXT: zip1 v5.4s, v2.4s, v0.4s +; CHECK-NEXT: zip2 v4.4s, v2.4s, v0.4s +; CHECK-NEXT: uzp2 v7.4s, v3.4s, v1.4s +; CHECK-NEXT: mov v17.16b, v1.16b +; CHECK-NEXT: zip2 v18.4s, v3.4s, v1.4s +; CHECK-NEXT: ext v19.16b, v2.16b, v5.16b, #8 +; CHECK-NEXT: uzp2 v7.4s, v7.4s, v3.4s +; CHECK-NEXT: mov v2.s[3], v0.s[2] +; CHECK-NEXT: zip2 v6.4s, v1.4s, v3.4s ; CHECK-NEXT: ext v16.16b, v3.16b, v3.16b, #12 -; CHECK-NEXT: zip1 v17.4s, v1.4s, v0.4s -; CHECK-NEXT: mov v7.16b, v3.16b -; CHECK-NEXT: zip2 v4.4s, v2.4s, v3.4s -; CHECK-NEXT: zip2 v6.4s, v1.4s, v0.4s -; CHECK-NEXT: zip2 v18.4s, v3.4s, v2.4s -; CHECK-NEXT: mov v7.s[0], v2.s[1] -; CHECK-NEXT: ext v16.16b, v2.16b, v16.16b, #12 -; CHECK-NEXT: ext v19.16b, v1.16b, v17.16b, #8 -; CHECK-NEXT: uzp2 v5.4s, v5.4s, v3.4s -; CHECK-NEXT: mov v2.s[1], v3.s[0] -; CHECK-NEXT: mov v1.s[3], v0.s[2] -; CHECK-NEXT: mov v7.d[1], v17.d[1] -; CHECK-NEXT: mov v5.d[1], v6.d[1] -; CHECK-NEXT: mov v2.d[1], v19.d[1] -; CHECK-NEXT: mov v18.d[1], v1.d[1] -; CHECK-NEXT: mov v16.d[1], v6.d[1] -; CHECK-NEXT: mov v4.d[1], v1.d[1] -; CHECK-NEXT: add v0.4s, v7.4s, v2.4s -; CHECK-NEXT: add v1.4s, v5.4s, v18.4s +; CHECK-NEXT: mov v17.s[1], v3.s[0] +; CHECK-NEXT: mov v3.s[0], v1.s[1] +; CHECK-NEXT: mov v7.d[1], v4.d[1] +; CHECK-NEXT: mov v18.d[1], v2.d[1] +; CHECK-NEXT: mov v17.d[1], v19.d[1] +; CHECK-NEXT: mov v3.d[1], v5.d[1] +; CHECK-NEXT: ext v16.16b, v1.16b, v16.16b, #12 +; CHECK-NEXT: add v1.4s, v7.4s, v18.4s +; CHECK-NEXT: mov v6.d[1], v2.d[1] +; CHECK-NEXT: add v0.4s, v3.4s, v17.4s +; CHECK-NEXT: mov v16.d[1], v4.d[1] +; CHECK-NEXT: sub v2.4s, v17.4s, v3.4s +; CHECK-NEXT: rev64 v3.4s, v1.4s ; CHECK-NEXT: rev64 v5.4s, v0.4s -; CHECK-NEXT: sub v3.4s, v4.4s, v16.4s -; CHECK-NEXT: rev64 v4.4s, v1.4s -; CHECK-NEXT: sub v2.4s, v2.4s, v7.4s +; CHECK-NEXT: sub v4.4s, v6.4s, v16.4s +; CHECK-NEXT: mov v3.d[1], v1.d[1] ; CHECK-NEXT: mov v5.d[1], v0.d[1] -; CHECK-NEXT: add v6.4s, v3.4s, v2.4s -; CHECK-NEXT: sub v2.4s, v2.4s, v3.4s -; CHECK-NEXT: mov v4.d[1], v1.d[1] -; CHECK-NEXT: rev64 v7.4s, v2.4s +; CHECK-NEXT: add v6.4s, v4.4s, v2.4s +; CHECK-NEXT: sub v2.4s, v2.4s, v4.4s +; CHECK-NEXT: sub v0.4s, v0.4s, v3.4s +; CHECK-NEXT: rev64 v4.4s, v2.4s ; CHECK-NEXT: rev64 v3.4s, v6.4s ; CHECK-NEXT: add v1.4s, v1.4s, v5.4s -; CHECK-NEXT: sub v0.4s, v0.4s, v4.4s -; CHECK-NEXT: sub v7.4s, v2.4s, v7.4s +; CHECK-NEXT: addp v7.4s, v0.4s, v2.4s ; CHECK-NEXT: addp v5.4s, v1.4s, v6.4s -; CHECK-NEXT: addp v2.4s, v0.4s, v2.4s +; CHECK-NEXT: sub v2.4s, v2.4s, v4.4s ; CHECK-NEXT: sub v3.4s, v6.4s, v3.4s -; CHECK-NEXT: rev64 v4.4s, v0.4s -; CHECK-NEXT: rev64 v6.4s, v1.4s -; CHECK-NEXT: zip1 v16.4s, v5.4s, v5.4s -; CHECK-NEXT: ext v17.16b, v2.16b, v7.16b, #4 -; CHECK-NEXT: ext v18.16b, v5.16b, v3.16b, #4 -; CHECK-NEXT: sub v0.4s, v0.4s, v4.4s -; CHECK-NEXT: sub v1.4s, v1.4s, v6.4s -; CHECK-NEXT: ext v4.16b, v0.16b, v2.16b, #8 -; CHECK-NEXT: ext v6.16b, v1.16b, v5.16b, #4 -; CHECK-NEXT: trn2 v1.4s, v16.4s, v1.4s -; CHECK-NEXT: zip2 v16.4s, v17.4s, v2.4s -; CHECK-NEXT: zip2 v17.4s, v18.4s, v5.4s -; CHECK-NEXT: ext v18.16b, v4.16b, v0.16b, #4 -; CHECK-NEXT: ext v6.16b, v6.16b, v6.16b, #4 -; CHECK-NEXT: ext v16.16b, v7.16b, v16.16b, #12 -; CHECK-NEXT: ext v17.16b, v3.16b, v17.16b, #12 +; CHECK-NEXT: rev64 v6.4s, v0.4s +; CHECK-NEXT: ext v4.16b, v7.16b, v2.16b, #4 +; CHECK-NEXT: rev64 v16.4s, v1.4s +; CHECK-NEXT: ext v17.16b, v5.16b, v3.16b, #4 +; CHECK-NEXT: sub v0.4s, v0.4s, v6.4s +; CHECK-NEXT: zip2 v4.4s, v4.4s, v7.4s +; CHECK-NEXT: ext v6.16b, v0.16b, v7.16b, #8 +; CHECK-NEXT: sub v1.4s, v1.4s, v16.4s +; CHECK-NEXT: zip2 v16.4s, v17.4s, v5.4s +; CHECK-NEXT: zip1 v18.4s, v5.4s, v5.4s +; CHECK-NEXT: ext v19.16b, v1.16b, v5.16b, #4 +; CHECK-NEXT: ext v4.16b, v2.16b, v4.16b, #12 +; CHECK-NEXT: mov v2.s[2], v7.s[3] +; CHECK-NEXT: ext v17.16b, v6.16b, v0.16b, #4 +; CHECK-NEXT: ext v16.16b, v3.16b, v16.16b, #12 ; CHECK-NEXT: mov v3.s[2], v5.s[3] -; CHECK-NEXT: mov v7.s[2], v2.s[3] -; CHECK-NEXT: mov v0.s[2], v2.s[1] -; CHECK-NEXT: uzp2 v4.4s, v4.4s, v18.4s -; CHECK-NEXT: sub v20.4s, v3.4s, v17.4s -; CHECK-NEXT: sub v21.4s, v7.4s, v16.4s +; CHECK-NEXT: trn2 v1.4s, v18.4s, v1.4s +; CHECK-NEXT: ext v18.16b, v19.16b, v19.16b, #4 +; CHECK-NEXT: mov v0.s[2], v7.s[1] +; CHECK-NEXT: uzp2 v6.4s, v6.4s, v17.4s +; CHECK-NEXT: sub v17.4s, v2.4s, v4.4s +; CHECK-NEXT: sub v21.4s, v3.4s, v16.4s ; CHECK-NEXT: mov v3.s[1], v5.s[2] -; CHECK-NEXT: mov v7.s[1], v2.s[2] -; CHECK-NEXT: sub v18.4s, v1.4s, v6.4s -; CHECK-NEXT: mov v6.s[0], v5.s[1] -; CHECK-NEXT: sub v19.4s, v0.4s, v4.4s -; CHECK-NEXT: mov v0.s[1], v2.s[0] -; CHECK-NEXT: add v2.4s, v3.4s, v17.4s -; CHECK-NEXT: add v3.4s, v7.4s, v16.4s -; CHECK-NEXT: add v1.4s, v1.4s, v6.4s +; CHECK-NEXT: mov v2.s[1], v7.s[2] +; CHECK-NEXT: sub v19.4s, v1.4s, v18.4s +; CHECK-NEXT: mov v18.s[0], v5.s[1] +; CHECK-NEXT: sub v20.4s, v0.4s, v6.4s +; CHECK-NEXT: mov v0.s[1], v7.s[0] +; CHECK-NEXT: add v3.4s, v3.4s, v16.4s +; CHECK-NEXT: add v2.4s, v2.4s, v4.4s +; CHECK-NEXT: add v1.4s, v1.4s, v18.4s +; CHECK-NEXT: mov v2.d[1], v17.d[1] ; CHECK-NEXT: mov v3.d[1], v21.d[1] -; CHECK-NEXT: mov v2.d[1], v20.d[1] -; CHECK-NEXT: add v0.4s, v0.4s, v4.4s -; CHECK-NEXT: mov v1.d[1], v18.d[1] -; CHECK-NEXT: mov v0.d[1], v19.d[1] -; CHECK-NEXT: cmlt v6.8h, v3.8h, #0 -; CHECK-NEXT: cmlt v7.8h, v2.8h, #0 +; CHECK-NEXT: add v0.4s, v0.4s, v6.4s +; CHECK-NEXT: mov v1.d[1], v19.d[1] +; CHECK-NEXT: mov v0.d[1], v20.d[1] +; CHECK-NEXT: cmlt v6.8h, v2.8h, #0 +; CHECK-NEXT: cmlt v7.8h, v3.8h, #0 ; CHECK-NEXT: cmlt v4.8h, v1.8h, #0 -; CHECK-NEXT: add v3.4s, v6.4s, v3.4s -; CHECK-NEXT: add v2.4s, v7.4s, v2.4s +; CHECK-NEXT: add v2.4s, v6.4s, v2.4s +; CHECK-NEXT: add v3.4s, v7.4s, v3.4s ; CHECK-NEXT: cmlt v5.8h, v0.8h, #0 ; CHECK-NEXT: add v1.4s, v4.4s, v1.4s -; CHECK-NEXT: eor v2.16b, v2.16b, v7.16b -; CHECK-NEXT: eor v3.16b, v3.16b, v6.16b -; CHECK-NEXT: add v2.4s, v2.4s, v3.4s +; CHECK-NEXT: eor v3.16b, v3.16b, v7.16b +; CHECK-NEXT: eor v2.16b, v2.16b, v6.16b +; CHECK-NEXT: add v2.4s, v3.4s, v2.4s ; CHECK-NEXT: add v0.4s, v5.4s, v0.4s ; CHECK-NEXT: eor v1.16b, v1.16b, v4.16b ; CHECK-NEXT: add v1.4s, v1.4s, v2.4s @@ -226,121 +220,115 @@ define i32 @v2(ptr nocapture noundef readonly %p1, i32 noundef %i1, ptr nocapture noundef readonly %p2, i32 noundef %i2) { ; CHECK-LABEL: v2: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: // kill: def $w3 killed $w3 def $x3 ; CHECK-NEXT: // kill: def $w1 killed $w1 def $x1 ; CHECK-NEXT: sxtw x8, w1 -; CHECK-NEXT: sxtw x10, w3 -; CHECK-NEXT: add x9, x0, x8 -; CHECK-NEXT: add x12, x2, x10 -; CHECK-NEXT: add x11, x9, x8 -; CHECK-NEXT: add x13, x12, x10 -; CHECK-NEXT: add x8, x11, x8 -; CHECK-NEXT: add x10, x13, x10 -; CHECK-NEXT: ldp s1, s0, [x9] -; CHECK-NEXT: ldp s7, s6, [x12] -; CHECK-NEXT: ldp s3, s2, [x8] -; CHECK-NEXT: ldp s5, s4, [x10] -; CHECK-NEXT: ld1 { v5.s }[1], [x13], #4 -; CHECK-NEXT: ld1 { v3.s }[1], [x11], #4 -; CHECK-NEXT: ld1 { v7.s }[1], [x2], #4 -; CHECK-NEXT: ld1 { v1.s }[1], [x0], #4 -; CHECK-NEXT: ld1 { v4.s }[1], [x13] -; CHECK-NEXT: ld1 { v2.s }[1], [x11] -; CHECK-NEXT: ld1 { v6.s }[1], [x2] -; CHECK-NEXT: ld1 { v0.s }[1], [x0] -; CHECK-NEXT: usubl v3.8h, v3.8b, v5.8b -; CHECK-NEXT: usubl v2.8h, v2.8b, v4.8b -; CHECK-NEXT: usubl v1.8h, v1.8b, v7.8b -; CHECK-NEXT: usubl v0.8h, v0.8b, v6.8b -; CHECK-NEXT: shll v4.4s, v2.4h, #16 -; CHECK-NEXT: shll2 v2.4s, v2.8h, #16 -; CHECK-NEXT: shll v5.4s, v0.4h, #16 -; CHECK-NEXT: shll2 v0.4s, v0.8h, #16 -; CHECK-NEXT: saddw2 v2.4s, v2.4s, v3.8h -; CHECK-NEXT: saddw v3.4s, v4.4s, v3.4h -; CHECK-NEXT: saddw2 v0.4s, v0.4s, v1.8h -; CHECK-NEXT: saddw v1.4s, v5.4s, v1.4h -; CHECK-NEXT: uzp2 v5.4s, v3.4s, v2.4s +; CHECK-NEXT: // kill: def $w3 killed $w3 def $x3 +; CHECK-NEXT: sxtw x9, w3 +; CHECK-NEXT: add x10, x0, x8 +; CHECK-NEXT: add x11, x2, x9 +; CHECK-NEXT: add x12, x10, x8 +; CHECK-NEXT: ldr d0, [x0] +; CHECK-NEXT: ldr d1, [x2] +; CHECK-NEXT: ldr d2, [x10] +; CHECK-NEXT: add x10, x11, x9 +; CHECK-NEXT: ldr d6, [x12, x8] +; CHECK-NEXT: ldr d7, [x10, x9] +; CHECK-NEXT: ldr d3, [x11] +; CHECK-NEXT: ldr d4, [x12] +; CHECK-NEXT: ldr d5, [x10] +; CHECK-NEXT: usubl v0.8h, v0.8b, v1.8b +; CHECK-NEXT: usubl v2.8h, v2.8b, v3.8b +; CHECK-NEXT: usubl v1.8h, v4.8b, v5.8b +; CHECK-NEXT: usubl v3.8h, v6.8b, v7.8b +; CHECK-NEXT: shll2 v4.4s, v0.8h, #16 +; CHECK-NEXT: shll2 v5.4s, v2.8h, #16 +; CHECK-NEXT: shll2 v6.4s, v3.8h, #16 +; CHECK-NEXT: shll2 v7.4s, v1.8h, #16 +; CHECK-NEXT: saddw v0.4s, v4.4s, v0.4h +; CHECK-NEXT: saddw v2.4s, v5.4s, v2.4h +; CHECK-NEXT: saddw v3.4s, v6.4s, v3.4h +; CHECK-NEXT: saddw v1.4s, v7.4s, v1.4h +; CHECK-NEXT: zip1 v5.4s, v2.4s, v0.4s ; CHECK-NEXT: ext v17.16b, v3.16b, v3.16b, #12 -; CHECK-NEXT: zip1 v7.4s, v1.4s, v0.4s +; CHECK-NEXT: uzp2 v7.4s, v3.4s, v1.4s ; CHECK-NEXT: mov v16.16b, v3.16b -; CHECK-NEXT: zip2 v4.4s, v2.4s, v3.4s -; CHECK-NEXT: zip2 v6.4s, v1.4s, v0.4s -; CHECK-NEXT: zip2 v18.4s, v3.4s, v2.4s -; CHECK-NEXT: mov v16.s[0], v2.s[1] -; CHECK-NEXT: ext v19.16b, v1.16b, v7.16b, #8 -; CHECK-NEXT: ext v17.16b, v2.16b, v17.16b, #12 -; CHECK-NEXT: uzp2 v5.4s, v5.4s, v3.4s -; CHECK-NEXT: mov v1.s[3], v0.s[2] -; CHECK-NEXT: mov v2.s[1], v3.s[0] -; CHECK-NEXT: mov v16.d[1], v7.d[1] -; CHECK-NEXT: mov v5.d[1], v6.d[1] -; CHECK-NEXT: mov v18.d[1], v1.d[1] -; CHECK-NEXT: mov v2.d[1], v19.d[1] -; CHECK-NEXT: mov v4.d[1], v1.d[1] -; CHECK-NEXT: mov v17.d[1], v6.d[1] -; CHECK-NEXT: add v0.4s, v5.4s, v18.4s -; CHECK-NEXT: add v1.4s, v16.4s, v2.4s +; CHECK-NEXT: zip2 v4.4s, v2.4s, v0.4s +; CHECK-NEXT: zip2 v6.4s, v1.4s, v3.4s +; CHECK-NEXT: zip2 v18.4s, v3.4s, v1.4s +; CHECK-NEXT: ext v19.16b, v2.16b, v5.16b, #8 +; CHECK-NEXT: mov v16.s[0], v1.s[1] +; CHECK-NEXT: ext v17.16b, v1.16b, v17.16b, #12 +; CHECK-NEXT: uzp2 v7.4s, v7.4s, v3.4s +; CHECK-NEXT: mov v2.s[3], v0.s[2] +; CHECK-NEXT: mov v1.s[1], v3.s[0] +; CHECK-NEXT: mov v16.d[1], v5.d[1] +; CHECK-NEXT: mov v7.d[1], v4.d[1] +; CHECK-NEXT: mov v18.d[1], v2.d[1] +; CHECK-NEXT: mov v1.d[1], v19.d[1] +; CHECK-NEXT: mov v6.d[1], v2.d[1] +; CHECK-NEXT: mov v17.d[1], v4.d[1] +; CHECK-NEXT: add v0.4s, v7.4s, v18.4s +; CHECK-NEXT: add v2.4s, v16.4s, v1.4s ; CHECK-NEXT: rev64 v3.4s, v0.4s -; CHECK-NEXT: rev64 v5.4s, v1.4s -; CHECK-NEXT: sub v2.4s, v2.4s, v16.4s -; CHECK-NEXT: sub v4.4s, v4.4s, v17.4s +; CHECK-NEXT: rev64 v4.4s, v2.4s +; CHECK-NEXT: sub v5.4s, v6.4s, v17.4s +; CHECK-NEXT: sub v1.4s, v1.4s, v16.4s ; CHECK-NEXT: mov v3.d[1], v0.d[1] -; CHECK-NEXT: mov v5.d[1], v1.d[1] -; CHECK-NEXT: add v6.4s, v4.4s, v2.4s -; CHECK-NEXT: sub v2.4s, v2.4s, v4.4s -; CHECK-NEXT: sub v1.4s, v1.4s, v3.4s -; CHECK-NEXT: add v0.4s, v0.4s, v5.4s -; CHECK-NEXT: zip1 v3.4s, v1.4s, v2.4s -; CHECK-NEXT: zip1 v4.4s, v0.4s, v6.4s +; CHECK-NEXT: mov v4.d[1], v2.d[1] +; CHECK-NEXT: add v6.4s, v5.4s, v1.4s +; CHECK-NEXT: sub v1.4s, v1.4s, v5.4s +; CHECK-NEXT: sub v2.4s, v2.4s, v3.4s +; CHECK-NEXT: add v0.4s, v0.4s, v4.4s +; CHECK-NEXT: zip1 v3.4s, v2.4s, v1.4s ; CHECK-NEXT: uzp2 v5.4s, v0.4s, v6.4s -; CHECK-NEXT: mov v17.16b, v1.16b -; CHECK-NEXT: zip2 v7.4s, v0.4s, v6.4s -; CHECK-NEXT: ext v16.16b, v1.16b, v3.16b, #8 -; CHECK-NEXT: trn2 v4.4s, v0.4s, v4.4s +; CHECK-NEXT: zip2 v4.4s, v2.4s, v1.4s +; CHECK-NEXT: zip1 v7.4s, v0.4s, v6.4s +; CHECK-NEXT: ext v16.16b, v2.16b, v3.16b, #8 +; CHECK-NEXT: zip2 v17.4s, v0.4s, v6.4s ; CHECK-NEXT: uzp2 v5.4s, v5.4s, v0.4s -; CHECK-NEXT: zip2 v1.4s, v1.4s, v2.4s -; CHECK-NEXT: mov v17.s[3], v2.s[2] -; CHECK-NEXT: mov v0.s[1], v6.s[1] -; CHECK-NEXT: mov v4.d[1], v16.d[1] -; CHECK-NEXT: mov v5.d[1], v1.d[1] -; CHECK-NEXT: mov v7.d[1], v17.d[1] -; CHECK-NEXT: mov v0.d[1], v3.d[1] -; CHECK-NEXT: add v1.4s, v7.4s, v5.4s -; CHECK-NEXT: add v2.4s, v0.4s, v4.4s -; CHECK-NEXT: sub v0.4s, v4.4s, v0.4s -; CHECK-NEXT: ext v4.16b, v2.16b, v2.16b, #4 -; CHECK-NEXT: ext v16.16b, v1.16b, v1.16b, #4 -; CHECK-NEXT: sub v3.4s, v5.4s, v7.4s -; CHECK-NEXT: zip2 v5.4s, v0.4s, v2.4s -; CHECK-NEXT: zip1 v6.4s, v1.4s, v3.4s -; CHECK-NEXT: zip2 v7.4s, v1.4s, v3.4s -; CHECK-NEXT: zip2 v1.4s, v3.4s, v1.4s -; CHECK-NEXT: zip1 v17.4s, v2.4s, v0.4s -; CHECK-NEXT: zip2 v2.4s, v2.4s, v0.4s -; CHECK-NEXT: ext v0.16b, v4.16b, v0.16b, #8 -; CHECK-NEXT: ext v3.16b, v16.16b, v3.16b, #8 -; CHECK-NEXT: add v1.4s, v5.4s, v1.4s -; CHECK-NEXT: sub v2.4s, v7.4s, v2.4s -; CHECK-NEXT: ext v0.16b, v0.16b, v4.16b, #4 -; CHECK-NEXT: ext v3.16b, v3.16b, v16.16b, #4 -; CHECK-NEXT: sub v5.4s, v6.4s, v17.4s -; CHECK-NEXT: cmlt v7.8h, v2.8h, #0 -; CHECK-NEXT: cmlt v17.8h, v1.8h, #0 -; CHECK-NEXT: cmlt v6.8h, v5.8h, #0 -; CHECK-NEXT: add v1.4s, v17.4s, v1.4s -; CHECK-NEXT: add v2.4s, v7.4s, v2.4s -; CHECK-NEXT: add v0.4s, v0.4s, v3.4s -; CHECK-NEXT: add v4.4s, v6.4s, v5.4s -; CHECK-NEXT: eor v2.16b, v2.16b, v7.16b -; CHECK-NEXT: eor v1.16b, v1.16b, v17.16b -; CHECK-NEXT: cmlt v3.8h, v0.8h, #0 -; CHECK-NEXT: add v1.4s, v1.4s, v2.4s -; CHECK-NEXT: add v0.4s, v3.4s, v0.4s -; CHECK-NEXT: eor v2.16b, v4.16b, v6.16b +; CHECK-NEXT: mov v2.s[3], v1.s[2] +; CHECK-NEXT: mov v18.16b, v0.16b +; CHECK-NEXT: trn2 v0.4s, v0.4s, v7.4s +; CHECK-NEXT: mov v18.s[1], v6.s[1] +; CHECK-NEXT: mov v5.d[1], v4.d[1] +; CHECK-NEXT: mov v17.d[1], v2.d[1] +; CHECK-NEXT: mov v0.d[1], v16.d[1] +; CHECK-NEXT: mov v18.d[1], v3.d[1] +; CHECK-NEXT: add v1.4s, v17.4s, v5.4s +; CHECK-NEXT: sub v2.4s, v5.4s, v17.4s +; CHECK-NEXT: ext v4.16b, v1.16b, v1.16b, #4 +; CHECK-NEXT: add v3.4s, v18.4s, v0.4s +; CHECK-NEXT: sub v0.4s, v0.4s, v18.4s +; CHECK-NEXT: ext v5.16b, v3.16b, v3.16b, #4 +; CHECK-NEXT: ext v16.16b, v4.16b, v2.16b, #8 +; CHECK-NEXT: zip1 v6.4s, v1.4s, v2.4s +; CHECK-NEXT: zip2 v7.4s, v1.4s, v2.4s +; CHECK-NEXT: ext v17.16b, v5.16b, v0.16b, #8 +; CHECK-NEXT: zip2 v1.4s, v2.4s, v1.4s +; CHECK-NEXT: zip2 v2.4s, v0.4s, v3.4s +; CHECK-NEXT: ext v4.16b, v16.16b, v4.16b, #4 +; CHECK-NEXT: zip1 v16.4s, v3.4s, v0.4s +; CHECK-NEXT: zip2 v0.4s, v3.4s, v0.4s +; CHECK-NEXT: ext v5.16b, v17.16b, v5.16b, #4 ; CHECK-NEXT: add v1.4s, v2.4s, v1.4s -; CHECK-NEXT: eor v0.16b, v0.16b, v3.16b -; CHECK-NEXT: add v0.4s, v0.4s, v1.4s +; CHECK-NEXT: sub v3.4s, v6.4s, v16.4s +; CHECK-NEXT: sub v0.4s, v7.4s, v0.4s +; CHECK-NEXT: cmlt v6.8h, v1.8h, #0 +; CHECK-NEXT: cmlt v7.8h, v0.8h, #0 +; CHECK-NEXT: add v2.4s, v5.4s, v4.4s +; CHECK-NEXT: cmlt v4.8h, v3.8h, #0 +; CHECK-NEXT: add v1.4s, v6.4s, v1.4s +; CHECK-NEXT: add v0.4s, v7.4s, v0.4s +; CHECK-NEXT: cmlt v5.8h, v2.8h, #0 +; CHECK-NEXT: add v3.4s, v4.4s, v3.4s +; CHECK-NEXT: eor v0.16b, v0.16b, v7.16b +; CHECK-NEXT: eor v1.16b, v1.16b, v6.16b +; CHECK-NEXT: add v0.4s, v1.4s, v0.4s +; CHECK-NEXT: add v2.4s, v5.4s, v2.4s +; CHECK-NEXT: eor v1.16b, v3.16b, v4.16b +; CHECK-NEXT: add v0.4s, v1.4s, v0.4s +; CHECK-NEXT: eor v1.16b, v2.16b, v5.16b +; CHECK-NEXT: add v0.4s, v1.4s, v0.4s ; CHECK-NEXT: addv s0, v0.4s ; CHECK-NEXT: fmov w8, s0 ; CHECK-NEXT: lsr w9, w8, #16 @@ -446,117 +434,112 @@ define i32 @v3(ptr nocapture noundef readonly %p1, i32 noundef %i1, ptr nocapture noundef readonly %p2, i32 noundef %i2) { ; CHECK-LABEL: v3: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: // kill: def $w3 killed $w3 def $x3 ; CHECK-NEXT: // kill: def $w1 killed $w1 def $x1 -; CHECK-NEXT: sxtw x8, w3 -; CHECK-NEXT: sxtw x9, w1 -; CHECK-NEXT: add x10, x2, x8 -; CHECK-NEXT: add x11, x0, x9 +; CHECK-NEXT: sxtw x8, w1 +; CHECK-NEXT: // kill: def $w3 killed $w3 def $x3 +; CHECK-NEXT: sxtw x9, w3 +; CHECK-NEXT: add x10, x0, x8 +; CHECK-NEXT: add x11, x2, x9 ; CHECK-NEXT: add x12, x10, x8 -; CHECK-NEXT: add x13, x11, x9 -; CHECK-NEXT: add x8, x12, x8 -; CHECK-NEXT: add x9, x13, x9 -; CHECK-NEXT: ldp s0, s6, [x11] -; CHECK-NEXT: ldp s3, s7, [x10] -; CHECK-NEXT: ldp s1, s5, [x8] -; CHECK-NEXT: ldp s2, s4, [x9] -; CHECK-NEXT: ld1 { v1.s }[1], [x12], #4 -; CHECK-NEXT: ld1 { v2.s }[1], [x13], #4 -; CHECK-NEXT: ld1 { v3.s }[1], [x2], #4 -; CHECK-NEXT: ld1 { v0.s }[1], [x0], #4 -; CHECK-NEXT: ld1 { v5.s }[1], [x12] -; CHECK-NEXT: ld1 { v4.s }[1], [x13] -; CHECK-NEXT: ld1 { v7.s }[1], [x2] -; CHECK-NEXT: ld1 { v6.s }[1], [x0] -; CHECK-NEXT: usubl v0.8h, v0.8b, v3.8b -; CHECK-NEXT: usubl v1.8h, v2.8b, v1.8b -; CHECK-NEXT: usubl v2.8h, v4.8b, v5.8b +; CHECK-NEXT: ldr d0, [x0] +; CHECK-NEXT: ldr d1, [x2] +; CHECK-NEXT: ldr d2, [x10] +; CHECK-NEXT: add x10, x11, x9 +; CHECK-NEXT: ldr d4, [x12, x8] +; CHECK-NEXT: ldr d5, [x10, x9] +; CHECK-NEXT: ldr d3, [x11] +; CHECK-NEXT: ldr d6, [x12] +; CHECK-NEXT: ldr d7, [x10] +; CHECK-NEXT: usubl v0.8h, v0.8b, v1.8b +; CHECK-NEXT: usubl v1.8h, v4.8b, v5.8b +; CHECK-NEXT: usubl v2.8h, v2.8b, v3.8b ; CHECK-NEXT: usubl v3.8h, v6.8b, v7.8b -; CHECK-NEXT: shll v4.4s, v2.4h, #16 -; CHECK-NEXT: shll v5.4s, v3.4h, #16 -; CHECK-NEXT: shll2 v3.4s, v3.8h, #16 -; CHECK-NEXT: shll2 v2.4s, v2.8h, #16 -; CHECK-NEXT: saddw2 v3.4s, v3.4s, v0.8h -; CHECK-NEXT: saddw v0.4s, v5.4s, v0.4h -; CHECK-NEXT: saddw2 v2.4s, v2.4s, v1.8h -; CHECK-NEXT: rev64 v17.4s, v3.4s -; CHECK-NEXT: rev64 v6.4s, v0.4s -; CHECK-NEXT: saddw v1.4s, v4.4s, v1.4h +; CHECK-NEXT: shll2 v4.4s, v0.8h, #16 +; CHECK-NEXT: shll2 v5.4s, v2.8h, #16 +; CHECK-NEXT: shll2 v6.4s, v3.8h, #16 +; CHECK-NEXT: shll2 v7.4s, v1.8h, #16 +; CHECK-NEXT: saddw v0.4s, v4.4s, v0.4h +; CHECK-NEXT: saddw v2.4s, v5.4s, v2.4h +; CHECK-NEXT: saddw v3.4s, v6.4s, v3.4h +; CHECK-NEXT: saddw v1.4s, v7.4s, v1.4h +; CHECK-NEXT: rev64 v4.4s, v0.4s ; CHECK-NEXT: rev64 v5.4s, v2.4s -; CHECK-NEXT: addp v16.4s, v0.4s, v3.4s -; CHECK-NEXT: rev64 v4.4s, v1.4s -; CHECK-NEXT: sub v3.4s, v3.4s, v17.4s -; CHECK-NEXT: sub v0.4s, v0.4s, v6.4s -; CHECK-NEXT: addp v7.4s, v1.4s, v2.4s -; CHECK-NEXT: ext v17.16b, v0.16b, v3.16b, #4 +; CHECK-NEXT: rev64 v7.4s, v1.4s +; CHECK-NEXT: rev64 v16.4s, v3.4s +; CHECK-NEXT: addp v6.4s, v2.4s, v0.4s +; CHECK-NEXT: addp v17.4s, v1.4s, v3.4s +; CHECK-NEXT: sub v0.4s, v0.4s, v4.4s ; CHECK-NEXT: sub v2.4s, v2.4s, v5.4s -; CHECK-NEXT: sub v1.4s, v1.4s, v4.4s -; CHECK-NEXT: uzp2 v5.4s, v7.4s, v16.4s -; CHECK-NEXT: ext v4.16b, v16.16b, v16.16b, #8 -; CHECK-NEXT: uzp1 v16.4s, v7.4s, v16.4s -; CHECK-NEXT: zip2 v6.4s, v1.4s, v2.4s -; CHECK-NEXT: mov v3.s[3], v0.s[2] -; CHECK-NEXT: zip1 v1.4s, v1.4s, v2.4s -; CHECK-NEXT: ext v0.16b, v17.16b, v0.16b, #4 -; CHECK-NEXT: rev64 v2.4s, v5.4s -; CHECK-NEXT: uzp1 v5.4s, v7.4s, v4.4s -; CHECK-NEXT: rev64 v16.4s, v16.4s -; CHECK-NEXT: uzp2 v4.4s, v7.4s, v4.4s -; CHECK-NEXT: mov v6.d[1], v3.d[1] +; CHECK-NEXT: sub v3.4s, v3.4s, v16.4s +; CHECK-NEXT: sub v1.4s, v1.4s, v7.4s +; CHECK-NEXT: ext v4.16b, v2.16b, v0.16b, #4 +; CHECK-NEXT: zip2 v5.4s, v1.4s, v3.4s +; CHECK-NEXT: mov v0.s[3], v2.s[2] +; CHECK-NEXT: uzp2 v7.4s, v17.4s, v6.4s +; CHECK-NEXT: zip1 v1.4s, v1.4s, v3.4s +; CHECK-NEXT: ext v3.16b, v6.16b, v6.16b, #8 +; CHECK-NEXT: mov v5.d[1], v0.d[1] +; CHECK-NEXT: ext v0.16b, v4.16b, v2.16b, #4 +; CHECK-NEXT: uzp1 v2.4s, v17.4s, v6.4s +; CHECK-NEXT: rev64 v4.4s, v7.4s ; CHECK-NEXT: mov v1.d[1], v0.d[1] -; CHECK-NEXT: add v0.4s, v2.4s, v16.4s -; CHECK-NEXT: sub v2.4s, v5.4s, v4.4s -; CHECK-NEXT: sub v3.4s, v1.4s, v6.4s -; CHECK-NEXT: add v1.4s, v6.4s, v1.4s -; CHECK-NEXT: zip1 v4.4s, v2.4s, v3.4s -; CHECK-NEXT: zip1 v5.4s, v0.4s, v1.4s -; CHECK-NEXT: uzp2 v6.4s, v0.4s, v1.4s -; CHECK-NEXT: zip2 v7.4s, v2.4s, v3.4s -; CHECK-NEXT: zip2 v16.4s, v0.4s, v1.4s -; CHECK-NEXT: ext v17.16b, v2.16b, v4.16b, #8 -; CHECK-NEXT: uzp2 v6.4s, v6.4s, v0.4s -; CHECK-NEXT: mov v2.s[3], v3.s[2] -; CHECK-NEXT: trn2 v3.4s, v0.4s, v5.4s -; CHECK-NEXT: mov v0.s[1], v1.s[1] -; CHECK-NEXT: mov v6.d[1], v7.d[1] -; CHECK-NEXT: mov v16.d[1], v2.d[1] -; CHECK-NEXT: mov v3.d[1], v17.d[1] -; CHECK-NEXT: mov v0.d[1], v4.d[1] -; CHECK-NEXT: add v1.4s, v6.4s, v16.4s -; CHECK-NEXT: sub v2.4s, v16.4s, v6.4s -; CHECK-NEXT: add v7.4s, v3.4s, v0.4s -; CHECK-NEXT: ext v6.16b, v1.16b, v1.16b, #4 -; CHECK-NEXT: sub v0.4s, v0.4s, v3.4s -; CHECK-NEXT: ext v3.16b, v7.16b, v7.16b, #4 -; CHECK-NEXT: zip1 v4.4s, v1.4s, v2.4s -; CHECK-NEXT: zip2 v5.4s, v1.4s, v2.4s -; CHECK-NEXT: zip2 v1.4s, v2.4s, v1.4s -; CHECK-NEXT: zip2 v16.4s, v0.4s, v7.4s -; CHECK-NEXT: zip1 v17.4s, v7.4s, v0.4s -; CHECK-NEXT: zip2 v7.4s, v7.4s, v0.4s -; CHECK-NEXT: ext v2.16b, v6.16b, v2.16b, #8 -; CHECK-NEXT: ext v0.16b, v3.16b, v0.16b, #8 -; CHECK-NEXT: add v1.4s, v16.4s, v1.4s -; CHECK-NEXT: sub v4.4s, v4.4s, v17.4s -; CHECK-NEXT: ext v2.16b, v2.16b, v6.16b, #4 -; CHECK-NEXT: ext v0.16b, v0.16b, v3.16b, #4 -; CHECK-NEXT: sub v3.4s, v5.4s, v7.4s -; CHECK-NEXT: cmlt v5.8h, v4.8h, #0 -; CHECK-NEXT: cmlt v6.8h, v3.8h, #0 -; CHECK-NEXT: add v0.4s, v0.4s, v2.4s -; CHECK-NEXT: cmlt v2.8h, v1.8h, #0 -; CHECK-NEXT: add v3.4s, v6.4s, v3.4s -; CHECK-NEXT: add v1.4s, v2.4s, v1.4s -; CHECK-NEXT: cmlt v7.8h, v0.8h, #0 -; CHECK-NEXT: add v4.4s, v5.4s, v4.4s -; CHECK-NEXT: eor v3.16b, v3.16b, v6.16b -; CHECK-NEXT: eor v1.16b, v1.16b, v2.16b -; CHECK-NEXT: add v1.4s, v1.4s, v3.4s -; CHECK-NEXT: add v0.4s, v7.4s, v0.4s -; CHECK-NEXT: eor v2.16b, v4.16b, v5.16b -; CHECK-NEXT: add v1.4s, v2.4s, v1.4s -; CHECK-NEXT: eor v0.16b, v0.16b, v7.16b +; CHECK-NEXT: rev64 v0.4s, v2.4s +; CHECK-NEXT: uzp1 v2.4s, v17.4s, v3.4s +; CHECK-NEXT: uzp2 v3.4s, v17.4s, v3.4s +; CHECK-NEXT: add v6.4s, v5.4s, v1.4s +; CHECK-NEXT: add v0.4s, v4.4s, v0.4s +; CHECK-NEXT: sub v1.4s, v1.4s, v5.4s +; CHECK-NEXT: sub v2.4s, v2.4s, v3.4s +; CHECK-NEXT: zip1 v3.4s, v0.4s, v6.4s +; CHECK-NEXT: zip1 v4.4s, v2.4s, v1.4s +; CHECK-NEXT: mov v7.16b, v0.16b +; CHECK-NEXT: uzp2 v5.4s, v0.4s, v6.4s +; CHECK-NEXT: trn2 v3.4s, v0.4s, v3.4s +; CHECK-NEXT: ext v16.16b, v2.16b, v4.16b, #8 +; CHECK-NEXT: mov v7.s[1], v6.s[1] +; CHECK-NEXT: uzp2 v5.4s, v5.4s, v0.4s +; CHECK-NEXT: zip2 v0.4s, v0.4s, v6.4s +; CHECK-NEXT: mov v3.d[1], v16.d[1] +; CHECK-NEXT: zip2 v6.4s, v2.4s, v1.4s +; CHECK-NEXT: mov v7.d[1], v4.d[1] +; CHECK-NEXT: mov v2.s[3], v1.s[2] +; CHECK-NEXT: mov v5.d[1], v6.d[1] +; CHECK-NEXT: add v1.4s, v3.4s, v7.4s +; CHECK-NEXT: mov v0.d[1], v2.d[1] +; CHECK-NEXT: ext v2.16b, v1.16b, v1.16b, #4 +; CHECK-NEXT: sub v3.4s, v7.4s, v3.4s +; CHECK-NEXT: add v4.4s, v5.4s, v0.4s +; CHECK-NEXT: ext v6.16b, v2.16b, v3.16b, #8 +; CHECK-NEXT: ext v7.16b, v4.16b, v4.16b, #4 +; CHECK-NEXT: sub v0.4s, v0.4s, v5.4s +; CHECK-NEXT: zip2 v5.4s, v3.4s, v1.4s +; CHECK-NEXT: ext v2.16b, v6.16b, v2.16b, #4 +; CHECK-NEXT: ext v6.16b, v7.16b, v0.16b, #8 +; CHECK-NEXT: zip1 v16.4s, v4.4s, v0.4s +; CHECK-NEXT: zip2 v17.4s, v4.4s, v0.4s +; CHECK-NEXT: zip2 v0.4s, v0.4s, v4.4s +; CHECK-NEXT: ext v4.16b, v6.16b, v7.16b, #4 +; CHECK-NEXT: zip1 v6.4s, v1.4s, v3.4s +; CHECK-NEXT: zip2 v1.4s, v1.4s, v3.4s +; CHECK-NEXT: add v0.4s, v5.4s, v0.4s +; CHECK-NEXT: add v2.4s, v2.4s, v4.4s +; CHECK-NEXT: sub v3.4s, v16.4s, v6.4s +; CHECK-NEXT: sub v1.4s, v17.4s, v1.4s +; CHECK-NEXT: cmlt v6.8h, v0.8h, #0 +; CHECK-NEXT: cmlt v7.8h, v1.8h, #0 +; CHECK-NEXT: cmlt v4.8h, v3.8h, #0 +; CHECK-NEXT: add v0.4s, v6.4s, v0.4s +; CHECK-NEXT: add v1.4s, v7.4s, v1.4s +; CHECK-NEXT: cmlt v5.8h, v2.8h, #0 +; CHECK-NEXT: add v3.4s, v4.4s, v3.4s +; CHECK-NEXT: eor v1.16b, v1.16b, v7.16b +; CHECK-NEXT: eor v0.16b, v0.16b, v6.16b ; CHECK-NEXT: add v0.4s, v0.4s, v1.4s +; CHECK-NEXT: add v2.4s, v5.4s, v2.4s +; CHECK-NEXT: eor v1.16b, v3.16b, v4.16b +; CHECK-NEXT: add v0.4s, v1.4s, v0.4s +; CHECK-NEXT: eor v1.16b, v2.16b, v5.16b +; CHECK-NEXT: add v0.4s, v1.4s, v0.4s ; CHECK-NEXT: addv s0, v0.4s ; CHECK-NEXT: fmov w8, s0 ; CHECK-NEXT: lsr w9, w8, #16