diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -18531,6 +18531,250 @@ DAG.getConstant(0, DL, MVT::i64)); } +static bool isLoadOrMultipleLoads(SDValue B, SmallVector &Loads) { + SDValue BV = peekThroughOneUseBitcasts(B); + if (!BV->hasOneUse()) + return false; + if (auto *Ld = dyn_cast(BV)) { + if (!Ld || !Ld->isSimple()) + return false; + Loads.push_back(Ld); + return true; + } else if (BV.getOpcode() == ISD::BUILD_VECTOR || + BV.getOpcode() == ISD::CONCAT_VECTORS) { + for (unsigned Op = 0; Op < BV.getNumOperands(); Op++) { + auto *Ld = dyn_cast(BV.getOperand(Op)); + if (!Ld || !Ld->isSimple() || !BV.getOperand(Op).hasOneUse()) + return false; + Loads.push_back(Ld); + } + return true; + } else if (B.getOpcode() == ISD::VECTOR_SHUFFLE) { + // Try to find a tree of shuffles and concats from how IR shuffles of loads + // are lowered. Note that this only comes up because we do not always visit + // operands before uses. After that is fixed this can be removed and in the + // meantime this is fairly specific to the lowering we expect from IR. + // t46: v16i8 = vector_shuffle<0,1,2,3,4,5,6,7,8,9,10,11,16,17,18,19> t44, t45 + // t44: v16i8 = vector_shuffle<0,1,2,3,4,5,6,7,16,17,18,19,u,u,u,u> t42, t43 + // t42: v16i8 = concat_vectors t40, t36, undef:v4i8, undef:v4i8 + // t40: v4i8,ch = load<(load (s32) from %ir.17)> t0, t22, undef:i64 + // t36: v4i8,ch = load<(load (s32) from %ir.13)> t0, t18, undef:i64 + // t43: v16i8 = concat_vectors t32, undef:v4i8, undef:v4i8, undef:v4i8 + // t32: v4i8,ch = load<(load (s32) from %ir.9)> t0, t14, undef:i64 + // t45: v16i8 = concat_vectors t28, undef:v4i8, undef:v4i8, undef:v4i8 + // t28: v4i8,ch = load<(load (s32) from %ir.0)> t0, t2, undef:i64 + if (B.getOperand(0).getOpcode() != ISD::VECTOR_SHUFFLE || + B.getOperand(0).getOperand(0).getOpcode() != ISD::CONCAT_VECTORS || + B.getOperand(0).getOperand(1).getOpcode() != ISD::CONCAT_VECTORS || + B.getOperand(1).getOpcode() != ISD::CONCAT_VECTORS || + B.getOperand(1).getNumOperands() != 4) + return false; + auto SV1 = cast(B); + auto SV2 = cast(B.getOperand(0)); + int NumElts = B.getValueType().getVectorNumElements(); + int NumSubElts = NumElts / 4; + for (int I = 0; I < NumSubElts; I++) { + // <0,1,2,3,4,5,6,7,8,9,10,11,16,17,18,19> + if (SV1->getMaskElt(I) != I || + SV1->getMaskElt(I + NumSubElts) != I + NumSubElts || + SV1->getMaskElt(I + NumSubElts * 2) != I + NumSubElts * 2 || + SV1->getMaskElt(I + NumSubElts * 3) != I + NumElts) + return false; + // <0,1,2,3,4,5,6,7,16,17,18,19,u,u,u,u> + if (SV2->getMaskElt(I) != I || + SV2->getMaskElt(I + NumSubElts) != I + NumSubElts || + SV2->getMaskElt(I + NumSubElts * 2) != I + NumElts) + return false; + } + auto *Ld0 = dyn_cast(SV2->getOperand(0).getOperand(0)); + auto *Ld1 = dyn_cast(SV2->getOperand(0).getOperand(1)); + auto *Ld2 = dyn_cast(SV2->getOperand(1).getOperand(0)); + auto *Ld3 = dyn_cast(B.getOperand(1).getOperand(0)); + if (!Ld0 || !Ld1 || !Ld2 || !Ld3 || !Ld0->isSimple() || !Ld1->isSimple() || + !Ld2->isSimple() || !Ld3->isSimple()) + return false; + Loads.push_back(Ld0); + Loads.push_back(Ld1); + Loads.push_back(Ld2); + Loads.push_back(Ld3); + return true; + } + return false; +} + +static bool areLoadedOffsetButOtherwiseSame(SDValue Op0, SDValue Op1, + SelectionDAG &DAG, + unsigned &NumSubLoads) { + if (!Op0.hasOneUse() || !Op1.hasOneUse()) + return false; + + SmallVector Loads0, Loads1; + if (isLoadOrMultipleLoads(Op0, Loads0) && + isLoadOrMultipleLoads(Op1, Loads1)) { + if (NumSubLoads && Loads0.size() != NumSubLoads) + return false; + NumSubLoads = Loads0.size(); + return Loads0.size() == Loads1.size() && + all_of(zip(Loads0, Loads1), [&DAG](auto L) { + unsigned Size = get<0>(L)->getValueType(0).getSizeInBits(); + return Size == get<1>(L)->getValueType(0).getSizeInBits() && + DAG.areNonVolatileConsecutiveLoads(get<1>(L), get<0>(L), + Size / 8, 1); + }); + } + + if (Op0.getOpcode() != Op1.getOpcode()) + return false; + + switch (Op0.getOpcode()) { + case ISD::ADD: + case ISD::SUB: + return areLoadedOffsetButOtherwiseSame(Op0.getOperand(0), Op1.getOperand(0), + DAG, NumSubLoads) && + areLoadedOffsetButOtherwiseSame(Op0.getOperand(1), Op1.getOperand(1), + DAG, NumSubLoads); + case ISD::SIGN_EXTEND: + case ISD::ANY_EXTEND: + case ISD::ZERO_EXTEND: + EVT XVT = Op0.getOperand(0).getValueType(); + if (XVT.getScalarSizeInBits() != 8 && XVT.getScalarSizeInBits() != 16 && + XVT.getScalarSizeInBits() != 32) + return false; + return areLoadedOffsetButOtherwiseSame(Op0.getOperand(0), Op1.getOperand(0), + DAG, NumSubLoads); + } + return false; +} + +// This method attempts to fold trees of add(ext(load p), shl(ext(load p+4)) +// into a single load of twice the size, that we extract the bottom part and top +// part so that the shl can use a shll2 instruction. The two loads in that +// example can also be larger trees of instructions, which are identical except +// for the leaves which are all loads offset from the LHS, including +// buildvectors of multiple loads. For example the RHS tree could be +// sub(zext(buildvec(load p+4, load q+4)), zext(buildvec(load r+4, load s+4))) +// Whilst it can be common for the larger loads to replace LDP instructions +// (which doesn't gain anything on it's own), the larger loads can help create +// more efficient code, and in buildvectors prevent the need for ld1 lane +// inserts which can be slower than normal loads. +static SDValue performExtBinopLoadFold(SDNode *N, SelectionDAG &DAG) { + EVT VT = N->getValueType(0); + if (!VT.isFixedLengthVector() || + (VT.getScalarSizeInBits() != 16 && VT.getScalarSizeInBits() != 32 && + VT.getScalarSizeInBits() != 64)) + return SDValue(); + + SDValue Other = N->getOperand(0); + SDValue Shift = N->getOperand(1); + if (Shift.getOpcode() != ISD::SHL && N->getOpcode() != ISD::SUB) + std::swap(Shift, Other); + APInt ShiftAmt; + if (Shift.getOpcode() != ISD::SHL || !Shift.hasOneUse() || + !ISD::isConstantSplatVector(Shift.getOperand(1).getNode(), ShiftAmt)) + return SDValue(); + + if (!ISD::isExtOpcode(Shift.getOperand(0).getOpcode()) || + !ISD::isExtOpcode(Other.getOpcode()) || + Shift.getOperand(0).getOperand(0).getValueType() != + Other.getOperand(0).getValueType() || + !Other.hasOneUse() || !Shift.getOperand(0).hasOneUse()) + return SDValue(); + + SDValue Op0 = Other.getOperand(0); + SDValue Op1 = Shift.getOperand(0).getOperand(0); + + unsigned NumSubLoads = 0; + if (!areLoadedOffsetButOtherwiseSame(Op0, Op1, DAG, NumSubLoads)) + return SDValue(); + + // Attempt to rule out some unprofitable cases using heuristics (some working + // around suboptimal code generation), notably if the extend not be able to + // use ushll2 instructions as the types are not large enough. Otherwise zip's + // will need to be created which can increase the instruction count. + unsigned NumElts = Op0.getValueType().getVectorNumElements(); + unsigned NumSubElts = NumElts / NumSubLoads; + if (NumSubElts * VT.getScalarSizeInBits() < 128 || + (Other.getOpcode() != Shift.getOperand(0).getOpcode() && + Op0.getValueType().getSizeInBits() < 128 && + !DAG.getTargetLoweringInfo().isTypeLegal(Op0.getValueType()))) + return SDValue(); + + // Recreate the tree with the new combined loads. + std::function GenCombinedTree = + [&GenCombinedTree](SDValue Op0, SDValue Op1, SelectionDAG &DAG) { + EVT DVT = + Op0.getValueType().getDoubleNumVectorElementsVT(*DAG.getContext()); + + SmallVector Loads0, Loads1; + if (isLoadOrMultipleLoads(Op0, Loads0) && + isLoadOrMultipleLoads(Op1, Loads1)) { + EVT LoadVT = EVT::getVectorVT( + *DAG.getContext(), Op0.getValueType().getScalarType(), + Op0.getValueType().getVectorNumElements() / Loads0.size()); + EVT DLoadVT = LoadVT.getDoubleNumVectorElementsVT(*DAG.getContext()); + + SmallVector NewLoads; + for (const auto &[L0, L1] : zip(Loads0, Loads1)) { + SDValue Load = DAG.getLoad(DLoadVT, SDLoc(L0), L0->getChain(), + L0->getBasePtr(), L0->getPointerInfo(), + L0->getOriginalAlign()); + DAG.makeEquivalentMemoryOrdering(L0, Load.getValue(1)); + DAG.makeEquivalentMemoryOrdering(L1, Load.getValue(1)); + NewLoads.push_back(Load); + } + return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op0), DVT, NewLoads); + } + + SmallVector Ops; + for (const auto &[O0, O1] : zip(Op0->op_values(), Op1->op_values())) + Ops.push_back(GenCombinedTree(O0, O1, DAG)); + return DAG.getNode(Op0.getOpcode(), SDLoc(Op0), DVT, Ops); + }; + SDValue NewOp = GenCombinedTree(Op0, Op1, DAG); + + SmallVector LowMask(NumElts, 0), HighMask(NumElts, 0); + int Hi = NumSubElts, Lo = 0; + for (unsigned i = 0; i < NumSubLoads; i++) { + for (unsigned j = 0; j < NumSubElts; j++) { + LowMask[i * NumSubElts + j] = Lo++; + HighMask[i * NumSubElts + j] = Hi++; + } + Lo += NumSubElts; + Hi += NumSubElts; + } + SDLoc DL(N); + SDValue Ext0, Ext1; + // Extract the top and bottom lanes, then extend the result. Possibly extend + // the result then extract the lanes if the two operands match as it produces + // slightly smaller code. + if (Other.getOpcode() != Shift.getOperand(0).getOpcode()) { + SDValue SubL = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, Op0.getValueType(), + NewOp, DAG.getConstant(0, DL, MVT::i64)); + SDValue SubH = + DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, Op0.getValueType(), NewOp, + DAG.getConstant(NumSubElts * NumSubLoads, DL, MVT::i64)); + SDValue Extr0 = + DAG.getVectorShuffle(Op0.getValueType(), DL, SubL, SubH, LowMask); + SDValue Extr1 = + DAG.getVectorShuffle(Op0.getValueType(), DL, SubL, SubH, HighMask); + Ext0 = DAG.getNode(Other.getOpcode(), DL, VT, Extr0); + Ext1 = DAG.getNode(Shift.getOperand(0).getOpcode(), DL, VT, Extr1); + } else { + EVT DVT = VT.getDoubleNumVectorElementsVT(*DAG.getContext()); + SDValue Ext = DAG.getNode(Other.getOpcode(), DL, DVT, NewOp); + SDValue SubL = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Ext, + DAG.getConstant(0, DL, MVT::i64)); + SDValue SubH = + DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Ext, + DAG.getConstant(NumSubElts * NumSubLoads, DL, MVT::i64)); + Ext0 = DAG.getVectorShuffle(VT, DL, SubL, SubH, LowMask); + Ext1 = DAG.getVectorShuffle(VT, DL, SubL, SubH, HighMask); + } + SDValue NShift = + DAG.getNode(Shift.getOpcode(), DL, VT, Ext1, Shift.getOperand(1)); + return DAG.getNode(N->getOpcode(), DL, VT, Ext0, NShift); +} + static SDValue performAddSubCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) { // Try to change sum of two reductions. @@ -18553,6 +18797,9 @@ if (SDValue Val = performAddSubIntoVectorOp(N, DCI.DAG)) return Val; + if (SDValue Val = performExtBinopLoadFold(N, DCI.DAG)) + return Val; + return performAddSubLongCombine(N, DCI); } diff --git a/llvm/test/CodeGen/AArch64/extbinopload.ll b/llvm/test/CodeGen/AArch64/extbinopload.ll --- a/llvm/test/CodeGen/AArch64/extbinopload.ll +++ b/llvm/test/CodeGen/AArch64/extbinopload.ll @@ -55,9 +55,9 @@ define <4 x i32> @load_v4i16_v4i32(ptr %p) { ; CHECK-LABEL: load_v4i16_v4i32: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp d1, d0, [x0] -; CHECK-NEXT: ushll v0.4s, v0.4h, #3 -; CHECK-NEXT: uaddw v0.4s, v0.4s, v1.4h +; CHECK-NEXT: ldr q0, [x0] +; CHECK-NEXT: ushll2 v1.4s, v0.8h, #3 +; CHECK-NEXT: uaddw v0.4s, v1.4s, v0.4h ; CHECK-NEXT: ret %l1 = load <4 x i16>, ptr %p %q = getelementptr i8, ptr %p, i32 8 @@ -91,11 +91,10 @@ define <4 x i32> @load_v4i8_v4i32(ptr %p) { ; CHECK-LABEL: load_v4i8_v4i32: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp s1, s0, [x0] +; CHECK-NEXT: ldr d0, [x0] ; CHECK-NEXT: ushll v0.8h, v0.8b, #0 -; CHECK-NEXT: ushll v1.8h, v1.8b, #0 -; CHECK-NEXT: ushll v0.4s, v0.4h, #3 -; CHECK-NEXT: uaddw v0.4s, v0.4s, v1.4h +; CHECK-NEXT: ushll2 v1.4s, v0.8h, #3 +; CHECK-NEXT: uaddw v0.4s, v1.4s, v0.4h ; CHECK-NEXT: ret %l1 = load <4 x i8>, ptr %p %q = getelementptr i8, ptr %p, i32 4 @@ -110,30 +109,28 @@ define <4 x i32> @load_v4i12_v4i32(ptr %p) { ; CHECK-LABEL: load_v4i12_v4i32: ; CHECK: // %bb.0: -; CHECK-NEXT: ldur w8, [x0, #6] -; CHECK-NEXT: ldr w9, [x0] -; CHECK-NEXT: ldrh w12, [x0, #10] -; CHECK-NEXT: and w10, w8, #0xfff -; CHECK-NEXT: ldrh w13, [x0, #4] -; CHECK-NEXT: and w11, w9, #0xfff +; CHECK-NEXT: ldr x8, [x0] +; CHECK-NEXT: ldr w9, [x0, #8] +; CHECK-NEXT: ubfx x10, x8, #48, #12 +; CHECK-NEXT: lsr x11, x8, #60 +; CHECK-NEXT: orr w11, w11, w9, lsl #4 +; CHECK-NEXT: and w12, w8, #0xfff +; CHECK-NEXT: and w11, w11, #0xfff ; CHECK-NEXT: fmov s0, w10 ; CHECK-NEXT: ubfx w10, w8, #12, #12 -; CHECK-NEXT: fmov s1, w11 -; CHECK-NEXT: ubfx w11, w9, #12, #12 -; CHECK-NEXT: orr x8, x8, x12, lsl #32 -; CHECK-NEXT: orr x9, x9, x13, lsl #32 -; CHECK-NEXT: mov v0.s[1], w10 -; CHECK-NEXT: ubfx x8, x8, #24, #12 -; CHECK-NEXT: mov v1.s[1], w11 -; CHECK-NEXT: ubfx x9, x9, #24, #12 -; CHECK-NEXT: mov v0.s[2], w8 -; CHECK-NEXT: ubfx w8, w12, #4, #12 -; CHECK-NEXT: mov v1.s[2], w9 -; CHECK-NEXT: ubfx w9, w13, #4, #12 -; CHECK-NEXT: mov v0.s[3], w8 -; CHECK-NEXT: mov v1.s[3], w9 -; CHECK-NEXT: shl v0.4s, v0.4s, #3 -; CHECK-NEXT: add v0.4s, v1.4s, v0.4s +; CHECK-NEXT: fmov s1, w12 +; CHECK-NEXT: mov v0.h[1], w11 +; CHECK-NEXT: ubfx w11, w9, #8, #12 +; CHECK-NEXT: mov v1.h[1], w10 +; CHECK-NEXT: ubfx x10, x8, #24, #12 +; CHECK-NEXT: lsr x9, x9, #20 +; CHECK-NEXT: ubfx x8, x8, #36, #12 +; CHECK-NEXT: mov v0.h[2], w11 +; CHECK-NEXT: mov v1.h[2], w10 +; CHECK-NEXT: mov v0.h[3], w9 +; CHECK-NEXT: mov v1.h[3], w8 +; CHECK-NEXT: ushll v0.4s, v0.4h, #3 +; CHECK-NEXT: uaddw v0.4s, v0.4s, v1.4h ; CHECK-NEXT: ret %l1 = load <4 x i12>, ptr %p %q = getelementptr i8, ptr %p, i32 6 @@ -148,9 +145,9 @@ define <8 x i16> @load_v8i8(ptr %p) { ; CHECK-LABEL: load_v8i8: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp d1, d0, [x0] -; CHECK-NEXT: ushll v0.8h, v0.8b, #3 -; CHECK-NEXT: uaddw v0.8h, v0.8h, v1.8b +; CHECK-NEXT: ldr q0, [x0] +; CHECK-NEXT: ushll2 v1.8h, v0.16b, #3 +; CHECK-NEXT: uaddw v0.8h, v1.8h, v0.8b ; CHECK-NEXT: ret %l1 = load <8 x i8>, ptr %p %q = getelementptr i8, ptr %p, i32 8 @@ -165,11 +162,10 @@ define <8 x i16> @loadadd_v8i8(ptr %p1, ptr %p2) { ; CHECK-LABEL: loadadd_v8i8: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp d0, d1, [x0] -; CHECK-NEXT: ldp d3, d2, [x1] -; CHECK-NEXT: add v0.8b, v0.8b, v3.8b -; CHECK-NEXT: add v1.8b, v1.8b, v2.8b -; CHECK-NEXT: ushll v1.8h, v1.8b, #3 +; CHECK-NEXT: ldr q0, [x0] +; CHECK-NEXT: ldr q1, [x1] +; CHECK-NEXT: add v0.16b, v0.16b, v1.16b +; CHECK-NEXT: ushll2 v1.8h, v0.16b, #3 ; CHECK-NEXT: uaddw v0.8h, v1.8h, v0.8b ; CHECK-NEXT: ret %l11 = load <8 x i8>, ptr %p1 @@ -190,14 +186,14 @@ define <8 x i32> @loadaddext_v8i8(ptr %p1, ptr %p2) { ; CHECK-LABEL: loadaddext_v8i8: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp d2, d0, [x0] -; CHECK-NEXT: ldp d3, d1, [x1] -; CHECK-NEXT: uaddl v2.8h, v2.8b, v3.8b +; CHECK-NEXT: ldr q0, [x0] +; CHECK-NEXT: ldr q1, [x1] +; CHECK-NEXT: uaddl2 v2.8h, v0.16b, v1.16b ; CHECK-NEXT: uaddl v0.8h, v0.8b, v1.8b -; CHECK-NEXT: ushll2 v1.4s, v0.8h, #3 -; CHECK-NEXT: ushll v0.4s, v0.4h, #3 -; CHECK-NEXT: uaddw2 v1.4s, v1.4s, v2.8h -; CHECK-NEXT: uaddw v0.4s, v0.4s, v2.4h +; CHECK-NEXT: ushll2 v1.4s, v2.8h, #3 +; CHECK-NEXT: ushll v2.4s, v2.4h, #3 +; CHECK-NEXT: uaddw2 v1.4s, v1.4s, v0.8h +; CHECK-NEXT: uaddw v0.4s, v2.4s, v0.4h ; CHECK-NEXT: ret %l11 = load <8 x i8>, ptr %p1 %q1 = getelementptr i8, ptr %p1, i32 8 @@ -221,15 +217,10 @@ define <4 x i32> @loadaddext_v4i8(ptr %p1, ptr %p2) { ; CHECK-LABEL: loadaddext_v4i8: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp s0, s1, [x0] -; CHECK-NEXT: ldp s2, s3, [x1] -; CHECK-NEXT: ushll v1.8h, v1.8b, #0 -; CHECK-NEXT: ushll v3.8h, v3.8b, #0 -; CHECK-NEXT: ushll v0.8h, v0.8b, #0 -; CHECK-NEXT: ushll v2.8h, v2.8b, #0 -; CHECK-NEXT: add v1.4h, v1.4h, v3.4h -; CHECK-NEXT: add v0.4h, v0.4h, v2.4h -; CHECK-NEXT: ushll v1.4s, v1.4h, #3 +; CHECK-NEXT: ldr d0, [x0] +; CHECK-NEXT: ldr d1, [x1] +; CHECK-NEXT: uaddl v0.8h, v0.8b, v1.8b +; CHECK-NEXT: ushll2 v1.4s, v0.8h, #3 ; CHECK-NEXT: uaddw v0.4s, v1.4s, v0.4h ; CHECK-NEXT: ret %l11 = load <4 x i8>, ptr %p1 @@ -321,15 +312,14 @@ define <8 x i32> @load_bv_v4i8_i32(ptr %p, ptr %q) { ; CHECK-LABEL: load_bv_v4i8_i32: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp s0, s1, [x0] -; CHECK-NEXT: ld1 { v0.s }[1], [x1], #4 +; CHECK-NEXT: ldr d0, [x0] +; CHECK-NEXT: ldr d1, [x1] ; CHECK-NEXT: ushll v0.8h, v0.8b, #0 -; CHECK-NEXT: ld1 { v1.s }[1], [x1] ; CHECK-NEXT: ushll v1.8h, v1.8b, #0 -; CHECK-NEXT: ushll2 v2.4s, v1.8h, #3 -; CHECK-NEXT: ushll v3.4s, v1.4h, #3 -; CHECK-NEXT: uaddw2 v1.4s, v2.4s, v0.8h -; CHECK-NEXT: uaddw v0.4s, v3.4s, v0.4h +; CHECK-NEXT: ushll2 v2.4s, v0.8h, #3 +; CHECK-NEXT: ushll2 v3.4s, v1.8h, #3 +; CHECK-NEXT: uaddw v0.4s, v2.4s, v0.4h +; CHECK-NEXT: uaddw v1.4s, v3.4s, v1.4h ; CHECK-NEXT: ret %j1 = load <4 x i8>, ptr %p %p1 = getelementptr i8, ptr %p, i32 4 @@ -349,12 +339,12 @@ define <8 x i32> @load_bv_v4i16_i32(ptr %p, ptr %q) { ; CHECK-LABEL: load_bv_v4i16_i32: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp d0, d1, [x0] -; CHECK-NEXT: ldp d3, d2, [x1] -; CHECK-NEXT: ushll v1.4s, v1.4h, #3 -; CHECK-NEXT: ushll v2.4s, v2.4h, #3 -; CHECK-NEXT: uaddw v0.4s, v1.4s, v0.4h -; CHECK-NEXT: uaddw v1.4s, v2.4s, v3.4h +; CHECK-NEXT: ldr q0, [x0] +; CHECK-NEXT: ldr q1, [x1] +; CHECK-NEXT: ushll2 v2.4s, v0.8h, #3 +; CHECK-NEXT: ushll2 v3.4s, v1.8h, #3 +; CHECK-NEXT: uaddw v0.4s, v2.4s, v0.4h +; CHECK-NEXT: uaddw v1.4s, v3.4s, v1.4h ; CHECK-NEXT: ret %j1 = load <4 x i16>, ptr %p %p1 = getelementptr i8, ptr %p, i32 8 @@ -575,30 +565,26 @@ define <16 x i32> @double2_bv_4xv4i8_i32(ptr %p, ptr %q, ptr %r, ptr %s, ptr %t, ptr %u, ptr %v, ptr %w) { ; CHECK-LABEL: double2_bv_4xv4i8_i32: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp s0, s1, [x2] -; CHECK-NEXT: ldp s2, s3, [x0] -; CHECK-NEXT: ldp s4, s5, [x6] -; CHECK-NEXT: ldp s6, s7, [x4] -; CHECK-NEXT: ld1 { v0.s }[1], [x3], #4 -; CHECK-NEXT: ld1 { v2.s }[1], [x1], #4 -; CHECK-NEXT: ld1 { v4.s }[1], [x7], #4 -; CHECK-NEXT: ld1 { v6.s }[1], [x5], #4 -; CHECK-NEXT: ld1 { v1.s }[1], [x3] -; CHECK-NEXT: ld1 { v3.s }[1], [x1] -; CHECK-NEXT: ld1 { v5.s }[1], [x7] -; CHECK-NEXT: ld1 { v7.s }[1], [x5] -; CHECK-NEXT: usubl v2.8h, v2.8b, v6.8b -; CHECK-NEXT: usubl v4.8h, v0.8b, v4.8b -; CHECK-NEXT: usubl v1.8h, v1.8b, v5.8b -; CHECK-NEXT: usubl v3.8h, v3.8b, v7.8b -; CHECK-NEXT: shll v5.4s, v1.4h, #16 -; CHECK-NEXT: shll v0.4s, v3.4h, #16 -; CHECK-NEXT: shll2 v3.4s, v3.8h, #16 -; CHECK-NEXT: shll2 v6.4s, v1.8h, #16 -; CHECK-NEXT: saddw2 v1.4s, v3.4s, v2.8h -; CHECK-NEXT: saddw2 v3.4s, v6.4s, v4.8h -; CHECK-NEXT: saddw v0.4s, v0.4s, v2.4h -; CHECK-NEXT: saddw v2.4s, v5.4s, v4.4h +; CHECK-NEXT: ldr d0, [x4] +; CHECK-NEXT: ldr d2, [x0] +; CHECK-NEXT: ldr d3, [x1] +; CHECK-NEXT: ldr d6, [x5] +; CHECK-NEXT: ldr d1, [x2] +; CHECK-NEXT: ldr d4, [x3] +; CHECK-NEXT: ldr d5, [x7] +; CHECK-NEXT: ldr d7, [x6] +; CHECK-NEXT: usubl v0.8h, v2.8b, v0.8b +; CHECK-NEXT: usubl v2.8h, v3.8b, v6.8b +; CHECK-NEXT: usubl v4.8h, v4.8b, v5.8b +; CHECK-NEXT: usubl v3.8h, v1.8b, v7.8b +; CHECK-NEXT: shll2 v1.4s, v0.8h, #16 +; CHECK-NEXT: shll2 v5.4s, v2.8h, #16 +; CHECK-NEXT: saddw v0.4s, v1.4s, v0.4h +; CHECK-NEXT: saddw v1.4s, v5.4s, v2.4h +; CHECK-NEXT: shll2 v2.4s, v3.8h, #16 +; CHECK-NEXT: shll2 v5.4s, v4.8h, #16 +; CHECK-NEXT: saddw v2.4s, v2.4s, v3.4h +; CHECK-NEXT: saddw v3.4s, v5.4s, v4.4h ; CHECK-NEXT: ret %j1 = load <4 x i8>, ptr %p %p1 = getelementptr i8, ptr %p, i32 4 @@ -1270,12 +1256,11 @@ define <8 x i32> @commuted_loads(ptr %p1, ptr %p2) { ; CHECK-LABEL: commuted_loads: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp d0, d1, [x0] -; CHECK-NEXT: ldp d3, d2, [x1] -; CHECK-NEXT: add v0.8b, v3.8b, v0.8b -; CHECK-NEXT: add v1.8b, v2.8b, v1.8b +; CHECK-NEXT: ldr q0, [x0] +; CHECK-NEXT: ldr q1, [x1] +; CHECK-NEXT: add v0.16b, v1.16b, v0.16b +; CHECK-NEXT: ushll2 v1.8h, v0.16b, #0 ; CHECK-NEXT: ushll v0.8h, v0.8b, #0 -; CHECK-NEXT: ushll v1.8h, v1.8b, #0 ; CHECK-NEXT: ushll2 v2.4s, v1.8h, #3 ; CHECK-NEXT: ushll v3.4s, v1.4h, #3 ; CHECK-NEXT: uaddw2 v1.4s, v2.4s, v0.8h @@ -1353,3 +1338,74 @@ %a = sub <8 x i32> %se2, %e1 ret <8 x i32> %a } + +define <4 x i32> @bitcast(ptr %p) { +; CHECK-LABEL: bitcast: +; CHECK: // %bb.0: +; CHECK-NEXT: ldr d0, [x0] +; CHECK-NEXT: ushll v0.8h, v0.8b, #0 +; CHECK-NEXT: ushll2 v1.4s, v0.8h, #3 +; CHECK-NEXT: uaddw v0.4s, v1.4s, v0.4h +; CHECK-NEXT: ret + %l1b = load float, ptr %p + %l1 = bitcast float %l1b to <4 x i8> + %q = getelementptr i8, ptr %p, i32 4 + %l2b = load float, ptr %q + %l2 = bitcast float %l2b to <4 x i8> + %e1 = zext <4 x i8> %l1 to <4 x i32> + %e2 = zext <4 x i8> %l2 to <4 x i32> + %e3 = shl <4 x i32> %e2, + %a = add <4 x i32> %e1, %e3 + ret <4 x i32> %a +} + +define <4 x i32> @atomic(ptr %p) { +; CHECK-LABEL: atomic: +; CHECK: // %bb.0: +; CHECK-NEXT: ldar w8, [x0] +; CHECK-NEXT: ldr s0, [x0, #4] +; CHECK-NEXT: movi v2.2d, #0x0000ff000000ff +; CHECK-NEXT: fmov s1, w8 +; CHECK-NEXT: zip1 v1.8b, v1.8b, v0.8b +; CHECK-NEXT: ushll v0.8h, v0.8b, #0 +; CHECK-NEXT: ushll v0.4s, v0.4h, #3 +; CHECK-NEXT: ushll v1.4s, v1.4h, #0 +; CHECK-NEXT: and v1.16b, v1.16b, v2.16b +; CHECK-NEXT: add v0.4s, v1.4s, v0.4s +; CHECK-NEXT: ret + %l1b = load atomic float, ptr %p acquire, align 4 + %l1 = bitcast float %l1b to <4 x i8> + %q = getelementptr i8, ptr %p, i32 4 + %l2b = load float, ptr %q + %l2 = bitcast float %l2b to <4 x i8> + %e1 = zext <4 x i8> %l1 to <4 x i32> + %e2 = zext <4 x i8> %l2 to <4 x i32> + %e3 = shl <4 x i32> %e2, + %a = add <4 x i32> %e1, %e3 + ret <4 x i32> %a +} + +define <4 x i32> @volatile(ptr %p) { +; CHECK-LABEL: volatile: +; CHECK: // %bb.0: +; CHECK-NEXT: sub sp, sp, #16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: ldr s0, [x0] +; CHECK-NEXT: ldr s1, [x0, #4] +; CHECK-NEXT: ushll v0.8h, v0.8b, #0 +; CHECK-NEXT: ushll v1.8h, v1.8b, #0 +; CHECK-NEXT: ushll v1.4s, v1.4h, #3 +; CHECK-NEXT: uaddw v0.4s, v1.4s, v0.4h +; CHECK-NEXT: add sp, sp, #16 +; CHECK-NEXT: ret + %l1b = load volatile float, ptr %p + %l1 = bitcast float %l1b to <4 x i8> + %q = getelementptr i8, ptr %p, i32 4 + %l2b = load float, ptr %q + %l2 = bitcast float %l2b to <4 x i8> + %e1 = zext <4 x i8> %l1 to <4 x i32> + %e2 = zext <4 x i8> %l2 to <4 x i32> + %e3 = shl <4 x i32> %e2, + %a = add <4 x i32> %e1, %e3 + ret <4 x i32> %a +} diff --git a/llvm/test/CodeGen/AArch64/insert-extend.ll b/llvm/test/CodeGen/AArch64/insert-extend.ll --- a/llvm/test/CodeGen/AArch64/insert-extend.ll +++ b/llvm/test/CodeGen/AArch64/insert-extend.ll @@ -48,120 +48,114 @@ ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: // kill: def $w3 killed $w3 def $x3 ; CHECK-NEXT: // kill: def $w1 killed $w1 def $x1 -; CHECK-NEXT: sxtw x8, w3 -; CHECK-NEXT: sxtw x9, w1 -; CHECK-NEXT: add x10, x2, x8 -; CHECK-NEXT: add x11, x0, x9 +; CHECK-NEXT: sxtw x8, w1 +; CHECK-NEXT: sxtw x9, w3 +; CHECK-NEXT: add x10, x0, x8 +; CHECK-NEXT: add x11, x2, x9 +; CHECK-NEXT: ldr d0, [x0] ; CHECK-NEXT: add x12, x10, x8 -; CHECK-NEXT: add x13, x11, x9 -; CHECK-NEXT: add x8, x12, x8 -; CHECK-NEXT: add x9, x13, x9 -; CHECK-NEXT: ldp s0, s6, [x11] -; CHECK-NEXT: ldp s3, s7, [x10] -; CHECK-NEXT: ldp s1, s5, [x8] -; CHECK-NEXT: ldp s2, s4, [x9] -; CHECK-NEXT: ld1 { v1.s }[1], [x12], #4 -; CHECK-NEXT: ld1 { v2.s }[1], [x13], #4 -; CHECK-NEXT: ld1 { v3.s }[1], [x2], #4 -; CHECK-NEXT: ld1 { v0.s }[1], [x0], #4 -; CHECK-NEXT: ld1 { v5.s }[1], [x12] -; CHECK-NEXT: ld1 { v4.s }[1], [x13] -; CHECK-NEXT: ld1 { v7.s }[1], [x2] -; CHECK-NEXT: ld1 { v6.s }[1], [x0] -; CHECK-NEXT: usubl v0.8h, v0.8b, v3.8b -; CHECK-NEXT: usubl v1.8h, v2.8b, v1.8b -; CHECK-NEXT: usubl v2.8h, v4.8b, v5.8b +; CHECK-NEXT: ldr d1, [x2] +; CHECK-NEXT: ldr d2, [x10] +; CHECK-NEXT: add x10, x11, x9 +; CHECK-NEXT: ldr d3, [x11] +; CHECK-NEXT: ldr d4, [x12] +; CHECK-NEXT: ldr d5, [x10] +; CHECK-NEXT: ldr d6, [x12, x8] +; CHECK-NEXT: ldr d7, [x10, x9] +; CHECK-NEXT: usubl v0.8h, v0.8b, v1.8b +; CHECK-NEXT: usubl v2.8h, v2.8b, v3.8b +; CHECK-NEXT: usubl v1.8h, v4.8b, v5.8b +; CHECK-NEXT: shll2 v4.4s, v0.8h, #16 +; CHECK-NEXT: shll2 v5.4s, v2.8h, #16 ; CHECK-NEXT: usubl v3.8h, v6.8b, v7.8b -; CHECK-NEXT: shll v4.4s, v2.4h, #16 -; CHECK-NEXT: shll v5.4s, v3.4h, #16 -; CHECK-NEXT: shll2 v3.4s, v3.8h, #16 -; CHECK-NEXT: shll2 v2.4s, v2.8h, #16 -; CHECK-NEXT: saddw2 v3.4s, v3.4s, v0.8h -; CHECK-NEXT: saddw v0.4s, v5.4s, v0.4h -; CHECK-NEXT: saddw2 v2.4s, v2.4s, v1.8h -; CHECK-NEXT: saddw v1.4s, v4.4s, v1.4h -; CHECK-NEXT: rev64 v6.4s, v0.4s -; CHECK-NEXT: rev64 v17.4s, v3.4s +; CHECK-NEXT: saddw v0.4s, v4.4s, v0.4h +; CHECK-NEXT: saddw v2.4s, v5.4s, v2.4h +; CHECK-NEXT: shll2 v7.4s, v1.8h, #16 +; CHECK-NEXT: shll2 v6.4s, v3.8h, #16 +; CHECK-NEXT: saddw v1.4s, v7.4s, v1.4h +; CHECK-NEXT: rev64 v4.4s, v0.4s ; CHECK-NEXT: rev64 v5.4s, v2.4s -; CHECK-NEXT: addp v7.4s, v1.4s, v2.4s -; CHECK-NEXT: rev64 v4.4s, v1.4s -; CHECK-NEXT: addp v16.4s, v0.4s, v3.4s -; CHECK-NEXT: sub v3.4s, v3.4s, v17.4s -; CHECK-NEXT: sub v0.4s, v0.4s, v6.4s -; CHECK-NEXT: ext v18.16b, v7.16b, v7.16b, #8 -; CHECK-NEXT: sub v2.4s, v2.4s, v5.4s -; CHECK-NEXT: zip1 v5.4s, v0.4s, v3.4s -; CHECK-NEXT: uzp2 v19.4s, v7.4s, v16.4s -; CHECK-NEXT: uzp1 v7.4s, v7.4s, v16.4s -; CHECK-NEXT: sub v1.4s, v1.4s, v4.4s -; CHECK-NEXT: uzp1 v6.4s, v18.4s, v16.4s -; CHECK-NEXT: zip2 v4.4s, v2.4s, v1.4s -; CHECK-NEXT: uzp2 v16.4s, v18.4s, v16.4s -; CHECK-NEXT: mov v2.s[1], v1.s[0] -; CHECK-NEXT: ext v1.16b, v0.16b, v5.16b, #8 -; CHECK-NEXT: mov v0.s[3], v3.s[2] -; CHECK-NEXT: add v7.4s, v19.4s, v7.4s -; CHECK-NEXT: sub v3.4s, v6.4s, v16.4s -; CHECK-NEXT: rev64 v5.4s, v7.4s -; CHECK-NEXT: mov v2.d[1], v1.d[1] -; CHECK-NEXT: mov v4.d[1], v0.d[1] +; CHECK-NEXT: saddw v3.4s, v6.4s, v3.4h +; CHECK-NEXT: rev64 v7.4s, v1.4s +; CHECK-NEXT: sub v4.4s, v0.4s, v4.4s ; CHECK-NEXT: rev64 v6.4s, v3.4s -; CHECK-NEXT: sub v0.4s, v7.4s, v5.4s -; CHECK-NEXT: add v5.4s, v4.4s, v2.4s -; CHECK-NEXT: sub v2.4s, v2.4s, v4.4s -; CHECK-NEXT: sub v1.4s, v3.4s, v6.4s -; CHECK-NEXT: rev64 v4.4s, v5.4s -; CHECK-NEXT: addp v6.4s, v7.4s, v5.4s -; CHECK-NEXT: rev64 v7.4s, v2.4s -; CHECK-NEXT: addp v3.4s, v3.4s, v2.4s -; CHECK-NEXT: sub v4.4s, v5.4s, v4.4s -; CHECK-NEXT: zip1 v16.4s, v6.4s, v6.4s -; CHECK-NEXT: sub v2.4s, v2.4s, v7.4s -; CHECK-NEXT: ext v17.16b, v1.16b, v3.16b, #8 -; CHECK-NEXT: ext v5.16b, v6.16b, v4.16b, #4 -; CHECK-NEXT: ext v7.16b, v3.16b, v2.16b, #4 -; CHECK-NEXT: ext v18.16b, v0.16b, v6.16b, #4 -; CHECK-NEXT: trn2 v0.4s, v16.4s, v0.4s -; CHECK-NEXT: ext v16.16b, v17.16b, v1.16b, #4 -; CHECK-NEXT: zip2 v7.4s, v7.4s, v3.4s -; CHECK-NEXT: zip2 v5.4s, v5.4s, v6.4s -; CHECK-NEXT: ext v18.16b, v18.16b, v18.16b, #4 -; CHECK-NEXT: mov v1.s[2], v3.s[1] -; CHECK-NEXT: uzp2 v16.4s, v17.4s, v16.4s -; CHECK-NEXT: ext v7.16b, v2.16b, v7.16b, #12 -; CHECK-NEXT: ext v5.16b, v4.16b, v5.16b, #12 -; CHECK-NEXT: mov v2.s[2], v3.s[3] -; CHECK-NEXT: mov v4.s[2], v6.s[3] -; CHECK-NEXT: sub v17.4s, v0.4s, v18.4s -; CHECK-NEXT: mov v18.s[0], v6.s[1] -; CHECK-NEXT: sub v19.4s, v1.4s, v16.4s -; CHECK-NEXT: sub v20.4s, v2.4s, v7.4s -; CHECK-NEXT: sub v21.4s, v4.4s, v5.4s -; CHECK-NEXT: mov v1.s[1], v3.s[0] -; CHECK-NEXT: mov v2.s[1], v3.s[2] -; CHECK-NEXT: mov v4.s[1], v6.s[2] -; CHECK-NEXT: add v0.4s, v0.4s, v18.4s -; CHECK-NEXT: add v1.4s, v1.4s, v16.4s -; CHECK-NEXT: add v2.4s, v2.4s, v7.4s -; CHECK-NEXT: add v3.4s, v4.4s, v5.4s -; CHECK-NEXT: mov v2.d[1], v20.d[1] -; CHECK-NEXT: mov v3.d[1], v21.d[1] -; CHECK-NEXT: mov v0.d[1], v17.d[1] -; CHECK-NEXT: mov v1.d[1], v19.d[1] -; CHECK-NEXT: cmlt v4.8h, v2.8h, #0 -; CHECK-NEXT: cmlt v5.8h, v3.8h, #0 -; CHECK-NEXT: cmlt v6.8h, v0.8h, #0 -; CHECK-NEXT: cmlt v7.8h, v1.8h, #0 -; CHECK-NEXT: add v0.4s, v6.4s, v0.4s +; CHECK-NEXT: sub v5.4s, v2.4s, v5.4s +; CHECK-NEXT: sub v7.4s, v1.4s, v7.4s +; CHECK-NEXT: zip1 v16.4s, v5.4s, v4.4s +; CHECK-NEXT: addp v1.4s, v3.4s, v1.4s +; CHECK-NEXT: sub v6.4s, v3.4s, v6.4s +; CHECK-NEXT: addp v0.4s, v2.4s, v0.4s +; CHECK-NEXT: zip2 v17.4s, v7.4s, v6.4s +; CHECK-NEXT: mov v7.s[1], v6.s[0] +; CHECK-NEXT: ext v2.16b, v5.16b, v16.16b, #8 +; CHECK-NEXT: mov v5.s[3], v4.s[2] +; CHECK-NEXT: ext v3.16b, v1.16b, v1.16b, #8 +; CHECK-NEXT: uzp2 v4.4s, v1.4s, v0.4s +; CHECK-NEXT: mov v7.d[1], v2.d[1] +; CHECK-NEXT: mov v17.d[1], v5.d[1] +; CHECK-NEXT: uzp1 v1.4s, v1.4s, v0.4s +; CHECK-NEXT: uzp1 v2.4s, v3.4s, v0.4s +; CHECK-NEXT: uzp2 v0.4s, v3.4s, v0.4s +; CHECK-NEXT: add v3.4s, v17.4s, v7.4s +; CHECK-NEXT: add v1.4s, v4.4s, v1.4s +; CHECK-NEXT: sub v4.4s, v7.4s, v17.4s +; CHECK-NEXT: sub v0.4s, v2.4s, v0.4s +; CHECK-NEXT: rev64 v2.4s, v3.4s +; CHECK-NEXT: rev64 v5.4s, v4.4s +; CHECK-NEXT: rev64 v7.4s, v0.4s +; CHECK-NEXT: rev64 v6.4s, v1.4s +; CHECK-NEXT: addp v16.4s, v0.4s, v4.4s +; CHECK-NEXT: addp v17.4s, v1.4s, v3.4s +; CHECK-NEXT: sub v4.4s, v4.4s, v5.4s +; CHECK-NEXT: sub v2.4s, v3.4s, v2.4s +; CHECK-NEXT: sub v0.4s, v0.4s, v7.4s +; CHECK-NEXT: sub v1.4s, v1.4s, v6.4s +; CHECK-NEXT: ext v3.16b, v16.16b, v4.16b, #4 +; CHECK-NEXT: ext v5.16b, v0.16b, v16.16b, #8 +; CHECK-NEXT: ext v6.16b, v17.16b, v2.16b, #4 +; CHECK-NEXT: zip1 v7.4s, v17.4s, v17.4s +; CHECK-NEXT: zip2 v3.4s, v3.4s, v16.4s +; CHECK-NEXT: ext v18.16b, v5.16b, v0.16b, #4 +; CHECK-NEXT: zip2 v6.4s, v6.4s, v17.4s +; CHECK-NEXT: trn2 v7.4s, v7.4s, v1.4s +; CHECK-NEXT: ext v1.16b, v1.16b, v17.16b, #4 +; CHECK-NEXT: ext v3.16b, v4.16b, v3.16b, #12 +; CHECK-NEXT: mov v0.s[2], v16.s[1] +; CHECK-NEXT: ext v6.16b, v2.16b, v6.16b, #12 +; CHECK-NEXT: mov v4.s[2], v16.s[3] +; CHECK-NEXT: uzp2 v5.4s, v5.4s, v18.4s +; CHECK-NEXT: mov v2.s[2], v17.s[3] +; CHECK-NEXT: ext v1.16b, v1.16b, v1.16b, #4 +; CHECK-NEXT: sub v18.4s, v4.4s, v3.4s +; CHECK-NEXT: sub v19.4s, v0.4s, v5.4s +; CHECK-NEXT: sub v20.4s, v2.4s, v6.4s +; CHECK-NEXT: mov v4.s[1], v16.s[2] +; CHECK-NEXT: sub v21.4s, v7.4s, v1.4s +; CHECK-NEXT: mov v2.s[1], v17.s[2] +; CHECK-NEXT: mov v0.s[1], v16.s[0] +; CHECK-NEXT: mov v1.s[0], v17.s[1] +; CHECK-NEXT: add v3.4s, v4.4s, v3.4s +; CHECK-NEXT: add v2.4s, v2.4s, v6.4s +; CHECK-NEXT: add v0.4s, v0.4s, v5.4s ; CHECK-NEXT: add v1.4s, v7.4s, v1.4s -; CHECK-NEXT: add v2.4s, v4.4s, v2.4s -; CHECK-NEXT: add v3.4s, v5.4s, v3.4s -; CHECK-NEXT: eor v2.16b, v2.16b, v4.16b -; CHECK-NEXT: eor v3.16b, v3.16b, v5.16b -; CHECK-NEXT: eor v1.16b, v1.16b, v7.16b -; CHECK-NEXT: eor v0.16b, v0.16b, v6.16b -; CHECK-NEXT: add v0.4s, v0.4s, v1.4s -; CHECK-NEXT: add v1.4s, v3.4s, v2.4s +; CHECK-NEXT: mov v3.d[1], v18.d[1] +; CHECK-NEXT: mov v2.d[1], v20.d[1] +; CHECK-NEXT: mov v1.d[1], v21.d[1] +; CHECK-NEXT: mov v0.d[1], v19.d[1] +; CHECK-NEXT: cmlt v4.8h, v3.8h, #0 +; CHECK-NEXT: cmlt v5.8h, v2.8h, #0 +; CHECK-NEXT: cmlt v6.8h, v1.8h, #0 +; CHECK-NEXT: cmlt v7.8h, v0.8h, #0 +; CHECK-NEXT: add v1.4s, v6.4s, v1.4s +; CHECK-NEXT: add v0.4s, v7.4s, v0.4s +; CHECK-NEXT: add v3.4s, v4.4s, v3.4s +; CHECK-NEXT: add v2.4s, v5.4s, v2.4s +; CHECK-NEXT: eor v3.16b, v3.16b, v4.16b +; CHECK-NEXT: eor v2.16b, v2.16b, v5.16b +; CHECK-NEXT: eor v0.16b, v0.16b, v7.16b +; CHECK-NEXT: eor v1.16b, v1.16b, v6.16b +; CHECK-NEXT: add v0.4s, v1.4s, v0.4s +; CHECK-NEXT: add v1.4s, v2.4s, v3.4s ; CHECK-NEXT: add v0.4s, v0.4s, v1.4s ; CHECK-NEXT: addv s0, v0.4s ; CHECK-NEXT: fmov w8, s0 diff --git a/llvm/test/CodeGen/AArch64/reduce-shuffle.ll b/llvm/test/CodeGen/AArch64/reduce-shuffle.ll --- a/llvm/test/CodeGen/AArch64/reduce-shuffle.ll +++ b/llvm/test/CodeGen/AArch64/reduce-shuffle.ll @@ -4,123 +4,117 @@ define i32 @v1(ptr nocapture noundef readonly %p1, i32 noundef %i1, ptr nocapture noundef readonly %p2, i32 noundef %i2) { ; CHECK-LABEL: v1: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: // kill: def $w3 killed $w3 def $x3 ; CHECK-NEXT: // kill: def $w1 killed $w1 def $x1 ; CHECK-NEXT: sxtw x8, w1 -; CHECK-NEXT: sxtw x10, w3 -; CHECK-NEXT: add x9, x0, x8 -; CHECK-NEXT: add x12, x2, x10 -; CHECK-NEXT: add x11, x9, x8 -; CHECK-NEXT: add x13, x12, x10 -; CHECK-NEXT: add x8, x11, x8 -; CHECK-NEXT: add x10, x13, x10 -; CHECK-NEXT: ldp s1, s0, [x9] -; CHECK-NEXT: ldp s7, s6, [x12] -; CHECK-NEXT: ldp s3, s2, [x8] -; CHECK-NEXT: ldp s5, s4, [x10] -; CHECK-NEXT: ld1 { v5.s }[1], [x13], #4 -; CHECK-NEXT: ld1 { v3.s }[1], [x11], #4 -; CHECK-NEXT: ld1 { v7.s }[1], [x2], #4 -; CHECK-NEXT: ld1 { v1.s }[1], [x0], #4 -; CHECK-NEXT: ld1 { v4.s }[1], [x13] -; CHECK-NEXT: ld1 { v2.s }[1], [x11] -; CHECK-NEXT: ld1 { v6.s }[1], [x2] -; CHECK-NEXT: ld1 { v0.s }[1], [x0] -; CHECK-NEXT: usubl v3.8h, v3.8b, v5.8b -; CHECK-NEXT: usubl v2.8h, v2.8b, v4.8b -; CHECK-NEXT: usubl v1.8h, v1.8b, v7.8b -; CHECK-NEXT: usubl v0.8h, v0.8b, v6.8b -; CHECK-NEXT: shll v4.4s, v2.4h, #16 -; CHECK-NEXT: shll2 v2.4s, v2.8h, #16 -; CHECK-NEXT: shll v5.4s, v0.4h, #16 -; CHECK-NEXT: shll2 v0.4s, v0.8h, #16 -; CHECK-NEXT: saddw2 v2.4s, v2.4s, v3.8h -; CHECK-NEXT: saddw v3.4s, v4.4s, v3.4h -; CHECK-NEXT: saddw2 v0.4s, v0.4s, v1.8h -; CHECK-NEXT: saddw v1.4s, v5.4s, v1.4h -; CHECK-NEXT: uzp2 v5.4s, v3.4s, v2.4s +; CHECK-NEXT: // kill: def $w3 killed $w3 def $x3 +; CHECK-NEXT: sxtw x9, w3 +; CHECK-NEXT: add x10, x0, x8 +; CHECK-NEXT: add x11, x2, x9 +; CHECK-NEXT: add x12, x10, x8 +; CHECK-NEXT: ldr d0, [x0] +; CHECK-NEXT: ldr d1, [x2] +; CHECK-NEXT: ldr d2, [x10] +; CHECK-NEXT: add x10, x11, x9 +; CHECK-NEXT: ldr d6, [x12, x8] +; CHECK-NEXT: ldr d7, [x10, x9] +; CHECK-NEXT: ldr d3, [x11] +; CHECK-NEXT: ldr d4, [x12] +; CHECK-NEXT: ldr d5, [x10] +; CHECK-NEXT: usubl v0.8h, v0.8b, v1.8b +; CHECK-NEXT: usubl v2.8h, v2.8b, v3.8b +; CHECK-NEXT: usubl v1.8h, v4.8b, v5.8b +; CHECK-NEXT: usubl v3.8h, v6.8b, v7.8b +; CHECK-NEXT: shll2 v4.4s, v0.8h, #16 +; CHECK-NEXT: shll2 v5.4s, v2.8h, #16 +; CHECK-NEXT: shll2 v6.4s, v3.8h, #16 +; CHECK-NEXT: shll2 v7.4s, v1.8h, #16 +; CHECK-NEXT: saddw v0.4s, v4.4s, v0.4h +; CHECK-NEXT: saddw v2.4s, v5.4s, v2.4h +; CHECK-NEXT: saddw v3.4s, v6.4s, v3.4h +; CHECK-NEXT: saddw v1.4s, v7.4s, v1.4h +; CHECK-NEXT: zip1 v5.4s, v2.4s, v0.4s +; CHECK-NEXT: zip2 v4.4s, v2.4s, v0.4s +; CHECK-NEXT: uzp2 v7.4s, v3.4s, v1.4s +; CHECK-NEXT: mov v17.16b, v1.16b +; CHECK-NEXT: zip2 v18.4s, v3.4s, v1.4s +; CHECK-NEXT: ext v19.16b, v2.16b, v5.16b, #8 +; CHECK-NEXT: uzp2 v7.4s, v7.4s, v3.4s +; CHECK-NEXT: mov v2.s[3], v0.s[2] +; CHECK-NEXT: zip2 v6.4s, v1.4s, v3.4s ; CHECK-NEXT: ext v16.16b, v3.16b, v3.16b, #12 -; CHECK-NEXT: zip1 v17.4s, v1.4s, v0.4s -; CHECK-NEXT: mov v7.16b, v3.16b -; CHECK-NEXT: zip2 v4.4s, v2.4s, v3.4s -; CHECK-NEXT: zip2 v6.4s, v1.4s, v0.4s -; CHECK-NEXT: zip2 v18.4s, v3.4s, v2.4s -; CHECK-NEXT: mov v7.s[0], v2.s[1] -; CHECK-NEXT: ext v16.16b, v2.16b, v16.16b, #12 -; CHECK-NEXT: ext v19.16b, v1.16b, v17.16b, #8 -; CHECK-NEXT: uzp2 v5.4s, v5.4s, v3.4s -; CHECK-NEXT: mov v2.s[1], v3.s[0] -; CHECK-NEXT: mov v1.s[3], v0.s[2] -; CHECK-NEXT: mov v7.d[1], v17.d[1] -; CHECK-NEXT: mov v5.d[1], v6.d[1] -; CHECK-NEXT: mov v2.d[1], v19.d[1] -; CHECK-NEXT: mov v18.d[1], v1.d[1] -; CHECK-NEXT: mov v16.d[1], v6.d[1] -; CHECK-NEXT: mov v4.d[1], v1.d[1] -; CHECK-NEXT: add v0.4s, v7.4s, v2.4s -; CHECK-NEXT: add v1.4s, v5.4s, v18.4s +; CHECK-NEXT: mov v17.s[1], v3.s[0] +; CHECK-NEXT: mov v3.s[0], v1.s[1] +; CHECK-NEXT: mov v7.d[1], v4.d[1] +; CHECK-NEXT: mov v18.d[1], v2.d[1] +; CHECK-NEXT: mov v17.d[1], v19.d[1] +; CHECK-NEXT: mov v3.d[1], v5.d[1] +; CHECK-NEXT: ext v16.16b, v1.16b, v16.16b, #12 +; CHECK-NEXT: add v1.4s, v7.4s, v18.4s +; CHECK-NEXT: mov v6.d[1], v2.d[1] +; CHECK-NEXT: add v0.4s, v3.4s, v17.4s +; CHECK-NEXT: mov v16.d[1], v4.d[1] +; CHECK-NEXT: sub v2.4s, v17.4s, v3.4s +; CHECK-NEXT: rev64 v3.4s, v1.4s ; CHECK-NEXT: rev64 v5.4s, v0.4s -; CHECK-NEXT: sub v3.4s, v4.4s, v16.4s -; CHECK-NEXT: rev64 v4.4s, v1.4s -; CHECK-NEXT: sub v2.4s, v2.4s, v7.4s +; CHECK-NEXT: sub v4.4s, v6.4s, v16.4s +; CHECK-NEXT: mov v3.d[1], v1.d[1] ; CHECK-NEXT: mov v5.d[1], v0.d[1] -; CHECK-NEXT: add v6.4s, v3.4s, v2.4s -; CHECK-NEXT: sub v2.4s, v2.4s, v3.4s -; CHECK-NEXT: mov v4.d[1], v1.d[1] -; CHECK-NEXT: rev64 v7.4s, v2.4s +; CHECK-NEXT: add v6.4s, v4.4s, v2.4s +; CHECK-NEXT: sub v2.4s, v2.4s, v4.4s +; CHECK-NEXT: sub v0.4s, v0.4s, v3.4s +; CHECK-NEXT: rev64 v4.4s, v2.4s ; CHECK-NEXT: rev64 v3.4s, v6.4s ; CHECK-NEXT: add v1.4s, v1.4s, v5.4s -; CHECK-NEXT: sub v0.4s, v0.4s, v4.4s -; CHECK-NEXT: sub v7.4s, v2.4s, v7.4s +; CHECK-NEXT: addp v7.4s, v0.4s, v2.4s ; CHECK-NEXT: addp v5.4s, v1.4s, v6.4s -; CHECK-NEXT: addp v2.4s, v0.4s, v2.4s +; CHECK-NEXT: sub v2.4s, v2.4s, v4.4s ; CHECK-NEXT: sub v3.4s, v6.4s, v3.4s -; CHECK-NEXT: rev64 v4.4s, v0.4s -; CHECK-NEXT: rev64 v6.4s, v1.4s -; CHECK-NEXT: zip1 v16.4s, v5.4s, v5.4s -; CHECK-NEXT: ext v17.16b, v2.16b, v7.16b, #4 -; CHECK-NEXT: ext v18.16b, v5.16b, v3.16b, #4 -; CHECK-NEXT: sub v0.4s, v0.4s, v4.4s -; CHECK-NEXT: sub v1.4s, v1.4s, v6.4s -; CHECK-NEXT: ext v4.16b, v0.16b, v2.16b, #8 -; CHECK-NEXT: ext v6.16b, v1.16b, v5.16b, #4 -; CHECK-NEXT: trn2 v1.4s, v16.4s, v1.4s -; CHECK-NEXT: zip2 v16.4s, v17.4s, v2.4s -; CHECK-NEXT: zip2 v17.4s, v18.4s, v5.4s -; CHECK-NEXT: ext v18.16b, v4.16b, v0.16b, #4 -; CHECK-NEXT: ext v6.16b, v6.16b, v6.16b, #4 -; CHECK-NEXT: ext v16.16b, v7.16b, v16.16b, #12 -; CHECK-NEXT: ext v17.16b, v3.16b, v17.16b, #12 +; CHECK-NEXT: rev64 v6.4s, v0.4s +; CHECK-NEXT: ext v4.16b, v7.16b, v2.16b, #4 +; CHECK-NEXT: rev64 v16.4s, v1.4s +; CHECK-NEXT: ext v17.16b, v5.16b, v3.16b, #4 +; CHECK-NEXT: sub v0.4s, v0.4s, v6.4s +; CHECK-NEXT: zip2 v4.4s, v4.4s, v7.4s +; CHECK-NEXT: ext v6.16b, v0.16b, v7.16b, #8 +; CHECK-NEXT: sub v1.4s, v1.4s, v16.4s +; CHECK-NEXT: zip2 v16.4s, v17.4s, v5.4s +; CHECK-NEXT: zip1 v18.4s, v5.4s, v5.4s +; CHECK-NEXT: ext v19.16b, v1.16b, v5.16b, #4 +; CHECK-NEXT: ext v4.16b, v2.16b, v4.16b, #12 +; CHECK-NEXT: mov v2.s[2], v7.s[3] +; CHECK-NEXT: ext v17.16b, v6.16b, v0.16b, #4 +; CHECK-NEXT: ext v16.16b, v3.16b, v16.16b, #12 ; CHECK-NEXT: mov v3.s[2], v5.s[3] -; CHECK-NEXT: mov v7.s[2], v2.s[3] -; CHECK-NEXT: mov v0.s[2], v2.s[1] -; CHECK-NEXT: uzp2 v4.4s, v4.4s, v18.4s -; CHECK-NEXT: sub v20.4s, v3.4s, v17.4s -; CHECK-NEXT: sub v21.4s, v7.4s, v16.4s +; CHECK-NEXT: trn2 v1.4s, v18.4s, v1.4s +; CHECK-NEXT: ext v18.16b, v19.16b, v19.16b, #4 +; CHECK-NEXT: mov v0.s[2], v7.s[1] +; CHECK-NEXT: uzp2 v6.4s, v6.4s, v17.4s +; CHECK-NEXT: sub v17.4s, v2.4s, v4.4s +; CHECK-NEXT: sub v21.4s, v3.4s, v16.4s ; CHECK-NEXT: mov v3.s[1], v5.s[2] -; CHECK-NEXT: mov v7.s[1], v2.s[2] -; CHECK-NEXT: sub v18.4s, v1.4s, v6.4s -; CHECK-NEXT: mov v6.s[0], v5.s[1] -; CHECK-NEXT: sub v19.4s, v0.4s, v4.4s -; CHECK-NEXT: mov v0.s[1], v2.s[0] -; CHECK-NEXT: add v2.4s, v3.4s, v17.4s -; CHECK-NEXT: add v3.4s, v7.4s, v16.4s -; CHECK-NEXT: add v1.4s, v1.4s, v6.4s +; CHECK-NEXT: mov v2.s[1], v7.s[2] +; CHECK-NEXT: sub v19.4s, v1.4s, v18.4s +; CHECK-NEXT: mov v18.s[0], v5.s[1] +; CHECK-NEXT: sub v20.4s, v0.4s, v6.4s +; CHECK-NEXT: mov v0.s[1], v7.s[0] +; CHECK-NEXT: add v3.4s, v3.4s, v16.4s +; CHECK-NEXT: add v2.4s, v2.4s, v4.4s +; CHECK-NEXT: add v1.4s, v1.4s, v18.4s +; CHECK-NEXT: mov v2.d[1], v17.d[1] ; CHECK-NEXT: mov v3.d[1], v21.d[1] -; CHECK-NEXT: mov v2.d[1], v20.d[1] -; CHECK-NEXT: add v0.4s, v0.4s, v4.4s -; CHECK-NEXT: mov v1.d[1], v18.d[1] -; CHECK-NEXT: mov v0.d[1], v19.d[1] -; CHECK-NEXT: cmlt v6.8h, v3.8h, #0 -; CHECK-NEXT: cmlt v7.8h, v2.8h, #0 +; CHECK-NEXT: add v0.4s, v0.4s, v6.4s +; CHECK-NEXT: mov v1.d[1], v19.d[1] +; CHECK-NEXT: mov v0.d[1], v20.d[1] +; CHECK-NEXT: cmlt v6.8h, v2.8h, #0 +; CHECK-NEXT: cmlt v7.8h, v3.8h, #0 ; CHECK-NEXT: cmlt v4.8h, v1.8h, #0 -; CHECK-NEXT: add v3.4s, v6.4s, v3.4s -; CHECK-NEXT: add v2.4s, v7.4s, v2.4s +; CHECK-NEXT: add v2.4s, v6.4s, v2.4s +; CHECK-NEXT: add v3.4s, v7.4s, v3.4s ; CHECK-NEXT: cmlt v5.8h, v0.8h, #0 ; CHECK-NEXT: add v1.4s, v4.4s, v1.4s -; CHECK-NEXT: eor v2.16b, v2.16b, v7.16b -; CHECK-NEXT: eor v3.16b, v3.16b, v6.16b -; CHECK-NEXT: add v2.4s, v2.4s, v3.4s +; CHECK-NEXT: eor v3.16b, v3.16b, v7.16b +; CHECK-NEXT: eor v2.16b, v2.16b, v6.16b +; CHECK-NEXT: add v2.4s, v3.4s, v2.4s ; CHECK-NEXT: add v0.4s, v5.4s, v0.4s ; CHECK-NEXT: eor v1.16b, v1.16b, v4.16b ; CHECK-NEXT: add v1.4s, v1.4s, v2.4s @@ -226,121 +220,115 @@ define i32 @v2(ptr nocapture noundef readonly %p1, i32 noundef %i1, ptr nocapture noundef readonly %p2, i32 noundef %i2) { ; CHECK-LABEL: v2: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: // kill: def $w3 killed $w3 def $x3 ; CHECK-NEXT: // kill: def $w1 killed $w1 def $x1 ; CHECK-NEXT: sxtw x8, w1 -; CHECK-NEXT: sxtw x10, w3 -; CHECK-NEXT: add x9, x0, x8 -; CHECK-NEXT: add x12, x2, x10 -; CHECK-NEXT: add x11, x9, x8 -; CHECK-NEXT: add x13, x12, x10 -; CHECK-NEXT: add x8, x11, x8 -; CHECK-NEXT: add x10, x13, x10 -; CHECK-NEXT: ldp s1, s0, [x9] -; CHECK-NEXT: ldp s7, s6, [x12] -; CHECK-NEXT: ldp s3, s2, [x8] -; CHECK-NEXT: ldp s5, s4, [x10] -; CHECK-NEXT: ld1 { v5.s }[1], [x13], #4 -; CHECK-NEXT: ld1 { v3.s }[1], [x11], #4 -; CHECK-NEXT: ld1 { v7.s }[1], [x2], #4 -; CHECK-NEXT: ld1 { v1.s }[1], [x0], #4 -; CHECK-NEXT: ld1 { v4.s }[1], [x13] -; CHECK-NEXT: ld1 { v2.s }[1], [x11] -; CHECK-NEXT: ld1 { v6.s }[1], [x2] -; CHECK-NEXT: ld1 { v0.s }[1], [x0] -; CHECK-NEXT: usubl v3.8h, v3.8b, v5.8b -; CHECK-NEXT: usubl v2.8h, v2.8b, v4.8b -; CHECK-NEXT: usubl v1.8h, v1.8b, v7.8b -; CHECK-NEXT: usubl v0.8h, v0.8b, v6.8b -; CHECK-NEXT: shll v4.4s, v2.4h, #16 -; CHECK-NEXT: shll2 v2.4s, v2.8h, #16 -; CHECK-NEXT: shll v5.4s, v0.4h, #16 -; CHECK-NEXT: shll2 v0.4s, v0.8h, #16 -; CHECK-NEXT: saddw2 v2.4s, v2.4s, v3.8h -; CHECK-NEXT: saddw v3.4s, v4.4s, v3.4h -; CHECK-NEXT: saddw2 v0.4s, v0.4s, v1.8h -; CHECK-NEXT: saddw v1.4s, v5.4s, v1.4h -; CHECK-NEXT: uzp2 v5.4s, v3.4s, v2.4s +; CHECK-NEXT: // kill: def $w3 killed $w3 def $x3 +; CHECK-NEXT: sxtw x9, w3 +; CHECK-NEXT: add x10, x0, x8 +; CHECK-NEXT: add x11, x2, x9 +; CHECK-NEXT: add x12, x10, x8 +; CHECK-NEXT: ldr d0, [x0] +; CHECK-NEXT: ldr d1, [x2] +; CHECK-NEXT: ldr d2, [x10] +; CHECK-NEXT: add x10, x11, x9 +; CHECK-NEXT: ldr d6, [x12, x8] +; CHECK-NEXT: ldr d7, [x10, x9] +; CHECK-NEXT: ldr d3, [x11] +; CHECK-NEXT: ldr d4, [x12] +; CHECK-NEXT: ldr d5, [x10] +; CHECK-NEXT: usubl v0.8h, v0.8b, v1.8b +; CHECK-NEXT: usubl v2.8h, v2.8b, v3.8b +; CHECK-NEXT: usubl v1.8h, v4.8b, v5.8b +; CHECK-NEXT: usubl v3.8h, v6.8b, v7.8b +; CHECK-NEXT: shll2 v4.4s, v0.8h, #16 +; CHECK-NEXT: shll2 v5.4s, v2.8h, #16 +; CHECK-NEXT: shll2 v6.4s, v3.8h, #16 +; CHECK-NEXT: shll2 v7.4s, v1.8h, #16 +; CHECK-NEXT: saddw v0.4s, v4.4s, v0.4h +; CHECK-NEXT: saddw v2.4s, v5.4s, v2.4h +; CHECK-NEXT: saddw v3.4s, v6.4s, v3.4h +; CHECK-NEXT: saddw v1.4s, v7.4s, v1.4h +; CHECK-NEXT: zip1 v5.4s, v2.4s, v0.4s ; CHECK-NEXT: ext v17.16b, v3.16b, v3.16b, #12 -; CHECK-NEXT: zip1 v7.4s, v1.4s, v0.4s +; CHECK-NEXT: uzp2 v7.4s, v3.4s, v1.4s ; CHECK-NEXT: mov v16.16b, v3.16b -; CHECK-NEXT: zip2 v4.4s, v2.4s, v3.4s -; CHECK-NEXT: zip2 v6.4s, v1.4s, v0.4s -; CHECK-NEXT: zip2 v18.4s, v3.4s, v2.4s -; CHECK-NEXT: mov v16.s[0], v2.s[1] -; CHECK-NEXT: ext v19.16b, v1.16b, v7.16b, #8 -; CHECK-NEXT: ext v17.16b, v2.16b, v17.16b, #12 -; CHECK-NEXT: uzp2 v5.4s, v5.4s, v3.4s -; CHECK-NEXT: mov v1.s[3], v0.s[2] -; CHECK-NEXT: mov v2.s[1], v3.s[0] -; CHECK-NEXT: mov v16.d[1], v7.d[1] -; CHECK-NEXT: mov v5.d[1], v6.d[1] -; CHECK-NEXT: mov v18.d[1], v1.d[1] -; CHECK-NEXT: mov v2.d[1], v19.d[1] -; CHECK-NEXT: mov v4.d[1], v1.d[1] -; CHECK-NEXT: mov v17.d[1], v6.d[1] -; CHECK-NEXT: add v0.4s, v5.4s, v18.4s -; CHECK-NEXT: add v1.4s, v16.4s, v2.4s +; CHECK-NEXT: zip2 v4.4s, v2.4s, v0.4s +; CHECK-NEXT: zip2 v6.4s, v1.4s, v3.4s +; CHECK-NEXT: zip2 v18.4s, v3.4s, v1.4s +; CHECK-NEXT: ext v19.16b, v2.16b, v5.16b, #8 +; CHECK-NEXT: mov v16.s[0], v1.s[1] +; CHECK-NEXT: ext v17.16b, v1.16b, v17.16b, #12 +; CHECK-NEXT: uzp2 v7.4s, v7.4s, v3.4s +; CHECK-NEXT: mov v2.s[3], v0.s[2] +; CHECK-NEXT: mov v1.s[1], v3.s[0] +; CHECK-NEXT: mov v16.d[1], v5.d[1] +; CHECK-NEXT: mov v7.d[1], v4.d[1] +; CHECK-NEXT: mov v18.d[1], v2.d[1] +; CHECK-NEXT: mov v1.d[1], v19.d[1] +; CHECK-NEXT: mov v6.d[1], v2.d[1] +; CHECK-NEXT: mov v17.d[1], v4.d[1] +; CHECK-NEXT: add v0.4s, v7.4s, v18.4s +; CHECK-NEXT: add v2.4s, v16.4s, v1.4s ; CHECK-NEXT: rev64 v3.4s, v0.4s -; CHECK-NEXT: rev64 v5.4s, v1.4s -; CHECK-NEXT: sub v2.4s, v2.4s, v16.4s -; CHECK-NEXT: sub v4.4s, v4.4s, v17.4s +; CHECK-NEXT: rev64 v4.4s, v2.4s +; CHECK-NEXT: sub v5.4s, v6.4s, v17.4s +; CHECK-NEXT: sub v1.4s, v1.4s, v16.4s ; CHECK-NEXT: mov v3.d[1], v0.d[1] -; CHECK-NEXT: mov v5.d[1], v1.d[1] -; CHECK-NEXT: add v6.4s, v4.4s, v2.4s -; CHECK-NEXT: sub v2.4s, v2.4s, v4.4s -; CHECK-NEXT: sub v1.4s, v1.4s, v3.4s -; CHECK-NEXT: add v0.4s, v0.4s, v5.4s -; CHECK-NEXT: zip1 v3.4s, v1.4s, v2.4s -; CHECK-NEXT: zip1 v4.4s, v0.4s, v6.4s +; CHECK-NEXT: mov v4.d[1], v2.d[1] +; CHECK-NEXT: add v6.4s, v5.4s, v1.4s +; CHECK-NEXT: sub v1.4s, v1.4s, v5.4s +; CHECK-NEXT: sub v2.4s, v2.4s, v3.4s +; CHECK-NEXT: add v0.4s, v0.4s, v4.4s +; CHECK-NEXT: zip1 v3.4s, v2.4s, v1.4s ; CHECK-NEXT: uzp2 v5.4s, v0.4s, v6.4s -; CHECK-NEXT: mov v17.16b, v1.16b -; CHECK-NEXT: zip2 v7.4s, v0.4s, v6.4s -; CHECK-NEXT: ext v16.16b, v1.16b, v3.16b, #8 -; CHECK-NEXT: trn2 v4.4s, v0.4s, v4.4s +; CHECK-NEXT: zip2 v4.4s, v2.4s, v1.4s +; CHECK-NEXT: zip1 v7.4s, v0.4s, v6.4s +; CHECK-NEXT: ext v16.16b, v2.16b, v3.16b, #8 +; CHECK-NEXT: zip2 v17.4s, v0.4s, v6.4s ; CHECK-NEXT: uzp2 v5.4s, v5.4s, v0.4s -; CHECK-NEXT: zip2 v1.4s, v1.4s, v2.4s -; CHECK-NEXT: mov v17.s[3], v2.s[2] -; CHECK-NEXT: mov v0.s[1], v6.s[1] -; CHECK-NEXT: mov v4.d[1], v16.d[1] -; CHECK-NEXT: mov v5.d[1], v1.d[1] -; CHECK-NEXT: mov v7.d[1], v17.d[1] -; CHECK-NEXT: mov v0.d[1], v3.d[1] -; CHECK-NEXT: add v1.4s, v7.4s, v5.4s -; CHECK-NEXT: add v2.4s, v0.4s, v4.4s -; CHECK-NEXT: sub v0.4s, v4.4s, v0.4s -; CHECK-NEXT: ext v4.16b, v2.16b, v2.16b, #4 -; CHECK-NEXT: ext v16.16b, v1.16b, v1.16b, #4 -; CHECK-NEXT: sub v3.4s, v5.4s, v7.4s -; CHECK-NEXT: zip2 v5.4s, v0.4s, v2.4s -; CHECK-NEXT: zip1 v6.4s, v1.4s, v3.4s -; CHECK-NEXT: zip2 v7.4s, v1.4s, v3.4s -; CHECK-NEXT: zip2 v1.4s, v3.4s, v1.4s -; CHECK-NEXT: zip1 v17.4s, v2.4s, v0.4s -; CHECK-NEXT: zip2 v2.4s, v2.4s, v0.4s -; CHECK-NEXT: ext v0.16b, v4.16b, v0.16b, #8 -; CHECK-NEXT: ext v3.16b, v16.16b, v3.16b, #8 -; CHECK-NEXT: add v1.4s, v5.4s, v1.4s -; CHECK-NEXT: sub v2.4s, v7.4s, v2.4s -; CHECK-NEXT: ext v0.16b, v0.16b, v4.16b, #4 -; CHECK-NEXT: ext v3.16b, v3.16b, v16.16b, #4 -; CHECK-NEXT: sub v5.4s, v6.4s, v17.4s -; CHECK-NEXT: cmlt v7.8h, v2.8h, #0 -; CHECK-NEXT: cmlt v17.8h, v1.8h, #0 -; CHECK-NEXT: cmlt v6.8h, v5.8h, #0 -; CHECK-NEXT: add v1.4s, v17.4s, v1.4s -; CHECK-NEXT: add v2.4s, v7.4s, v2.4s -; CHECK-NEXT: add v0.4s, v0.4s, v3.4s -; CHECK-NEXT: add v4.4s, v6.4s, v5.4s -; CHECK-NEXT: eor v2.16b, v2.16b, v7.16b -; CHECK-NEXT: eor v1.16b, v1.16b, v17.16b -; CHECK-NEXT: cmlt v3.8h, v0.8h, #0 -; CHECK-NEXT: add v1.4s, v1.4s, v2.4s -; CHECK-NEXT: add v0.4s, v3.4s, v0.4s -; CHECK-NEXT: eor v2.16b, v4.16b, v6.16b +; CHECK-NEXT: mov v2.s[3], v1.s[2] +; CHECK-NEXT: mov v18.16b, v0.16b +; CHECK-NEXT: trn2 v0.4s, v0.4s, v7.4s +; CHECK-NEXT: mov v18.s[1], v6.s[1] +; CHECK-NEXT: mov v5.d[1], v4.d[1] +; CHECK-NEXT: mov v17.d[1], v2.d[1] +; CHECK-NEXT: mov v0.d[1], v16.d[1] +; CHECK-NEXT: mov v18.d[1], v3.d[1] +; CHECK-NEXT: add v1.4s, v17.4s, v5.4s +; CHECK-NEXT: sub v2.4s, v5.4s, v17.4s +; CHECK-NEXT: ext v4.16b, v1.16b, v1.16b, #4 +; CHECK-NEXT: add v3.4s, v18.4s, v0.4s +; CHECK-NEXT: sub v0.4s, v0.4s, v18.4s +; CHECK-NEXT: ext v5.16b, v3.16b, v3.16b, #4 +; CHECK-NEXT: ext v16.16b, v4.16b, v2.16b, #8 +; CHECK-NEXT: zip1 v6.4s, v1.4s, v2.4s +; CHECK-NEXT: zip2 v7.4s, v1.4s, v2.4s +; CHECK-NEXT: ext v17.16b, v5.16b, v0.16b, #8 +; CHECK-NEXT: zip2 v1.4s, v2.4s, v1.4s +; CHECK-NEXT: zip2 v2.4s, v0.4s, v3.4s +; CHECK-NEXT: ext v4.16b, v16.16b, v4.16b, #4 +; CHECK-NEXT: zip1 v16.4s, v3.4s, v0.4s +; CHECK-NEXT: zip2 v0.4s, v3.4s, v0.4s +; CHECK-NEXT: ext v5.16b, v17.16b, v5.16b, #4 ; CHECK-NEXT: add v1.4s, v2.4s, v1.4s -; CHECK-NEXT: eor v0.16b, v0.16b, v3.16b -; CHECK-NEXT: add v0.4s, v0.4s, v1.4s +; CHECK-NEXT: sub v3.4s, v6.4s, v16.4s +; CHECK-NEXT: sub v0.4s, v7.4s, v0.4s +; CHECK-NEXT: cmlt v6.8h, v1.8h, #0 +; CHECK-NEXT: cmlt v7.8h, v0.8h, #0 +; CHECK-NEXT: add v2.4s, v5.4s, v4.4s +; CHECK-NEXT: cmlt v4.8h, v3.8h, #0 +; CHECK-NEXT: add v1.4s, v6.4s, v1.4s +; CHECK-NEXT: add v0.4s, v7.4s, v0.4s +; CHECK-NEXT: cmlt v5.8h, v2.8h, #0 +; CHECK-NEXT: add v3.4s, v4.4s, v3.4s +; CHECK-NEXT: eor v0.16b, v0.16b, v7.16b +; CHECK-NEXT: eor v1.16b, v1.16b, v6.16b +; CHECK-NEXT: add v0.4s, v1.4s, v0.4s +; CHECK-NEXT: add v2.4s, v5.4s, v2.4s +; CHECK-NEXT: eor v1.16b, v3.16b, v4.16b +; CHECK-NEXT: add v0.4s, v1.4s, v0.4s +; CHECK-NEXT: eor v1.16b, v2.16b, v5.16b +; CHECK-NEXT: add v0.4s, v1.4s, v0.4s ; CHECK-NEXT: addv s0, v0.4s ; CHECK-NEXT: fmov w8, s0 ; CHECK-NEXT: lsr w9, w8, #16 @@ -446,117 +434,112 @@ define i32 @v3(ptr nocapture noundef readonly %p1, i32 noundef %i1, ptr nocapture noundef readonly %p2, i32 noundef %i2) { ; CHECK-LABEL: v3: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: // kill: def $w3 killed $w3 def $x3 ; CHECK-NEXT: // kill: def $w1 killed $w1 def $x1 -; CHECK-NEXT: sxtw x8, w3 -; CHECK-NEXT: sxtw x9, w1 -; CHECK-NEXT: add x10, x2, x8 -; CHECK-NEXT: add x11, x0, x9 +; CHECK-NEXT: sxtw x8, w1 +; CHECK-NEXT: // kill: def $w3 killed $w3 def $x3 +; CHECK-NEXT: sxtw x9, w3 +; CHECK-NEXT: add x10, x0, x8 +; CHECK-NEXT: add x11, x2, x9 ; CHECK-NEXT: add x12, x10, x8 -; CHECK-NEXT: add x13, x11, x9 -; CHECK-NEXT: add x8, x12, x8 -; CHECK-NEXT: add x9, x13, x9 -; CHECK-NEXT: ldp s0, s6, [x11] -; CHECK-NEXT: ldp s3, s7, [x10] -; CHECK-NEXT: ldp s1, s5, [x8] -; CHECK-NEXT: ldp s2, s4, [x9] -; CHECK-NEXT: ld1 { v1.s }[1], [x12], #4 -; CHECK-NEXT: ld1 { v2.s }[1], [x13], #4 -; CHECK-NEXT: ld1 { v3.s }[1], [x2], #4 -; CHECK-NEXT: ld1 { v0.s }[1], [x0], #4 -; CHECK-NEXT: ld1 { v5.s }[1], [x12] -; CHECK-NEXT: ld1 { v4.s }[1], [x13] -; CHECK-NEXT: ld1 { v7.s }[1], [x2] -; CHECK-NEXT: ld1 { v6.s }[1], [x0] -; CHECK-NEXT: usubl v0.8h, v0.8b, v3.8b -; CHECK-NEXT: usubl v1.8h, v2.8b, v1.8b -; CHECK-NEXT: usubl v2.8h, v4.8b, v5.8b +; CHECK-NEXT: ldr d0, [x0] +; CHECK-NEXT: ldr d1, [x2] +; CHECK-NEXT: ldr d2, [x10] +; CHECK-NEXT: add x10, x11, x9 +; CHECK-NEXT: ldr d4, [x12, x8] +; CHECK-NEXT: ldr d5, [x10, x9] +; CHECK-NEXT: ldr d3, [x11] +; CHECK-NEXT: ldr d6, [x12] +; CHECK-NEXT: ldr d7, [x10] +; CHECK-NEXT: usubl v0.8h, v0.8b, v1.8b +; CHECK-NEXT: usubl v1.8h, v4.8b, v5.8b +; CHECK-NEXT: usubl v2.8h, v2.8b, v3.8b ; CHECK-NEXT: usubl v3.8h, v6.8b, v7.8b -; CHECK-NEXT: shll v4.4s, v2.4h, #16 -; CHECK-NEXT: shll v5.4s, v3.4h, #16 -; CHECK-NEXT: shll2 v3.4s, v3.8h, #16 -; CHECK-NEXT: shll2 v2.4s, v2.8h, #16 -; CHECK-NEXT: saddw2 v3.4s, v3.4s, v0.8h -; CHECK-NEXT: saddw v0.4s, v5.4s, v0.4h -; CHECK-NEXT: saddw2 v2.4s, v2.4s, v1.8h -; CHECK-NEXT: rev64 v17.4s, v3.4s -; CHECK-NEXT: rev64 v6.4s, v0.4s -; CHECK-NEXT: saddw v1.4s, v4.4s, v1.4h +; CHECK-NEXT: shll2 v4.4s, v0.8h, #16 +; CHECK-NEXT: shll2 v5.4s, v2.8h, #16 +; CHECK-NEXT: shll2 v6.4s, v3.8h, #16 +; CHECK-NEXT: shll2 v7.4s, v1.8h, #16 +; CHECK-NEXT: saddw v0.4s, v4.4s, v0.4h +; CHECK-NEXT: saddw v2.4s, v5.4s, v2.4h +; CHECK-NEXT: saddw v3.4s, v6.4s, v3.4h +; CHECK-NEXT: saddw v1.4s, v7.4s, v1.4h +; CHECK-NEXT: rev64 v4.4s, v0.4s ; CHECK-NEXT: rev64 v5.4s, v2.4s -; CHECK-NEXT: addp v16.4s, v0.4s, v3.4s -; CHECK-NEXT: rev64 v4.4s, v1.4s -; CHECK-NEXT: sub v3.4s, v3.4s, v17.4s -; CHECK-NEXT: sub v0.4s, v0.4s, v6.4s -; CHECK-NEXT: addp v7.4s, v1.4s, v2.4s -; CHECK-NEXT: ext v17.16b, v0.16b, v3.16b, #4 +; CHECK-NEXT: rev64 v7.4s, v1.4s +; CHECK-NEXT: rev64 v16.4s, v3.4s +; CHECK-NEXT: addp v6.4s, v2.4s, v0.4s +; CHECK-NEXT: addp v17.4s, v1.4s, v3.4s +; CHECK-NEXT: sub v0.4s, v0.4s, v4.4s ; CHECK-NEXT: sub v2.4s, v2.4s, v5.4s -; CHECK-NEXT: sub v1.4s, v1.4s, v4.4s -; CHECK-NEXT: uzp2 v5.4s, v7.4s, v16.4s -; CHECK-NEXT: ext v4.16b, v16.16b, v16.16b, #8 -; CHECK-NEXT: uzp1 v16.4s, v7.4s, v16.4s -; CHECK-NEXT: zip2 v6.4s, v1.4s, v2.4s -; CHECK-NEXT: mov v3.s[3], v0.s[2] -; CHECK-NEXT: zip1 v1.4s, v1.4s, v2.4s -; CHECK-NEXT: ext v0.16b, v17.16b, v0.16b, #4 -; CHECK-NEXT: rev64 v2.4s, v5.4s -; CHECK-NEXT: uzp1 v5.4s, v7.4s, v4.4s -; CHECK-NEXT: rev64 v16.4s, v16.4s -; CHECK-NEXT: uzp2 v4.4s, v7.4s, v4.4s -; CHECK-NEXT: mov v6.d[1], v3.d[1] +; CHECK-NEXT: sub v3.4s, v3.4s, v16.4s +; CHECK-NEXT: sub v1.4s, v1.4s, v7.4s +; CHECK-NEXT: ext v4.16b, v2.16b, v0.16b, #4 +; CHECK-NEXT: zip2 v5.4s, v1.4s, v3.4s +; CHECK-NEXT: mov v0.s[3], v2.s[2] +; CHECK-NEXT: uzp2 v7.4s, v17.4s, v6.4s +; CHECK-NEXT: zip1 v1.4s, v1.4s, v3.4s +; CHECK-NEXT: ext v3.16b, v6.16b, v6.16b, #8 +; CHECK-NEXT: mov v5.d[1], v0.d[1] +; CHECK-NEXT: ext v0.16b, v4.16b, v2.16b, #4 +; CHECK-NEXT: uzp1 v2.4s, v17.4s, v6.4s +; CHECK-NEXT: rev64 v4.4s, v7.4s ; CHECK-NEXT: mov v1.d[1], v0.d[1] -; CHECK-NEXT: add v0.4s, v2.4s, v16.4s -; CHECK-NEXT: sub v2.4s, v5.4s, v4.4s -; CHECK-NEXT: sub v3.4s, v1.4s, v6.4s -; CHECK-NEXT: add v1.4s, v6.4s, v1.4s -; CHECK-NEXT: zip1 v4.4s, v2.4s, v3.4s -; CHECK-NEXT: zip1 v5.4s, v0.4s, v1.4s -; CHECK-NEXT: uzp2 v6.4s, v0.4s, v1.4s -; CHECK-NEXT: zip2 v7.4s, v2.4s, v3.4s -; CHECK-NEXT: zip2 v16.4s, v0.4s, v1.4s -; CHECK-NEXT: ext v17.16b, v2.16b, v4.16b, #8 -; CHECK-NEXT: uzp2 v6.4s, v6.4s, v0.4s -; CHECK-NEXT: mov v2.s[3], v3.s[2] -; CHECK-NEXT: trn2 v3.4s, v0.4s, v5.4s -; CHECK-NEXT: mov v0.s[1], v1.s[1] -; CHECK-NEXT: mov v6.d[1], v7.d[1] -; CHECK-NEXT: mov v16.d[1], v2.d[1] -; CHECK-NEXT: mov v3.d[1], v17.d[1] -; CHECK-NEXT: mov v0.d[1], v4.d[1] -; CHECK-NEXT: add v1.4s, v6.4s, v16.4s -; CHECK-NEXT: sub v2.4s, v16.4s, v6.4s -; CHECK-NEXT: add v7.4s, v3.4s, v0.4s -; CHECK-NEXT: ext v6.16b, v1.16b, v1.16b, #4 -; CHECK-NEXT: sub v0.4s, v0.4s, v3.4s -; CHECK-NEXT: ext v3.16b, v7.16b, v7.16b, #4 -; CHECK-NEXT: zip1 v4.4s, v1.4s, v2.4s -; CHECK-NEXT: zip2 v5.4s, v1.4s, v2.4s -; CHECK-NEXT: zip2 v1.4s, v2.4s, v1.4s -; CHECK-NEXT: zip2 v16.4s, v0.4s, v7.4s -; CHECK-NEXT: zip1 v17.4s, v7.4s, v0.4s -; CHECK-NEXT: zip2 v7.4s, v7.4s, v0.4s -; CHECK-NEXT: ext v2.16b, v6.16b, v2.16b, #8 -; CHECK-NEXT: ext v0.16b, v3.16b, v0.16b, #8 -; CHECK-NEXT: add v1.4s, v16.4s, v1.4s -; CHECK-NEXT: sub v4.4s, v4.4s, v17.4s -; CHECK-NEXT: ext v2.16b, v2.16b, v6.16b, #4 -; CHECK-NEXT: ext v0.16b, v0.16b, v3.16b, #4 -; CHECK-NEXT: sub v3.4s, v5.4s, v7.4s -; CHECK-NEXT: cmlt v5.8h, v4.8h, #0 -; CHECK-NEXT: cmlt v6.8h, v3.8h, #0 -; CHECK-NEXT: add v0.4s, v0.4s, v2.4s -; CHECK-NEXT: cmlt v2.8h, v1.8h, #0 -; CHECK-NEXT: add v3.4s, v6.4s, v3.4s -; CHECK-NEXT: add v1.4s, v2.4s, v1.4s -; CHECK-NEXT: cmlt v7.8h, v0.8h, #0 -; CHECK-NEXT: add v4.4s, v5.4s, v4.4s -; CHECK-NEXT: eor v3.16b, v3.16b, v6.16b -; CHECK-NEXT: eor v1.16b, v1.16b, v2.16b -; CHECK-NEXT: add v1.4s, v1.4s, v3.4s -; CHECK-NEXT: add v0.4s, v7.4s, v0.4s -; CHECK-NEXT: eor v2.16b, v4.16b, v5.16b -; CHECK-NEXT: add v1.4s, v2.4s, v1.4s -; CHECK-NEXT: eor v0.16b, v0.16b, v7.16b +; CHECK-NEXT: rev64 v0.4s, v2.4s +; CHECK-NEXT: uzp1 v2.4s, v17.4s, v3.4s +; CHECK-NEXT: uzp2 v3.4s, v17.4s, v3.4s +; CHECK-NEXT: add v6.4s, v5.4s, v1.4s +; CHECK-NEXT: add v0.4s, v4.4s, v0.4s +; CHECK-NEXT: sub v1.4s, v1.4s, v5.4s +; CHECK-NEXT: sub v2.4s, v2.4s, v3.4s +; CHECK-NEXT: zip1 v3.4s, v0.4s, v6.4s +; CHECK-NEXT: zip1 v4.4s, v2.4s, v1.4s +; CHECK-NEXT: mov v7.16b, v0.16b +; CHECK-NEXT: uzp2 v5.4s, v0.4s, v6.4s +; CHECK-NEXT: trn2 v3.4s, v0.4s, v3.4s +; CHECK-NEXT: ext v16.16b, v2.16b, v4.16b, #8 +; CHECK-NEXT: mov v7.s[1], v6.s[1] +; CHECK-NEXT: uzp2 v5.4s, v5.4s, v0.4s +; CHECK-NEXT: zip2 v0.4s, v0.4s, v6.4s +; CHECK-NEXT: mov v3.d[1], v16.d[1] +; CHECK-NEXT: zip2 v6.4s, v2.4s, v1.4s +; CHECK-NEXT: mov v7.d[1], v4.d[1] +; CHECK-NEXT: mov v2.s[3], v1.s[2] +; CHECK-NEXT: mov v5.d[1], v6.d[1] +; CHECK-NEXT: add v1.4s, v3.4s, v7.4s +; CHECK-NEXT: mov v0.d[1], v2.d[1] +; CHECK-NEXT: ext v2.16b, v1.16b, v1.16b, #4 +; CHECK-NEXT: sub v3.4s, v7.4s, v3.4s +; CHECK-NEXT: add v4.4s, v5.4s, v0.4s +; CHECK-NEXT: ext v6.16b, v2.16b, v3.16b, #8 +; CHECK-NEXT: ext v7.16b, v4.16b, v4.16b, #4 +; CHECK-NEXT: sub v0.4s, v0.4s, v5.4s +; CHECK-NEXT: zip2 v5.4s, v3.4s, v1.4s +; CHECK-NEXT: ext v2.16b, v6.16b, v2.16b, #4 +; CHECK-NEXT: ext v6.16b, v7.16b, v0.16b, #8 +; CHECK-NEXT: zip1 v16.4s, v4.4s, v0.4s +; CHECK-NEXT: zip2 v17.4s, v4.4s, v0.4s +; CHECK-NEXT: zip2 v0.4s, v0.4s, v4.4s +; CHECK-NEXT: ext v4.16b, v6.16b, v7.16b, #4 +; CHECK-NEXT: zip1 v6.4s, v1.4s, v3.4s +; CHECK-NEXT: zip2 v1.4s, v1.4s, v3.4s +; CHECK-NEXT: add v0.4s, v5.4s, v0.4s +; CHECK-NEXT: add v2.4s, v2.4s, v4.4s +; CHECK-NEXT: sub v3.4s, v16.4s, v6.4s +; CHECK-NEXT: sub v1.4s, v17.4s, v1.4s +; CHECK-NEXT: cmlt v6.8h, v0.8h, #0 +; CHECK-NEXT: cmlt v7.8h, v1.8h, #0 +; CHECK-NEXT: cmlt v4.8h, v3.8h, #0 +; CHECK-NEXT: add v0.4s, v6.4s, v0.4s +; CHECK-NEXT: add v1.4s, v7.4s, v1.4s +; CHECK-NEXT: cmlt v5.8h, v2.8h, #0 +; CHECK-NEXT: add v3.4s, v4.4s, v3.4s +; CHECK-NEXT: eor v1.16b, v1.16b, v7.16b +; CHECK-NEXT: eor v0.16b, v0.16b, v6.16b ; CHECK-NEXT: add v0.4s, v0.4s, v1.4s +; CHECK-NEXT: add v2.4s, v5.4s, v2.4s +; CHECK-NEXT: eor v1.16b, v3.16b, v4.16b +; CHECK-NEXT: add v0.4s, v1.4s, v0.4s +; CHECK-NEXT: eor v1.16b, v2.16b, v5.16b +; CHECK-NEXT: add v0.4s, v1.4s, v0.4s ; CHECK-NEXT: addv s0, v0.4s ; CHECK-NEXT: fmov w8, s0 ; CHECK-NEXT: lsr w9, w8, #16