Index: llvm/lib/Target/AArch64/AArch64ISelLowering.cpp =================================================================== --- llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -22377,6 +22377,75 @@ return DAG.getNode(ISD::BITCAST, DL, VT, NewDuplane128); } +// Try to combine op with uzp1. For example, +// +// smull(trunc(x), extract_high(y)) +// ==> +// smull(extract_high(uzp1(undef,x)), extract_high(y)) +// +// -> It will be matched to smull2. +static SDValue tryCombineOpWithUZP1(SDNode *N, + TargetLowering::DAGCombinerInfo &DCI, + SelectionDAG &DAG) { + if (DCI.isBeforeLegalizeOps()) + return SDValue(); + + SDValue LHS = N->getOperand(0); + SDValue RHS = N->getOperand(1); + EVT VT = N->getValueType(0); + + SDValue TRUNC; + SDLoc DL(N); + + // Check the operands are trunc and extract_high. + if (isEssentiallyExtractHighSubvector(LHS) && + RHS.getOpcode() == ISD::TRUNCATE) + TRUNC = RHS; + else if (isEssentiallyExtractHighSubvector(RHS) && + LHS.getOpcode() == ISD::TRUNCATE) + TRUNC = LHS; + else + return SDValue(); + + // If the truncate's operand is BUILD_VECTOR with DUP, do not combine the op + // with uzp1. + // You can see the regressions on test/CodeGen/AArch64/aarch64-smull.ll + SDValue TRUNCOP = TRUNC.getOperand(0); + EVT TRUNCOPVT = TRUNCOP.getValueType(); + if (TRUNCOP.getOpcode() == AArch64ISD::DUP || + DAG.isSplatValue(TRUNCOP, false)) + return SDValue(); + + // Create uzp1 and extract_high. + EVT TRUNCVT = TRUNC.getValueType(); + EVT UZP1VT = TRUNCVT.getDoubleNumVectorElementsVT(*DAG.getContext()); + SDValue HighIdx = + DAG.getConstant(TRUNCVT.getVectorNumElements(), DL, MVT::i64); + if (TRUNCOPVT != UZP1VT) + TRUNCOP = DAG.getNode(ISD::BITCAST, DL, UZP1VT, TRUNCOP); + SDValue UZP1 = + DAG.getNode(AArch64ISD::UZP1, DL, UZP1VT, DAG.getUNDEF(UZP1VT), TRUNCOP); + UZP1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, TRUNCVT, UZP1, HighIdx); + + SDValue NewLHS = (TRUNC == RHS) ? LHS : UZP1; + SDValue NewRHS = (TRUNC == RHS) ? UZP1 : RHS; + + return DAG.getNode(N->getOpcode(), DL, VT, NewLHS, NewRHS); +} + +static SDValue performMULLCombine(SDNode *N, + TargetLowering::DAGCombinerInfo &DCI, + SelectionDAG &DAG) { + if (SDValue Val = + tryCombineLongOpWithDup(Intrinsic::not_intrinsic, N, DCI, DAG)) + return Val; + + if (SDValue Val = tryCombineOpWithUZP1(N, DCI, DAG)) + return Val; + + return SDValue(); +} + SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const { SelectionDAG &DAG = DCI.DAG; @@ -22521,7 +22590,7 @@ case AArch64ISD::SMULL: case AArch64ISD::UMULL: case AArch64ISD::PMULL: - return tryCombineLongOpWithDup(Intrinsic::not_intrinsic, N, DCI, DAG); + return performMULLCombine(N, DCI, DAG); case ISD::INTRINSIC_VOID: case ISD::INTRINSIC_W_CHAIN: switch (cast(N->getOperand(1))->getZExtValue()) { Index: llvm/lib/Target/AArch64/AArch64MIPeepholeOpt.cpp =================================================================== --- llvm/lib/Target/AArch64/AArch64MIPeepholeOpt.cpp +++ llvm/lib/Target/AArch64/AArch64MIPeepholeOpt.cpp @@ -52,14 +52,35 @@ // %1:fpr64 = nofpexcept FCVTNv4i16 %0:fpr128, implicit $fpcr // %2:fpr64 = MOVID 0 // %4:fpr128 = IMPLICIT_DEF -// %3:fpr128 = INSERT_SUBREG %4:fpr128(tied-def 0), killed %2:fpr64, %subreg.dsub -// %6:fpr128 = IMPLICIT_DEF -// %5:fpr128 = INSERT_SUBREG %6:fpr128(tied-def 0), killed %1:fpr64, %subreg.dsub -// %7:fpr128 = INSvi64lane %5:fpr128(tied-def 0), 1, killed %3:fpr128, 0 +// %3:fpr128 = INSERT_SUBREG %4:fpr128(tied-def 0), killed %2:fpr64, +// %subreg.dsub %6:fpr128 = IMPLICIT_DEF %5:fpr128 = INSERT_SUBREG +// %6:fpr128(tied-def 0), killed %1:fpr64, %subreg.dsub %7:fpr128 = +// INSvi64lane %5:fpr128(tied-def 0), 1, killed %3:fpr128, 0 // ==> // %1:fpr64 = nofpexcept FCVTNv4i16 %0:fpr128, implicit $fpcr // %6:fpr128 = IMPLICIT_DEF -// %7:fpr128 = INSERT_SUBREG %6:fpr128(tied-def 0), killed %1:fpr64, %subreg.dsub +// %7:fpr128 = INSERT_SUBREG %6:fpr128(tied-def 0), killed %1:fpr64, +// %subreg.dsub +// +// 8. If UZP1 has IMPLICIT_DEF as low 64-bit operand, try to replace it with +// XTN's operand. For example, +// +// %4:fpr128 = LDRQui %3:gpr64common, 0 :: (load (s128) from %ir.3, align 4) +// %5:fpr64 = XTNv4i16 killed %4:fpr128 +// %6:fpr64 = COPY %0.dsub:fpr128 +// %7:fpr128 = LDRQui %3:gpr64common, 1 :: (load (s128) from %ir.7, align 4) +// %9:fpr128 = IMPLICIT_DEF +// %8:fpr128 = UZP1v8i16 killed %9:fpr128, killed %7:fpr128 +// %10:fpr128 = SMLSLv4i16_v4i32 %1:fpr128(tied-def 0), killed %6:fpr64, killed %5:fpr64 +// %11:fpr128 = SMLSLv8i16_v4i32 %10:fpr128(tied-def 0), %0:fpr128, killed %8:fpr128 +// ==> +// %4:fpr128 = LDRQui %3:gpr64common, 0 :: (load (s128) from %ir.3, align 4) +// %6:fpr64 = COPY %0.dsub:fpr128 +// %7:fpr128 = LDRQui %3:gpr64common, 1 :: (load (s128) from %ir.7, align 4) +// %8:fpr128 = UZP1v8i16 killed %4:fpr128, killed %7:fpr128 +// %12:fpr64 = COPY %8.dsub:fpr128 +// %10:fpr128 = SMLSLv4i16_v4i32 %1:fpr128(tied-def 0), killed %6:fpr64, killed %12:fpr64 +// %11:fpr128 = SMLSLv8i16_v4i32 %10:fpr128(tied-def 0), %0:fpr128, killed %8:fpr128 // //===----------------------------------------------------------------------===// @@ -127,6 +148,7 @@ bool visitINSERT(MachineInstr &MI); bool visitINSviGPR(MachineInstr &MI, unsigned Opc); bool visitINSvi64lane(MachineInstr &MI); + bool visitUZP1v8i16(MachineInstr &MI, MachineBasicBlock &MBB); bool runOnMachineFunction(MachineFunction &MF) override; StringRef getPassName() const override { @@ -669,6 +691,88 @@ return true; } +bool AArch64MIPeepholeOpt::visitUZP1v8i16(MachineInstr &MI, + MachineBasicBlock &MBB) { + // If UZP1 has IMPLICIT_DEF as low 64-bit operand, try to replace it with + // XTN's operand For example, + // + // %4:fpr128 = LDRQui %3:gpr64common, 0 :: (load (s128) from %ir.3, align 4) + // %5:fpr64 = XTNv4i16 killed %4:fpr128 + // %6:fpr64 = COPY %0.dsub:fpr128 + // %7:fpr128 = LDRQui %3:gpr64common, 1 :: (load (s128) from %ir.7, align 4) + // %9:fpr128 = IMPLICIT_DEF + // %8:fpr128 = UZP1v8i16 killed %9:fpr128, killed %7:fpr128 + // %10:fpr128 = SMLSLv4i16_v4i32 %1:fpr128(tied-def 0), killed %6:fpr64, killed %5:fpr64 + // %11:fpr128 = SMLSLv8i16_v4i32 %10:fpr128(tied-def 0), %0:fpr128, killed %8:fpr128 + // ==> + // %4:fpr128 = LDRQui %3:gpr64common, 0 :: (load (s128) from %ir.3, align 4) + // %6:fpr64 = COPY %0.dsub:fpr128 + // %7:fpr128 = LDRQui %3:gpr64common, 1 :: (load (s128) from %ir.7, align 4) + // %8:fpr128 = UZP1v8i16 killed %4:fpr128, killed %7:fpr128 + // %12:fpr64 = COPY %8.dsub:fpr128 + // %10:fpr128 = SMLSLv4i16_v4i32 %1:fpr128(tied-def 0), killed %6:fpr64, killed %12:fpr64 + // %11:fpr128 = SMLSLv8i16_v4i32 %10:fpr128(tied-def 0), %0:fpr128, killed %8:fpr128 + + MachineInstr *UZP1MI = &MI; + MachineInstr *UZP1MIOP1DefMI = + MRI->getUniqueVRegDef(UZP1MI->getOperand(1).getReg()); + + // Check UZP1's first operand IMPLICIT_DEF. + if (UZP1MIOP1DefMI->getOpcode() != TargetOpcode::IMPLICIT_DEF) + return false; + + // Let's check the XTN's use MI. + MachineInstr *XTNMI = nullptr; + MachineInstr *XTNUseMI = nullptr; + for (MachineInstr &MI : MBB) { + if (MI.getOpcode() == AArch64::XTNv4i16) { + // For simplicity, let's assume only one use. + XTNMI = &MI; + if (!MRI->hasOneUse(XTNMI->getOperand(0).getReg())) + return false; + + XTNUseMI = MRI->use_begin(XTNMI->getOperand(0).getReg())->getParent(); + break; + } + } + + if (XTNUseMI == nullptr) + return false; + + // If XTN's use MI is in UZP1's block, check XTN's use MI is ahead of + // UZP1. + if (XTNUseMI->getParent() == &MBB) { + unsigned distanceUZP1MI = + (unsigned)std::distance(MBB.instr_begin(), UZP1MI->getIterator()); + unsigned distanceXTNUseMI = + (unsigned)std::distance(MBB.instr_begin(), XTNUseMI->getIterator()); + if (distanceUZP1MI > distanceXTNUseMI) + return false; + } + + // Let's build UZP1 with XTN's operand. + MachineInstr *NewUZP1MI = + BuildMI(MBB, UZP1MI, UZP1MI->getDebugLoc(), TII->get(UZP1MI->getOpcode()), + UZP1MI->getOperand(0).getReg()) + .add(XTNMI->getOperand(1)) + .add(UZP1MI->getOperand(2)); + + // Let's build COPY for low 64-bit of UZP1. + Register COPYLow64DstReg = + MRI->createVirtualRegister(&AArch64::FPR64RegClass); + BuildMI(MBB, UZP1MI, UZP1MI->getDebugLoc(), TII->get(TargetOpcode::COPY), + COPYLow64DstReg) + .addUse(NewUZP1MI->getOperand(0).getReg(), 0, AArch64::dsub); + + // Let's replace XTN with UZP1. + MRI->replaceRegWith(XTNMI->getOperand(0).getReg(), COPYLow64DstReg); + + UZP1MI->eraseFromParent(); + XTNMI->eraseFromParent(); + + return true; +} + bool AArch64MIPeepholeOpt::runOnMachineFunction(MachineFunction &MF) { if (skipFunction(MF.getFunction())) return false; @@ -747,6 +851,9 @@ case AArch64::INSvi64lane: Changed |= visitINSvi64lane(MI); break; + case AArch64::UZP1v8i16: + Changed |= visitUZP1v8i16(MI, MBB); + break; } } } Index: llvm/test/CodeGen/AArch64/aarch64-smull.ll =================================================================== --- llvm/test/CodeGen/AArch64/aarch64-smull.ll +++ llvm/test/CodeGen/AArch64/aarch64-smull.ll @@ -1033,13 +1033,13 @@ ; CHECK-LABEL: umull_and_v8i32: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: movi v3.2d, #0x0000ff000000ff -; CHECK-NEXT: ext v4.16b, v0.16b, v0.16b, #8 ; CHECK-NEXT: and v2.16b, v2.16b, v3.16b ; CHECK-NEXT: and v1.16b, v1.16b, v3.16b +; CHECK-NEXT: uzp1 v3.8h, v0.8h, v2.8h ; CHECK-NEXT: xtn v1.4h, v1.4s -; CHECK-NEXT: xtn v2.4h, v2.4s -; CHECK-NEXT: umull v0.4s, v0.4h, v1.4h -; CHECK-NEXT: umull v1.4s, v4.4h, v2.4h +; CHECK-NEXT: umull v2.4s, v0.4h, v1.4h +; CHECK-NEXT: umull2 v1.4s, v0.8h, v3.8h +; CHECK-NEXT: mov v0.16b, v2.16b ; CHECK-NEXT: ret entry: %in1 = zext <8 x i16> %src1 to <8 x i32> @@ -1084,13 +1084,13 @@ ; CHECK-LABEL: umull_and_v4i64: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: movi v3.2d, #0x000000000000ff -; CHECK-NEXT: ext v4.16b, v0.16b, v0.16b, #8 ; CHECK-NEXT: and v2.16b, v2.16b, v3.16b ; CHECK-NEXT: and v1.16b, v1.16b, v3.16b +; CHECK-NEXT: uzp1 v3.4s, v0.4s, v2.4s ; CHECK-NEXT: xtn v1.2s, v1.2d -; CHECK-NEXT: xtn v2.2s, v2.2d -; CHECK-NEXT: umull v0.2d, v0.2s, v1.2s -; CHECK-NEXT: umull v1.2d, v4.2s, v2.2s +; CHECK-NEXT: umull v2.2d, v0.2s, v1.2s +; CHECK-NEXT: umull2 v1.2d, v0.4s, v3.4s +; CHECK-NEXT: mov v0.16b, v2.16b ; CHECK-NEXT: ret entry: %in1 = zext <4 x i32> %src1 to <4 x i64> @@ -1115,3 +1115,150 @@ %out = mul nsw <4 x i64> %in1, %broadcast.splat ret <4 x i64> %out } + +define void @pmlsl2_v8i16_uzp1(<16 x i8> %0, <8 x i16> %1, ptr %2, ptr %3) { +; CHECK-LABEL: pmlsl2_v8i16_uzp1: +; CHECK: // %bb.0: +; CHECK-NEXT: ldr q2, [x1, #16] +; CHECK-NEXT: uzp1 v2.16b, v0.16b, v2.16b +; CHECK-NEXT: pmull2 v0.8h, v0.16b, v2.16b +; CHECK-NEXT: sub v0.8h, v1.8h, v0.8h +; CHECK-NEXT: str q0, [x0] +; CHECK-NEXT: ret + %5 = getelementptr inbounds i32, ptr %3, i64 4 + %6 = load <8 x i16>, ptr %5, align 4 + %7 = trunc <8 x i16> %6 to <8 x i8> + %8 = shufflevector <16 x i8> %0, <16 x i8> poison, <8 x i32> + %9 = tail call <8 x i16> @llvm.aarch64.neon.pmull.v8i16(<8 x i8> %8, <8 x i8> %7) + %10 = sub <8 x i16> %1, %9 + store <8 x i16> %10, ptr %2, align 16 + ret void +} + +define void @smlsl2_v8i16_uzp1(<16 x i8> %0, <8 x i16> %1, ptr %2, ptr %3) { +; CHECK-LABEL: smlsl2_v8i16_uzp1: +; CHECK: // %bb.0: +; CHECK-NEXT: ldr q2, [x1, #16] +; CHECK-NEXT: uzp1 v2.16b, v0.16b, v2.16b +; CHECK-NEXT: smlsl2 v1.8h, v0.16b, v2.16b +; CHECK-NEXT: str q1, [x0] +; CHECK-NEXT: ret + %5 = getelementptr inbounds i32, ptr %3, i64 4 + %6 = load <8 x i16>, ptr %5, align 4 + %7 = trunc <8 x i16> %6 to <8 x i8> + %8 = shufflevector <16 x i8> %0, <16 x i8> poison, <8 x i32> + %9 = tail call <8 x i16> @llvm.aarch64.neon.smull.v8i16(<8 x i8> %8, <8 x i8> %7) + %10 = sub <8 x i16> %1, %9 + store <8 x i16> %10, ptr %2, align 16 + ret void +} + +define void @umlsl2_v8i16_uzp1(<16 x i8> %0, <8 x i16> %1, ptr %2, ptr %3) { +; CHECK-LABEL: umlsl2_v8i16_uzp1: +; CHECK: // %bb.0: +; CHECK-NEXT: ldr q2, [x1, #16] +; CHECK-NEXT: uzp1 v2.16b, v0.16b, v2.16b +; CHECK-NEXT: umlsl2 v1.8h, v0.16b, v2.16b +; CHECK-NEXT: str q1, [x0] +; CHECK-NEXT: ret + %5 = getelementptr inbounds i32, ptr %3, i64 4 + %6 = load <8 x i16>, ptr %5, align 4 + %7 = trunc <8 x i16> %6 to <8 x i8> + %8 = shufflevector <16 x i8> %0, <16 x i8> poison, <8 x i32> + %9 = tail call <8 x i16> @llvm.aarch64.neon.umull.v8i16(<8 x i8> %8, <8 x i8> %7) + %10 = sub <8 x i16> %1, %9 + store <8 x i16> %10, ptr %2, align 16 + ret void +} + +define void @smlsl2_v4i32_uzp1(<8 x i16> %0, <4 x i32> %1, ptr %2, ptr %3) { +; CHECK-LABEL: smlsl2_v4i32_uzp1: +; CHECK: // %bb.0: +; CHECK-NEXT: ldr q2, [x1, #16] +; CHECK-NEXT: uzp1 v2.8h, v0.8h, v2.8h +; CHECK-NEXT: smlsl2 v1.4s, v0.8h, v2.8h +; CHECK-NEXT: str q1, [x0] +; CHECK-NEXT: ret + %5 = getelementptr inbounds i32, ptr %3, i64 4 + %6 = load <4 x i32>, ptr %5, align 4 + %7 = trunc <4 x i32> %6 to <4 x i16> + %8 = shufflevector <8 x i16> %0, <8 x i16> poison, <4 x i32> + %9 = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %8, <4 x i16> %7) + %10 = sub <4 x i32> %1, %9 + store <4 x i32> %10, ptr %2, align 16 + ret void +} + +define void @umlsl2_v4i32_uzp1(<8 x i16> %0, <4 x i32> %1, ptr %2, ptr %3) { +; CHECK-LABEL: umlsl2_v4i32_uzp1: +; CHECK: // %bb.0: +; CHECK-NEXT: ldr q2, [x1, #16] +; CHECK-NEXT: uzp1 v2.8h, v0.8h, v2.8h +; CHECK-NEXT: umlsl2 v1.4s, v0.8h, v2.8h +; CHECK-NEXT: str q1, [x0] +; CHECK-NEXT: ret + %5 = getelementptr inbounds i32, ptr %3, i64 4 + %6 = load <4 x i32>, ptr %5, align 4 + %7 = trunc <4 x i32> %6 to <4 x i16> + %8 = shufflevector <8 x i16> %0, <8 x i16> poison, <4 x i32> + %9 = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %8, <4 x i16> %7) + %10 = sub <4 x i32> %1, %9 + store <4 x i32> %10, ptr %2, align 16 + ret void +} + +define void @smlsl_smlsl2_v4i32_uzp1(<8 x i16> %0, <4 x i32> %1, ptr %2, ptr %3, i32 %4) { +; CHECK-LABEL: smlsl_smlsl2_v4i32_uzp1: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ldp q2, q3, [x1] +; CHECK-NEXT: uzp1 v2.8h, v2.8h, v3.8h +; CHECK-NEXT: smlsl v1.4s, v0.4h, v2.4h +; CHECK-NEXT: smlsl2 v1.4s, v0.8h, v2.8h +; CHECK-NEXT: str q1, [x0] +; CHECK-NEXT: ret +entry: + %5 = load <4 x i32>, ptr %3, align 4 + %6 = trunc <4 x i32> %5 to <4 x i16> + %7 = getelementptr inbounds i32, ptr %3, i64 4 + %8 = load <4 x i32>, ptr %7, align 4 + %9 = trunc <4 x i32> %8 to <4 x i16> + %10 = shufflevector <8 x i16> %0, <8 x i16> poison, <4 x i32> + %11 = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %10, <4 x i16> %6) + %12 = shufflevector <8 x i16> %0, <8 x i16> poison, <4 x i32> + %13 = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %12, <4 x i16> %9) + %14 = add <4 x i32> %11, %13 + %15 = sub <4 x i32> %1, %14 + store <4 x i32> %15, ptr %2, align 16 + ret void +} + +define void @umlsl_umlsl2_v4i32_uzp1(<8 x i16> %0, <4 x i32> %1, ptr %2, ptr %3, i32 %4) { +; CHECK-LABEL: umlsl_umlsl2_v4i32_uzp1: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ldp q2, q3, [x1] +; CHECK-NEXT: uzp1 v2.8h, v2.8h, v3.8h +; CHECK-NEXT: umlsl v1.4s, v0.4h, v2.4h +; CHECK-NEXT: umlsl2 v1.4s, v0.8h, v2.8h +; CHECK-NEXT: str q1, [x0] +; CHECK-NEXT: ret +entry: + %5 = load <4 x i32>, ptr %3, align 4 + %6 = trunc <4 x i32> %5 to <4 x i16> + %7 = getelementptr inbounds i32, ptr %3, i64 4 + %8 = load <4 x i32>, ptr %7, align 4 + %9 = trunc <4 x i32> %8 to <4 x i16> + %10 = shufflevector <8 x i16> %0, <8 x i16> poison, <4 x i32> + %11 = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %10, <4 x i16> %6) + %12 = shufflevector <8 x i16> %0, <8 x i16> poison, <4 x i32> + %13 = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %12, <4 x i16> %9) + %14 = add <4 x i32> %11, %13 + %15 = sub <4 x i32> %1, %14 + store <4 x i32> %15, ptr %2, align 16 + ret void +} + +declare <8 x i16> @llvm.aarch64.neon.pmull.v8i16(<8 x i8>, <8 x i8>) +declare <8 x i16> @llvm.aarch64.neon.smull.v8i16(<8 x i8>, <8 x i8>) +declare <8 x i16> @llvm.aarch64.neon.umull.v8i16(<8 x i8>, <8 x i8>) +declare <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16>, <4 x i16>) +declare <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16>, <4 x i16>) Index: llvm/test/CodeGen/AArch64/zext-to-tbl.ll =================================================================== --- llvm/test/CodeGen/AArch64/zext-to-tbl.ll +++ llvm/test/CodeGen/AArch64/zext-to-tbl.ll @@ -2877,25 +2877,21 @@ ; CHECK-NEXT: ldr q3, [x11, lCPI24_3@PAGEOFF] ; CHECK-NEXT: LBB24_1: ; %loop ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: ldr q5, [x1], #16 -; CHECK-NEXT: ldr q4, [x8, #16]! -; CHECK-NEXT: ldr q6, [x0] +; CHECK-NEXT: ldr q4, [x1], #16 ; CHECK-NEXT: subs w2, w2, #1 -; CHECK-NEXT: tbl.16b v16, { v5 }, v0 -; CHECK-NEXT: tbl.16b v17, { v5 }, v1 -; CHECK-NEXT: tbl.16b v18, { v5 }, v2 -; CHECK-NEXT: ext.16b v7, v4, v4, #8 -; CHECK-NEXT: tbl.16b v5, { v5 }, v3 -; CHECK-NEXT: xtn.4h v16, v16 -; CHECK-NEXT: xtn.4h v17, v17 -; CHECK-NEXT: xtn.4h v18, v18 -; CHECK-NEXT: ext.16b v19, v6, v6, #8 -; CHECK-NEXT: umull.4s v4, v4, v16 -; CHECK-NEXT: umull.4s v7, v7, v17 -; CHECK-NEXT: umull.4s v6, v6, v18 -; CHECK-NEXT: xtn.4h v5, v5 -; CHECK-NEXT: stp q4, q7, [x0, #32] -; CHECK-NEXT: umull.4s v4, v19, v5 +; CHECK-NEXT: tbl.16b v5, { v4 }, v0 +; CHECK-NEXT: tbl.16b v6, { v4 }, v1 +; CHECK-NEXT: tbl.16b v7, { v4 }, v2 +; CHECK-NEXT: tbl.16b v4, { v4 }, v3 +; CHECK-NEXT: uzp1.8h v5, v5, v6 +; CHECK-NEXT: ldr q6, [x8, #16]! +; CHECK-NEXT: uzp1.8h v4, v7, v4 +; CHECK-NEXT: ldr q7, [x0] +; CHECK-NEXT: umull.4s v16, v6, v5 +; CHECK-NEXT: umull2.4s v5, v6, v5 +; CHECK-NEXT: umull.4s v6, v7, v4 +; CHECK-NEXT: umull2.4s v4, v7, v4 +; CHECK-NEXT: stp q16, q5, [x0, #32] ; CHECK-NEXT: str q6, [x0] ; CHECK-NEXT: mov x0, x8 ; CHECK-NEXT: str q4, [x8] @@ -2928,32 +2924,28 @@ ; CHECK-BE-NEXT: add x8, x0, #16 ; CHECK-BE-NEXT: add x9, x0, #32 ; CHECK-BE-NEXT: add x10, x0, #48 -; CHECK-BE-NEXT: ld1 { v6.8h }, [x0] +; CHECK-BE-NEXT: ld1 { v16.8h }, [x0] ; CHECK-BE-NEXT: subs w2, w2, #1 ; CHECK-BE-NEXT: add x1, x1, #16 ; CHECK-BE-NEXT: ld1 { v17.8h }, [x8] -; CHECK-BE-NEXT: tbl v5.16b, { v4.16b }, v1.16b -; CHECK-BE-NEXT: tbl v7.16b, { v4.16b }, v0.16b -; CHECK-BE-NEXT: tbl v16.16b, { v4.16b }, v3.16b -; CHECK-BE-NEXT: tbl v4.16b, { v4.16b }, v2.16b +; CHECK-BE-NEXT: tbl v5.16b, { v4.16b }, v2.16b +; CHECK-BE-NEXT: tbl v6.16b, { v4.16b }, v1.16b +; CHECK-BE-NEXT: tbl v7.16b, { v4.16b }, v3.16b +; CHECK-BE-NEXT: tbl v4.16b, { v4.16b }, v0.16b ; CHECK-BE-NEXT: rev32 v5.16b, v5.16b -; CHECK-BE-NEXT: rev32 v7.16b, v7.16b -; CHECK-BE-NEXT: rev32 v16.16b, v16.16b -; CHECK-BE-NEXT: rev32 v4.16b, v4.16b -; CHECK-BE-NEXT: xtn v5.4h, v5.4s -; CHECK-BE-NEXT: ext v18.16b, v17.16b, v17.16b, #8 -; CHECK-BE-NEXT: xtn v7.4h, v7.4s -; CHECK-BE-NEXT: umull v5.4s, v6.4h, v5.4h -; CHECK-BE-NEXT: ext v6.16b, v6.16b, v6.16b, #8 -; CHECK-BE-NEXT: xtn v4.4h, v4.4s -; CHECK-BE-NEXT: st1 { v5.4s }, [x0] -; CHECK-BE-NEXT: xtn v5.4h, v16.4s -; CHECK-BE-NEXT: umull v6.4s, v6.4h, v7.4h +; CHECK-BE-NEXT: rev16 v6.16b, v6.16b +; CHECK-BE-NEXT: rev16 v4.16b, v4.16b +; CHECK-BE-NEXT: uzp1 v5.8h, v5.8h, v6.8h +; CHECK-BE-NEXT: rev32 v6.16b, v7.16b +; CHECK-BE-NEXT: umull v7.4s, v16.4h, v5.4h +; CHECK-BE-NEXT: uzp1 v4.8h, v6.8h, v4.8h +; CHECK-BE-NEXT: umull2 v5.4s, v16.8h, v5.8h +; CHECK-BE-NEXT: st1 { v7.4s }, [x0] ; CHECK-BE-NEXT: mov x0, x8 -; CHECK-BE-NEXT: umull v5.4s, v17.4h, v5.4h -; CHECK-BE-NEXT: umull v4.4s, v18.4h, v4.4h -; CHECK-BE-NEXT: st1 { v6.4s }, [x8] -; CHECK-BE-NEXT: st1 { v5.4s }, [x9] +; CHECK-BE-NEXT: umull v6.4s, v17.4h, v4.4h +; CHECK-BE-NEXT: umull2 v4.4s, v17.8h, v4.8h +; CHECK-BE-NEXT: st1 { v5.4s }, [x8] +; CHECK-BE-NEXT: st1 { v6.4s }, [x9] ; CHECK-BE-NEXT: st1 { v4.4s }, [x10] ; CHECK-BE-NEXT: b.ne .LBB24_1 ; CHECK-BE-NEXT: // %bb.2: // %exit