diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -1394,18 +1394,16 @@ for (auto VT : {MVT::v4f16, MVT::v8f16, MVT::v2f32, MVT::v4f32, MVT::v2f64}) setOperationAction(ISD::VECREDUCE_SEQ_FADD, VT, Custom); - if (Subtarget->forceSVEInStreamingMode()) { - for (MVT VT : MVT::integer_fixedlen_vector_valuetypes()) { - if (useSVEForFixedLengthVectorVT(VT, true)) { + if (Subtarget->forceStreamingCompatibleSVE()) { + for (MVT VT : {MVT::v8i8, MVT::v16i8, MVT::v4i16, MVT::v8i16, MVT::v2i32, + MVT::v4i32, MVT::v1i64, MVT::v2i64}) + if (useSVEForFixedLengthVectorVT(VT, true)) addTypeForStreamingSVE(VT); - } - } - for (MVT VT : MVT::fp_fixedlen_vector_valuetypes()) { - if (useSVEForFixedLengthVectorVT(VT, true)) { + for (MVT VT : {MVT::v4f16, MVT::v8f16, MVT::v2f32, + MVT::v4f32, MVT::v1f64, MVT::v2f64}) + if (useSVEForFixedLengthVectorVT(VT, true)) addTypeForStreamingSVE(VT); - } - } } // NOTE: Currently this has to happen after computeRegisterProperties rather @@ -1615,11 +1613,19 @@ } void AArch64TargetLowering::addTypeForStreamingSVE(MVT VT) { + setOperationAction(ISD::LOAD, VT, Custom); setOperationAction(ISD::ANY_EXTEND, VT, Custom); setOperationAction(ISD::ZERO_EXTEND, VT, Custom); setOperationAction(ISD::SIGN_EXTEND, VT, Custom); - setOperationAction(ISD::LOAD, VT, Custom); setOperationAction(ISD::CONCAT_VECTORS, VT, Custom); + setOperationAction(ISD::ADD, VT, Custom); + setOperationAction(ISD::SUB, VT, Custom); + setOperationAction(ISD::MUL, VT, Custom); + setOperationAction(ISD::MULHS, VT, Custom); + setOperationAction(ISD::MULHU, VT, Custom); + setOperationAction(ISD::ABS, VT, Custom); + setOperationAction(ISD::AND, VT, Custom); + setOperationAction(ISD::XOR, VT, Custom); } void AArch64TargetLowering::addTypeForFixedLengthSVE(MVT VT) { @@ -3524,7 +3530,8 @@ } SDValue AArch64TargetLowering::LowerXOR(SDValue Op, SelectionDAG &DAG) const { - if (useSVEForFixedLengthVectorVT(Op.getValueType())) + if (useSVEForFixedLengthVectorVT(Op.getValueType(), + Subtarget->forceStreamingCompatibleSVE())) return LowerToScalableOp(Op, DAG); SDValue Sel = Op.getOperand(0); @@ -4436,7 +4443,8 @@ EVT VT = Op.getValueType(); // If SVE is available then i64 vector multiplications can also be made legal. - bool OverrideNEON = VT == MVT::v2i64 || VT == MVT::v1i64; + bool OverrideNEON = VT == MVT::v2i64 || VT == MVT::v1i64 || + Subtarget->forceStreamingCompatibleSVE(); if (VT.isScalableVector() || useSVEForFixedLengthVectorVT(VT, OverrideNEON)) return LowerToPredicatedOp(Op, DAG, AArch64ISD::MUL_PRED); @@ -5783,7 +5791,7 @@ return LowerMLOAD(Op, DAG); case ISD::LOAD: if (useSVEForFixedLengthVectorVT(Op.getValueType(), - Subtarget->forceSVEInStreamingMode())) + Subtarget->forceStreamingCompatibleSVE())) return LowerFixedLengthVectorLoadToSVE(Op, DAG); return LowerLOAD(Op, DAG); case ISD::ADD: @@ -10904,7 +10912,7 @@ ShuffleVectorSDNode *SVN = cast(Op.getNode()); - if (useSVEForFixedLengthVectorVT(VT, Subtarget->forceSVEInStreamingMode())) + if (useSVEForFixedLengthVectorVT(VT, Subtarget->forceStreamingCompatibleSVE())) return LowerFixedLengthVECTOR_SHUFFLEToSVE(Op, DAG); // Convert shuffles that are directly supported on NEON to target-specific @@ -11474,7 +11482,8 @@ SDValue AArch64TargetLowering::LowerVectorOR(SDValue Op, SelectionDAG &DAG) const { - if (useSVEForFixedLengthVectorVT(Op.getValueType())) + if (useSVEForFixedLengthVectorVT(Op.getValueType(), + Subtarget->forceStreamingCompatibleSVE())) return LowerToScalableOp(Op, DAG); // Attempt to form a vector S[LR]I from (or (and X, C1), (lsl Y, C2)) @@ -11594,8 +11603,7 @@ SelectionDAG &DAG) const { EVT VT = Op.getValueType(); - // override NEON if possible. - if (useSVEForFixedLengthVectorVT(VT, Subtarget->forceSVEInStreamingMode())) { + if (useSVEForFixedLengthVectorVT(VT, Subtarget->forceStreamingCompatibleSVE())) { if (auto SeqInfo = cast(Op)->isConstantSequence()) { SDLoc DL(Op); EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT); @@ -11926,7 +11934,7 @@ SDValue AArch64TargetLowering::LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const { if (useSVEForFixedLengthVectorVT(Op.getValueType(), - Subtarget->forceSVEInStreamingMode())) + Subtarget->forceStreamingCompatibleSVE())) return LowerFixedLengthConcatVectorsToSVE(Op, DAG); assert(Op.getValueType().isScalableVector() && @@ -12032,8 +12040,7 @@ return DAG.getAnyExtOrTrunc(Extract, DL, Op.getValueType()); } - // try overriding NEON if possible. - if (useSVEForFixedLengthVectorVT(VT, Subtarget->forceSVEInStreamingMode())) + if (useSVEForFixedLengthVectorVT(VT, Subtarget->forceStreamingCompatibleSVE())) return LowerFixedLengthExtractVectorElt(Op, DAG); // Check for non-constant or out of range lane. @@ -12092,7 +12099,7 @@ if (Idx == 0 && InVT.getSizeInBits() <= 128) return Op; - if (!Subtarget->forceSVEInStreamingMode()) { + if (!Subtarget->forceStreamingCompatibleSVE()) { // If this is extracting the upper 64-bits of a 128-bit vector, we match // that directly. if (Size == 64 && Idx * InVT.getScalarSizeInBits() == 64 && @@ -12101,7 +12108,7 @@ } if (useSVEForFixedLengthVectorVT(InVT, - Subtarget->forceSVEInStreamingMode())) { + Subtarget->forceStreamingCompatibleSVE())) { SDLoc DL(Op); EVT ContainerVT = getContainerForFixedLengthVector(DAG, InVT); @@ -12399,9 +12406,8 @@ switch (Op.getOpcode()) { case ISD::SHL: - // override NEON if possible. if (VT.isScalableVector() || - useSVEForFixedLengthVectorVT(VT, Subtarget->forceSVEInStreamingMode())) + useSVEForFixedLengthVectorVT(VT, Subtarget->forceStreamingCompatibleSVE())) return LowerToPredicatedOp(Op, DAG, AArch64ISD::SHL_PRED); if (isVShiftLImm(Op.getOperand(1), VT, false, Cnt) && Cnt < EltSize) @@ -12413,10 +12419,9 @@ Op.getOperand(0), Op.getOperand(1)); case ISD::SRA: case ISD::SRL: - // override NEON if possible. if (VT.isScalableVector() || useSVEForFixedLengthVectorVT(VT, - Subtarget->forceSVEInStreamingMode())) { + Subtarget->forceStreamingCompatibleSVE())) { unsigned Opc = Op.getOpcode() == ISD::SRA ? AArch64ISD::SRA_PRED : AArch64ISD::SRL_PRED; return LowerToPredicatedOp(Op, DAG, Opc); @@ -15529,7 +15534,7 @@ } static SDValue performANDCombine(SDNode *N, - TargetLowering::DAGCombinerInfo &DCI) { + TargetLowering::DAGCombinerInfo &DCI, const AArch64Subtarget * const Subtarget) { SelectionDAG &DAG = DCI.DAG; SDValue LHS = N->getOperand(0); SDValue RHS = N->getOperand(1); @@ -15540,10 +15545,17 @@ if (!DAG.getTargetLoweringInfo().isTypeLegal(VT)) return SDValue(); - if (VT.isScalableVector()) return performSVEAndCombine(N, DCI); + if (VT.isFixedLengthVector() && Subtarget->forceStreamingCompatibleSVE()) { + // convert fixed-length vector to scalable one: + EVT scalableContainerVT = getContainerForFixedLengthVector(DAG, VT); + SDValue scalableLHS = convertToScalableVector(DAG, scalableContainerVT, LHS); + SDValue scalableRHS = convertToScalableVector(DAG, scalableContainerVT, RHS); + return performSVEAndCombine(N, DCI); + } + // The combining code below works only for NEON vectors. In particular, it // does not work for SVE when dealing with vectors wider than 128 bits. if (!VT.is64BitVector() && !VT.is128BitVector()) @@ -20262,7 +20274,7 @@ case ISD::OR: return performORCombine(N, DCI, Subtarget); case ISD::AND: - return performANDCombine(N, DCI); + return performANDCombine(N, DCI, Subtarget); case ISD::INTRINSIC_WO_CHAIN: return performIntrinsicCombine(N, DCI, Subtarget); case ISD::ANY_EXTEND: @@ -22069,7 +22081,7 @@ SDValue AArch64TargetLowering::LowerToScalableOp(SDValue Op, SelectionDAG &DAG) const { EVT VT = Op.getValueType(); - assert(useSVEForFixedLengthVectorVT(VT) && + assert(useSVEForFixedLengthVectorVT(VT, Subtarget->forceStreamingCompatibleSVE()) && "Only expected to lower fixed length vector operation!"); EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT); @@ -22085,7 +22097,7 @@ } // "cast" fixed length vector to a scalable vector. - assert(useSVEForFixedLengthVectorVT(V.getValueType()) && + assert(useSVEForFixedLengthVectorVT(V.getValueType(), Subtarget->forceStreamingCompatibleSVE()) && "Only fixed length vectors are supported!"); Ops.push_back(convertToScalableVector(DAG, ContainerVT, V)); } diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td --- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td @@ -218,7 +218,7 @@ def UseScalarIncVL : Predicate<"Subtarget->useScalarIncVL()">; -def IsForcingSVEDisabled : Predicate<"!Subtarget->forceSVEInStreamingMode()">; +def IsForceStreamingCompatibleSVEDisabled : Predicate<"!Subtarget->forceStreamingCompatibleSVE()">; def AArch64LocalRecover : SDNode<"ISD::LOCAL_RECOVER", SDTypeProfile<1, 1, [SDTCisSameAs<0, 1>, diff --git a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td --- a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td @@ -3032,7 +3032,7 @@ (EXTRACT_SUBREG (DUP_ZZI_D ZPR:$vec, sve_elm_idx_extdup_d:$index), dsub)>; // Extract element from vector with immediate index that's within the bottom 128-bits. - let Predicates = [IsForcingSVEDisabled] in { + let Predicates = [IsForceStreamingCompatibleSVEDisabled] in { let AddedComplexity = 1 in { def : Pat<(i32 (vector_extract (nxv16i8 ZPR:$vec), VectorIndexB:$index)), (i32 (UMOVvi8 (v16i8 (EXTRACT_SUBREG ZPR:$vec, zsub)), VectorIndexB:$index))>; diff --git a/llvm/lib/Target/AArch64/AArch64Subtarget.h b/llvm/lib/Target/AArch64/AArch64Subtarget.h --- a/llvm/lib/Target/AArch64/AArch64Subtarget.h +++ b/llvm/lib/Target/AArch64/AArch64Subtarget.h @@ -361,7 +361,7 @@ bool useSVEForFixedLengthVectors() const; - bool forceSVEInStreamingMode() const; + bool forceStreamingCompatibleSVE() const; unsigned getVScaleForTuning() const { return VScaleForTuning; } diff --git a/llvm/lib/Target/AArch64/AArch64Subtarget.cpp b/llvm/lib/Target/AArch64/AArch64Subtarget.cpp --- a/llvm/lib/Target/AArch64/AArch64Subtarget.cpp +++ b/llvm/lib/Target/AArch64/AArch64Subtarget.cpp @@ -66,7 +66,7 @@ cl::CommaSeparated, cl::Hidden); static cl::opt - ForceSVEWhenStreamingCompatible("force-sve-when-streaming-compatible", + ForceStreamingCompatibleSVE("force-streaming-compatible-sve", cl::init(false), cl::Hidden); unsigned AArch64Subtarget::getVectorInsertExtractBaseCost() const { @@ -437,15 +437,19 @@ bool AArch64Subtarget::useAA() const { return UseAA; } bool AArch64Subtarget::useSVEForFixedLengthVectors() const { - if (ForceSVEWhenStreamingCompatible) + if (ForceStreamingCompatibleSVE) { + assert(hasSVE() && "Expected SVE to be available"); return hasSVE(); + } // Prefer NEON unless larger SVE registers are available. return hasSVE() && getMinSVEVectorSizeInBits() >= 256; } -bool AArch64Subtarget::forceSVEInStreamingMode() const { - if (ForceSVEWhenStreamingCompatible) +bool AArch64Subtarget::forceStreamingCompatibleSVE() const { + if (ForceStreamingCompatibleSVE) { + assert(hasSVE() && "Expected SVE to be available"); return hasSVE(); + } return false; } diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-ext-loads.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-ext-loads.ll --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-ext-loads.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-ext-loads.ll @@ -1,7 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -aarch64-sve-vector-bits-min=128 -force-sve-when-streaming-compatible < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_128 -; RUN: llc -aarch64-sve-vector-bits-min=1024 -force-sve-when-streaming-compatible < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_1024 -; RUN: llc -aarch64-sve-vector-bits-min=2048 -force-sve-when-streaming-compatible < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_2048 +; RUN: llc -force-streaming-compatible-sve < %s | FileCheck %s target triple = "aarch64-unknown-linux-gnu" @@ -23,8 +21,9 @@ define <4 x i32> @load_zext_v4i16i32(<4 x i16>* %ap) #0 { ; CHECK-LABEL: load_zext_v4i16i32: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s, vl4 -; CHECK-NEXT: ld1h { z0.s }, p0/z, [x0] +; CHECK-NEXT: ptrue p0.h, vl4 +; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] +; CHECK-NEXT: uunpklo z0.s, z0.h ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: ret %a = load <4 x i16>, <4 x i16>* %ap @@ -35,8 +34,9 @@ define <2 x i64> @load_zext_v2i32i64(<2 x i32>* %ap) #0 { ; CHECK-LABEL: load_zext_v2i32i64: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d, vl2 -; CHECK-NEXT: ld1w { z0.d }, p0/z, [x0] +; CHECK-NEXT: ptrue p0.s, vl2 +; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] +; CHECK-NEXT: uunpklo z0.d, z0.s ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: ret %a = load <2 x i32>, <2 x i32>* %ap @@ -45,122 +45,64 @@ } define <2 x i256> @load_zext_v2i64i256(<2 x i64>* %ap) #0 { -; VBITS_GE_128-LABEL: load_zext_v2i64i256: -; VBITS_GE_128: // %bb.0: -; VBITS_GE_128-NEXT: adrp x8, .LCPI3_0 -; VBITS_GE_128-NEXT: add x8, x8, :lo12:.LCPI3_0 -; VBITS_GE_128-NEXT: ptrue p0.d, vl2 -; VBITS_GE_128-NEXT: mov x1, xzr -; VBITS_GE_128-NEXT: ld1d { z0.d }, p0/z, [x0] -; VBITS_GE_128-NEXT: mov x5, xzr -; VBITS_GE_128-NEXT: ld1d { z1.d }, p0/z, [x8] -; VBITS_GE_128-NEXT: mov z2.d, z0.d[1] -; VBITS_GE_128-NEXT: fmov x0, d0 -; VBITS_GE_128-NEXT: mov z0.d, z1.d[1] -; VBITS_GE_128-NEXT: fmov x2, d1 -; VBITS_GE_128-NEXT: fmov x3, d0 -; VBITS_GE_128-NEXT: fmov x4, d2 -; VBITS_GE_128-NEXT: mov x6, x2 -; VBITS_GE_128-NEXT: mov x7, x3 -; VBITS_GE_128-NEXT: ret -; -; VBITS_GE_1024-LABEL: load_zext_v2i64i256: -; VBITS_GE_1024: // %bb.0: -; VBITS_GE_1024-NEXT: ptrue p0.d, vl2 -; VBITS_GE_1024-NEXT: mov x1, xzr -; VBITS_GE_1024-NEXT: ld1d { z0.d }, p0/z, [x0] -; VBITS_GE_1024-NEXT: mov x2, xzr -; VBITS_GE_1024-NEXT: mov x3, xzr -; VBITS_GE_1024-NEXT: mov x5, xzr -; VBITS_GE_1024-NEXT: mov x6, xzr -; VBITS_GE_1024-NEXT: mov x7, xzr -; VBITS_GE_1024-NEXT: mov z1.d, z0.d[1] -; VBITS_GE_1024-NEXT: fmov x0, d0 -; VBITS_GE_1024-NEXT: fmov x4, d1 -; VBITS_GE_1024-NEXT: ret -; -; VBITS_GE_2048-LABEL: load_zext_v2i64i256: -; VBITS_GE_2048: // %bb.0: -; VBITS_GE_2048-NEXT: ptrue p0.d, vl2 -; VBITS_GE_2048-NEXT: mov x1, xzr -; VBITS_GE_2048-NEXT: ld1d { z0.d }, p0/z, [x0] -; VBITS_GE_2048-NEXT: mov x2, xzr -; VBITS_GE_2048-NEXT: mov x3, xzr -; VBITS_GE_2048-NEXT: mov x5, xzr -; VBITS_GE_2048-NEXT: mov x6, xzr -; VBITS_GE_2048-NEXT: mov x7, xzr -; VBITS_GE_2048-NEXT: mov z1.d, z0.d[1] -; VBITS_GE_2048-NEXT: fmov x0, d0 -; VBITS_GE_2048-NEXT: fmov x4, d1 -; VBITS_GE_2048-NEXT: ret +; CHECK-LABEL: load_zext_v2i64i256: +; CHECK: // %bb.0: +; CHECK-NEXT: adrp x8, .LCPI3_0 +; CHECK-NEXT: add x8, x8, :lo12:.LCPI3_0 +; CHECK-NEXT: ptrue p0.d, vl2 +; CHECK-NEXT: mov x1, xzr +; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] +; CHECK-NEXT: mov x5, xzr +; CHECK-NEXT: ld1d { z1.d }, p0/z, [x8] +; CHECK-NEXT: mov z2.d, z0.d[1] +; CHECK-NEXT: fmov x0, d0 +; CHECK-NEXT: mov z0.d, z1.d[1] +; CHECK-NEXT: fmov x2, d1 +; CHECK-NEXT: fmov x3, d0 +; CHECK-NEXT: fmov x4, d2 +; CHECK-NEXT: mov x6, x2 +; CHECK-NEXT: mov x7, x3 +; CHECK-NEXT: ret %a = load <2 x i64>, <2 x i64>* %ap %val = zext <2 x i64> %a to <2 x i256> ret <2 x i256> %val } define <16 x i32> @load_sext_v16i8i32(<16 x i8>* %ap) #0 { -; VBITS_GE_128-LABEL: load_sext_v16i8i32: -; VBITS_GE_128: // %bb.0: -; VBITS_GE_128-NEXT: ptrue p0.b, vl16 -; VBITS_GE_128-NEXT: ld1b { z1.b }, p0/z, [x0] -; VBITS_GE_128-NEXT: sunpklo z3.h, z1.b -; VBITS_GE_128-NEXT: ext z1.b, z1.b, z1.b, #8 -; VBITS_GE_128-NEXT: sunpklo z4.h, z1.b -; VBITS_GE_128-NEXT: sunpklo z0.s, z3.h -; VBITS_GE_128-NEXT: ext z3.b, z3.b, z3.b, #8 -; VBITS_GE_128-NEXT: sunpklo z2.s, z4.h -; VBITS_GE_128-NEXT: ext z4.b, z4.b, z4.b, #8 -; VBITS_GE_128-NEXT: sunpklo z1.s, z3.h -; VBITS_GE_128-NEXT: sunpklo z3.s, z4.h -; VBITS_GE_128-NEXT: // kill: def $q0 killed $q0 killed $z0 -; VBITS_GE_128-NEXT: // kill: def $q1 killed $q1 killed $z1 -; VBITS_GE_128-NEXT: // kill: def $q2 killed $q2 killed $z2 -; VBITS_GE_128-NEXT: // kill: def $q3 killed $q3 killed $z3 -; VBITS_GE_128-NEXT: ret -; -; VBITS_GE_1024-LABEL: load_sext_v16i8i32: -; VBITS_GE_1024: // %bb.0: -; VBITS_GE_1024-NEXT: ptrue p0.s, vl16 -; VBITS_GE_1024-NEXT: ld1sb { z0.s }, p0/z, [x0] -; VBITS_GE_1024-NEXT: st1w { z0.s }, p0, [x8] -; VBITS_GE_1024-NEXT: ret -; -; VBITS_GE_2048-LABEL: load_sext_v16i8i32: -; VBITS_GE_2048: // %bb.0: -; VBITS_GE_2048-NEXT: ptrue p0.s, vl16 -; VBITS_GE_2048-NEXT: ld1sb { z0.s }, p0/z, [x0] -; VBITS_GE_2048-NEXT: st1w { z0.s }, p0, [x8] -; VBITS_GE_2048-NEXT: ret +; CHECK-LABEL: load_sext_v16i8i32: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.b, vl16 +; CHECK-NEXT: ld1b { z1.b }, p0/z, [x0] +; CHECK-NEXT: sunpklo z3.h, z1.b +; CHECK-NEXT: ext z1.b, z1.b, z1.b, #8 +; CHECK-NEXT: sunpklo z4.h, z1.b +; CHECK-NEXT: sunpklo z0.s, z3.h +; CHECK-NEXT: ext z3.b, z3.b, z3.b, #8 +; CHECK-NEXT: sunpklo z2.s, z4.h +; CHECK-NEXT: ext z4.b, z4.b, z4.b, #8 +; CHECK-NEXT: sunpklo z1.s, z3.h +; CHECK-NEXT: sunpklo z3.s, z4.h +; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 +; CHECK-NEXT: // kill: def $q1 killed $q1 killed $z1 +; CHECK-NEXT: // kill: def $q2 killed $q2 killed $z2 +; CHECK-NEXT: // kill: def $q3 killed $q3 killed $z3 +; CHECK-NEXT: ret %a = load <16 x i8>, <16 x i8>* %ap %val = sext <16 x i8> %a to <16 x i32> ret <16 x i32> %val } define <8 x i32> @load_sext_v8i16i32(<8 x i16>* %ap) #0 { -; VBITS_GE_128-LABEL: load_sext_v8i16i32: -; VBITS_GE_128: // %bb.0: -; VBITS_GE_128-NEXT: ptrue p0.h, vl8 -; VBITS_GE_128-NEXT: ld1h { z1.h }, p0/z, [x0] -; VBITS_GE_128-NEXT: sunpklo z0.s, z1.h -; VBITS_GE_128-NEXT: ext z1.b, z1.b, z1.b, #8 -; VBITS_GE_128-NEXT: sunpklo z1.s, z1.h -; VBITS_GE_128-NEXT: // kill: def $q0 killed $q0 killed $z0 -; VBITS_GE_128-NEXT: // kill: def $q1 killed $q1 killed $z1 -; VBITS_GE_128-NEXT: ret -; -; VBITS_GE_1024-LABEL: load_sext_v8i16i32: -; VBITS_GE_1024: // %bb.0: -; VBITS_GE_1024-NEXT: ptrue p0.s, vl8 -; VBITS_GE_1024-NEXT: ld1sh { z0.s }, p0/z, [x0] -; VBITS_GE_1024-NEXT: st1w { z0.s }, p0, [x8] -; VBITS_GE_1024-NEXT: ret -; -; VBITS_GE_2048-LABEL: load_sext_v8i16i32: -; VBITS_GE_2048: // %bb.0: -; VBITS_GE_2048-NEXT: ptrue p0.s, vl8 -; VBITS_GE_2048-NEXT: ld1sh { z0.s }, p0/z, [x0] -; VBITS_GE_2048-NEXT: st1w { z0.s }, p0, [x8] -; VBITS_GE_2048-NEXT: ret +; CHECK-LABEL: load_sext_v8i16i32: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.h, vl8 +; CHECK-NEXT: ld1h { z1.h }, p0/z, [x0] +; CHECK-NEXT: sunpklo z0.s, z1.h +; CHECK-NEXT: ext z1.b, z1.b, z1.b, #8 +; CHECK-NEXT: sunpklo z1.s, z1.h +; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 +; CHECK-NEXT: // kill: def $q1 killed $q1 killed $z1 +; CHECK-NEXT: ret %a = load <8 x i16>, <8 x i16>* %ap %val = sext <8 x i16> %a to <8 x i32> ret <8 x i32> %val @@ -199,171 +141,73 @@ } define <2 x i256> @load_sext_v2i64i256(<2 x i64>* %ap) #0 { -; VBITS_GE_128-LABEL: load_sext_v2i64i256: -; VBITS_GE_128: // %bb.0: -; VBITS_GE_128-NEXT: ptrue p0.d, vl2 -; VBITS_GE_128-NEXT: ld1d { z0.d }, p0/z, [x0] -; VBITS_GE_128-NEXT: mov z1.d, z0.d[1] -; VBITS_GE_128-NEXT: fmov x8, d0 -; VBITS_GE_128-NEXT: fmov x9, d1 -; VBITS_GE_128-NEXT: asr x8, x8, #63 -; VBITS_GE_128-NEXT: asr x9, x9, #63 -; VBITS_GE_128-NEXT: fmov d2, x8 -; VBITS_GE_128-NEXT: fmov d3, x9 -; VBITS_GE_128-NEXT: zip1 z0.d, z0.d, z2.d -; VBITS_GE_128-NEXT: zip1 z2.d, z2.d, z2.d -; VBITS_GE_128-NEXT: zip1 z1.d, z1.d, z3.d -; VBITS_GE_128-NEXT: mov z4.d, z0.d[1] -; VBITS_GE_128-NEXT: fmov x0, d0 -; VBITS_GE_128-NEXT: mov z0.d, z2.d[1] -; VBITS_GE_128-NEXT: fmov x2, d2 -; VBITS_GE_128-NEXT: mov z2.d, z1.d[1] -; VBITS_GE_128-NEXT: fmov x4, d1 -; VBITS_GE_128-NEXT: zip1 z1.d, z3.d, z3.d -; VBITS_GE_128-NEXT: fmov x3, d0 -; VBITS_GE_128-NEXT: mov z0.d, z1.d[1] -; VBITS_GE_128-NEXT: fmov x1, d4 -; VBITS_GE_128-NEXT: fmov x6, d1 -; VBITS_GE_128-NEXT: fmov x5, d2 -; VBITS_GE_128-NEXT: fmov x7, d0 -; VBITS_GE_128-NEXT: ret -; -; VBITS_GE_1024-LABEL: load_sext_v2i64i256: -; VBITS_GE_1024: // %bb.0: -; VBITS_GE_1024-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill -; VBITS_GE_1024-NEXT: .cfi_def_cfa_offset 16 -; VBITS_GE_1024-NEXT: mov x29, sp -; VBITS_GE_1024-NEXT: .cfi_def_cfa w29, 16 -; VBITS_GE_1024-NEXT: .cfi_offset w30, -8 -; VBITS_GE_1024-NEXT: .cfi_offset w29, -16 -; VBITS_GE_1024-NEXT: sub x9, sp, #112 -; VBITS_GE_1024-NEXT: and sp, x9, #0xffffffffffffffc0 -; VBITS_GE_1024-NEXT: ptrue p0.d, vl2 -; VBITS_GE_1024-NEXT: ld1d { z0.d }, p0/z, [x0] -; VBITS_GE_1024-NEXT: ptrue p0.d, vl8 -; VBITS_GE_1024-NEXT: fmov x8, d0 -; VBITS_GE_1024-NEXT: mov z0.d, z0.d[1] -; VBITS_GE_1024-NEXT: fmov x10, d0 -; VBITS_GE_1024-NEXT: asr x9, x8, #63 -; VBITS_GE_1024-NEXT: asr x11, x10, #63 -; VBITS_GE_1024-NEXT: stp x9, x9, [sp, #16] -; VBITS_GE_1024-NEXT: stp x8, x9, [sp] -; VBITS_GE_1024-NEXT: stp x11, x11, [sp, #48] -; VBITS_GE_1024-NEXT: stp x10, x11, [sp, #32] -; VBITS_GE_1024-NEXT: ld1d { z0.d }, p0/z, [sp] -; VBITS_GE_1024-NEXT: mov z1.d, z0.d[1] -; VBITS_GE_1024-NEXT: mov z2.d, z0.d[2] -; VBITS_GE_1024-NEXT: mov z3.d, z0.d[3] -; VBITS_GE_1024-NEXT: mov z4.d, z0.d[4] -; VBITS_GE_1024-NEXT: mov z5.d, z0.d[5] -; VBITS_GE_1024-NEXT: mov z6.d, z0.d[6] -; VBITS_GE_1024-NEXT: mov z7.d, z0.d[7] -; VBITS_GE_1024-NEXT: fmov x0, d0 -; VBITS_GE_1024-NEXT: fmov x1, d1 -; VBITS_GE_1024-NEXT: fmov x2, d2 -; VBITS_GE_1024-NEXT: fmov x3, d3 -; VBITS_GE_1024-NEXT: fmov x4, d4 -; VBITS_GE_1024-NEXT: fmov x5, d5 -; VBITS_GE_1024-NEXT: fmov x6, d6 -; VBITS_GE_1024-NEXT: fmov x7, d7 -; VBITS_GE_1024-NEXT: mov sp, x29 -; VBITS_GE_1024-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload -; VBITS_GE_1024-NEXT: ret -; -; VBITS_GE_2048-LABEL: load_sext_v2i64i256: -; VBITS_GE_2048: // %bb.0: -; VBITS_GE_2048-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill -; VBITS_GE_2048-NEXT: .cfi_def_cfa_offset 16 -; VBITS_GE_2048-NEXT: mov x29, sp -; VBITS_GE_2048-NEXT: .cfi_def_cfa w29, 16 -; VBITS_GE_2048-NEXT: .cfi_offset w30, -8 -; VBITS_GE_2048-NEXT: .cfi_offset w29, -16 -; VBITS_GE_2048-NEXT: sub x9, sp, #112 -; VBITS_GE_2048-NEXT: and sp, x9, #0xffffffffffffffc0 -; VBITS_GE_2048-NEXT: ptrue p0.d, vl2 -; VBITS_GE_2048-NEXT: ld1d { z0.d }, p0/z, [x0] -; VBITS_GE_2048-NEXT: ptrue p0.d, vl8 -; VBITS_GE_2048-NEXT: fmov x8, d0 -; VBITS_GE_2048-NEXT: mov z0.d, z0.d[1] -; VBITS_GE_2048-NEXT: fmov x10, d0 -; VBITS_GE_2048-NEXT: asr x9, x8, #63 -; VBITS_GE_2048-NEXT: asr x11, x10, #63 -; VBITS_GE_2048-NEXT: stp x9, x9, [sp, #16] -; VBITS_GE_2048-NEXT: stp x8, x9, [sp] -; VBITS_GE_2048-NEXT: stp x11, x11, [sp, #48] -; VBITS_GE_2048-NEXT: stp x10, x11, [sp, #32] -; VBITS_GE_2048-NEXT: ld1d { z0.d }, p0/z, [sp] -; VBITS_GE_2048-NEXT: mov z1.d, z0.d[1] -; VBITS_GE_2048-NEXT: mov z2.d, z0.d[2] -; VBITS_GE_2048-NEXT: mov z3.d, z0.d[3] -; VBITS_GE_2048-NEXT: mov z4.d, z0.d[4] -; VBITS_GE_2048-NEXT: mov z5.d, z0.d[5] -; VBITS_GE_2048-NEXT: mov z6.d, z0.d[6] -; VBITS_GE_2048-NEXT: mov z7.d, z0.d[7] -; VBITS_GE_2048-NEXT: fmov x0, d0 -; VBITS_GE_2048-NEXT: fmov x1, d1 -; VBITS_GE_2048-NEXT: fmov x2, d2 -; VBITS_GE_2048-NEXT: fmov x3, d3 -; VBITS_GE_2048-NEXT: fmov x4, d4 -; VBITS_GE_2048-NEXT: fmov x5, d5 -; VBITS_GE_2048-NEXT: fmov x6, d6 -; VBITS_GE_2048-NEXT: fmov x7, d7 -; VBITS_GE_2048-NEXT: mov sp, x29 -; VBITS_GE_2048-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload -; VBITS_GE_2048-NEXT: ret +; CHECK-LABEL: load_sext_v2i64i256: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.d, vl2 +; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] +; CHECK-NEXT: mov z1.d, z0.d[1] +; CHECK-NEXT: fmov x8, d0 +; CHECK-NEXT: fmov x9, d1 +; CHECK-NEXT: asr x8, x8, #63 +; CHECK-NEXT: asr x9, x9, #63 +; CHECK-NEXT: fmov d2, x8 +; CHECK-NEXT: fmov d3, x9 +; CHECK-NEXT: zip1 z0.d, z0.d, z2.d +; CHECK-NEXT: zip1 z2.d, z2.d, z2.d +; CHECK-NEXT: zip1 z1.d, z1.d, z3.d +; CHECK-NEXT: mov z4.d, z0.d[1] +; CHECK-NEXT: fmov x0, d0 +; CHECK-NEXT: mov z0.d, z2.d[1] +; CHECK-NEXT: fmov x2, d2 +; CHECK-NEXT: mov z2.d, z1.d[1] +; CHECK-NEXT: fmov x4, d1 +; CHECK-NEXT: zip1 z1.d, z3.d, z3.d +; CHECK-NEXT: fmov x3, d0 +; CHECK-NEXT: mov z0.d, z1.d[1] +; CHECK-NEXT: fmov x1, d4 +; CHECK-NEXT: fmov x6, d1 +; CHECK-NEXT: fmov x5, d2 +; CHECK-NEXT: fmov x7, d0 +; CHECK-NEXT: ret %a = load <2 x i64>, <2 x i64>* %ap %val = sext <2 x i64> %a to <2 x i256> ret <2 x i256> %val } define <16 x i64> @load_zext_v16i16i64(<16 x i16>* %ap) #0 { -; VBITS_GE_128-LABEL: load_zext_v16i16i64: -; VBITS_GE_128: // %bb.0: -; VBITS_GE_128-NEXT: mov x8, #8 -; VBITS_GE_128-NEXT: ptrue p0.h, vl8 -; VBITS_GE_128-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] -; VBITS_GE_128-NEXT: ld1h { z1.h }, p0/z, [x0] -; VBITS_GE_128-NEXT: uunpklo z2.s, z0.h -; VBITS_GE_128-NEXT: ext z0.b, z0.b, z0.b, #8 -; VBITS_GE_128-NEXT: uunpklo z3.s, z1.h -; VBITS_GE_128-NEXT: ext z1.b, z1.b, z1.b, #8 -; VBITS_GE_128-NEXT: uunpklo z4.d, z2.s -; VBITS_GE_128-NEXT: ext z2.b, z2.b, z2.b, #8 -; VBITS_GE_128-NEXT: uunpklo z7.s, z0.h -; VBITS_GE_128-NEXT: uunpklo z16.s, z1.h -; VBITS_GE_128-NEXT: uunpklo z0.d, z3.s -; VBITS_GE_128-NEXT: ext z3.b, z3.b, z3.b, #8 -; VBITS_GE_128-NEXT: uunpklo z5.d, z2.s -; VBITS_GE_128-NEXT: uunpklo z6.d, z7.s -; VBITS_GE_128-NEXT: ext z7.b, z7.b, z7.b, #8 -; VBITS_GE_128-NEXT: uunpklo z2.d, z16.s -; VBITS_GE_128-NEXT: ext z16.b, z16.b, z16.b, #8 -; VBITS_GE_128-NEXT: uunpklo z1.d, z3.s -; VBITS_GE_128-NEXT: uunpklo z7.d, z7.s -; VBITS_GE_128-NEXT: uunpklo z3.d, z16.s -; VBITS_GE_128-NEXT: // kill: def $q0 killed $q0 killed $z0 -; VBITS_GE_128-NEXT: // kill: def $q1 killed $q1 killed $z1 -; VBITS_GE_128-NEXT: // kill: def $q2 killed $q2 killed $z2 -; VBITS_GE_128-NEXT: // kill: def $q3 killed $q3 killed $z3 -; VBITS_GE_128-NEXT: // kill: def $q4 killed $q4 killed $z4 -; VBITS_GE_128-NEXT: // kill: def $q5 killed $q5 killed $z5 -; VBITS_GE_128-NEXT: // kill: def $q6 killed $q6 killed $z6 -; VBITS_GE_128-NEXT: // kill: def $q7 killed $q7 killed $z7 -; VBITS_GE_128-NEXT: ret -; -; VBITS_GE_1024-LABEL: load_zext_v16i16i64: -; VBITS_GE_1024: // %bb.0: -; VBITS_GE_1024-NEXT: ptrue p0.d, vl16 -; VBITS_GE_1024-NEXT: ld1h { z0.d }, p0/z, [x0] -; VBITS_GE_1024-NEXT: st1d { z0.d }, p0, [x8] -; VBITS_GE_1024-NEXT: ret -; -; VBITS_GE_2048-LABEL: load_zext_v16i16i64: -; VBITS_GE_2048: // %bb.0: -; VBITS_GE_2048-NEXT: ptrue p0.d, vl16 -; VBITS_GE_2048-NEXT: ld1h { z0.d }, p0/z, [x0] -; VBITS_GE_2048-NEXT: st1d { z0.d }, p0, [x8] -; VBITS_GE_2048-NEXT: ret +; CHECK-LABEL: load_zext_v16i16i64: +; CHECK: // %bb.0: +; CHECK-NEXT: mov x8, #8 +; CHECK-NEXT: ptrue p0.h, vl8 +; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] +; CHECK-NEXT: ld1h { z1.h }, p0/z, [x0] +; CHECK-NEXT: uunpklo z2.s, z0.h +; CHECK-NEXT: ext z0.b, z0.b, z0.b, #8 +; CHECK-NEXT: uunpklo z3.s, z1.h +; CHECK-NEXT: ext z1.b, z1.b, z1.b, #8 +; CHECK-NEXT: uunpklo z4.d, z2.s +; CHECK-NEXT: ext z2.b, z2.b, z2.b, #8 +; CHECK-NEXT: uunpklo z7.s, z0.h +; CHECK-NEXT: uunpklo z16.s, z1.h +; CHECK-NEXT: uunpklo z0.d, z3.s +; CHECK-NEXT: ext z3.b, z3.b, z3.b, #8 +; CHECK-NEXT: uunpklo z5.d, z2.s +; CHECK-NEXT: uunpklo z6.d, z7.s +; CHECK-NEXT: ext z7.b, z7.b, z7.b, #8 +; CHECK-NEXT: uunpklo z2.d, z16.s +; CHECK-NEXT: ext z16.b, z16.b, z16.b, #8 +; CHECK-NEXT: uunpklo z1.d, z3.s +; CHECK-NEXT: uunpklo z7.d, z7.s +; CHECK-NEXT: uunpklo z3.d, z16.s +; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 +; CHECK-NEXT: // kill: def $q1 killed $q1 killed $z1 +; CHECK-NEXT: // kill: def $q2 killed $q2 killed $z2 +; CHECK-NEXT: // kill: def $q3 killed $q3 killed $z3 +; CHECK-NEXT: // kill: def $q4 killed $q4 killed $z4 +; CHECK-NEXT: // kill: def $q5 killed $q5 killed $z5 +; CHECK-NEXT: // kill: def $q6 killed $q6 killed $z6 +; CHECK-NEXT: // kill: def $q7 killed $q7 killed $z7 +; CHECK-NEXT: ret %a = load <16 x i16>, <16 x i16>* %ap %val = zext <16 x i16> %a to <16 x i64> ret <16 x i64> %val diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-arith.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-arith.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-arith.ll @@ -0,0 +1,1310 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -force-streaming-compatible-sve < %s | FileCheck %s + +target triple = "aarch64-unknown-linux-gnu" + +; +; ADD +; +define <4 x i8> @add_v4i8(<4 x i8> %op1, <4 x i8> %op2) #0 { +; CHECK-LABEL: add_v4i8: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1 +; CHECK-NEXT: add z0.h, z0.h, z1.h +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 +; CHECK-NEXT: ret + %res = add <4 x i8> %op1, %op2 + ret <4 x i8> %res +} + +define <8 x i8> @add_v8i8(<8 x i8> %op1, <8 x i8> %op2) #0 { +; CHECK-LABEL: add_v8i8: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1 +; CHECK-NEXT: add z0.b, z0.b, z1.b +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 +; CHECK-NEXT: ret + %res = add <8 x i8> %op1, %op2 + ret <8 x i8> %res +} + +define <16 x i8> @add_v16i8(<16 x i8> %op1, <16 x i8> %op2) #0 { +; CHECK-LABEL: add_v16i8: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1 +; CHECK-NEXT: add z0.b, z0.b, z1.b +; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 +; CHECK-NEXT: ret + %res = add <16 x i8> %op1, %op2 + ret <16 x i8> %res +} + +define void @add_v32i8(<32 x i8>* %a, <32 x i8>* %b) #0 { +; CHECK-LABEL: add_v32i8: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w8, #16 +; CHECK-NEXT: ptrue p0.b, vl16 +; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0, x8] +; CHECK-NEXT: ld1b { z1.b }, p0/z, [x0] +; CHECK-NEXT: ld1b { z2.b }, p0/z, [x1, x8] +; CHECK-NEXT: ld1b { z3.b }, p0/z, [x1] +; CHECK-NEXT: add z1.b, z1.b, z3.b +; CHECK-NEXT: add z0.b, z0.b, z2.b +; CHECK-NEXT: stp q1, q0, [x0] +; CHECK-NEXT: ret + %op1 = load <32 x i8>, <32 x i8>* %a + %op2 = load <32 x i8>, <32 x i8>* %b + %res = add <32 x i8> %op1, %op2 + store <32 x i8> %res, <32 x i8>* %a + ret void +} + +define void @add_v64i8(<64 x i8>* %a, <64 x i8>* %b) #0 { +; CHECK-LABEL: add_v64i8: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w8, #32 +; CHECK-NEXT: mov w9, #48 +; CHECK-NEXT: mov w10, #16 +; CHECK-NEXT: ptrue p0.b, vl16 +; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0, x8] +; CHECK-NEXT: ld1b { z1.b }, p0/z, [x0, x9] +; CHECK-NEXT: ld1b { z2.b }, p0/z, [x0, x10] +; CHECK-NEXT: ld1b { z3.b }, p0/z, [x0] +; CHECK-NEXT: ld1b { z4.b }, p0/z, [x1, x8] +; CHECK-NEXT: ld1b { z5.b }, p0/z, [x1, x9] +; CHECK-NEXT: ld1b { z6.b }, p0/z, [x1, x10] +; CHECK-NEXT: ld1b { z7.b }, p0/z, [x1] +; CHECK-NEXT: add z0.b, z0.b, z4.b +; CHECK-NEXT: add z1.b, z1.b, z5.b +; CHECK-NEXT: stp q0, q1, [x0, #32] +; CHECK-NEXT: add z0.b, z3.b, z7.b +; CHECK-NEXT: add z1.b, z2.b, z6.b +; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: ret + %op1 = load <64 x i8>, <64 x i8>* %a + %op2 = load <64 x i8>, <64 x i8>* %b + %res = add <64 x i8> %op1, %op2 + store <64 x i8> %res, <64 x i8>* %a + ret void +} + +define <2 x i16> @add_v2i16(<2 x i16> %op1, <2 x i16> %op2) #0 { +; CHECK-LABEL: add_v2i16: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1 +; CHECK-NEXT: add z0.s, z0.s, z1.s +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 +; CHECK-NEXT: ret + %res = add <2 x i16> %op1, %op2 + ret <2 x i16> %res +} + +define <4 x i16> @add_v4i16(<4 x i16> %op1, <4 x i16> %op2) #0 { +; CHECK-LABEL: add_v4i16: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1 +; CHECK-NEXT: add z0.h, z0.h, z1.h +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 +; CHECK-NEXT: ret + %res = add <4 x i16> %op1, %op2 + ret <4 x i16> %res +} + +define <8 x i16> @add_v8i16(<8 x i16> %op1, <8 x i16> %op2) #0 { +; CHECK-LABEL: add_v8i16: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1 +; CHECK-NEXT: add z0.h, z0.h, z1.h +; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 +; CHECK-NEXT: ret + %res = add <8 x i16> %op1, %op2 + ret <8 x i16> %res +} + +define void @add_v16i16(<16 x i16>* %a, <16 x i16>* %b) #0 { +; CHECK-LABEL: add_v16i16: +; CHECK: // %bb.0: +; CHECK-NEXT: mov x8, #8 +; CHECK-NEXT: ptrue p0.h, vl8 +; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] +; CHECK-NEXT: ld1h { z1.h }, p0/z, [x0] +; CHECK-NEXT: ld1h { z2.h }, p0/z, [x1, x8, lsl #1] +; CHECK-NEXT: ld1h { z3.h }, p0/z, [x1] +; CHECK-NEXT: add z1.h, z1.h, z3.h +; CHECK-NEXT: add z0.h, z0.h, z2.h +; CHECK-NEXT: stp q1, q0, [x0] +; CHECK-NEXT: ret + %op1 = load <16 x i16>, <16 x i16>* %a + %op2 = load <16 x i16>, <16 x i16>* %b + %res = add <16 x i16> %op1, %op2 + store <16 x i16> %res, <16 x i16>* %a + ret void +} + +define void @add_v32i16(<32 x i16>* %a, <32 x i16>* %b) #0 { +; CHECK-LABEL: add_v32i16: +; CHECK: // %bb.0: +; CHECK-NEXT: mov x8, #16 +; CHECK-NEXT: mov x9, #24 +; CHECK-NEXT: mov x10, #8 +; CHECK-NEXT: ptrue p0.h, vl8 +; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] +; CHECK-NEXT: ld1h { z1.h }, p0/z, [x0, x9, lsl #1] +; CHECK-NEXT: ld1h { z2.h }, p0/z, [x0, x10, lsl #1] +; CHECK-NEXT: ld1h { z3.h }, p0/z, [x0] +; CHECK-NEXT: ld1h { z4.h }, p0/z, [x1, x8, lsl #1] +; CHECK-NEXT: ld1h { z5.h }, p0/z, [x1, x9, lsl #1] +; CHECK-NEXT: ld1h { z6.h }, p0/z, [x1, x10, lsl #1] +; CHECK-NEXT: ld1h { z7.h }, p0/z, [x1] +; CHECK-NEXT: add z0.h, z0.h, z4.h +; CHECK-NEXT: add z1.h, z1.h, z5.h +; CHECK-NEXT: stp q0, q1, [x0, #32] +; CHECK-NEXT: add z0.h, z3.h, z7.h +; CHECK-NEXT: add z1.h, z2.h, z6.h +; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: ret + %op1 = load <32 x i16>, <32 x i16>* %a + %op2 = load <32 x i16>, <32 x i16>* %b + %res = add <32 x i16> %op1, %op2 + store <32 x i16> %res, <32 x i16>* %a + ret void +} + +define <2 x i32> @add_v2i32(<2 x i32> %op1, <2 x i32> %op2) #0 { +; CHECK-LABEL: add_v2i32: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1 +; CHECK-NEXT: add z0.s, z0.s, z1.s +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 +; CHECK-NEXT: ret + %res = add <2 x i32> %op1, %op2 + ret <2 x i32> %res +} + +define <4 x i32> @add_v4i32(<4 x i32> %op1, <4 x i32> %op2) #0 { +; CHECK-LABEL: add_v4i32: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1 +; CHECK-NEXT: add z0.s, z0.s, z1.s +; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 +; CHECK-NEXT: ret + %res = add <4 x i32> %op1, %op2 + ret <4 x i32> %res +} + +define void @add_v8i32(<8 x i32>* %a, <8 x i32>* %b) #0 { +; CHECK-LABEL: add_v8i32: +; CHECK: // %bb.0: +; CHECK-NEXT: mov x8, #4 +; CHECK-NEXT: ptrue p0.s, vl4 +; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] +; CHECK-NEXT: ld1w { z1.s }, p0/z, [x0] +; CHECK-NEXT: ld1w { z2.s }, p0/z, [x1, x8, lsl #2] +; CHECK-NEXT: ld1w { z3.s }, p0/z, [x1] +; CHECK-NEXT: add z1.s, z1.s, z3.s +; CHECK-NEXT: add z0.s, z0.s, z2.s +; CHECK-NEXT: stp q1, q0, [x0] +; CHECK-NEXT: ret + %op1 = load <8 x i32>, <8 x i32>* %a + %op2 = load <8 x i32>, <8 x i32>* %b + %res = add <8 x i32> %op1, %op2 + store <8 x i32> %res, <8 x i32>* %a + ret void +} + +define void @add_v16i32(<16 x i32>* %a, <16 x i32>* %b) #0 { +; CHECK-LABEL: add_v16i32: +; CHECK: // %bb.0: +; CHECK-NEXT: mov x8, #8 +; CHECK-NEXT: mov x9, #12 +; CHECK-NEXT: mov x10, #4 +; CHECK-NEXT: ptrue p0.s, vl4 +; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] +; CHECK-NEXT: ld1w { z1.s }, p0/z, [x0, x9, lsl #2] +; CHECK-NEXT: ld1w { z2.s }, p0/z, [x0, x10, lsl #2] +; CHECK-NEXT: ld1w { z3.s }, p0/z, [x0] +; CHECK-NEXT: ld1w { z4.s }, p0/z, [x1, x8, lsl #2] +; CHECK-NEXT: ld1w { z5.s }, p0/z, [x1, x9, lsl #2] +; CHECK-NEXT: ld1w { z6.s }, p0/z, [x1, x10, lsl #2] +; CHECK-NEXT: ld1w { z7.s }, p0/z, [x1] +; CHECK-NEXT: add z0.s, z0.s, z4.s +; CHECK-NEXT: add z1.s, z1.s, z5.s +; CHECK-NEXT: stp q0, q1, [x0, #32] +; CHECK-NEXT: add z0.s, z3.s, z7.s +; CHECK-NEXT: add z1.s, z2.s, z6.s +; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: ret + %op1 = load <16 x i32>, <16 x i32>* %a + %op2 = load <16 x i32>, <16 x i32>* %b + %res = add <16 x i32> %op1, %op2 + store <16 x i32> %res, <16 x i32>* %a + ret void +} + +define <1 x i64> @add_v1i64(<1 x i64> %op1, <1 x i64> %op2) #0 { +; CHECK-LABEL: add_v1i64: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1 +; CHECK-NEXT: add z0.d, z0.d, z1.d +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 +; CHECK-NEXT: ret + %res = add <1 x i64> %op1, %op2 + ret <1 x i64> %res +} + +define <2 x i64> @add_v2i64(<2 x i64> %op1, <2 x i64> %op2) #0 { +; CHECK-LABEL: add_v2i64: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1 +; CHECK-NEXT: add z0.d, z0.d, z1.d +; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 +; CHECK-NEXT: ret + %res = add <2 x i64> %op1, %op2 + ret <2 x i64> %res +} + +define void @add_v4i64(<4 x i64>* %a, <4 x i64>* %b) #0 { +; CHECK-LABEL: add_v4i64: +; CHECK: // %bb.0: +; CHECK-NEXT: mov x8, #2 +; CHECK-NEXT: ptrue p0.d, vl2 +; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] +; CHECK-NEXT: ld1d { z1.d }, p0/z, [x0] +; CHECK-NEXT: ld1d { z2.d }, p0/z, [x1, x8, lsl #3] +; CHECK-NEXT: ld1d { z3.d }, p0/z, [x1] +; CHECK-NEXT: add z1.d, z1.d, z3.d +; CHECK-NEXT: add z0.d, z0.d, z2.d +; CHECK-NEXT: stp q1, q0, [x0] +; CHECK-NEXT: ret + %op1 = load <4 x i64>, <4 x i64>* %a + %op2 = load <4 x i64>, <4 x i64>* %b + %res = add <4 x i64> %op1, %op2 + store <4 x i64> %res, <4 x i64>* %a + ret void +} + +define void @add_v8i64(<8 x i64>* %a, <8 x i64>* %b) #0 { +; CHECK-LABEL: add_v8i64: +; CHECK: // %bb.0: +; CHECK-NEXT: mov x8, #4 +; CHECK-NEXT: mov x9, #6 +; CHECK-NEXT: mov x10, #2 +; CHECK-NEXT: ptrue p0.d, vl2 +; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] +; CHECK-NEXT: ld1d { z1.d }, p0/z, [x0, x9, lsl #3] +; CHECK-NEXT: ld1d { z2.d }, p0/z, [x0, x10, lsl #3] +; CHECK-NEXT: ld1d { z3.d }, p0/z, [x0] +; CHECK-NEXT: ld1d { z4.d }, p0/z, [x1, x8, lsl #3] +; CHECK-NEXT: ld1d { z5.d }, p0/z, [x1, x9, lsl #3] +; CHECK-NEXT: ld1d { z6.d }, p0/z, [x1, x10, lsl #3] +; CHECK-NEXT: ld1d { z7.d }, p0/z, [x1] +; CHECK-NEXT: add z0.d, z0.d, z4.d +; CHECK-NEXT: add z1.d, z1.d, z5.d +; CHECK-NEXT: stp q0, q1, [x0, #32] +; CHECK-NEXT: add z0.d, z3.d, z7.d +; CHECK-NEXT: add z1.d, z2.d, z6.d +; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: ret + %op1 = load <8 x i64>, <8 x i64>* %a + %op2 = load <8 x i64>, <8 x i64>* %b + %res = add <8 x i64> %op1, %op2 + store <8 x i64> %res, <8 x i64>* %a + ret void +} + +; +; MUL +; + +define <4 x i8> @mul_v4i8(<4 x i8> %op1, <4 x i8> %op2) #0 { +; CHECK-LABEL: mul_v4i8: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: ptrue p0.h, vl4 +; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1 +; CHECK-NEXT: mul z0.h, p0/m, z0.h, z1.h +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 +; CHECK-NEXT: ret + %res = mul <4 x i8> %op1, %op2 + ret <4 x i8> %res +} + +define <8 x i8> @mul_v8i8(<8 x i8> %op1, <8 x i8> %op2) #0 { +; CHECK-LABEL: mul_v8i8: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: ptrue p0.b, vl8 +; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1 +; CHECK-NEXT: mul z0.b, p0/m, z0.b, z1.b +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 +; CHECK-NEXT: ret + %res = mul <8 x i8> %op1, %op2 + ret <8 x i8> %res +} + +define <16 x i8> @mul_v16i8(<16 x i8> %op1, <16 x i8> %op2) #0 { +; CHECK-LABEL: mul_v16i8: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: ptrue p0.b, vl16 +; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1 +; CHECK-NEXT: mul z0.b, p0/m, z0.b, z1.b +; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 +; CHECK-NEXT: ret + %res = mul <16 x i8> %op1, %op2 + ret <16 x i8> %res +} + +define void @mul_v32i8(<32 x i8>* %a, <32 x i8>* %b) #0 { +; CHECK-LABEL: mul_v32i8: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w8, #16 +; CHECK-NEXT: ptrue p0.b, vl16 +; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0, x8] +; CHECK-NEXT: ld1b { z1.b }, p0/z, [x0] +; CHECK-NEXT: ld1b { z2.b }, p0/z, [x1, x8] +; CHECK-NEXT: ld1b { z3.b }, p0/z, [x1] +; CHECK-NEXT: mul z1.b, p0/m, z1.b, z3.b +; CHECK-NEXT: mul z0.b, p0/m, z0.b, z2.b +; CHECK-NEXT: stp q1, q0, [x0] +; CHECK-NEXT: ret + %op1 = load <32 x i8>, <32 x i8>* %a + %op2 = load <32 x i8>, <32 x i8>* %b + %res = mul <32 x i8> %op1, %op2 + store <32 x i8> %res, <32 x i8>* %a + ret void +} + +define void @mul_v64i8(<64 x i8>* %a, <64 x i8>* %b) #0 { +; CHECK-LABEL: mul_v64i8: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w8, #32 +; CHECK-NEXT: mov w9, #48 +; CHECK-NEXT: mov w10, #16 +; CHECK-NEXT: ptrue p0.b, vl16 +; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0, x8] +; CHECK-NEXT: ld1b { z1.b }, p0/z, [x0, x9] +; CHECK-NEXT: ld1b { z2.b }, p0/z, [x0, x10] +; CHECK-NEXT: ld1b { z3.b }, p0/z, [x0] +; CHECK-NEXT: ld1b { z4.b }, p0/z, [x1, x8] +; CHECK-NEXT: ld1b { z5.b }, p0/z, [x1, x9] +; CHECK-NEXT: ld1b { z6.b }, p0/z, [x1, x10] +; CHECK-NEXT: ld1b { z7.b }, p0/z, [x1] +; CHECK-NEXT: mul z0.b, p0/m, z0.b, z4.b +; CHECK-NEXT: mul z1.b, p0/m, z1.b, z5.b +; CHECK-NEXT: stp q0, q1, [x0, #32] +; CHECK-NEXT: movprfx z0, z3 +; CHECK-NEXT: mul z0.b, p0/m, z0.b, z7.b +; CHECK-NEXT: movprfx z1, z2 +; CHECK-NEXT: mul z1.b, p0/m, z1.b, z6.b +; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: ret + %op1 = load <64 x i8>, <64 x i8>* %a + %op2 = load <64 x i8>, <64 x i8>* %b + %res = mul <64 x i8> %op1, %op2 + store <64 x i8> %res, <64 x i8>* %a + ret void +} + +define <2 x i16> @mul_v2i16(<2 x i16> %op1, <2 x i16> %op2) #0 { +; CHECK-LABEL: mul_v2i16: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: ptrue p0.s, vl2 +; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1 +; CHECK-NEXT: mul z0.s, p0/m, z0.s, z1.s +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 +; CHECK-NEXT: ret + %res = mul <2 x i16> %op1, %op2 + ret <2 x i16> %res +} + +define <4 x i16> @mul_v4i16(<4 x i16> %op1, <4 x i16> %op2) #0 { +; CHECK-LABEL: mul_v4i16: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: ptrue p0.h, vl4 +; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1 +; CHECK-NEXT: mul z0.h, p0/m, z0.h, z1.h +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 +; CHECK-NEXT: ret + %res = mul <4 x i16> %op1, %op2 + ret <4 x i16> %res +} + +define <8 x i16> @mul_v8i16(<8 x i16> %op1, <8 x i16> %op2) #0 { +; CHECK-LABEL: mul_v8i16: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: ptrue p0.h, vl8 +; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1 +; CHECK-NEXT: mul z0.h, p0/m, z0.h, z1.h +; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 +; CHECK-NEXT: ret + %res = mul <8 x i16> %op1, %op2 + ret <8 x i16> %res +} + +define void @mul_v16i16(<16 x i16>* %a, <16 x i16>* %b) #0 { +; CHECK-LABEL: mul_v16i16: +; CHECK: // %bb.0: +; CHECK-NEXT: mov x8, #8 +; CHECK-NEXT: ptrue p0.h, vl8 +; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] +; CHECK-NEXT: ld1h { z1.h }, p0/z, [x0] +; CHECK-NEXT: ld1h { z2.h }, p0/z, [x1, x8, lsl #1] +; CHECK-NEXT: ld1h { z3.h }, p0/z, [x1] +; CHECK-NEXT: mul z1.h, p0/m, z1.h, z3.h +; CHECK-NEXT: mul z0.h, p0/m, z0.h, z2.h +; CHECK-NEXT: stp q1, q0, [x0] +; CHECK-NEXT: ret + %op1 = load <16 x i16>, <16 x i16>* %a + %op2 = load <16 x i16>, <16 x i16>* %b + %res = mul <16 x i16> %op1, %op2 + store <16 x i16> %res, <16 x i16>* %a + ret void +} + +define void @mul_v32i16(<32 x i16>* %a, <32 x i16>* %b) #0 { +; CHECK-LABEL: mul_v32i16: +; CHECK: // %bb.0: +; CHECK-NEXT: mov x8, #16 +; CHECK-NEXT: mov x9, #24 +; CHECK-NEXT: mov x10, #8 +; CHECK-NEXT: ptrue p0.h, vl8 +; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] +; CHECK-NEXT: ld1h { z1.h }, p0/z, [x0, x9, lsl #1] +; CHECK-NEXT: ld1h { z2.h }, p0/z, [x0, x10, lsl #1] +; CHECK-NEXT: ld1h { z3.h }, p0/z, [x0] +; CHECK-NEXT: ld1h { z4.h }, p0/z, [x1, x8, lsl #1] +; CHECK-NEXT: ld1h { z5.h }, p0/z, [x1, x9, lsl #1] +; CHECK-NEXT: ld1h { z6.h }, p0/z, [x1, x10, lsl #1] +; CHECK-NEXT: ld1h { z7.h }, p0/z, [x1] +; CHECK-NEXT: mul z0.h, p0/m, z0.h, z4.h +; CHECK-NEXT: mul z1.h, p0/m, z1.h, z5.h +; CHECK-NEXT: stp q0, q1, [x0, #32] +; CHECK-NEXT: movprfx z0, z3 +; CHECK-NEXT: mul z0.h, p0/m, z0.h, z7.h +; CHECK-NEXT: movprfx z1, z2 +; CHECK-NEXT: mul z1.h, p0/m, z1.h, z6.h +; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: ret + %op1 = load <32 x i16>, <32 x i16>* %a + %op2 = load <32 x i16>, <32 x i16>* %b + %res = mul <32 x i16> %op1, %op2 + store <32 x i16> %res, <32 x i16>* %a + ret void +} + +define <2 x i32> @mul_v2i32(<2 x i32> %op1, <2 x i32> %op2) #0 { +; CHECK-LABEL: mul_v2i32: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: ptrue p0.s, vl2 +; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1 +; CHECK-NEXT: mul z0.s, p0/m, z0.s, z1.s +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 +; CHECK-NEXT: ret + %res = mul <2 x i32> %op1, %op2 + ret <2 x i32> %res +} + +define <4 x i32> @mul_v4i32(<4 x i32> %op1, <4 x i32> %op2) #0 { +; CHECK-LABEL: mul_v4i32: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: ptrue p0.s, vl4 +; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1 +; CHECK-NEXT: mul z0.s, p0/m, z0.s, z1.s +; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 +; CHECK-NEXT: ret + %res = mul <4 x i32> %op1, %op2 + ret <4 x i32> %res +} + +define void @mul_v8i32(<8 x i32>* %a, <8 x i32>* %b) #0 { +; CHECK-LABEL: mul_v8i32: +; CHECK: // %bb.0: +; CHECK-NEXT: mov x8, #4 +; CHECK-NEXT: ptrue p0.s, vl4 +; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] +; CHECK-NEXT: ld1w { z1.s }, p0/z, [x0] +; CHECK-NEXT: ld1w { z2.s }, p0/z, [x1, x8, lsl #2] +; CHECK-NEXT: ld1w { z3.s }, p0/z, [x1] +; CHECK-NEXT: mul z1.s, p0/m, z1.s, z3.s +; CHECK-NEXT: mul z0.s, p0/m, z0.s, z2.s +; CHECK-NEXT: stp q1, q0, [x0] +; CHECK-NEXT: ret + %op1 = load <8 x i32>, <8 x i32>* %a + %op2 = load <8 x i32>, <8 x i32>* %b + %res = mul <8 x i32> %op1, %op2 + store <8 x i32> %res, <8 x i32>* %a + ret void +} + +define void @mul_v16i32(<16 x i32>* %a, <16 x i32>* %b) #0 { +; CHECK-LABEL: mul_v16i32: +; CHECK: // %bb.0: +; CHECK-NEXT: mov x8, #8 +; CHECK-NEXT: mov x9, #12 +; CHECK-NEXT: mov x10, #4 +; CHECK-NEXT: ptrue p0.s, vl4 +; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] +; CHECK-NEXT: ld1w { z1.s }, p0/z, [x0, x9, lsl #2] +; CHECK-NEXT: ld1w { z2.s }, p0/z, [x0, x10, lsl #2] +; CHECK-NEXT: ld1w { z3.s }, p0/z, [x0] +; CHECK-NEXT: ld1w { z4.s }, p0/z, [x1, x8, lsl #2] +; CHECK-NEXT: ld1w { z5.s }, p0/z, [x1, x9, lsl #2] +; CHECK-NEXT: ld1w { z6.s }, p0/z, [x1, x10, lsl #2] +; CHECK-NEXT: ld1w { z7.s }, p0/z, [x1] +; CHECK-NEXT: mul z0.s, p0/m, z0.s, z4.s +; CHECK-NEXT: mul z1.s, p0/m, z1.s, z5.s +; CHECK-NEXT: stp q0, q1, [x0, #32] +; CHECK-NEXT: movprfx z0, z3 +; CHECK-NEXT: mul z0.s, p0/m, z0.s, z7.s +; CHECK-NEXT: movprfx z1, z2 +; CHECK-NEXT: mul z1.s, p0/m, z1.s, z6.s +; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: ret + %op1 = load <16 x i32>, <16 x i32>* %a + %op2 = load <16 x i32>, <16 x i32>* %b + %res = mul <16 x i32> %op1, %op2 + store <16 x i32> %res, <16 x i32>* %a + ret void +} + +define <1 x i64> @mul_v1i64(<1 x i64> %op1, <1 x i64> %op2) #0 { +; CHECK-LABEL: mul_v1i64: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: ptrue p0.d, vl1 +; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1 +; CHECK-NEXT: mul z0.d, p0/m, z0.d, z1.d +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 +; CHECK-NEXT: ret + %res = mul <1 x i64> %op1, %op2 + ret <1 x i64> %res +} + +define <2 x i64> @mul_v2i64(<2 x i64> %op1, <2 x i64> %op2) #0 { +; CHECK-LABEL: mul_v2i64: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: ptrue p0.d, vl2 +; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1 +; CHECK-NEXT: mul z0.d, p0/m, z0.d, z1.d +; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 +; CHECK-NEXT: ret + %res = mul <2 x i64> %op1, %op2 + ret <2 x i64> %res +} + +define void @mul_v4i64(<4 x i64>* %a, <4 x i64>* %b) #0 { +; CHECK-LABEL: mul_v4i64: +; CHECK: // %bb.0: +; CHECK-NEXT: mov x8, #2 +; CHECK-NEXT: ptrue p0.d, vl2 +; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] +; CHECK-NEXT: ld1d { z1.d }, p0/z, [x0] +; CHECK-NEXT: ld1d { z2.d }, p0/z, [x1, x8, lsl #3] +; CHECK-NEXT: ld1d { z3.d }, p0/z, [x1] +; CHECK-NEXT: mul z1.d, p0/m, z1.d, z3.d +; CHECK-NEXT: mul z0.d, p0/m, z0.d, z2.d +; CHECK-NEXT: stp q1, q0, [x0] +; CHECK-NEXT: ret + %op1 = load <4 x i64>, <4 x i64>* %a + %op2 = load <4 x i64>, <4 x i64>* %b + %res = mul <4 x i64> %op1, %op2 + store <4 x i64> %res, <4 x i64>* %a + ret void +} + +define void @mul_v8i64(<8 x i64>* %a, <8 x i64>* %b) #0 { +; CHECK-LABEL: mul_v8i64: +; CHECK: // %bb.0: +; CHECK-NEXT: mov x8, #4 +; CHECK-NEXT: mov x9, #6 +; CHECK-NEXT: mov x10, #2 +; CHECK-NEXT: ptrue p0.d, vl2 +; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] +; CHECK-NEXT: ld1d { z1.d }, p0/z, [x0, x9, lsl #3] +; CHECK-NEXT: ld1d { z2.d }, p0/z, [x0, x10, lsl #3] +; CHECK-NEXT: ld1d { z3.d }, p0/z, [x0] +; CHECK-NEXT: ld1d { z4.d }, p0/z, [x1, x8, lsl #3] +; CHECK-NEXT: ld1d { z5.d }, p0/z, [x1, x9, lsl #3] +; CHECK-NEXT: ld1d { z6.d }, p0/z, [x1, x10, lsl #3] +; CHECK-NEXT: ld1d { z7.d }, p0/z, [x1] +; CHECK-NEXT: mul z0.d, p0/m, z0.d, z4.d +; CHECK-NEXT: mul z1.d, p0/m, z1.d, z5.d +; CHECK-NEXT: stp q0, q1, [x0, #32] +; CHECK-NEXT: movprfx z0, z3 +; CHECK-NEXT: mul z0.d, p0/m, z0.d, z7.d +; CHECK-NEXT: movprfx z1, z2 +; CHECK-NEXT: mul z1.d, p0/m, z1.d, z6.d +; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: ret + %op1 = load <8 x i64>, <8 x i64>* %a + %op2 = load <8 x i64>, <8 x i64>* %b + %res = mul <8 x i64> %op1, %op2 + store <8 x i64> %res, <8 x i64>* %a + ret void +} + +; +; SUB +; + +define <4 x i8> @sub_v4i8(<4 x i8> %op1, <4 x i8> %op2) #0 { +; CHECK-LABEL: sub_v4i8: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1 +; CHECK-NEXT: sub z0.h, z0.h, z1.h +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 +; CHECK-NEXT: ret + %res = sub <4 x i8> %op1, %op2 + ret <4 x i8> %res +} + +define <8 x i8> @sub_v8i8(<8 x i8> %op1, <8 x i8> %op2) #0 { +; CHECK-LABEL: sub_v8i8: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1 +; CHECK-NEXT: sub z0.b, z0.b, z1.b +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 +; CHECK-NEXT: ret + %res = sub <8 x i8> %op1, %op2 + ret <8 x i8> %res +} + +define <16 x i8> @sub_v16i8(<16 x i8> %op1, <16 x i8> %op2) #0 { +; CHECK-LABEL: sub_v16i8: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1 +; CHECK-NEXT: sub z0.b, z0.b, z1.b +; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 +; CHECK-NEXT: ret + %res = sub <16 x i8> %op1, %op2 + ret <16 x i8> %res +} + +define void @sub_v32i8(<32 x i8>* %a, <32 x i8>* %b) #0 { +; CHECK-LABEL: sub_v32i8: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w8, #16 +; CHECK-NEXT: ptrue p0.b, vl16 +; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0, x8] +; CHECK-NEXT: ld1b { z1.b }, p0/z, [x0] +; CHECK-NEXT: ld1b { z2.b }, p0/z, [x1, x8] +; CHECK-NEXT: ld1b { z3.b }, p0/z, [x1] +; CHECK-NEXT: sub z1.b, z1.b, z3.b +; CHECK-NEXT: sub z0.b, z0.b, z2.b +; CHECK-NEXT: stp q1, q0, [x0] +; CHECK-NEXT: ret + %op1 = load <32 x i8>, <32 x i8>* %a + %op2 = load <32 x i8>, <32 x i8>* %b + %res = sub <32 x i8> %op1, %op2 + store <32 x i8> %res, <32 x i8>* %a + ret void +} + +define void @sub_v64i8(<64 x i8>* %a, <64 x i8>* %b) #0 { +; CHECK-LABEL: sub_v64i8: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w8, #32 +; CHECK-NEXT: mov w9, #48 +; CHECK-NEXT: mov w10, #16 +; CHECK-NEXT: ptrue p0.b, vl16 +; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0, x8] +; CHECK-NEXT: ld1b { z1.b }, p0/z, [x0, x9] +; CHECK-NEXT: ld1b { z2.b }, p0/z, [x0, x10] +; CHECK-NEXT: ld1b { z3.b }, p0/z, [x0] +; CHECK-NEXT: ld1b { z4.b }, p0/z, [x1, x8] +; CHECK-NEXT: ld1b { z5.b }, p0/z, [x1, x9] +; CHECK-NEXT: ld1b { z6.b }, p0/z, [x1, x10] +; CHECK-NEXT: ld1b { z7.b }, p0/z, [x1] +; CHECK-NEXT: sub z0.b, z0.b, z4.b +; CHECK-NEXT: sub z1.b, z1.b, z5.b +; CHECK-NEXT: stp q0, q1, [x0, #32] +; CHECK-NEXT: sub z0.b, z3.b, z7.b +; CHECK-NEXT: sub z1.b, z2.b, z6.b +; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: ret + %op1 = load <64 x i8>, <64 x i8>* %a + %op2 = load <64 x i8>, <64 x i8>* %b + %res = sub <64 x i8> %op1, %op2 + store <64 x i8> %res, <64 x i8>* %a + ret void +} + +define <2 x i16> @sub_v2i16(<2 x i16> %op1, <2 x i16> %op2) #0 { +; CHECK-LABEL: sub_v2i16: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1 +; CHECK-NEXT: sub z0.s, z0.s, z1.s +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 +; CHECK-NEXT: ret + %res = sub <2 x i16> %op1, %op2 + ret <2 x i16> %res +} + +define <4 x i16> @sub_v4i16(<4 x i16> %op1, <4 x i16> %op2) #0 { +; CHECK-LABEL: sub_v4i16: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1 +; CHECK-NEXT: sub z0.h, z0.h, z1.h +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 +; CHECK-NEXT: ret + %res = sub <4 x i16> %op1, %op2 + ret <4 x i16> %res +} + +define <8 x i16> @sub_v8i16(<8 x i16> %op1, <8 x i16> %op2) #0 { +; CHECK-LABEL: sub_v8i16: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1 +; CHECK-NEXT: sub z0.h, z0.h, z1.h +; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 +; CHECK-NEXT: ret + %res = sub <8 x i16> %op1, %op2 + ret <8 x i16> %res +} + +define void @sub_v16i16(<16 x i16>* %a, <16 x i16>* %b) #0 { +; CHECK-LABEL: sub_v16i16: +; CHECK: // %bb.0: +; CHECK-NEXT: mov x8, #8 +; CHECK-NEXT: ptrue p0.h, vl8 +; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] +; CHECK-NEXT: ld1h { z1.h }, p0/z, [x0] +; CHECK-NEXT: ld1h { z2.h }, p0/z, [x1, x8, lsl #1] +; CHECK-NEXT: ld1h { z3.h }, p0/z, [x1] +; CHECK-NEXT: sub z1.h, z1.h, z3.h +; CHECK-NEXT: sub z0.h, z0.h, z2.h +; CHECK-NEXT: stp q1, q0, [x0] +; CHECK-NEXT: ret + %op1 = load <16 x i16>, <16 x i16>* %a + %op2 = load <16 x i16>, <16 x i16>* %b + %res = sub <16 x i16> %op1, %op2 + store <16 x i16> %res, <16 x i16>* %a + ret void +} + +define void @sub_v32i16(<32 x i16>* %a, <32 x i16>* %b) #0 { +; CHECK-LABEL: sub_v32i16: +; CHECK: // %bb.0: +; CHECK-NEXT: mov x8, #16 +; CHECK-NEXT: mov x9, #24 +; CHECK-NEXT: mov x10, #8 +; CHECK-NEXT: ptrue p0.h, vl8 +; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] +; CHECK-NEXT: ld1h { z1.h }, p0/z, [x0, x9, lsl #1] +; CHECK-NEXT: ld1h { z2.h }, p0/z, [x0, x10, lsl #1] +; CHECK-NEXT: ld1h { z3.h }, p0/z, [x0] +; CHECK-NEXT: ld1h { z4.h }, p0/z, [x1, x8, lsl #1] +; CHECK-NEXT: ld1h { z5.h }, p0/z, [x1, x9, lsl #1] +; CHECK-NEXT: ld1h { z6.h }, p0/z, [x1, x10, lsl #1] +; CHECK-NEXT: ld1h { z7.h }, p0/z, [x1] +; CHECK-NEXT: sub z0.h, z0.h, z4.h +; CHECK-NEXT: sub z1.h, z1.h, z5.h +; CHECK-NEXT: stp q0, q1, [x0, #32] +; CHECK-NEXT: sub z0.h, z3.h, z7.h +; CHECK-NEXT: sub z1.h, z2.h, z6.h +; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: ret + %op1 = load <32 x i16>, <32 x i16>* %a + %op2 = load <32 x i16>, <32 x i16>* %b + %res = sub <32 x i16> %op1, %op2 + store <32 x i16> %res, <32 x i16>* %a + ret void +} + +define <2 x i32> @sub_v2i32(<2 x i32> %op1, <2 x i32> %op2) #0 { +; CHECK-LABEL: sub_v2i32: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1 +; CHECK-NEXT: sub z0.s, z0.s, z1.s +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 +; CHECK-NEXT: ret + %res = sub <2 x i32> %op1, %op2 + ret <2 x i32> %res +} + +define <4 x i32> @sub_v4i32(<4 x i32> %op1, <4 x i32> %op2) #0 { +; CHECK-LABEL: sub_v4i32: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1 +; CHECK-NEXT: sub z0.s, z0.s, z1.s +; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 +; CHECK-NEXT: ret + %res = sub <4 x i32> %op1, %op2 + ret <4 x i32> %res +} + +define void @sub_v8i32(<8 x i32>* %a, <8 x i32>* %b) #0 { +; CHECK-LABEL: sub_v8i32: +; CHECK: // %bb.0: +; CHECK-NEXT: mov x8, #4 +; CHECK-NEXT: ptrue p0.s, vl4 +; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] +; CHECK-NEXT: ld1w { z1.s }, p0/z, [x0] +; CHECK-NEXT: ld1w { z2.s }, p0/z, [x1, x8, lsl #2] +; CHECK-NEXT: ld1w { z3.s }, p0/z, [x1] +; CHECK-NEXT: sub z1.s, z1.s, z3.s +; CHECK-NEXT: sub z0.s, z0.s, z2.s +; CHECK-NEXT: stp q1, q0, [x0] +; CHECK-NEXT: ret + %op1 = load <8 x i32>, <8 x i32>* %a + %op2 = load <8 x i32>, <8 x i32>* %b + %res = sub <8 x i32> %op1, %op2 + store <8 x i32> %res, <8 x i32>* %a + ret void +} + +define void @sub_v16i32(<16 x i32>* %a, <16 x i32>* %b) #0 { +; CHECK-LABEL: sub_v16i32: +; CHECK: // %bb.0: +; CHECK-NEXT: mov x8, #8 +; CHECK-NEXT: mov x9, #12 +; CHECK-NEXT: mov x10, #4 +; CHECK-NEXT: ptrue p0.s, vl4 +; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] +; CHECK-NEXT: ld1w { z1.s }, p0/z, [x0, x9, lsl #2] +; CHECK-NEXT: ld1w { z2.s }, p0/z, [x0, x10, lsl #2] +; CHECK-NEXT: ld1w { z3.s }, p0/z, [x0] +; CHECK-NEXT: ld1w { z4.s }, p0/z, [x1, x8, lsl #2] +; CHECK-NEXT: ld1w { z5.s }, p0/z, [x1, x9, lsl #2] +; CHECK-NEXT: ld1w { z6.s }, p0/z, [x1, x10, lsl #2] +; CHECK-NEXT: ld1w { z7.s }, p0/z, [x1] +; CHECK-NEXT: sub z0.s, z0.s, z4.s +; CHECK-NEXT: sub z1.s, z1.s, z5.s +; CHECK-NEXT: stp q0, q1, [x0, #32] +; CHECK-NEXT: sub z0.s, z3.s, z7.s +; CHECK-NEXT: sub z1.s, z2.s, z6.s +; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: ret + %op1 = load <16 x i32>, <16 x i32>* %a + %op2 = load <16 x i32>, <16 x i32>* %b + %res = sub <16 x i32> %op1, %op2 + store <16 x i32> %res, <16 x i32>* %a + ret void +} + +define <1 x i64> @sub_v1i64(<1 x i64> %op1, <1 x i64> %op2) #0 { +; CHECK-LABEL: sub_v1i64: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1 +; CHECK-NEXT: sub z0.d, z0.d, z1.d +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 +; CHECK-NEXT: ret + %res = sub <1 x i64> %op1, %op2 + ret <1 x i64> %res +} + +define <2 x i64> @sub_v2i64(<2 x i64> %op1, <2 x i64> %op2) #0 { +; CHECK-LABEL: sub_v2i64: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1 +; CHECK-NEXT: sub z0.d, z0.d, z1.d +; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 +; CHECK-NEXT: ret + %res = sub <2 x i64> %op1, %op2 + ret <2 x i64> %res +} + +define void @sub_v4i64(<4 x i64>* %a, <4 x i64>* %b) #0 { +; CHECK-LABEL: sub_v4i64: +; CHECK: // %bb.0: +; CHECK-NEXT: mov x8, #2 +; CHECK-NEXT: ptrue p0.d, vl2 +; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] +; CHECK-NEXT: ld1d { z1.d }, p0/z, [x0] +; CHECK-NEXT: ld1d { z2.d }, p0/z, [x1, x8, lsl #3] +; CHECK-NEXT: ld1d { z3.d }, p0/z, [x1] +; CHECK-NEXT: sub z1.d, z1.d, z3.d +; CHECK-NEXT: sub z0.d, z0.d, z2.d +; CHECK-NEXT: stp q1, q0, [x0] +; CHECK-NEXT: ret + %op1 = load <4 x i64>, <4 x i64>* %a + %op2 = load <4 x i64>, <4 x i64>* %b + %res = sub <4 x i64> %op1, %op2 + store <4 x i64> %res, <4 x i64>* %a + ret void +} + +define void @sub_v8i64(<8 x i64>* %a, <8 x i64>* %b) #0 { +; CHECK-LABEL: sub_v8i64: +; CHECK: // %bb.0: +; CHECK-NEXT: mov x8, #4 +; CHECK-NEXT: mov x9, #6 +; CHECK-NEXT: mov x10, #2 +; CHECK-NEXT: ptrue p0.d, vl2 +; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] +; CHECK-NEXT: ld1d { z1.d }, p0/z, [x0, x9, lsl #3] +; CHECK-NEXT: ld1d { z2.d }, p0/z, [x0, x10, lsl #3] +; CHECK-NEXT: ld1d { z3.d }, p0/z, [x0] +; CHECK-NEXT: ld1d { z4.d }, p0/z, [x1, x8, lsl #3] +; CHECK-NEXT: ld1d { z5.d }, p0/z, [x1, x9, lsl #3] +; CHECK-NEXT: ld1d { z6.d }, p0/z, [x1, x10, lsl #3] +; CHECK-NEXT: ld1d { z7.d }, p0/z, [x1] +; CHECK-NEXT: sub z0.d, z0.d, z4.d +; CHECK-NEXT: sub z1.d, z1.d, z5.d +; CHECK-NEXT: stp q0, q1, [x0, #32] +; CHECK-NEXT: sub z0.d, z3.d, z7.d +; CHECK-NEXT: sub z1.d, z2.d, z6.d +; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: ret + %op1 = load <8 x i64>, <8 x i64>* %a + %op2 = load <8 x i64>, <8 x i64>* %b + %res = sub <8 x i64> %op1, %op2 + store <8 x i64> %res, <8 x i64>* %a + ret void +} + + +; +; ABS +; + +define <4 x i8> @abs_v4i8(<4 x i8> %op1) #0 { +; CHECK-LABEL: abs_v4i8: +; CHECK: // %bb.0: +; CHECK-NEXT: adrp x8, .LCPI54_0 +; CHECK-NEXT: add x8, x8, :lo12:.LCPI54_0 +; CHECK-NEXT: ptrue p0.h, vl4 +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: ld1h { z1.h }, p0/z, [x8] +; CHECK-NEXT: lsl z0.h, p0/m, z0.h, z1.h +; CHECK-NEXT: asr z0.h, p0/m, z0.h, z1.h +; CHECK-NEXT: abs z0.h, p0/m, z0.h +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 +; CHECK-NEXT: ret + %res = call <4 x i8> @llvm.abs.v4i8(<4 x i8> %op1, i1 false) + ret <4 x i8> %res +} + +define <8 x i8> @abs_v8i8(<8 x i8> %op1) #0 { +; CHECK-LABEL: abs_v8i8: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: ptrue p0.b, vl8 +; CHECK-NEXT: abs z0.b, p0/m, z0.b +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 +; CHECK-NEXT: ret + %res = call <8 x i8> @llvm.abs.v8i8(<8 x i8> %op1, i1 false) + ret <8 x i8> %res +} + +define <16 x i8> @abs_v16i8(<16 x i8> %op1) #0 { +; CHECK-LABEL: abs_v16i8: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: ptrue p0.b, vl16 +; CHECK-NEXT: abs z0.b, p0/m, z0.b +; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 +; CHECK-NEXT: ret + %res = call <16 x i8> @llvm.abs.v16i8(<16 x i8> %op1, i1 false) + ret <16 x i8> %res +} + +define void @abs_v32i8(<32 x i8>* %a) #0 { +; CHECK-LABEL: abs_v32i8: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w8, #16 +; CHECK-NEXT: ptrue p0.b, vl16 +; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0, x8] +; CHECK-NEXT: ld1b { z1.b }, p0/z, [x0] +; CHECK-NEXT: abs z1.b, p0/m, z1.b +; CHECK-NEXT: abs z0.b, p0/m, z0.b +; CHECK-NEXT: stp q1, q0, [x0] +; CHECK-NEXT: ret + %op1 = load <32 x i8>, <32 x i8>* %a + %res = call <32 x i8> @llvm.abs.v32i8(<32 x i8> %op1, i1 false) + store <32 x i8> %res, <32 x i8>* %a + ret void +} + +define void @abs_v64i8(<64 x i8>* %a) #0 { +; CHECK-LABEL: abs_v64i8: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w8, #32 +; CHECK-NEXT: mov w9, #48 +; CHECK-NEXT: ptrue p0.b, vl16 +; CHECK-NEXT: mov w10, #16 +; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0, x8] +; CHECK-NEXT: ld1b { z1.b }, p0/z, [x0, x9] +; CHECK-NEXT: ld1b { z2.b }, p0/z, [x0, x10] +; CHECK-NEXT: ld1b { z3.b }, p0/z, [x0] +; CHECK-NEXT: abs z0.b, p0/m, z0.b +; CHECK-NEXT: abs z1.b, p0/m, z1.b +; CHECK-NEXT: stp q0, q1, [x0, #32] +; CHECK-NEXT: movprfx z0, z3 +; CHECK-NEXT: abs z0.b, p0/m, z3.b +; CHECK-NEXT: movprfx z1, z2 +; CHECK-NEXT: abs z1.b, p0/m, z2.b +; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: ret + %op1 = load <64 x i8>, <64 x i8>* %a + %res = call <64 x i8> @llvm.abs.v64i8(<64 x i8> %op1, i1 false) + store <64 x i8> %res, <64 x i8>* %a + ret void +} + +define <2 x i16> @abs_v2i16(<2 x i16> %op1) #0 { +; CHECK-LABEL: abs_v2i16: +; CHECK: // %bb.0: +; CHECK-NEXT: adrp x8, .LCPI59_0 +; CHECK-NEXT: add x8, x8, :lo12:.LCPI59_0 +; CHECK-NEXT: ptrue p0.s, vl2 +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: ld1w { z1.s }, p0/z, [x8] +; CHECK-NEXT: lsl z0.s, p0/m, z0.s, z1.s +; CHECK-NEXT: asr z0.s, p0/m, z0.s, z1.s +; CHECK-NEXT: abs z0.s, p0/m, z0.s +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 +; CHECK-NEXT: ret + %res = call <2 x i16> @llvm.abs.v2i16(<2 x i16> %op1, i1 false) + ret <2 x i16> %res +} + +define <4 x i16> @abs_v4i16(<4 x i16> %op1) #0 { +; CHECK-LABEL: abs_v4i16: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: ptrue p0.h, vl4 +; CHECK-NEXT: abs z0.h, p0/m, z0.h +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 +; CHECK-NEXT: ret + %res = call <4 x i16> @llvm.abs.v4i16(<4 x i16> %op1, i1 false) + ret <4 x i16> %res +} + +define <8 x i16> @abs_v8i16(<8 x i16> %op1) #0 { +; CHECK-LABEL: abs_v8i16: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: ptrue p0.h, vl8 +; CHECK-NEXT: abs z0.h, p0/m, z0.h +; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 +; CHECK-NEXT: ret + %res = call <8 x i16> @llvm.abs.v8i16(<8 x i16> %op1, i1 false) + ret <8 x i16> %res +} + +define void @abs_v16i16(<16 x i16>* %a) #0 { +; CHECK-LABEL: abs_v16i16: +; CHECK: // %bb.0: +; CHECK-NEXT: mov x8, #8 +; CHECK-NEXT: ptrue p0.h, vl8 +; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] +; CHECK-NEXT: ld1h { z1.h }, p0/z, [x0] +; CHECK-NEXT: abs z1.h, p0/m, z1.h +; CHECK-NEXT: abs z0.h, p0/m, z0.h +; CHECK-NEXT: stp q1, q0, [x0] +; CHECK-NEXT: ret + %op1 = load <16 x i16>, <16 x i16>* %a + %res = call <16 x i16> @llvm.abs.v16i16(<16 x i16> %op1, i1 false) + store <16 x i16> %res, <16 x i16>* %a + ret void +} + +define void @abs_v32i16(<32 x i16>* %a) #0 { +; CHECK-LABEL: abs_v32i16: +; CHECK: // %bb.0: +; CHECK-NEXT: mov x8, #16 +; CHECK-NEXT: mov x9, #24 +; CHECK-NEXT: ptrue p0.h, vl8 +; CHECK-NEXT: mov x10, #8 +; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] +; CHECK-NEXT: ld1h { z1.h }, p0/z, [x0, x9, lsl #1] +; CHECK-NEXT: ld1h { z2.h }, p0/z, [x0, x10, lsl #1] +; CHECK-NEXT: ld1h { z3.h }, p0/z, [x0] +; CHECK-NEXT: abs z0.h, p0/m, z0.h +; CHECK-NEXT: abs z1.h, p0/m, z1.h +; CHECK-NEXT: stp q0, q1, [x0, #32] +; CHECK-NEXT: movprfx z0, z3 +; CHECK-NEXT: abs z0.h, p0/m, z3.h +; CHECK-NEXT: movprfx z1, z2 +; CHECK-NEXT: abs z1.h, p0/m, z2.h +; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: ret + %op1 = load <32 x i16>, <32 x i16>* %a + %res = call <32 x i16> @llvm.abs.v32i16(<32 x i16> %op1, i1 false) + store <32 x i16> %res, <32 x i16>* %a + ret void +} + +define <2 x i32> @abs_v2i32(<2 x i32> %op1) #0 { +; CHECK-LABEL: abs_v2i32: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: ptrue p0.s, vl2 +; CHECK-NEXT: abs z0.s, p0/m, z0.s +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 +; CHECK-NEXT: ret + %res = call <2 x i32> @llvm.abs.v2i32(<2 x i32> %op1, i1 false) + ret <2 x i32> %res +} + +define <4 x i32> @abs_v4i32(<4 x i32> %op1) #0 { +; CHECK-LABEL: abs_v4i32: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: ptrue p0.s, vl4 +; CHECK-NEXT: abs z0.s, p0/m, z0.s +; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 +; CHECK-NEXT: ret + %res = call <4 x i32> @llvm.abs.v4i32(<4 x i32> %op1, i1 false) + ret <4 x i32> %res +} + +define void @abs_v8i32(<8 x i32>* %a) #0 { +; CHECK-LABEL: abs_v8i32: +; CHECK: // %bb.0: +; CHECK-NEXT: mov x8, #4 +; CHECK-NEXT: ptrue p0.s, vl4 +; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] +; CHECK-NEXT: ld1w { z1.s }, p0/z, [x0] +; CHECK-NEXT: abs z1.s, p0/m, z1.s +; CHECK-NEXT: abs z0.s, p0/m, z0.s +; CHECK-NEXT: stp q1, q0, [x0] +; CHECK-NEXT: ret + %op1 = load <8 x i32>, <8 x i32>* %a + %res = call <8 x i32> @llvm.abs.v8i32(<8 x i32> %op1, i1 false) + store <8 x i32> %res, <8 x i32>* %a + ret void +} + +define void @abs_v16i32(<16 x i32>* %a) #0 { +; CHECK-LABEL: abs_v16i32: +; CHECK: // %bb.0: +; CHECK-NEXT: mov x8, #8 +; CHECK-NEXT: mov x9, #12 +; CHECK-NEXT: ptrue p0.s, vl4 +; CHECK-NEXT: mov x10, #4 +; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] +; CHECK-NEXT: ld1w { z1.s }, p0/z, [x0, x9, lsl #2] +; CHECK-NEXT: ld1w { z2.s }, p0/z, [x0, x10, lsl #2] +; CHECK-NEXT: ld1w { z3.s }, p0/z, [x0] +; CHECK-NEXT: abs z0.s, p0/m, z0.s +; CHECK-NEXT: abs z1.s, p0/m, z1.s +; CHECK-NEXT: stp q0, q1, [x0, #32] +; CHECK-NEXT: movprfx z0, z3 +; CHECK-NEXT: abs z0.s, p0/m, z3.s +; CHECK-NEXT: movprfx z1, z2 +; CHECK-NEXT: abs z1.s, p0/m, z2.s +; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: ret + %op1 = load <16 x i32>, <16 x i32>* %a + %res = call <16 x i32> @llvm.abs.v16i32(<16 x i32> %op1, i1 false) + store <16 x i32> %res, <16 x i32>* %a + ret void +} + +define <1 x i64> @abs_v1i64(<1 x i64> %op1) #0 { +; CHECK-LABEL: abs_v1i64: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: ptrue p0.d, vl1 +; CHECK-NEXT: abs z0.d, p0/m, z0.d +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 +; CHECK-NEXT: ret + %res = call <1 x i64> @llvm.abs.v1i64(<1 x i64> %op1, i1 false) + ret <1 x i64> %res +} + +define <2 x i64> @abs_v2i64(<2 x i64> %op1) #0 { +; CHECK-LABEL: abs_v2i64: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: ptrue p0.d, vl2 +; CHECK-NEXT: abs z0.d, p0/m, z0.d +; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 +; CHECK-NEXT: ret + %res = call <2 x i64> @llvm.abs.v2i64(<2 x i64> %op1, i1 false) + ret <2 x i64> %res +} + +define void @abs_v4i64(<4 x i64>* %a) #0 { +; CHECK-LABEL: abs_v4i64: +; CHECK: // %bb.0: +; CHECK-NEXT: mov x8, #2 +; CHECK-NEXT: ptrue p0.d, vl2 +; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] +; CHECK-NEXT: ld1d { z1.d }, p0/z, [x0] +; CHECK-NEXT: abs z1.d, p0/m, z1.d +; CHECK-NEXT: abs z0.d, p0/m, z0.d +; CHECK-NEXT: stp q1, q0, [x0] +; CHECK-NEXT: ret + %op1 = load <4 x i64>, <4 x i64>* %a + %res = call <4 x i64> @llvm.abs.v4i64(<4 x i64> %op1, i1 false) + store <4 x i64> %res, <4 x i64>* %a + ret void +} + +define void @abs_v8i64(<8 x i64>* %a) #0 { +; CHECK-LABEL: abs_v8i64: +; CHECK: // %bb.0: +; CHECK-NEXT: mov x8, #4 +; CHECK-NEXT: mov x9, #6 +; CHECK-NEXT: ptrue p0.d, vl2 +; CHECK-NEXT: mov x10, #2 +; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] +; CHECK-NEXT: ld1d { z1.d }, p0/z, [x0, x9, lsl #3] +; CHECK-NEXT: ld1d { z2.d }, p0/z, [x0, x10, lsl #3] +; CHECK-NEXT: ld1d { z3.d }, p0/z, [x0] +; CHECK-NEXT: abs z0.d, p0/m, z0.d +; CHECK-NEXT: abs z1.d, p0/m, z1.d +; CHECK-NEXT: stp q0, q1, [x0, #32] +; CHECK-NEXT: movprfx z0, z3 +; CHECK-NEXT: abs z0.d, p0/m, z3.d +; CHECK-NEXT: movprfx z1, z2 +; CHECK-NEXT: abs z1.d, p0/m, z2.d +; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: ret + %op1 = load <8 x i64>, <8 x i64>* %a + %res = call <8 x i64> @llvm.abs.v8i64(<8 x i64> %op1, i1 false) + store <8 x i64> %res, <8 x i64>* %a + ret void +} + +declare <4 x i8> @llvm.abs.v4i8(<4 x i8>, i1) +declare <8 x i8> @llvm.abs.v8i8(<8 x i8>, i1) +declare <16 x i8> @llvm.abs.v16i8(<16 x i8>, i1) +declare <32 x i8> @llvm.abs.v32i8(<32 x i8>, i1) +declare <64 x i8> @llvm.abs.v64i8(<64 x i8>, i1) +declare <4 x i16> @llvm.abs.v4i16(<4 x i16>, i1) +declare <2 x i16> @llvm.abs.v2i16(<2 x i16>, i1) +declare <8 x i16> @llvm.abs.v8i16(<8 x i16>, i1) +declare <16 x i16> @llvm.abs.v16i16(<16 x i16>, i1) +declare <32 x i16> @llvm.abs.v32i16(<32 x i16>, i1) +declare <2 x i32> @llvm.abs.v2i32(<2 x i32>, i1) +declare <4 x i32> @llvm.abs.v4i32(<4 x i32>, i1) +declare <8 x i32> @llvm.abs.v8i32(<8 x i32>, i1) +declare <16 x i32> @llvm.abs.v16i32(<16 x i32>, i1) +declare <1 x i64> @llvm.abs.v1i64(<1 x i64>, i1) +declare <2 x i64> @llvm.abs.v2i64(<2 x i64>, i1) +declare <4 x i64> @llvm.abs.v4i64(<4 x i64>, i1) +declare <8 x i64> @llvm.abs.v8i64(<8 x i64>, i1) + + +attributes #0 = { "target-features"="+sve" } diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-div.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-div.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-div.ll @@ -0,0 +1,1229 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -force-streaming-compatible-sve < %s | FileCheck %s + +target triple = "aarch64-unknown-linux-gnu" + +; +; SDIV +; + +define <4 x i8> @sdiv_v4i8(<4 x i8> %op1, <4 x i8> %op2) #0 { +; CHECK-LABEL: sdiv_v4i8: +; CHECK: // %bb.0: +; CHECK-NEXT: sub sp, sp, #16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: adrp x8, .LCPI0_0 +; CHECK-NEXT: add x8, x8, :lo12:.LCPI0_0 +; CHECK-NEXT: ptrue p0.h, vl4 +; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1 +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: ptrue p1.s, vl4 +; CHECK-NEXT: ld1h { z2.h }, p0/z, [x8] +; CHECK-NEXT: lsl z0.h, p0/m, z0.h, z2.h +; CHECK-NEXT: lsl z1.h, p0/m, z1.h, z2.h +; CHECK-NEXT: asr z0.h, p0/m, z0.h, z2.h +; CHECK-NEXT: asr z1.h, p0/m, z1.h, z2.h +; CHECK-NEXT: sunpklo z1.s, z1.h +; CHECK-NEXT: sunpklo z0.s, z0.h +; CHECK-NEXT: sdiv z0.s, p1/m, z0.s, z1.s +; CHECK-NEXT: fmov w8, s0 +; CHECK-NEXT: mov z1.s, z0.s[3] +; CHECK-NEXT: mov z2.s, z0.s[2] +; CHECK-NEXT: mov z0.s, z0.s[1] +; CHECK-NEXT: fmov w9, s1 +; CHECK-NEXT: fmov w10, s2 +; CHECK-NEXT: strh w8, [sp, #8] +; CHECK-NEXT: fmov w8, s0 +; CHECK-NEXT: strh w9, [sp, #14] +; CHECK-NEXT: strh w8, [sp, #10] +; CHECK-NEXT: add x8, sp, #8 +; CHECK-NEXT: strh w10, [sp, #12] +; CHECK-NEXT: ld1h { z0.h }, p0/z, [x8] +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 +; CHECK-NEXT: add sp, sp, #16 +; CHECK-NEXT: ret + %res = sdiv <4 x i8> %op1, %op2 + ret <4 x i8> %res +} + +define <8 x i8> @sdiv_v8i8(<8 x i8> %op1, <8 x i8> %op2) #0 { +; CHECK-LABEL: sdiv_v8i8: +; CHECK: // %bb.0: +; CHECK-NEXT: sub sp, sp, #16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1 +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: ptrue p0.s, vl4 +; CHECK-NEXT: sunpklo z1.h, z1.b +; CHECK-NEXT: sunpklo z0.h, z0.b +; CHECK-NEXT: sunpkhi z2.s, z1.h +; CHECK-NEXT: sunpkhi z3.s, z0.h +; CHECK-NEXT: sunpklo z1.s, z1.h +; CHECK-NEXT: sunpklo z0.s, z0.h +; CHECK-NEXT: sdivr z2.s, p0/m, z2.s, z3.s +; CHECK-NEXT: sdiv z0.s, p0/m, z0.s, z1.s +; CHECK-NEXT: uzp1 z0.h, z0.h, z2.h +; CHECK-NEXT: ptrue p0.b, vl8 +; CHECK-NEXT: fmov w8, s0 +; CHECK-NEXT: mov z1.h, z0.h[7] +; CHECK-NEXT: mov z3.h, z0.h[5] +; CHECK-NEXT: fmov w9, s1 +; CHECK-NEXT: mov z2.h, z0.h[6] +; CHECK-NEXT: mov z4.h, z0.h[4] +; CHECK-NEXT: strb w8, [sp, #8] +; CHECK-NEXT: fmov w8, s3 +; CHECK-NEXT: mov z6.h, z0.h[2] +; CHECK-NEXT: fmov w10, s2 +; CHECK-NEXT: strb w9, [sp, #15] +; CHECK-NEXT: fmov w9, s4 +; CHECK-NEXT: strb w8, [sp, #13] +; CHECK-NEXT: fmov w8, s6 +; CHECK-NEXT: mov z5.h, z0.h[3] +; CHECK-NEXT: mov z0.h, z0.h[1] +; CHECK-NEXT: strb w10, [sp, #14] +; CHECK-NEXT: fmov w10, s5 +; CHECK-NEXT: strb w9, [sp, #12] +; CHECK-NEXT: fmov w9, s0 +; CHECK-NEXT: strb w8, [sp, #10] +; CHECK-NEXT: add x8, sp, #8 +; CHECK-NEXT: strb w10, [sp, #11] +; CHECK-NEXT: strb w9, [sp, #9] +; CHECK-NEXT: ld1b { z0.b }, p0/z, [x8] +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 +; CHECK-NEXT: add sp, sp, #16 +; CHECK-NEXT: ret + %res = sdiv <8 x i8> %op1, %op2 + ret <8 x i8> %res +} + +define <16 x i8> @sdiv_v16i8(<16 x i8> %op1, <16 x i8> %op2) #0 { +; CHECK-LABEL: sdiv_v16i8: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1 +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: sunpkhi z2.h, z1.b +; CHECK-NEXT: sunpkhi z3.h, z0.b +; CHECK-NEXT: ptrue p0.s, vl4 +; CHECK-NEXT: sunpklo z1.h, z1.b +; CHECK-NEXT: sunpkhi z4.s, z2.h +; CHECK-NEXT: sunpkhi z5.s, z3.h +; CHECK-NEXT: sunpklo z2.s, z2.h +; CHECK-NEXT: sunpklo z3.s, z3.h +; CHECK-NEXT: sunpklo z0.h, z0.b +; CHECK-NEXT: sdivr z4.s, p0/m, z4.s, z5.s +; CHECK-NEXT: sdivr z2.s, p0/m, z2.s, z3.s +; CHECK-NEXT: sunpkhi z3.s, z1.h +; CHECK-NEXT: sunpkhi z5.s, z0.h +; CHECK-NEXT: sunpklo z1.s, z1.h +; CHECK-NEXT: sunpklo z0.s, z0.h +; CHECK-NEXT: sdivr z3.s, p0/m, z3.s, z5.s +; CHECK-NEXT: sdiv z0.s, p0/m, z0.s, z1.s +; CHECK-NEXT: uzp1 z1.h, z2.h, z4.h +; CHECK-NEXT: uzp1 z0.h, z0.h, z3.h +; CHECK-NEXT: uzp1 z0.b, z0.b, z1.b +; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 +; CHECK-NEXT: ret + %res = sdiv <16 x i8> %op1, %op2 + ret <16 x i8> %res +} + +define void @sdiv_v32i8(<32 x i8>* %a, <32 x i8>* %b) #0 { +; CHECK-LABEL: sdiv_v32i8: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w8, #16 +; CHECK-NEXT: ptrue p0.b, vl16 +; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0, x8] +; CHECK-NEXT: ld1b { z1.b }, p0/z, [x0] +; CHECK-NEXT: ld1b { z2.b }, p0/z, [x1, x8] +; CHECK-NEXT: ld1b { z3.b }, p0/z, [x1] +; CHECK-NEXT: ptrue p0.s, vl4 +; CHECK-NEXT: sunpkhi z5.h, z0.b +; CHECK-NEXT: sunpklo z0.h, z0.b +; CHECK-NEXT: sunpkhi z4.h, z2.b +; CHECK-NEXT: sunpklo z2.h, z2.b +; CHECK-NEXT: sunpkhi z6.s, z4.h +; CHECK-NEXT: sunpkhi z7.s, z5.h +; CHECK-NEXT: sunpklo z4.s, z4.h +; CHECK-NEXT: sunpklo z5.s, z5.h +; CHECK-NEXT: sunpkhi z16.s, z2.h +; CHECK-NEXT: sdivr z6.s, p0/m, z6.s, z7.s +; CHECK-NEXT: sdivr z4.s, p0/m, z4.s, z5.s +; CHECK-NEXT: sunpkhi z5.s, z0.h +; CHECK-NEXT: sunpklo z2.s, z2.h +; CHECK-NEXT: sunpklo z0.s, z0.h +; CHECK-NEXT: uzp1 z4.h, z4.h, z6.h +; CHECK-NEXT: sdiv z0.s, p0/m, z0.s, z2.s +; CHECK-NEXT: sunpkhi z2.h, z3.b +; CHECK-NEXT: sunpkhi z6.h, z1.b +; CHECK-NEXT: sdiv z5.s, p0/m, z5.s, z16.s +; CHECK-NEXT: sunpkhi z7.s, z2.h +; CHECK-NEXT: sunpkhi z16.s, z6.h +; CHECK-NEXT: sunpklo z2.s, z2.h +; CHECK-NEXT: sunpklo z6.s, z6.h +; CHECK-NEXT: sunpklo z3.h, z3.b +; CHECK-NEXT: sunpklo z1.h, z1.b +; CHECK-NEXT: sdivr z7.s, p0/m, z7.s, z16.s +; CHECK-NEXT: sdivr z2.s, p0/m, z2.s, z6.s +; CHECK-NEXT: sunpkhi z6.s, z3.h +; CHECK-NEXT: sunpkhi z16.s, z1.h +; CHECK-NEXT: sunpklo z3.s, z3.h +; CHECK-NEXT: sunpklo z1.s, z1.h +; CHECK-NEXT: sdivr z6.s, p0/m, z6.s, z16.s +; CHECK-NEXT: sdiv z1.s, p0/m, z1.s, z3.s +; CHECK-NEXT: uzp1 z2.h, z2.h, z7.h +; CHECK-NEXT: uzp1 z1.h, z1.h, z6.h +; CHECK-NEXT: uzp1 z0.h, z0.h, z5.h +; CHECK-NEXT: uzp1 z1.b, z1.b, z2.b +; CHECK-NEXT: uzp1 z0.b, z0.b, z4.b +; CHECK-NEXT: stp q1, q0, [x0] +; CHECK-NEXT: ret + %op1 = load <32 x i8>, <32 x i8>* %a + %op2 = load <32 x i8>, <32 x i8>* %b + %res = sdiv <32 x i8> %op1, %op2 + store <32 x i8> %res, <32 x i8>* %a + ret void +} + +define void @sdiv_v64i8(<64 x i8>* %a, <64 x i8>* %b) #0 { +; CHECK-LABEL: sdiv_v64i8: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w8, #32 +; CHECK-NEXT: mov w9, #48 +; CHECK-NEXT: mov w10, #16 +; CHECK-NEXT: ptrue p1.b, vl16 +; CHECK-NEXT: ptrue p0.s, vl4 +; CHECK-NEXT: ld1b { z2.b }, p1/z, [x0, x8] +; CHECK-NEXT: ld1b { z3.b }, p1/z, [x0, x9] +; CHECK-NEXT: ld1b { z4.b }, p1/z, [x0, x10] +; CHECK-NEXT: ld1b { z0.b }, p1/z, [x0] +; CHECK-NEXT: ld1b { z5.b }, p1/z, [x1, x10] +; CHECK-NEXT: ld1b { z7.b }, p1/z, [x1, x9] +; CHECK-NEXT: ld1b { z6.b }, p1/z, [x1, x8] +; CHECK-NEXT: sunpkhi z16.h, z4.b +; CHECK-NEXT: sunpklo z4.h, z4.b +; CHECK-NEXT: sunpkhi z1.h, z5.b +; CHECK-NEXT: sunpkhi z18.s, z16.h +; CHECK-NEXT: sunpkhi z17.s, z1.h +; CHECK-NEXT: sunpklo z1.s, z1.h +; CHECK-NEXT: sunpklo z16.s, z16.h +; CHECK-NEXT: sdivr z17.s, p0/m, z17.s, z18.s +; CHECK-NEXT: sdivr z1.s, p0/m, z1.s, z16.s +; CHECK-NEXT: sunpklo z5.h, z5.b +; CHECK-NEXT: uzp1 z1.h, z1.h, z17.h +; CHECK-NEXT: sunpkhi z17.s, z5.h +; CHECK-NEXT: sunpkhi z18.s, z4.h +; CHECK-NEXT: sunpklo z5.s, z5.h +; CHECK-NEXT: sunpklo z4.s, z4.h +; CHECK-NEXT: sdivr z17.s, p0/m, z17.s, z18.s +; CHECK-NEXT: sdiv z4.s, p0/m, z4.s, z5.s +; CHECK-NEXT: sunpkhi z5.h, z7.b +; CHECK-NEXT: sunpkhi z18.h, z3.b +; CHECK-NEXT: sunpkhi z19.s, z5.h +; CHECK-NEXT: sunpkhi z20.s, z18.h +; CHECK-NEXT: sunpklo z5.s, z5.h +; CHECK-NEXT: sunpklo z18.s, z18.h +; CHECK-NEXT: sunpklo z7.h, z7.b +; CHECK-NEXT: sunpklo z3.h, z3.b +; CHECK-NEXT: sdivr z19.s, p0/m, z19.s, z20.s +; CHECK-NEXT: sdivr z5.s, p0/m, z5.s, z18.s +; CHECK-NEXT: sunpkhi z18.s, z7.h +; CHECK-NEXT: sunpkhi z20.s, z3.h +; CHECK-NEXT: sunpklo z7.s, z7.h +; CHECK-NEXT: sunpklo z3.s, z3.h +; CHECK-NEXT: sdivr z18.s, p0/m, z18.s, z20.s +; CHECK-NEXT: sdiv z3.s, p0/m, z3.s, z7.s +; CHECK-NEXT: uzp1 z5.h, z5.h, z19.h +; CHECK-NEXT: uzp1 z3.h, z3.h, z18.h +; CHECK-NEXT: ld1b { z16.b }, p1/z, [x1] +; CHECK-NEXT: uzp1 z3.b, z3.b, z5.b +; CHECK-NEXT: sunpkhi z5.h, z6.b +; CHECK-NEXT: sunpkhi z7.h, z2.b +; CHECK-NEXT: uzp1 z4.h, z4.h, z17.h +; CHECK-NEXT: sunpkhi z17.s, z5.h +; CHECK-NEXT: sunpkhi z18.s, z7.h +; CHECK-NEXT: sunpklo z5.s, z5.h +; CHECK-NEXT: sunpklo z7.s, z7.h +; CHECK-NEXT: sunpklo z6.h, z6.b +; CHECK-NEXT: sunpklo z2.h, z2.b +; CHECK-NEXT: sdivr z17.s, p0/m, z17.s, z18.s +; CHECK-NEXT: sdivr z5.s, p0/m, z5.s, z7.s +; CHECK-NEXT: sunpkhi z7.s, z6.h +; CHECK-NEXT: sunpkhi z18.s, z2.h +; CHECK-NEXT: sunpklo z6.s, z6.h +; CHECK-NEXT: sunpklo z2.s, z2.h +; CHECK-NEXT: sdivr z7.s, p0/m, z7.s, z18.s +; CHECK-NEXT: sdiv z2.s, p0/m, z2.s, z6.s +; CHECK-NEXT: uzp1 z2.h, z2.h, z7.h +; CHECK-NEXT: sunpkhi z6.h, z16.b +; CHECK-NEXT: sunpkhi z7.h, z0.b +; CHECK-NEXT: uzp1 z5.h, z5.h, z17.h +; CHECK-NEXT: sunpkhi z17.s, z6.h +; CHECK-NEXT: sunpkhi z18.s, z7.h +; CHECK-NEXT: sunpklo z6.s, z6.h +; CHECK-NEXT: sunpklo z7.s, z7.h +; CHECK-NEXT: sdivr z17.s, p0/m, z17.s, z18.s +; CHECK-NEXT: sdivr z6.s, p0/m, z6.s, z7.s +; CHECK-NEXT: uzp1 z2.b, z2.b, z5.b +; CHECK-NEXT: uzp1 z5.h, z6.h, z17.h +; CHECK-NEXT: sunpklo z6.h, z16.b +; CHECK-NEXT: sunpklo z0.h, z0.b +; CHECK-NEXT: sunpkhi z7.s, z6.h +; CHECK-NEXT: sunpkhi z16.s, z0.h +; CHECK-NEXT: sunpklo z6.s, z6.h +; CHECK-NEXT: sunpklo z0.s, z0.h +; CHECK-NEXT: sdivr z7.s, p0/m, z7.s, z16.s +; CHECK-NEXT: sdiv z0.s, p0/m, z0.s, z6.s +; CHECK-NEXT: uzp1 z0.h, z0.h, z7.h +; CHECK-NEXT: uzp1 z1.b, z4.b, z1.b +; CHECK-NEXT: uzp1 z0.b, z0.b, z5.b +; CHECK-NEXT: stp q2, q3, [x0, #32] +; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: ret + %op1 = load <64 x i8>, <64 x i8>* %a + %op2 = load <64 x i8>, <64 x i8>* %b + %res = sdiv <64 x i8> %op1, %op2 + store <64 x i8> %res, <64 x i8>* %a + ret void +} + +define <2 x i16> @sdiv_v2i16(<2 x i16> %op1, <2 x i16> %op2) #0 { +; CHECK-LABEL: sdiv_v2i16: +; CHECK: // %bb.0: +; CHECK-NEXT: adrp x8, .LCPI5_0 +; CHECK-NEXT: add x8, x8, :lo12:.LCPI5_0 +; CHECK-NEXT: ptrue p0.s, vl2 +; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1 +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: ld1w { z2.s }, p0/z, [x8] +; CHECK-NEXT: lsl z1.s, p0/m, z1.s, z2.s +; CHECK-NEXT: lsl z0.s, p0/m, z0.s, z2.s +; CHECK-NEXT: asr z1.s, p0/m, z1.s, z2.s +; CHECK-NEXT: asr z0.s, p0/m, z0.s, z2.s +; CHECK-NEXT: sdiv z0.s, p0/m, z0.s, z1.s +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 +; CHECK-NEXT: ret + %res = sdiv <2 x i16> %op1, %op2 + ret <2 x i16> %res +} + +define <4 x i16> @sdiv_v4i16(<4 x i16> %op1, <4 x i16> %op2) #0 { +; CHECK-LABEL: sdiv_v4i16: +; CHECK: // %bb.0: +; CHECK-NEXT: sub sp, sp, #16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1 +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: ptrue p0.s, vl4 +; CHECK-NEXT: sunpklo z1.s, z1.h +; CHECK-NEXT: sunpklo z0.s, z0.h +; CHECK-NEXT: sdiv z0.s, p0/m, z0.s, z1.s +; CHECK-NEXT: ptrue p0.h, vl4 +; CHECK-NEXT: fmov w8, s0 +; CHECK-NEXT: mov z1.s, z0.s[3] +; CHECK-NEXT: mov z2.s, z0.s[2] +; CHECK-NEXT: mov z0.s, z0.s[1] +; CHECK-NEXT: fmov w9, s1 +; CHECK-NEXT: fmov w10, s2 +; CHECK-NEXT: strh w8, [sp, #8] +; CHECK-NEXT: fmov w8, s0 +; CHECK-NEXT: strh w9, [sp, #14] +; CHECK-NEXT: strh w8, [sp, #10] +; CHECK-NEXT: add x8, sp, #8 +; CHECK-NEXT: strh w10, [sp, #12] +; CHECK-NEXT: ld1h { z0.h }, p0/z, [x8] +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 +; CHECK-NEXT: add sp, sp, #16 +; CHECK-NEXT: ret + %res = sdiv <4 x i16> %op1, %op2 + ret <4 x i16> %res +} + +define <8 x i16> @sdiv_v8i16(<8 x i16> %op1, <8 x i16> %op2) #0 { +; CHECK-LABEL: sdiv_v8i16: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1 +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: ptrue p0.s, vl4 +; CHECK-NEXT: sunpkhi z2.s, z1.h +; CHECK-NEXT: sunpkhi z3.s, z0.h +; CHECK-NEXT: sunpklo z1.s, z1.h +; CHECK-NEXT: sunpklo z0.s, z0.h +; CHECK-NEXT: sdivr z2.s, p0/m, z2.s, z3.s +; CHECK-NEXT: sdiv z0.s, p0/m, z0.s, z1.s +; CHECK-NEXT: uzp1 z0.h, z0.h, z2.h +; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 +; CHECK-NEXT: ret + %res = sdiv <8 x i16> %op1, %op2 + ret <8 x i16> %res +} + +define void @sdiv_v16i16(<16 x i16>* %a, <16 x i16>* %b) #0 { +; CHECK-LABEL: sdiv_v16i16: +; CHECK: // %bb.0: +; CHECK-NEXT: mov x8, #8 +; CHECK-NEXT: ptrue p0.h, vl8 +; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] +; CHECK-NEXT: ld1h { z1.h }, p0/z, [x0] +; CHECK-NEXT: ld1h { z2.h }, p0/z, [x1, x8, lsl #1] +; CHECK-NEXT: ld1h { z3.h }, p0/z, [x1] +; CHECK-NEXT: ptrue p0.s, vl4 +; CHECK-NEXT: sunpkhi z5.s, z0.h +; CHECK-NEXT: sunpklo z0.s, z0.h +; CHECK-NEXT: sunpkhi z4.s, z2.h +; CHECK-NEXT: sunpkhi z6.s, z3.h +; CHECK-NEXT: sdivr z4.s, p0/m, z4.s, z5.s +; CHECK-NEXT: sunpkhi z5.s, z1.h +; CHECK-NEXT: sunpklo z2.s, z2.h +; CHECK-NEXT: sunpklo z3.s, z3.h +; CHECK-NEXT: sunpklo z1.s, z1.h +; CHECK-NEXT: sdiv z5.s, p0/m, z5.s, z6.s +; CHECK-NEXT: sdiv z1.s, p0/m, z1.s, z3.s +; CHECK-NEXT: sdiv z0.s, p0/m, z0.s, z2.s +; CHECK-NEXT: uzp1 z1.h, z1.h, z5.h +; CHECK-NEXT: uzp1 z0.h, z0.h, z4.h +; CHECK-NEXT: stp q1, q0, [x0] +; CHECK-NEXT: ret + %op1 = load <16 x i16>, <16 x i16>* %a + %op2 = load <16 x i16>, <16 x i16>* %b + %res = sdiv <16 x i16> %op1, %op2 + store <16 x i16> %res, <16 x i16>* %a + ret void +} + +define void @sdiv_v32i16(<32 x i16>* %a, <32 x i16>* %b) #0 { +; CHECK-LABEL: sdiv_v32i16: +; CHECK: // %bb.0: +; CHECK-NEXT: mov x8, #16 +; CHECK-NEXT: mov x9, #24 +; CHECK-NEXT: mov x10, #8 +; CHECK-NEXT: ptrue p1.h, vl8 +; CHECK-NEXT: ptrue p0.s, vl4 +; CHECK-NEXT: ld1h { z0.h }, p1/z, [x0, x8, lsl #1] +; CHECK-NEXT: ld1h { z1.h }, p1/z, [x0, x9, lsl #1] +; CHECK-NEXT: ld1h { z2.h }, p1/z, [x0, x10, lsl #1] +; CHECK-NEXT: ld1h { z3.h }, p1/z, [x0] +; CHECK-NEXT: ld1h { z4.h }, p1/z, [x1, x10, lsl #1] +; CHECK-NEXT: ld1h { z5.h }, p1/z, [x1, x9, lsl #1] +; CHECK-NEXT: ld1h { z6.h }, p1/z, [x1, x8, lsl #1] +; CHECK-NEXT: ld1h { z17.h }, p1/z, [x1] +; CHECK-NEXT: sunpkhi z18.s, z1.h +; CHECK-NEXT: sunpklo z1.s, z1.h +; CHECK-NEXT: sunpkhi z16.s, z2.h +; CHECK-NEXT: sunpklo z2.s, z2.h +; CHECK-NEXT: sunpkhi z7.s, z4.h +; CHECK-NEXT: sunpklo z4.s, z4.h +; CHECK-NEXT: sdivr z7.s, p0/m, z7.s, z16.s +; CHECK-NEXT: sunpkhi z16.s, z5.h +; CHECK-NEXT: sunpklo z5.s, z5.h +; CHECK-NEXT: sdiv z2.s, p0/m, z2.s, z4.s +; CHECK-NEXT: sdiv z1.s, p0/m, z1.s, z5.s +; CHECK-NEXT: sunpkhi z4.s, z6.h +; CHECK-NEXT: sunpkhi z5.s, z0.h +; CHECK-NEXT: sunpklo z6.s, z6.h +; CHECK-NEXT: sunpklo z0.s, z0.h +; CHECK-NEXT: sdivr z4.s, p0/m, z4.s, z5.s +; CHECK-NEXT: sdiv z0.s, p0/m, z0.s, z6.s +; CHECK-NEXT: sunpkhi z5.s, z17.h +; CHECK-NEXT: sdivr z16.s, p0/m, z16.s, z18.s +; CHECK-NEXT: sunpkhi z6.s, z3.h +; CHECK-NEXT: uzp1 z0.h, z0.h, z4.h +; CHECK-NEXT: movprfx z4, z6 +; CHECK-NEXT: sdiv z4.s, p0/m, z4.s, z5.s +; CHECK-NEXT: sunpklo z5.s, z17.h +; CHECK-NEXT: sunpklo z3.s, z3.h +; CHECK-NEXT: uzp1 z1.h, z1.h, z16.h +; CHECK-NEXT: sdiv z3.s, p0/m, z3.s, z5.s +; CHECK-NEXT: stp q0, q1, [x0, #32] +; CHECK-NEXT: uzp1 z0.h, z3.h, z4.h +; CHECK-NEXT: uzp1 z1.h, z2.h, z7.h +; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: ret + %op1 = load <32 x i16>, <32 x i16>* %a + %op2 = load <32 x i16>, <32 x i16>* %b + %res = sdiv <32 x i16> %op1, %op2 + store <32 x i16> %res, <32 x i16>* %a + ret void +} + +define <2 x i32> @sdiv_v2i32(<2 x i32> %op1, <2 x i32> %op2) #0 { +; CHECK-LABEL: sdiv_v2i32: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: ptrue p0.s, vl2 +; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1 +; CHECK-NEXT: sdiv z0.s, p0/m, z0.s, z1.s +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 +; CHECK-NEXT: ret + %res = sdiv <2 x i32> %op1, %op2 + ret <2 x i32> %res +} + +define <4 x i32> @sdiv_v4i32(<4 x i32> %op1, <4 x i32> %op2) #0 { +; CHECK-LABEL: sdiv_v4i32: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: ptrue p0.s, vl4 +; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1 +; CHECK-NEXT: sdiv z0.s, p0/m, z0.s, z1.s +; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 +; CHECK-NEXT: ret + %res = sdiv <4 x i32> %op1, %op2 + ret <4 x i32> %res +} + +define void @sdiv_v8i32(<8 x i32>* %a, <8 x i32>* %b) #0 { +; CHECK-LABEL: sdiv_v8i32: +; CHECK: // %bb.0: +; CHECK-NEXT: mov x8, #4 +; CHECK-NEXT: ptrue p0.s, vl4 +; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] +; CHECK-NEXT: ld1w { z1.s }, p0/z, [x0] +; CHECK-NEXT: ld1w { z2.s }, p0/z, [x1, x8, lsl #2] +; CHECK-NEXT: ld1w { z3.s }, p0/z, [x1] +; CHECK-NEXT: sdiv z1.s, p0/m, z1.s, z3.s +; CHECK-NEXT: sdiv z0.s, p0/m, z0.s, z2.s +; CHECK-NEXT: stp q1, q0, [x0] +; CHECK-NEXT: ret + %op1 = load <8 x i32>, <8 x i32>* %a + %op2 = load <8 x i32>, <8 x i32>* %b + %res = sdiv <8 x i32> %op1, %op2 + store <8 x i32> %res, <8 x i32>* %a + ret void +} + +define void @sdiv_v16i32(<16 x i32>* %a, <16 x i32>* %b) #0 { +; CHECK-LABEL: sdiv_v16i32: +; CHECK: // %bb.0: +; CHECK-NEXT: mov x8, #8 +; CHECK-NEXT: mov x9, #12 +; CHECK-NEXT: mov x10, #4 +; CHECK-NEXT: ptrue p0.s, vl4 +; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] +; CHECK-NEXT: ld1w { z1.s }, p0/z, [x0, x9, lsl #2] +; CHECK-NEXT: ld1w { z2.s }, p0/z, [x0, x10, lsl #2] +; CHECK-NEXT: ld1w { z3.s }, p0/z, [x0] +; CHECK-NEXT: ld1w { z4.s }, p0/z, [x1, x8, lsl #2] +; CHECK-NEXT: ld1w { z5.s }, p0/z, [x1, x9, lsl #2] +; CHECK-NEXT: ld1w { z6.s }, p0/z, [x1, x10, lsl #2] +; CHECK-NEXT: ld1w { z7.s }, p0/z, [x1] +; CHECK-NEXT: sdiv z0.s, p0/m, z0.s, z4.s +; CHECK-NEXT: sdiv z1.s, p0/m, z1.s, z5.s +; CHECK-NEXT: stp q0, q1, [x0, #32] +; CHECK-NEXT: movprfx z0, z3 +; CHECK-NEXT: sdiv z0.s, p0/m, z0.s, z7.s +; CHECK-NEXT: movprfx z1, z2 +; CHECK-NEXT: sdiv z1.s, p0/m, z1.s, z6.s +; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: ret + %op1 = load <16 x i32>, <16 x i32>* %a + %op2 = load <16 x i32>, <16 x i32>* %b + %res = sdiv <16 x i32> %op1, %op2 + store <16 x i32> %res, <16 x i32>* %a + ret void +} + +define <1 x i64> @sdiv_v1i64(<1 x i64> %op1, <1 x i64> %op2) #0 { +; CHECK-LABEL: sdiv_v1i64: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: ptrue p0.d, vl1 +; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1 +; CHECK-NEXT: sdiv z0.d, p0/m, z0.d, z1.d +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 +; CHECK-NEXT: ret + %res = sdiv <1 x i64> %op1, %op2 + ret <1 x i64> %res +} + +define <2 x i64> @sdiv_v2i64(<2 x i64> %op1, <2 x i64> %op2) #0 { +; CHECK-LABEL: sdiv_v2i64: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: ptrue p0.d, vl2 +; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1 +; CHECK-NEXT: sdiv z0.d, p0/m, z0.d, z1.d +; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 +; CHECK-NEXT: ret + %res = sdiv <2 x i64> %op1, %op2 + ret <2 x i64> %res +} + +define void @sdiv_v4i64(<4 x i64>* %a, <4 x i64>* %b) #0 { +; CHECK-LABEL: sdiv_v4i64: +; CHECK: // %bb.0: +; CHECK-NEXT: mov x8, #2 +; CHECK-NEXT: ptrue p0.d, vl2 +; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] +; CHECK-NEXT: ld1d { z1.d }, p0/z, [x0] +; CHECK-NEXT: ld1d { z2.d }, p0/z, [x1, x8, lsl #3] +; CHECK-NEXT: ld1d { z3.d }, p0/z, [x1] +; CHECK-NEXT: sdiv z1.d, p0/m, z1.d, z3.d +; CHECK-NEXT: sdiv z0.d, p0/m, z0.d, z2.d +; CHECK-NEXT: stp q1, q0, [x0] +; CHECK-NEXT: ret + %op1 = load <4 x i64>, <4 x i64>* %a + %op2 = load <4 x i64>, <4 x i64>* %b + %res = sdiv <4 x i64> %op1, %op2 + store <4 x i64> %res, <4 x i64>* %a + ret void +} + +define void @sdiv_v8i64(<8 x i64>* %a, <8 x i64>* %b) #0 { +; CHECK-LABEL: sdiv_v8i64: +; CHECK: // %bb.0: +; CHECK-NEXT: mov x8, #4 +; CHECK-NEXT: mov x9, #6 +; CHECK-NEXT: mov x10, #2 +; CHECK-NEXT: ptrue p0.d, vl2 +; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] +; CHECK-NEXT: ld1d { z1.d }, p0/z, [x0, x9, lsl #3] +; CHECK-NEXT: ld1d { z2.d }, p0/z, [x0, x10, lsl #3] +; CHECK-NEXT: ld1d { z3.d }, p0/z, [x0] +; CHECK-NEXT: ld1d { z4.d }, p0/z, [x1, x8, lsl #3] +; CHECK-NEXT: ld1d { z5.d }, p0/z, [x1, x9, lsl #3] +; CHECK-NEXT: ld1d { z6.d }, p0/z, [x1, x10, lsl #3] +; CHECK-NEXT: ld1d { z7.d }, p0/z, [x1] +; CHECK-NEXT: sdiv z0.d, p0/m, z0.d, z4.d +; CHECK-NEXT: sdiv z1.d, p0/m, z1.d, z5.d +; CHECK-NEXT: stp q0, q1, [x0, #32] +; CHECK-NEXT: movprfx z0, z3 +; CHECK-NEXT: sdiv z0.d, p0/m, z0.d, z7.d +; CHECK-NEXT: movprfx z1, z2 +; CHECK-NEXT: sdiv z1.d, p0/m, z1.d, z6.d +; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: ret + %op1 = load <8 x i64>, <8 x i64>* %a + %op2 = load <8 x i64>, <8 x i64>* %b + %res = sdiv <8 x i64> %op1, %op2 + store <8 x i64> %res, <8 x i64>* %a + ret void +} + +; +; UDIV +; + +define <4 x i8> @udiv_v4i8(<4 x i8> %op1, <4 x i8> %op2) #0 { +; CHECK-LABEL: udiv_v4i8: +; CHECK: // %bb.0: +; CHECK-NEXT: sub sp, sp, #16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: adrp x8, .LCPI18_0 +; CHECK-NEXT: add x8, x8, :lo12:.LCPI18_0 +; CHECK-NEXT: ptrue p0.h, vl4 +; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1 +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: ptrue p1.s, vl4 +; CHECK-NEXT: ld1h { z2.h }, p0/z, [x8] +; CHECK-NEXT: and z0.d, z0.d, z2.d +; CHECK-NEXT: and z1.d, z1.d, z2.d +; CHECK-NEXT: uunpklo z1.s, z1.h +; CHECK-NEXT: uunpklo z0.s, z0.h +; CHECK-NEXT: udiv z0.s, p1/m, z0.s, z1.s +; CHECK-NEXT: fmov w8, s0 +; CHECK-NEXT: mov z1.s, z0.s[3] +; CHECK-NEXT: mov z2.s, z0.s[2] +; CHECK-NEXT: mov z0.s, z0.s[1] +; CHECK-NEXT: fmov w9, s1 +; CHECK-NEXT: fmov w10, s2 +; CHECK-NEXT: strh w8, [sp, #8] +; CHECK-NEXT: fmov w8, s0 +; CHECK-NEXT: strh w9, [sp, #14] +; CHECK-NEXT: strh w8, [sp, #10] +; CHECK-NEXT: add x8, sp, #8 +; CHECK-NEXT: strh w10, [sp, #12] +; CHECK-NEXT: ld1h { z0.h }, p0/z, [x8] +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 +; CHECK-NEXT: add sp, sp, #16 +; CHECK-NEXT: ret + %res = udiv <4 x i8> %op1, %op2 + ret <4 x i8> %res +} + +define <8 x i8> @udiv_v8i8(<8 x i8> %op1, <8 x i8> %op2) #0 { +; CHECK-LABEL: udiv_v8i8: +; CHECK: // %bb.0: +; CHECK-NEXT: sub sp, sp, #16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1 +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: ptrue p0.s, vl4 +; CHECK-NEXT: uunpklo z1.h, z1.b +; CHECK-NEXT: uunpklo z0.h, z0.b +; CHECK-NEXT: uunpkhi z2.s, z1.h +; CHECK-NEXT: uunpkhi z3.s, z0.h +; CHECK-NEXT: uunpklo z1.s, z1.h +; CHECK-NEXT: uunpklo z0.s, z0.h +; CHECK-NEXT: udivr z2.s, p0/m, z2.s, z3.s +; CHECK-NEXT: udiv z0.s, p0/m, z0.s, z1.s +; CHECK-NEXT: uzp1 z0.h, z0.h, z2.h +; CHECK-NEXT: ptrue p0.b, vl8 +; CHECK-NEXT: fmov w8, s0 +; CHECK-NEXT: mov z1.h, z0.h[7] +; CHECK-NEXT: mov z3.h, z0.h[5] +; CHECK-NEXT: fmov w9, s1 +; CHECK-NEXT: mov z2.h, z0.h[6] +; CHECK-NEXT: mov z4.h, z0.h[4] +; CHECK-NEXT: strb w8, [sp, #8] +; CHECK-NEXT: fmov w8, s3 +; CHECK-NEXT: mov z6.h, z0.h[2] +; CHECK-NEXT: fmov w10, s2 +; CHECK-NEXT: strb w9, [sp, #15] +; CHECK-NEXT: fmov w9, s4 +; CHECK-NEXT: strb w8, [sp, #13] +; CHECK-NEXT: fmov w8, s6 +; CHECK-NEXT: mov z5.h, z0.h[3] +; CHECK-NEXT: mov z0.h, z0.h[1] +; CHECK-NEXT: strb w10, [sp, #14] +; CHECK-NEXT: fmov w10, s5 +; CHECK-NEXT: strb w9, [sp, #12] +; CHECK-NEXT: fmov w9, s0 +; CHECK-NEXT: strb w8, [sp, #10] +; CHECK-NEXT: add x8, sp, #8 +; CHECK-NEXT: strb w10, [sp, #11] +; CHECK-NEXT: strb w9, [sp, #9] +; CHECK-NEXT: ld1b { z0.b }, p0/z, [x8] +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 +; CHECK-NEXT: add sp, sp, #16 +; CHECK-NEXT: ret + %res = udiv <8 x i8> %op1, %op2 + ret <8 x i8> %res +} + +define <16 x i8> @udiv_v16i8(<16 x i8> %op1, <16 x i8> %op2) #0 { +; CHECK-LABEL: udiv_v16i8: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1 +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: uunpkhi z2.h, z1.b +; CHECK-NEXT: uunpkhi z3.h, z0.b +; CHECK-NEXT: ptrue p0.s, vl4 +; CHECK-NEXT: uunpklo z1.h, z1.b +; CHECK-NEXT: uunpkhi z4.s, z2.h +; CHECK-NEXT: uunpkhi z5.s, z3.h +; CHECK-NEXT: uunpklo z2.s, z2.h +; CHECK-NEXT: uunpklo z3.s, z3.h +; CHECK-NEXT: uunpklo z0.h, z0.b +; CHECK-NEXT: udivr z4.s, p0/m, z4.s, z5.s +; CHECK-NEXT: udivr z2.s, p0/m, z2.s, z3.s +; CHECK-NEXT: uunpkhi z3.s, z1.h +; CHECK-NEXT: uunpkhi z5.s, z0.h +; CHECK-NEXT: uunpklo z1.s, z1.h +; CHECK-NEXT: uunpklo z0.s, z0.h +; CHECK-NEXT: udivr z3.s, p0/m, z3.s, z5.s +; CHECK-NEXT: udiv z0.s, p0/m, z0.s, z1.s +; CHECK-NEXT: uzp1 z1.h, z2.h, z4.h +; CHECK-NEXT: uzp1 z0.h, z0.h, z3.h +; CHECK-NEXT: uzp1 z0.b, z0.b, z1.b +; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 +; CHECK-NEXT: ret + %res = udiv <16 x i8> %op1, %op2 + ret <16 x i8> %res +} + +define void @udiv_v32i8(<32 x i8>* %a, <32 x i8>* %b) #0 { +; CHECK-LABEL: udiv_v32i8: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w8, #16 +; CHECK-NEXT: ptrue p0.b, vl16 +; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0, x8] +; CHECK-NEXT: ld1b { z1.b }, p0/z, [x0] +; CHECK-NEXT: ld1b { z2.b }, p0/z, [x1, x8] +; CHECK-NEXT: ld1b { z3.b }, p0/z, [x1] +; CHECK-NEXT: ptrue p0.s, vl4 +; CHECK-NEXT: uunpkhi z5.h, z0.b +; CHECK-NEXT: uunpklo z0.h, z0.b +; CHECK-NEXT: uunpkhi z4.h, z2.b +; CHECK-NEXT: uunpklo z2.h, z2.b +; CHECK-NEXT: uunpkhi z6.s, z4.h +; CHECK-NEXT: uunpkhi z7.s, z5.h +; CHECK-NEXT: uunpklo z4.s, z4.h +; CHECK-NEXT: uunpklo z5.s, z5.h +; CHECK-NEXT: uunpkhi z16.s, z2.h +; CHECK-NEXT: udivr z6.s, p0/m, z6.s, z7.s +; CHECK-NEXT: udivr z4.s, p0/m, z4.s, z5.s +; CHECK-NEXT: uunpkhi z5.s, z0.h +; CHECK-NEXT: uunpklo z2.s, z2.h +; CHECK-NEXT: uunpklo z0.s, z0.h +; CHECK-NEXT: uzp1 z4.h, z4.h, z6.h +; CHECK-NEXT: udiv z0.s, p0/m, z0.s, z2.s +; CHECK-NEXT: uunpkhi z2.h, z3.b +; CHECK-NEXT: uunpkhi z6.h, z1.b +; CHECK-NEXT: udiv z5.s, p0/m, z5.s, z16.s +; CHECK-NEXT: uunpkhi z7.s, z2.h +; CHECK-NEXT: uunpkhi z16.s, z6.h +; CHECK-NEXT: uunpklo z2.s, z2.h +; CHECK-NEXT: uunpklo z6.s, z6.h +; CHECK-NEXT: uunpklo z3.h, z3.b +; CHECK-NEXT: uunpklo z1.h, z1.b +; CHECK-NEXT: udivr z7.s, p0/m, z7.s, z16.s +; CHECK-NEXT: udivr z2.s, p0/m, z2.s, z6.s +; CHECK-NEXT: uunpkhi z6.s, z3.h +; CHECK-NEXT: uunpkhi z16.s, z1.h +; CHECK-NEXT: uunpklo z3.s, z3.h +; CHECK-NEXT: uunpklo z1.s, z1.h +; CHECK-NEXT: udivr z6.s, p0/m, z6.s, z16.s +; CHECK-NEXT: udiv z1.s, p0/m, z1.s, z3.s +; CHECK-NEXT: uzp1 z2.h, z2.h, z7.h +; CHECK-NEXT: uzp1 z1.h, z1.h, z6.h +; CHECK-NEXT: uzp1 z0.h, z0.h, z5.h +; CHECK-NEXT: uzp1 z1.b, z1.b, z2.b +; CHECK-NEXT: uzp1 z0.b, z0.b, z4.b +; CHECK-NEXT: stp q1, q0, [x0] +; CHECK-NEXT: ret + %op1 = load <32 x i8>, <32 x i8>* %a + %op2 = load <32 x i8>, <32 x i8>* %b + %res = udiv <32 x i8> %op1, %op2 + store <32 x i8> %res, <32 x i8>* %a + ret void +} + +define void @udiv_v64i8(<64 x i8>* %a, <64 x i8>* %b) #0 { +; CHECK-LABEL: udiv_v64i8: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w8, #32 +; CHECK-NEXT: mov w9, #48 +; CHECK-NEXT: mov w10, #16 +; CHECK-NEXT: ptrue p1.b, vl16 +; CHECK-NEXT: ptrue p0.s, vl4 +; CHECK-NEXT: ld1b { z2.b }, p1/z, [x0, x8] +; CHECK-NEXT: ld1b { z3.b }, p1/z, [x0, x9] +; CHECK-NEXT: ld1b { z4.b }, p1/z, [x0, x10] +; CHECK-NEXT: ld1b { z0.b }, p1/z, [x0] +; CHECK-NEXT: ld1b { z5.b }, p1/z, [x1, x10] +; CHECK-NEXT: ld1b { z7.b }, p1/z, [x1, x9] +; CHECK-NEXT: ld1b { z6.b }, p1/z, [x1, x8] +; CHECK-NEXT: uunpkhi z16.h, z4.b +; CHECK-NEXT: uunpklo z4.h, z4.b +; CHECK-NEXT: uunpkhi z1.h, z5.b +; CHECK-NEXT: uunpkhi z18.s, z16.h +; CHECK-NEXT: uunpkhi z17.s, z1.h +; CHECK-NEXT: uunpklo z1.s, z1.h +; CHECK-NEXT: uunpklo z16.s, z16.h +; CHECK-NEXT: udivr z17.s, p0/m, z17.s, z18.s +; CHECK-NEXT: udivr z1.s, p0/m, z1.s, z16.s +; CHECK-NEXT: uunpklo z5.h, z5.b +; CHECK-NEXT: uzp1 z1.h, z1.h, z17.h +; CHECK-NEXT: uunpkhi z17.s, z5.h +; CHECK-NEXT: uunpkhi z18.s, z4.h +; CHECK-NEXT: uunpklo z5.s, z5.h +; CHECK-NEXT: uunpklo z4.s, z4.h +; CHECK-NEXT: udivr z17.s, p0/m, z17.s, z18.s +; CHECK-NEXT: udiv z4.s, p0/m, z4.s, z5.s +; CHECK-NEXT: uunpkhi z5.h, z7.b +; CHECK-NEXT: uunpkhi z18.h, z3.b +; CHECK-NEXT: uunpkhi z19.s, z5.h +; CHECK-NEXT: uunpkhi z20.s, z18.h +; CHECK-NEXT: uunpklo z5.s, z5.h +; CHECK-NEXT: uunpklo z18.s, z18.h +; CHECK-NEXT: uunpklo z7.h, z7.b +; CHECK-NEXT: uunpklo z3.h, z3.b +; CHECK-NEXT: udivr z19.s, p0/m, z19.s, z20.s +; CHECK-NEXT: udivr z5.s, p0/m, z5.s, z18.s +; CHECK-NEXT: uunpkhi z18.s, z7.h +; CHECK-NEXT: uunpkhi z20.s, z3.h +; CHECK-NEXT: uunpklo z7.s, z7.h +; CHECK-NEXT: uunpklo z3.s, z3.h +; CHECK-NEXT: udivr z18.s, p0/m, z18.s, z20.s +; CHECK-NEXT: udiv z3.s, p0/m, z3.s, z7.s +; CHECK-NEXT: uzp1 z5.h, z5.h, z19.h +; CHECK-NEXT: uzp1 z3.h, z3.h, z18.h +; CHECK-NEXT: ld1b { z16.b }, p1/z, [x1] +; CHECK-NEXT: uzp1 z3.b, z3.b, z5.b +; CHECK-NEXT: uunpkhi z5.h, z6.b +; CHECK-NEXT: uunpkhi z7.h, z2.b +; CHECK-NEXT: uzp1 z4.h, z4.h, z17.h +; CHECK-NEXT: uunpkhi z17.s, z5.h +; CHECK-NEXT: uunpkhi z18.s, z7.h +; CHECK-NEXT: uunpklo z5.s, z5.h +; CHECK-NEXT: uunpklo z7.s, z7.h +; CHECK-NEXT: uunpklo z6.h, z6.b +; CHECK-NEXT: uunpklo z2.h, z2.b +; CHECK-NEXT: udivr z17.s, p0/m, z17.s, z18.s +; CHECK-NEXT: udivr z5.s, p0/m, z5.s, z7.s +; CHECK-NEXT: uunpkhi z7.s, z6.h +; CHECK-NEXT: uunpkhi z18.s, z2.h +; CHECK-NEXT: uunpklo z6.s, z6.h +; CHECK-NEXT: uunpklo z2.s, z2.h +; CHECK-NEXT: udivr z7.s, p0/m, z7.s, z18.s +; CHECK-NEXT: udiv z2.s, p0/m, z2.s, z6.s +; CHECK-NEXT: uzp1 z2.h, z2.h, z7.h +; CHECK-NEXT: uunpkhi z6.h, z16.b +; CHECK-NEXT: uunpkhi z7.h, z0.b +; CHECK-NEXT: uzp1 z5.h, z5.h, z17.h +; CHECK-NEXT: uunpkhi z17.s, z6.h +; CHECK-NEXT: uunpkhi z18.s, z7.h +; CHECK-NEXT: uunpklo z6.s, z6.h +; CHECK-NEXT: uunpklo z7.s, z7.h +; CHECK-NEXT: udivr z17.s, p0/m, z17.s, z18.s +; CHECK-NEXT: udivr z6.s, p0/m, z6.s, z7.s +; CHECK-NEXT: uzp1 z2.b, z2.b, z5.b +; CHECK-NEXT: uzp1 z5.h, z6.h, z17.h +; CHECK-NEXT: uunpklo z6.h, z16.b +; CHECK-NEXT: uunpklo z0.h, z0.b +; CHECK-NEXT: uunpkhi z7.s, z6.h +; CHECK-NEXT: uunpkhi z16.s, z0.h +; CHECK-NEXT: uunpklo z6.s, z6.h +; CHECK-NEXT: uunpklo z0.s, z0.h +; CHECK-NEXT: udivr z7.s, p0/m, z7.s, z16.s +; CHECK-NEXT: udiv z0.s, p0/m, z0.s, z6.s +; CHECK-NEXT: uzp1 z0.h, z0.h, z7.h +; CHECK-NEXT: uzp1 z1.b, z4.b, z1.b +; CHECK-NEXT: uzp1 z0.b, z0.b, z5.b +; CHECK-NEXT: stp q2, q3, [x0, #32] +; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: ret + %op1 = load <64 x i8>, <64 x i8>* %a + %op2 = load <64 x i8>, <64 x i8>* %b + %res = udiv <64 x i8> %op1, %op2 + store <64 x i8> %res, <64 x i8>* %a + ret void +} + +define <2 x i16> @udiv_v2i16(<2 x i16> %op1, <2 x i16> %op2) #0 { +; CHECK-LABEL: udiv_v2i16: +; CHECK: // %bb.0: +; CHECK-NEXT: adrp x8, .LCPI23_0 +; CHECK-NEXT: add x8, x8, :lo12:.LCPI23_0 +; CHECK-NEXT: ptrue p0.s, vl2 +; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1 +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: ld1w { z2.s }, p0/z, [x8] +; CHECK-NEXT: and z1.d, z1.d, z2.d +; CHECK-NEXT: and z0.d, z0.d, z2.d +; CHECK-NEXT: udiv z0.s, p0/m, z0.s, z1.s +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 +; CHECK-NEXT: ret + %res = udiv <2 x i16> %op1, %op2 + ret <2 x i16> %res +} + +define <4 x i16> @udiv_v4i16(<4 x i16> %op1, <4 x i16> %op2) #0 { +; CHECK-LABEL: udiv_v4i16: +; CHECK: // %bb.0: +; CHECK-NEXT: sub sp, sp, #16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1 +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: ptrue p0.s, vl4 +; CHECK-NEXT: uunpklo z1.s, z1.h +; CHECK-NEXT: uunpklo z0.s, z0.h +; CHECK-NEXT: udiv z0.s, p0/m, z0.s, z1.s +; CHECK-NEXT: ptrue p0.h, vl4 +; CHECK-NEXT: fmov w8, s0 +; CHECK-NEXT: mov z1.s, z0.s[3] +; CHECK-NEXT: mov z2.s, z0.s[2] +; CHECK-NEXT: mov z0.s, z0.s[1] +; CHECK-NEXT: fmov w9, s1 +; CHECK-NEXT: fmov w10, s2 +; CHECK-NEXT: strh w8, [sp, #8] +; CHECK-NEXT: fmov w8, s0 +; CHECK-NEXT: strh w9, [sp, #14] +; CHECK-NEXT: strh w8, [sp, #10] +; CHECK-NEXT: add x8, sp, #8 +; CHECK-NEXT: strh w10, [sp, #12] +; CHECK-NEXT: ld1h { z0.h }, p0/z, [x8] +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 +; CHECK-NEXT: add sp, sp, #16 +; CHECK-NEXT: ret + %res = udiv <4 x i16> %op1, %op2 + ret <4 x i16> %res +} + +define <8 x i16> @udiv_v8i16(<8 x i16> %op1, <8 x i16> %op2) #0 { +; CHECK-LABEL: udiv_v8i16: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1 +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: ptrue p0.s, vl4 +; CHECK-NEXT: uunpkhi z2.s, z1.h +; CHECK-NEXT: uunpkhi z3.s, z0.h +; CHECK-NEXT: uunpklo z1.s, z1.h +; CHECK-NEXT: uunpklo z0.s, z0.h +; CHECK-NEXT: udivr z2.s, p0/m, z2.s, z3.s +; CHECK-NEXT: udiv z0.s, p0/m, z0.s, z1.s +; CHECK-NEXT: uzp1 z0.h, z0.h, z2.h +; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 +; CHECK-NEXT: ret + %res = udiv <8 x i16> %op1, %op2 + ret <8 x i16> %res +} + +define void @udiv_v16i16(<16 x i16>* %a, <16 x i16>* %b) #0 { +; CHECK-LABEL: udiv_v16i16: +; CHECK: // %bb.0: +; CHECK-NEXT: mov x8, #8 +; CHECK-NEXT: ptrue p0.h, vl8 +; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] +; CHECK-NEXT: ld1h { z1.h }, p0/z, [x0] +; CHECK-NEXT: ld1h { z2.h }, p0/z, [x1, x8, lsl #1] +; CHECK-NEXT: ld1h { z3.h }, p0/z, [x1] +; CHECK-NEXT: ptrue p0.s, vl4 +; CHECK-NEXT: uunpkhi z5.s, z0.h +; CHECK-NEXT: uunpklo z0.s, z0.h +; CHECK-NEXT: uunpkhi z4.s, z2.h +; CHECK-NEXT: uunpkhi z6.s, z3.h +; CHECK-NEXT: udivr z4.s, p0/m, z4.s, z5.s +; CHECK-NEXT: uunpkhi z5.s, z1.h +; CHECK-NEXT: uunpklo z2.s, z2.h +; CHECK-NEXT: uunpklo z3.s, z3.h +; CHECK-NEXT: uunpklo z1.s, z1.h +; CHECK-NEXT: udiv z5.s, p0/m, z5.s, z6.s +; CHECK-NEXT: udiv z1.s, p0/m, z1.s, z3.s +; CHECK-NEXT: udiv z0.s, p0/m, z0.s, z2.s +; CHECK-NEXT: uzp1 z1.h, z1.h, z5.h +; CHECK-NEXT: uzp1 z0.h, z0.h, z4.h +; CHECK-NEXT: stp q1, q0, [x0] +; CHECK-NEXT: ret + %op1 = load <16 x i16>, <16 x i16>* %a + %op2 = load <16 x i16>, <16 x i16>* %b + %res = udiv <16 x i16> %op1, %op2 + store <16 x i16> %res, <16 x i16>* %a + ret void +} + +define void @udiv_v32i16(<32 x i16>* %a, <32 x i16>* %b) #0 { +; CHECK-LABEL: udiv_v32i16: +; CHECK: // %bb.0: +; CHECK-NEXT: mov x8, #16 +; CHECK-NEXT: mov x9, #24 +; CHECK-NEXT: mov x10, #8 +; CHECK-NEXT: ptrue p1.h, vl8 +; CHECK-NEXT: ptrue p0.s, vl4 +; CHECK-NEXT: ld1h { z0.h }, p1/z, [x0, x8, lsl #1] +; CHECK-NEXT: ld1h { z1.h }, p1/z, [x0, x9, lsl #1] +; CHECK-NEXT: ld1h { z2.h }, p1/z, [x0, x10, lsl #1] +; CHECK-NEXT: ld1h { z3.h }, p1/z, [x0] +; CHECK-NEXT: ld1h { z4.h }, p1/z, [x1, x10, lsl #1] +; CHECK-NEXT: ld1h { z5.h }, p1/z, [x1, x9, lsl #1] +; CHECK-NEXT: ld1h { z6.h }, p1/z, [x1, x8, lsl #1] +; CHECK-NEXT: ld1h { z17.h }, p1/z, [x1] +; CHECK-NEXT: uunpkhi z18.s, z1.h +; CHECK-NEXT: uunpklo z1.s, z1.h +; CHECK-NEXT: uunpkhi z16.s, z2.h +; CHECK-NEXT: uunpklo z2.s, z2.h +; CHECK-NEXT: uunpkhi z7.s, z4.h +; CHECK-NEXT: uunpklo z4.s, z4.h +; CHECK-NEXT: udivr z7.s, p0/m, z7.s, z16.s +; CHECK-NEXT: uunpkhi z16.s, z5.h +; CHECK-NEXT: uunpklo z5.s, z5.h +; CHECK-NEXT: udiv z2.s, p0/m, z2.s, z4.s +; CHECK-NEXT: udiv z1.s, p0/m, z1.s, z5.s +; CHECK-NEXT: uunpkhi z4.s, z6.h +; CHECK-NEXT: uunpkhi z5.s, z0.h +; CHECK-NEXT: uunpklo z6.s, z6.h +; CHECK-NEXT: uunpklo z0.s, z0.h +; CHECK-NEXT: udivr z4.s, p0/m, z4.s, z5.s +; CHECK-NEXT: udiv z0.s, p0/m, z0.s, z6.s +; CHECK-NEXT: uunpkhi z5.s, z17.h +; CHECK-NEXT: udivr z16.s, p0/m, z16.s, z18.s +; CHECK-NEXT: uunpkhi z6.s, z3.h +; CHECK-NEXT: uzp1 z0.h, z0.h, z4.h +; CHECK-NEXT: movprfx z4, z6 +; CHECK-NEXT: udiv z4.s, p0/m, z4.s, z5.s +; CHECK-NEXT: uunpklo z5.s, z17.h +; CHECK-NEXT: uunpklo z3.s, z3.h +; CHECK-NEXT: uzp1 z1.h, z1.h, z16.h +; CHECK-NEXT: udiv z3.s, p0/m, z3.s, z5.s +; CHECK-NEXT: stp q0, q1, [x0, #32] +; CHECK-NEXT: uzp1 z0.h, z3.h, z4.h +; CHECK-NEXT: uzp1 z1.h, z2.h, z7.h +; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: ret + %op1 = load <32 x i16>, <32 x i16>* %a + %op2 = load <32 x i16>, <32 x i16>* %b + %res = udiv <32 x i16> %op1, %op2 + store <32 x i16> %res, <32 x i16>* %a + ret void +} + +define <2 x i32> @udiv_v2i32(<2 x i32> %op1, <2 x i32> %op2) #0 { +; CHECK-LABEL: udiv_v2i32: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: ptrue p0.s, vl2 +; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1 +; CHECK-NEXT: udiv z0.s, p0/m, z0.s, z1.s +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 +; CHECK-NEXT: ret + %res = udiv <2 x i32> %op1, %op2 + ret <2 x i32> %res +} + +define <4 x i32> @udiv_v4i32(<4 x i32> %op1, <4 x i32> %op2) #0 { +; CHECK-LABEL: udiv_v4i32: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: ptrue p0.s, vl4 +; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1 +; CHECK-NEXT: udiv z0.s, p0/m, z0.s, z1.s +; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 +; CHECK-NEXT: ret + %res = udiv <4 x i32> %op1, %op2 + ret <4 x i32> %res +} + +define void @udiv_v8i32(<8 x i32>* %a, <8 x i32>* %b) #0 { +; CHECK-LABEL: udiv_v8i32: +; CHECK: // %bb.0: +; CHECK-NEXT: mov x8, #4 +; CHECK-NEXT: ptrue p0.s, vl4 +; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] +; CHECK-NEXT: ld1w { z1.s }, p0/z, [x0] +; CHECK-NEXT: ld1w { z2.s }, p0/z, [x1, x8, lsl #2] +; CHECK-NEXT: ld1w { z3.s }, p0/z, [x1] +; CHECK-NEXT: udiv z1.s, p0/m, z1.s, z3.s +; CHECK-NEXT: udiv z0.s, p0/m, z0.s, z2.s +; CHECK-NEXT: stp q1, q0, [x0] +; CHECK-NEXT: ret + %op1 = load <8 x i32>, <8 x i32>* %a + %op2 = load <8 x i32>, <8 x i32>* %b + %res = udiv <8 x i32> %op1, %op2 + store <8 x i32> %res, <8 x i32>* %a + ret void +} + +define void @udiv_v16i32(<16 x i32>* %a, <16 x i32>* %b) #0 { +; CHECK-LABEL: udiv_v16i32: +; CHECK: // %bb.0: +; CHECK-NEXT: mov x8, #8 +; CHECK-NEXT: mov x9, #12 +; CHECK-NEXT: mov x10, #4 +; CHECK-NEXT: ptrue p0.s, vl4 +; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] +; CHECK-NEXT: ld1w { z1.s }, p0/z, [x0, x9, lsl #2] +; CHECK-NEXT: ld1w { z2.s }, p0/z, [x0, x10, lsl #2] +; CHECK-NEXT: ld1w { z3.s }, p0/z, [x0] +; CHECK-NEXT: ld1w { z4.s }, p0/z, [x1, x8, lsl #2] +; CHECK-NEXT: ld1w { z5.s }, p0/z, [x1, x9, lsl #2] +; CHECK-NEXT: ld1w { z6.s }, p0/z, [x1, x10, lsl #2] +; CHECK-NEXT: ld1w { z7.s }, p0/z, [x1] +; CHECK-NEXT: udiv z0.s, p0/m, z0.s, z4.s +; CHECK-NEXT: udiv z1.s, p0/m, z1.s, z5.s +; CHECK-NEXT: stp q0, q1, [x0, #32] +; CHECK-NEXT: movprfx z0, z3 +; CHECK-NEXT: udiv z0.s, p0/m, z0.s, z7.s +; CHECK-NEXT: movprfx z1, z2 +; CHECK-NEXT: udiv z1.s, p0/m, z1.s, z6.s +; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: ret + %op1 = load <16 x i32>, <16 x i32>* %a + %op2 = load <16 x i32>, <16 x i32>* %b + %res = udiv <16 x i32> %op1, %op2 + store <16 x i32> %res, <16 x i32>* %a + ret void +} + +define <1 x i64> @udiv_v1i64(<1 x i64> %op1, <1 x i64> %op2) #0 { +; CHECK-LABEL: udiv_v1i64: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: ptrue p0.d, vl1 +; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1 +; CHECK-NEXT: udiv z0.d, p0/m, z0.d, z1.d +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 +; CHECK-NEXT: ret + %res = udiv <1 x i64> %op1, %op2 + ret <1 x i64> %res +} + +define <2 x i64> @udiv_v2i64(<2 x i64> %op1, <2 x i64> %op2) #0 { +; CHECK-LABEL: udiv_v2i64: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: ptrue p0.d, vl2 +; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1 +; CHECK-NEXT: udiv z0.d, p0/m, z0.d, z1.d +; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 +; CHECK-NEXT: ret + %res = udiv <2 x i64> %op1, %op2 + ret <2 x i64> %res +} + +define void @udiv_v4i64(<4 x i64>* %a, <4 x i64>* %b) #0 { +; CHECK-LABEL: udiv_v4i64: +; CHECK: // %bb.0: +; CHECK-NEXT: mov x8, #2 +; CHECK-NEXT: ptrue p0.d, vl2 +; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] +; CHECK-NEXT: ld1d { z1.d }, p0/z, [x0] +; CHECK-NEXT: ld1d { z2.d }, p0/z, [x1, x8, lsl #3] +; CHECK-NEXT: ld1d { z3.d }, p0/z, [x1] +; CHECK-NEXT: udiv z1.d, p0/m, z1.d, z3.d +; CHECK-NEXT: udiv z0.d, p0/m, z0.d, z2.d +; CHECK-NEXT: stp q1, q0, [x0] +; CHECK-NEXT: ret + %op1 = load <4 x i64>, <4 x i64>* %a + %op2 = load <4 x i64>, <4 x i64>* %b + %res = udiv <4 x i64> %op1, %op2 + store <4 x i64> %res, <4 x i64>* %a + ret void +} + +define void @udiv_v8i64(<8 x i64>* %a, <8 x i64>* %b) #0 { +; CHECK-LABEL: udiv_v8i64: +; CHECK: // %bb.0: +; CHECK-NEXT: mov x8, #4 +; CHECK-NEXT: mov x9, #6 +; CHECK-NEXT: mov x10, #2 +; CHECK-NEXT: ptrue p0.d, vl2 +; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] +; CHECK-NEXT: ld1d { z1.d }, p0/z, [x0, x9, lsl #3] +; CHECK-NEXT: ld1d { z2.d }, p0/z, [x0, x10, lsl #3] +; CHECK-NEXT: ld1d { z3.d }, p0/z, [x0] +; CHECK-NEXT: ld1d { z4.d }, p0/z, [x1, x8, lsl #3] +; CHECK-NEXT: ld1d { z5.d }, p0/z, [x1, x9, lsl #3] +; CHECK-NEXT: ld1d { z6.d }, p0/z, [x1, x10, lsl #3] +; CHECK-NEXT: ld1d { z7.d }, p0/z, [x1] +; CHECK-NEXT: udiv z0.d, p0/m, z0.d, z4.d +; CHECK-NEXT: udiv z1.d, p0/m, z1.d, z5.d +; CHECK-NEXT: stp q0, q1, [x0, #32] +; CHECK-NEXT: movprfx z0, z3 +; CHECK-NEXT: udiv z0.d, p0/m, z0.d, z7.d +; CHECK-NEXT: movprfx z1, z2 +; CHECK-NEXT: udiv z1.d, p0/m, z1.d, z6.d +; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: ret + %op1 = load <8 x i64>, <8 x i64>* %a + %op2 = load <8 x i64>, <8 x i64>* %b + %res = udiv <8 x i64> %op1, %op2 + store <8 x i64> %res, <8 x i64>* %a + ret void +} + +define void @udiv_constantsplat_v8i32(<8 x i32>* %a) #0 { +; CHECK-LABEL: udiv_constantsplat_v8i32: +; CHECK: // %bb.0: +; CHECK-NEXT: mov x8, #4 +; CHECK-NEXT: ptrue p0.s, vl4 +; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] +; CHECK-NEXT: adrp x8, .LCPI36_0 +; CHECK-NEXT: add x8, x8, :lo12:.LCPI36_0 +; CHECK-NEXT: ld1w { z1.s }, p0/z, [x0] +; CHECK-NEXT: ld1w { z2.s }, p0/z, [x8] +; CHECK-NEXT: adrp x8, .LCPI36_1 +; CHECK-NEXT: add x8, x8, :lo12:.LCPI36_1 +; CHECK-NEXT: ld1w { z3.s }, p0/z, [x8] +; CHECK-NEXT: adrp x8, .LCPI36_2 +; CHECK-NEXT: add x8, x8, :lo12:.LCPI36_2 +; CHECK-NEXT: ld1w { z4.s }, p0/z, [x8] +; CHECK-NEXT: movprfx z5, z1 +; CHECK-NEXT: umulh z5.s, p0/m, z5.s, z2.s +; CHECK-NEXT: umulh z2.s, p0/m, z2.s, z0.s +; CHECK-NEXT: sub z1.s, z1.s, z5.s +; CHECK-NEXT: sub z0.s, z0.s, z2.s +; CHECK-NEXT: lsr z1.s, p0/m, z1.s, z3.s +; CHECK-NEXT: lsr z0.s, p0/m, z0.s, z3.s +; CHECK-NEXT: add z1.s, z1.s, z5.s +; CHECK-NEXT: add z0.s, z0.s, z2.s +; CHECK-NEXT: lsr z1.s, p0/m, z1.s, z4.s +; CHECK-NEXT: lsr z0.s, p0/m, z0.s, z4.s +; CHECK-NEXT: stp q1, q0, [x0] +; CHECK-NEXT: ret + %op1 = load <8 x i32>, <8 x i32>* %a + %res = udiv <8 x i32> %op1, + store <8 x i32> %res, <8 x i32>* %a + ret void +} + +attributes #0 = { "target-features"="+sve" } diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-log.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-log.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-log.ll @@ -0,0 +1,546 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -force-streaming-compatible-sve < %s | FileCheck %s + +target triple = "aarch64-unknown-linux-gnu" + +; +; AND +; + +define <8 x i8> @and_v8i8(<8 x i8> %op1, <8 x i8> %op2) #0 { +; CHECK-LABEL: and_v8i8: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1 +; CHECK-NEXT: and z0.d, z0.d, z1.d +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 +; CHECK-NEXT: ret + %res = and <8 x i8> %op1, %op2 + ret <8 x i8> %res +} + +define <16 x i8> @and_v16i8(<16 x i8> %op1, <16 x i8> %op2) #0 { +; CHECK-LABEL: and_v16i8: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1 +; CHECK-NEXT: and z0.d, z0.d, z1.d +; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 +; CHECK-NEXT: ret + %res = and <16 x i8> %op1, %op2 + ret <16 x i8> %res +} + +define void @and_v32i8(<32 x i8>* %a, <32 x i8>* %b) #0 { +; CHECK-LABEL: and_v32i8: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w8, #16 +; CHECK-NEXT: ptrue p0.b, vl16 +; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0, x8] +; CHECK-NEXT: ld1b { z1.b }, p0/z, [x0] +; CHECK-NEXT: ld1b { z2.b }, p0/z, [x1, x8] +; CHECK-NEXT: ld1b { z3.b }, p0/z, [x1] +; CHECK-NEXT: and z1.d, z1.d, z3.d +; CHECK-NEXT: and z0.d, z0.d, z2.d +; CHECK-NEXT: stp q1, q0, [x0] +; CHECK-NEXT: ret + %op1 = load <32 x i8>, <32 x i8>* %a + %op2 = load <32 x i8>, <32 x i8>* %b + %res = and <32 x i8> %op1, %op2 + store <32 x i8> %res, <32 x i8>* %a + ret void +} + +define <4 x i16> @and_v4i16(<4 x i16> %op1, <4 x i16> %op2) #0 { +; CHECK-LABEL: and_v4i16: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1 +; CHECK-NEXT: and z0.d, z0.d, z1.d +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 +; CHECK-NEXT: ret + %res = and <4 x i16> %op1, %op2 + ret <4 x i16> %res +} + +define <8 x i16> @and_v8i16(<8 x i16> %op1, <8 x i16> %op2) #0 { +; CHECK-LABEL: and_v8i16: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1 +; CHECK-NEXT: and z0.d, z0.d, z1.d +; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 +; CHECK-NEXT: ret + %res = and <8 x i16> %op1, %op2 + ret <8 x i16> %res +} + +define void @and_v16i16(<16 x i16>* %a, <16 x i16>* %b) #0 { +; CHECK-LABEL: and_v16i16: +; CHECK: // %bb.0: +; CHECK-NEXT: mov x8, #8 +; CHECK-NEXT: ptrue p0.h, vl8 +; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] +; CHECK-NEXT: ld1h { z1.h }, p0/z, [x0] +; CHECK-NEXT: ld1h { z2.h }, p0/z, [x1, x8, lsl #1] +; CHECK-NEXT: ld1h { z3.h }, p0/z, [x1] +; CHECK-NEXT: and z1.d, z1.d, z3.d +; CHECK-NEXT: and z0.d, z0.d, z2.d +; CHECK-NEXT: stp q1, q0, [x0] +; CHECK-NEXT: ret + %op1 = load <16 x i16>, <16 x i16>* %a + %op2 = load <16 x i16>, <16 x i16>* %b + %res = and <16 x i16> %op1, %op2 + store <16 x i16> %res, <16 x i16>* %a + ret void +} + +define <2 x i32> @and_v2i32(<2 x i32> %op1, <2 x i32> %op2) #0 { +; CHECK-LABEL: and_v2i32: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1 +; CHECK-NEXT: and z0.d, z0.d, z1.d +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 +; CHECK-NEXT: ret + %res = and <2 x i32> %op1, %op2 + ret <2 x i32> %res +} + +define <4 x i32> @and_v4i32(<4 x i32> %op1, <4 x i32> %op2) #0 { +; CHECK-LABEL: and_v4i32: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1 +; CHECK-NEXT: and z0.d, z0.d, z1.d +; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 +; CHECK-NEXT: ret + %res = and <4 x i32> %op1, %op2 + ret <4 x i32> %res +} + +define void @and_v8i32(<8 x i32>* %a, <8 x i32>* %b) #0 { +; CHECK-LABEL: and_v8i32: +; CHECK: // %bb.0: +; CHECK-NEXT: mov x8, #4 +; CHECK-NEXT: ptrue p0.s, vl4 +; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] +; CHECK-NEXT: ld1w { z1.s }, p0/z, [x0] +; CHECK-NEXT: ld1w { z2.s }, p0/z, [x1, x8, lsl #2] +; CHECK-NEXT: ld1w { z3.s }, p0/z, [x1] +; CHECK-NEXT: and z1.d, z1.d, z3.d +; CHECK-NEXT: and z0.d, z0.d, z2.d +; CHECK-NEXT: stp q1, q0, [x0] +; CHECK-NEXT: ret + %op1 = load <8 x i32>, <8 x i32>* %a + %op2 = load <8 x i32>, <8 x i32>* %b + %res = and <8 x i32> %op1, %op2 + store <8 x i32> %res, <8 x i32>* %a + ret void +} + +define <1 x i64> @and_v1i64(<1 x i64> %op1, <1 x i64> %op2) #0 { +; CHECK-LABEL: and_v1i64: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1 +; CHECK-NEXT: and z0.d, z0.d, z1.d +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 +; CHECK-NEXT: ret + %res = and <1 x i64> %op1, %op2 + ret <1 x i64> %res +} + +define <2 x i64> @and_v2i64(<2 x i64> %op1, <2 x i64> %op2) #0 { +; CHECK-LABEL: and_v2i64: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1 +; CHECK-NEXT: and z0.d, z0.d, z1.d +; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 +; CHECK-NEXT: ret + %res = and <2 x i64> %op1, %op2 + ret <2 x i64> %res +} + +define void @and_v4i64(<4 x i64>* %a, <4 x i64>* %b) #0 { +; CHECK-LABEL: and_v4i64: +; CHECK: // %bb.0: +; CHECK-NEXT: mov x8, #2 +; CHECK-NEXT: ptrue p0.d, vl2 +; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] +; CHECK-NEXT: ld1d { z1.d }, p0/z, [x0] +; CHECK-NEXT: ld1d { z2.d }, p0/z, [x1, x8, lsl #3] +; CHECK-NEXT: ld1d { z3.d }, p0/z, [x1] +; CHECK-NEXT: and z1.d, z1.d, z3.d +; CHECK-NEXT: and z0.d, z0.d, z2.d +; CHECK-NEXT: stp q1, q0, [x0] +; CHECK-NEXT: ret + %op1 = load <4 x i64>, <4 x i64>* %a + %op2 = load <4 x i64>, <4 x i64>* %b + %res = and <4 x i64> %op1, %op2 + store <4 x i64> %res, <4 x i64>* %a + ret void +} + +; +; OR +; + +define <8 x i8> @or_v8i8(<8 x i8> %op1, <8 x i8> %op2) #0 { +; CHECK-LABEL: or_v8i8: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1 +; CHECK-NEXT: orr z0.d, z0.d, z1.d +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 +; CHECK-NEXT: ret + %res = or <8 x i8> %op1, %op2 + ret <8 x i8> %res +} + +define <16 x i8> @or_v16i8(<16 x i8> %op1, <16 x i8> %op2) #0 { +; CHECK-LABEL: or_v16i8: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1 +; CHECK-NEXT: orr z0.d, z0.d, z1.d +; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 +; CHECK-NEXT: ret + %res = or <16 x i8> %op1, %op2 + ret <16 x i8> %res +} + +define void @or_v32i8(<32 x i8>* %a, <32 x i8>* %b) #0 { +; CHECK-LABEL: or_v32i8: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w8, #16 +; CHECK-NEXT: ptrue p0.b, vl16 +; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0, x8] +; CHECK-NEXT: ld1b { z1.b }, p0/z, [x0] +; CHECK-NEXT: ld1b { z2.b }, p0/z, [x1, x8] +; CHECK-NEXT: ld1b { z3.b }, p0/z, [x1] +; CHECK-NEXT: orr z1.d, z1.d, z3.d +; CHECK-NEXT: orr z0.d, z0.d, z2.d +; CHECK-NEXT: stp q1, q0, [x0] +; CHECK-NEXT: ret + %op1 = load <32 x i8>, <32 x i8>* %a + %op2 = load <32 x i8>, <32 x i8>* %b + %res = or <32 x i8> %op1, %op2 + store <32 x i8> %res, <32 x i8>* %a + ret void +} + +define <4 x i16> @or_v4i16(<4 x i16> %op1, <4 x i16> %op2) #0 { +; CHECK-LABEL: or_v4i16: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1 +; CHECK-NEXT: orr z0.d, z0.d, z1.d +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 +; CHECK-NEXT: ret + %res = or <4 x i16> %op1, %op2 + ret <4 x i16> %res +} + +define <8 x i16> @or_v8i16(<8 x i16> %op1, <8 x i16> %op2) #0 { +; CHECK-LABEL: or_v8i16: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1 +; CHECK-NEXT: orr z0.d, z0.d, z1.d +; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 +; CHECK-NEXT: ret + %res = or <8 x i16> %op1, %op2 + ret <8 x i16> %res +} + +define void @or_v16i16(<16 x i16>* %a, <16 x i16>* %b) #0 { +; CHECK-LABEL: or_v16i16: +; CHECK: // %bb.0: +; CHECK-NEXT: mov x8, #8 +; CHECK-NEXT: ptrue p0.h, vl8 +; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] +; CHECK-NEXT: ld1h { z1.h }, p0/z, [x0] +; CHECK-NEXT: ld1h { z2.h }, p0/z, [x1, x8, lsl #1] +; CHECK-NEXT: ld1h { z3.h }, p0/z, [x1] +; CHECK-NEXT: orr z1.d, z1.d, z3.d +; CHECK-NEXT: orr z0.d, z0.d, z2.d +; CHECK-NEXT: stp q1, q0, [x0] +; CHECK-NEXT: ret + %op1 = load <16 x i16>, <16 x i16>* %a + %op2 = load <16 x i16>, <16 x i16>* %b + %res = or <16 x i16> %op1, %op2 + store <16 x i16> %res, <16 x i16>* %a + ret void +} + +define <2 x i32> @or_v2i32(<2 x i32> %op1, <2 x i32> %op2) #0 { +; CHECK-LABEL: or_v2i32: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1 +; CHECK-NEXT: orr z0.d, z0.d, z1.d +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 +; CHECK-NEXT: ret + %res = or <2 x i32> %op1, %op2 + ret <2 x i32> %res +} + +define <4 x i32> @or_v4i32(<4 x i32> %op1, <4 x i32> %op2) #0 { +; CHECK-LABEL: or_v4i32: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1 +; CHECK-NEXT: orr z0.d, z0.d, z1.d +; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 +; CHECK-NEXT: ret + %res = or <4 x i32> %op1, %op2 + ret <4 x i32> %res +} + +define void @or_v8i32(<8 x i32>* %a, <8 x i32>* %b) #0 { +; CHECK-LABEL: or_v8i32: +; CHECK: // %bb.0: +; CHECK-NEXT: mov x8, #4 +; CHECK-NEXT: ptrue p0.s, vl4 +; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] +; CHECK-NEXT: ld1w { z1.s }, p0/z, [x0] +; CHECK-NEXT: ld1w { z2.s }, p0/z, [x1, x8, lsl #2] +; CHECK-NEXT: ld1w { z3.s }, p0/z, [x1] +; CHECK-NEXT: orr z1.d, z1.d, z3.d +; CHECK-NEXT: orr z0.d, z0.d, z2.d +; CHECK-NEXT: stp q1, q0, [x0] +; CHECK-NEXT: ret + %op1 = load <8 x i32>, <8 x i32>* %a + %op2 = load <8 x i32>, <8 x i32>* %b + %res = or <8 x i32> %op1, %op2 + store <8 x i32> %res, <8 x i32>* %a + ret void +} + +define <1 x i64> @or_v1i64(<1 x i64> %op1, <1 x i64> %op2) #0 { +; CHECK-LABEL: or_v1i64: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1 +; CHECK-NEXT: orr z0.d, z0.d, z1.d +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 +; CHECK-NEXT: ret + %res = or <1 x i64> %op1, %op2 + ret <1 x i64> %res +} + +define <2 x i64> @or_v2i64(<2 x i64> %op1, <2 x i64> %op2) #0 { +; CHECK-LABEL: or_v2i64: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1 +; CHECK-NEXT: orr z0.d, z0.d, z1.d +; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 +; CHECK-NEXT: ret + %res = or <2 x i64> %op1, %op2 + ret <2 x i64> %res +} + +define void @or_v4i64(<4 x i64>* %a, <4 x i64>* %b) #0 { +; CHECK-LABEL: or_v4i64: +; CHECK: // %bb.0: +; CHECK-NEXT: mov x8, #2 +; CHECK-NEXT: ptrue p0.d, vl2 +; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] +; CHECK-NEXT: ld1d { z1.d }, p0/z, [x0] +; CHECK-NEXT: ld1d { z2.d }, p0/z, [x1, x8, lsl #3] +; CHECK-NEXT: ld1d { z3.d }, p0/z, [x1] +; CHECK-NEXT: orr z1.d, z1.d, z3.d +; CHECK-NEXT: orr z0.d, z0.d, z2.d +; CHECK-NEXT: stp q1, q0, [x0] +; CHECK-NEXT: ret + %op1 = load <4 x i64>, <4 x i64>* %a + %op2 = load <4 x i64>, <4 x i64>* %b + %res = or <4 x i64> %op1, %op2 + store <4 x i64> %res, <4 x i64>* %a + ret void +} + +; +; XOR +; + +define <8 x i8> @xor_v8i8(<8 x i8> %op1, <8 x i8> %op2) #0 { +; CHECK-LABEL: xor_v8i8: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1 +; CHECK-NEXT: eor z0.d, z0.d, z1.d +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 +; CHECK-NEXT: ret + %res = xor <8 x i8> %op1, %op2 + ret <8 x i8> %res +} + +define <16 x i8> @xor_v16i8(<16 x i8> %op1, <16 x i8> %op2) #0 { +; CHECK-LABEL: xor_v16i8: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1 +; CHECK-NEXT: eor z0.d, z0.d, z1.d +; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 +; CHECK-NEXT: ret + %res = xor <16 x i8> %op1, %op2 + ret <16 x i8> %res +} + +define void @xor_v32i8(<32 x i8>* %a, <32 x i8>* %b) #0 { +; CHECK-LABEL: xor_v32i8: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w8, #16 +; CHECK-NEXT: ptrue p0.b, vl16 +; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0, x8] +; CHECK-NEXT: ld1b { z1.b }, p0/z, [x0] +; CHECK-NEXT: ld1b { z2.b }, p0/z, [x1, x8] +; CHECK-NEXT: ld1b { z3.b }, p0/z, [x1] +; CHECK-NEXT: eor z1.d, z1.d, z3.d +; CHECK-NEXT: eor z0.d, z0.d, z2.d +; CHECK-NEXT: stp q1, q0, [x0] +; CHECK-NEXT: ret + %op1 = load <32 x i8>, <32 x i8>* %a + %op2 = load <32 x i8>, <32 x i8>* %b + %res = xor <32 x i8> %op1, %op2 + store <32 x i8> %res, <32 x i8>* %a + ret void +} + +define <4 x i16> @xor_v4i16(<4 x i16> %op1, <4 x i16> %op2) #0 { +; CHECK-LABEL: xor_v4i16: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1 +; CHECK-NEXT: eor z0.d, z0.d, z1.d +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 +; CHECK-NEXT: ret + %res = xor <4 x i16> %op1, %op2 + ret <4 x i16> %res +} + +define <8 x i16> @xor_v8i16(<8 x i16> %op1, <8 x i16> %op2) #0 { +; CHECK-LABEL: xor_v8i16: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1 +; CHECK-NEXT: eor z0.d, z0.d, z1.d +; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 +; CHECK-NEXT: ret + %res = xor <8 x i16> %op1, %op2 + ret <8 x i16> %res +} + +define void @xor_v16i16(<16 x i16>* %a, <16 x i16>* %b) #0 { +; CHECK-LABEL: xor_v16i16: +; CHECK: // %bb.0: +; CHECK-NEXT: mov x8, #8 +; CHECK-NEXT: ptrue p0.h, vl8 +; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] +; CHECK-NEXT: ld1h { z1.h }, p0/z, [x0] +; CHECK-NEXT: ld1h { z2.h }, p0/z, [x1, x8, lsl #1] +; CHECK-NEXT: ld1h { z3.h }, p0/z, [x1] +; CHECK-NEXT: eor z1.d, z1.d, z3.d +; CHECK-NEXT: eor z0.d, z0.d, z2.d +; CHECK-NEXT: stp q1, q0, [x0] +; CHECK-NEXT: ret + %op1 = load <16 x i16>, <16 x i16>* %a + %op2 = load <16 x i16>, <16 x i16>* %b + %res = xor <16 x i16> %op1, %op2 + store <16 x i16> %res, <16 x i16>* %a + ret void +} + +define <2 x i32> @xor_v2i32(<2 x i32> %op1, <2 x i32> %op2) #0 { +; CHECK-LABEL: xor_v2i32: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1 +; CHECK-NEXT: eor z0.d, z0.d, z1.d +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 +; CHECK-NEXT: ret + %res = xor <2 x i32> %op1, %op2 + ret <2 x i32> %res +} + +define <4 x i32> @xor_v4i32(<4 x i32> %op1, <4 x i32> %op2) #0 { +; CHECK-LABEL: xor_v4i32: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1 +; CHECK-NEXT: eor z0.d, z0.d, z1.d +; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 +; CHECK-NEXT: ret + %res = xor <4 x i32> %op1, %op2 + ret <4 x i32> %res +} + +define void @xor_v8i32(<8 x i32>* %a, <8 x i32>* %b) #0 { +; CHECK-LABEL: xor_v8i32: +; CHECK: // %bb.0: +; CHECK-NEXT: mov x8, #4 +; CHECK-NEXT: ptrue p0.s, vl4 +; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] +; CHECK-NEXT: ld1w { z1.s }, p0/z, [x0] +; CHECK-NEXT: ld1w { z2.s }, p0/z, [x1, x8, lsl #2] +; CHECK-NEXT: ld1w { z3.s }, p0/z, [x1] +; CHECK-NEXT: eor z1.d, z1.d, z3.d +; CHECK-NEXT: eor z0.d, z0.d, z2.d +; CHECK-NEXT: stp q1, q0, [x0] +; CHECK-NEXT: ret + %op1 = load <8 x i32>, <8 x i32>* %a + %op2 = load <8 x i32>, <8 x i32>* %b + %res = xor <8 x i32> %op1, %op2 + store <8 x i32> %res, <8 x i32>* %a + ret void +} + +define <1 x i64> @xor_v1i64(<1 x i64> %op1, <1 x i64> %op2) #0 { +; CHECK-LABEL: xor_v1i64: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1 +; CHECK-NEXT: eor z0.d, z0.d, z1.d +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 +; CHECK-NEXT: ret + %res = xor <1 x i64> %op1, %op2 + ret <1 x i64> %res +} + +define <2 x i64> @xor_v2i64(<2 x i64> %op1, <2 x i64> %op2) #0 { +; CHECK-LABEL: xor_v2i64: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1 +; CHECK-NEXT: eor z0.d, z0.d, z1.d +; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 +; CHECK-NEXT: ret + %res = xor <2 x i64> %op1, %op2 + ret <2 x i64> %res +} + +define void @xor_v4i64(<4 x i64>* %a, <4 x i64>* %b) #0 { +; CHECK-LABEL: xor_v4i64: +; CHECK: // %bb.0: +; CHECK-NEXT: mov x8, #2 +; CHECK-NEXT: ptrue p0.d, vl2 +; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] +; CHECK-NEXT: ld1d { z1.d }, p0/z, [x0] +; CHECK-NEXT: ld1d { z2.d }, p0/z, [x1, x8, lsl #3] +; CHECK-NEXT: ld1d { z3.d }, p0/z, [x1] +; CHECK-NEXT: eor z1.d, z1.d, z3.d +; CHECK-NEXT: eor z0.d, z0.d, z2.d +; CHECK-NEXT: stp q1, q0, [x0] +; CHECK-NEXT: ret + %op1 = load <4 x i64>, <4 x i64>* %a + %op2 = load <4 x i64>, <4 x i64>* %b + %res = xor <4 x i64> %op1, %op2 + store <4 x i64> %res, <4 x i64>* %a + ret void +} + +attributes #0 = { "target-features"="+sve" } diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-mulh.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-mulh.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-mulh.ll @@ -0,0 +1,924 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -force-streaming-compatible-sve < %s | FileCheck %s + +; This test only tests the legal types for a given vector width, as mulh nodes +; do not get generated for non-legal types. + +target triple = "aarch64-unknown-linux-gnu" + +; +; SMULH +; + +define <4 x i8> @smulh_v4i8(<4 x i8> %op1, <4 x i8> %op2) #0 { +; CHECK-LABEL: smulh_v4i8: +; CHECK: // %bb.0: +; CHECK-NEXT: adrp x8, .LCPI0_0 +; CHECK-NEXT: add x8, x8, :lo12:.LCPI0_0 +; CHECK-NEXT: ptrue p0.h, vl4 +; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1 +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: ld1h { z2.h }, p0/z, [x8] +; CHECK-NEXT: adrp x8, .LCPI0_1 +; CHECK-NEXT: add x8, x8, :lo12:.LCPI0_1 +; CHECK-NEXT: ld1h { z3.h }, p0/z, [x8] +; CHECK-NEXT: lsl z0.h, p0/m, z0.h, z2.h +; CHECK-NEXT: lsl z1.h, p0/m, z1.h, z2.h +; CHECK-NEXT: asr z0.h, p0/m, z0.h, z2.h +; CHECK-NEXT: asr z1.h, p0/m, z1.h, z2.h +; CHECK-NEXT: mul z0.h, p0/m, z0.h, z1.h +; CHECK-NEXT: lsr z0.h, p0/m, z0.h, z3.h +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 +; CHECK-NEXT: ret + %insert = insertelement <4 x i16> undef, i16 4, i64 0 + %splat = shufflevector <4 x i16> %insert, <4 x i16> undef, <4 x i32> zeroinitializer + %1 = sext <4 x i8> %op1 to <4 x i16> + %2 = sext <4 x i8> %op2 to <4 x i16> + %mul = mul <4 x i16> %1, %2 + %shr = lshr <4 x i16> %mul, + %res = trunc <4 x i16> %shr to <4 x i8> + ret <4 x i8> %res +} + +define <8 x i8> @smulh_v8i8(<8 x i8> %op1, <8 x i8> %op2) #0 { +; CHECK-LABEL: smulh_v8i8: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: ptrue p0.b, vl8 +; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1 +; CHECK-NEXT: smulh z0.b, p0/m, z0.b, z1.b +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 +; CHECK-NEXT: ret + %insert = insertelement <8 x i16> undef, i16 8, i64 0 + %splat = shufflevector <8 x i16> %insert, <8 x i16> undef, <8 x i32> zeroinitializer + %1 = sext <8 x i8> %op1 to <8 x i16> + %2 = sext <8 x i8> %op2 to <8 x i16> + %mul = mul <8 x i16> %1, %2 + %shr = lshr <8 x i16> %mul, + %res = trunc <8 x i16> %shr to <8 x i8> + ret <8 x i8> %res +} + +define <16 x i8> @smulh_v16i8(<16 x i8> %op1, <16 x i8> %op2) #0 { +; CHECK-LABEL: smulh_v16i8: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: ptrue p0.b, vl16 +; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1 +; CHECK-NEXT: smulh z0.b, p0/m, z0.b, z1.b +; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 +; CHECK-NEXT: ret + %1 = sext <16 x i8> %op1 to <16 x i16> + %2 = sext <16 x i8> %op2 to <16 x i16> + %mul = mul <16 x i16> %1, %2 + %shr = lshr <16 x i16> %mul, + %res = trunc <16 x i16> %shr to <16 x i8> + ret <16 x i8> %res +} + +define void @smulh_v32i8(<32 x i8>* %a, <32 x i8>* %b) #0 { +; CHECK-LABEL: smulh_v32i8: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w8, #16 +; CHECK-NEXT: ptrue p0.b, vl16 +; CHECK-NEXT: ptrue p1.h, vl8 +; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0, x8] +; CHECK-NEXT: ld1b { z2.b }, p0/z, [x0] +; CHECK-NEXT: ld1b { z1.b }, p0/z, [x1, x8] +; CHECK-NEXT: ld1b { z3.b }, p0/z, [x1] +; CHECK-NEXT: adrp x8, .LCPI3_0 +; CHECK-NEXT: add x8, x8, :lo12:.LCPI3_0 +; CHECK-NEXT: sunpklo z5.h, z2.b +; CHECK-NEXT: ext z2.b, z2.b, z2.b, #8 +; CHECK-NEXT: sunpklo z7.h, z3.b +; CHECK-NEXT: ld1h { z16.h }, p1/z, [x8] +; CHECK-NEXT: ext z3.b, z3.b, z3.b, #8 +; CHECK-NEXT: sunpklo z2.h, z2.b +; CHECK-NEXT: sunpklo z3.h, z3.b +; CHECK-NEXT: mul z5.h, p1/m, z5.h, z7.h +; CHECK-NEXT: mul z2.h, p1/m, z2.h, z3.h +; CHECK-NEXT: movprfx z3, z5 +; CHECK-NEXT: lsr z3.h, p1/m, z3.h, z16.h +; CHECK-NEXT: fmov w8, s3 +; CHECK-NEXT: sunpklo z4.h, z0.b +; CHECK-NEXT: sunpklo z6.h, z1.b +; CHECK-NEXT: ext z0.b, z0.b, z0.b, #8 +; CHECK-NEXT: ext z1.b, z1.b, z1.b, #8 +; CHECK-NEXT: lsr z2.h, p1/m, z2.h, z16.h +; CHECK-NEXT: mov z5.h, z3.h[7] +; CHECK-NEXT: sunpklo z0.h, z0.b +; CHECK-NEXT: sunpklo z1.h, z1.b +; CHECK-NEXT: mul z4.h, p1/m, z4.h, z6.h +; CHECK-NEXT: mov z6.h, z3.h[6] +; CHECK-NEXT: mov z7.h, z3.h[5] +; CHECK-NEXT: mov z17.h, z3.h[4] +; CHECK-NEXT: mov z18.h, z3.h[3] +; CHECK-NEXT: mov z19.h, z3.h[2] +; CHECK-NEXT: mov z20.h, z3.h[1] +; CHECK-NEXT: mov z3.h, z2.h[7] +; CHECK-NEXT: mov z21.h, z2.h[6] +; CHECK-NEXT: mov z22.h, z2.h[5] +; CHECK-NEXT: mov z23.h, z2.h[4] +; CHECK-NEXT: mov z24.h, z2.h[3] +; CHECK-NEXT: mov z25.h, z2.h[2] +; CHECK-NEXT: mov z26.h, z2.h[1] +; CHECK-NEXT: fmov w9, s2 +; CHECK-NEXT: mul z0.h, p1/m, z0.h, z1.h +; CHECK-NEXT: fmov w10, s5 +; CHECK-NEXT: strb w8, [sp, #-32]! +; CHECK-NEXT: .cfi_def_cfa_offset 32 +; CHECK-NEXT: fmov w8, s6 +; CHECK-NEXT: strb w9, [sp, #8] +; CHECK-NEXT: fmov w9, s7 +; CHECK-NEXT: strb w10, [sp, #7] +; CHECK-NEXT: fmov w10, s17 +; CHECK-NEXT: lsr z0.h, p1/m, z0.h, z16.h +; CHECK-NEXT: strb w8, [sp, #6] +; CHECK-NEXT: fmov w8, s18 +; CHECK-NEXT: strb w9, [sp, #5] +; CHECK-NEXT: fmov w9, s19 +; CHECK-NEXT: strb w10, [sp, #4] +; CHECK-NEXT: fmov w10, s20 +; CHECK-NEXT: strb w8, [sp, #3] +; CHECK-NEXT: fmov w8, s3 +; CHECK-NEXT: strb w9, [sp, #2] +; CHECK-NEXT: fmov w9, s21 +; CHECK-NEXT: strb w10, [sp, #1] +; CHECK-NEXT: fmov w10, s22 +; CHECK-NEXT: strb w8, [sp, #15] +; CHECK-NEXT: fmov w8, s23 +; CHECK-NEXT: strb w9, [sp, #14] +; CHECK-NEXT: fmov w9, s24 +; CHECK-NEXT: strb w10, [sp, #13] +; CHECK-NEXT: fmov w10, s25 +; CHECK-NEXT: strb w8, [sp, #12] +; CHECK-NEXT: fmov w8, s26 +; CHECK-NEXT: movprfx z1, z4 +; CHECK-NEXT: lsr z1.h, p1/m, z1.h, z16.h +; CHECK-NEXT: strb w9, [sp, #11] +; CHECK-NEXT: mov z2.h, z1.h[7] +; CHECK-NEXT: fmov w9, s1 +; CHECK-NEXT: strb w10, [sp, #10] +; CHECK-NEXT: fmov w10, s0 +; CHECK-NEXT: strb w8, [sp, #9] +; CHECK-NEXT: fmov w8, s2 +; CHECK-NEXT: mov z3.h, z1.h[6] +; CHECK-NEXT: mov z4.h, z1.h[5] +; CHECK-NEXT: mov z5.h, z1.h[4] +; CHECK-NEXT: strb w9, [sp, #16] +; CHECK-NEXT: fmov w9, s3 +; CHECK-NEXT: strb w10, [sp, #24] +; CHECK-NEXT: fmov w10, s4 +; CHECK-NEXT: strb w8, [sp, #23] +; CHECK-NEXT: fmov w8, s5 +; CHECK-NEXT: mov z6.h, z1.h[3] +; CHECK-NEXT: mov z7.h, z1.h[2] +; CHECK-NEXT: mov z16.h, z1.h[1] +; CHECK-NEXT: strb w9, [sp, #22] +; CHECK-NEXT: fmov w9, s6 +; CHECK-NEXT: strb w10, [sp, #21] +; CHECK-NEXT: fmov w10, s7 +; CHECK-NEXT: strb w8, [sp, #20] +; CHECK-NEXT: fmov w8, s16 +; CHECK-NEXT: mov z1.h, z0.h[7] +; CHECK-NEXT: mov z17.h, z0.h[6] +; CHECK-NEXT: mov z18.h, z0.h[5] +; CHECK-NEXT: strb w9, [sp, #19] +; CHECK-NEXT: fmov w9, s1 +; CHECK-NEXT: strb w10, [sp, #18] +; CHECK-NEXT: fmov w10, s17 +; CHECK-NEXT: strb w8, [sp, #17] +; CHECK-NEXT: fmov w8, s18 +; CHECK-NEXT: mov z19.h, z0.h[4] +; CHECK-NEXT: mov z20.h, z0.h[3] +; CHECK-NEXT: mov z21.h, z0.h[2] +; CHECK-NEXT: strb w9, [sp, #31] +; CHECK-NEXT: fmov w9, s19 +; CHECK-NEXT: strb w10, [sp, #30] +; CHECK-NEXT: fmov w10, s20 +; CHECK-NEXT: strb w8, [sp, #29] +; CHECK-NEXT: fmov w8, s21 +; CHECK-NEXT: mov z22.h, z0.h[1] +; CHECK-NEXT: strb w9, [sp, #28] +; CHECK-NEXT: fmov w9, s22 +; CHECK-NEXT: strb w10, [sp, #27] +; CHECK-NEXT: mov x10, sp +; CHECK-NEXT: strb w8, [sp, #26] +; CHECK-NEXT: add x8, sp, #16 +; CHECK-NEXT: strb w9, [sp, #25] +; CHECK-NEXT: ld1b { z0.b }, p0/z, [x10] +; CHECK-NEXT: ld1b { z1.b }, p0/z, [x8] +; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: add sp, sp, #32 +; CHECK-NEXT: ret + %op1 = load <32 x i8>, <32 x i8>* %a + %op2 = load <32 x i8>, <32 x i8>* %b + %1 = sext <32 x i8> %op1 to <32 x i16> + %2 = sext <32 x i8> %op2 to <32 x i16> + %mul = mul <32 x i16> %1, %2 + %shr = lshr <32 x i16> %mul, + %res = trunc <32 x i16> %shr to <32 x i8> + store <32 x i8> %res, <32 x i8>* %a + ret void +} + +define <2 x i16> @smulh_v2i16(<2 x i16> %op1, <2 x i16> %op2) #0 { +; CHECK-LABEL: smulh_v2i16: +; CHECK: // %bb.0: +; CHECK-NEXT: adrp x8, .LCPI4_0 +; CHECK-NEXT: add x8, x8, :lo12:.LCPI4_0 +; CHECK-NEXT: ptrue p0.s, vl2 +; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1 +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: ld1w { z2.s }, p0/z, [x8] +; CHECK-NEXT: lsl z0.s, p0/m, z0.s, z2.s +; CHECK-NEXT: lsl z1.s, p0/m, z1.s, z2.s +; CHECK-NEXT: asr z0.s, p0/m, z0.s, z2.s +; CHECK-NEXT: asr z1.s, p0/m, z1.s, z2.s +; CHECK-NEXT: mul z0.s, p0/m, z0.s, z1.s +; CHECK-NEXT: lsr z0.s, p0/m, z0.s, z2.s +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 +; CHECK-NEXT: ret + %1 = sext <2 x i16> %op1 to <2 x i32> + %2 = sext <2 x i16> %op2 to <2 x i32> + %mul = mul <2 x i32> %1, %2 + %shr = lshr <2 x i32> %mul, + %res = trunc <2 x i32> %shr to <2 x i16> + ret <2 x i16> %res +} + +define <4 x i16> @smulh_v4i16(<4 x i16> %op1, <4 x i16> %op2) #0 { +; CHECK-LABEL: smulh_v4i16: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: ptrue p0.h, vl4 +; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1 +; CHECK-NEXT: smulh z0.h, p0/m, z0.h, z1.h +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 +; CHECK-NEXT: ret + %1 = sext <4 x i16> %op1 to <4 x i32> + %2 = sext <4 x i16> %op2 to <4 x i32> + %mul = mul <4 x i32> %1, %2 + %shr = lshr <4 x i32> %mul, + %res = trunc <4 x i32> %shr to <4 x i16> + ret <4 x i16> %res +} + +define <8 x i16> @smulh_v8i16(<8 x i16> %op1, <8 x i16> %op2) #0 { +; CHECK-LABEL: smulh_v8i16: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: ptrue p0.h, vl8 +; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1 +; CHECK-NEXT: smulh z0.h, p0/m, z0.h, z1.h +; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 +; CHECK-NEXT: ret + %1 = sext <8 x i16> %op1 to <8 x i32> + %2 = sext <8 x i16> %op2 to <8 x i32> + %mul = mul <8 x i32> %1, %2 + %shr = lshr <8 x i32> %mul, + %res = trunc <8 x i32> %shr to <8 x i16> + ret <8 x i16> %res +} + +define void @smulh_v16i16(<16 x i16>* %a, <16 x i16>* %b) #0 { +; CHECK-LABEL: smulh_v16i16: +; CHECK: // %bb.0: +; CHECK-NEXT: mov x8, #8 +; CHECK-NEXT: ptrue p0.h, vl8 +; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] +; CHECK-NEXT: ld1h { z1.h }, p0/z, [x0] +; CHECK-NEXT: ld1h { z2.h }, p0/z, [x1, x8, lsl #1] +; CHECK-NEXT: ld1h { z3.h }, p0/z, [x1] +; CHECK-NEXT: ptrue p0.h, vl4 +; CHECK-NEXT: movprfx z4, z1 +; CHECK-NEXT: smulh z4.h, p0/m, z4.h, z3.h +; CHECK-NEXT: ext z1.b, z1.b, z1.b, #8 +; CHECK-NEXT: ext z3.b, z3.b, z3.b, #8 +; CHECK-NEXT: smulh z1.h, p0/m, z1.h, z3.h +; CHECK-NEXT: movprfx z3, z0 +; CHECK-NEXT: smulh z3.h, p0/m, z3.h, z2.h +; CHECK-NEXT: ext z2.b, z2.b, z2.b, #8 +; CHECK-NEXT: ext z0.b, z0.b, z0.b, #8 +; CHECK-NEXT: smulh z0.h, p0/m, z0.h, z2.h +; CHECK-NEXT: splice z4.h, p0, z4.h, z1.h +; CHECK-NEXT: splice z3.h, p0, z3.h, z0.h +; CHECK-NEXT: stp q4, q3, [x0] +; CHECK-NEXT: ret + %op1 = load <16 x i16>, <16 x i16>* %a + %op2 = load <16 x i16>, <16 x i16>* %b + %1 = sext <16 x i16> %op1 to <16 x i32> + %2 = sext <16 x i16> %op2 to <16 x i32> + %mul = mul <16 x i32> %1, %2 + %shr = lshr <16 x i32> %mul, + %res = trunc <16 x i32> %shr to <16 x i16> + store <16 x i16> %res, <16 x i16>* %a + ret void +} + +define <2 x i32> @smulh_v2i32(<2 x i32> %op1, <2 x i32> %op2) #0 { +; CHECK-LABEL: smulh_v2i32: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: ptrue p0.s, vl2 +; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1 +; CHECK-NEXT: smulh z0.s, p0/m, z0.s, z1.s +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 +; CHECK-NEXT: ret + %1 = sext <2 x i32> %op1 to <2 x i64> + %2 = sext <2 x i32> %op2 to <2 x i64> + %mul = mul <2 x i64> %1, %2 + %shr = lshr <2 x i64> %mul, + %res = trunc <2 x i64> %shr to <2 x i32> + ret <2 x i32> %res +} + +define <4 x i32> @smulh_v4i32(<4 x i32> %op1, <4 x i32> %op2) #0 { +; CHECK-LABEL: smulh_v4i32: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: ptrue p0.s, vl4 +; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1 +; CHECK-NEXT: smulh z0.s, p0/m, z0.s, z1.s +; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 +; CHECK-NEXT: ret + %1 = sext <4 x i32> %op1 to <4 x i64> + %2 = sext <4 x i32> %op2 to <4 x i64> + %mul = mul <4 x i64> %1, %2 + %shr = lshr <4 x i64> %mul, + %res = trunc <4 x i64> %shr to <4 x i32> + ret <4 x i32> %res +} + +define void @smulh_v8i32(<8 x i32>* %a, <8 x i32>* %b) #0 { +; CHECK-LABEL: smulh_v8i32: +; CHECK: // %bb.0: +; CHECK-NEXT: mov x8, #4 +; CHECK-NEXT: ptrue p0.s, vl4 +; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] +; CHECK-NEXT: ld1w { z1.s }, p0/z, [x0] +; CHECK-NEXT: ld1w { z2.s }, p0/z, [x1, x8, lsl #2] +; CHECK-NEXT: ld1w { z3.s }, p0/z, [x1] +; CHECK-NEXT: ptrue p0.s, vl2 +; CHECK-NEXT: movprfx z4, z1 +; CHECK-NEXT: smulh z4.s, p0/m, z4.s, z3.s +; CHECK-NEXT: ext z1.b, z1.b, z1.b, #8 +; CHECK-NEXT: ext z3.b, z3.b, z3.b, #8 +; CHECK-NEXT: smulh z1.s, p0/m, z1.s, z3.s +; CHECK-NEXT: movprfx z3, z0 +; CHECK-NEXT: smulh z3.s, p0/m, z3.s, z2.s +; CHECK-NEXT: ext z2.b, z2.b, z2.b, #8 +; CHECK-NEXT: ext z0.b, z0.b, z0.b, #8 +; CHECK-NEXT: smulh z0.s, p0/m, z0.s, z2.s +; CHECK-NEXT: splice z4.s, p0, z4.s, z1.s +; CHECK-NEXT: splice z3.s, p0, z3.s, z0.s +; CHECK-NEXT: stp q4, q3, [x0] +; CHECK-NEXT: ret + %op1 = load <8 x i32>, <8 x i32>* %a + %op2 = load <8 x i32>, <8 x i32>* %b + %1 = sext <8 x i32> %op1 to <8 x i64> + %2 = sext <8 x i32> %op2 to <8 x i64> + %mul = mul <8 x i64> %1, %2 + %shr = lshr <8 x i64> %mul, + %res = trunc <8 x i64> %shr to <8 x i32> + store <8 x i32> %res, <8 x i32>* %a + ret void +} + +define <1 x i64> @smulh_v1i64(<1 x i64> %op1, <1 x i64> %op2) #0 { +; CHECK-LABEL: smulh_v1i64: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: ptrue p0.d, vl1 +; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1 +; CHECK-NEXT: smulh z0.d, p0/m, z0.d, z1.d +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 +; CHECK-NEXT: ret + %insert = insertelement <1 x i128> undef, i128 64, i128 0 + %splat = shufflevector <1 x i128> %insert, <1 x i128> undef, <1 x i32> zeroinitializer + %1 = sext <1 x i64> %op1 to <1 x i128> + %2 = sext <1 x i64> %op2 to <1 x i128> + %mul = mul <1 x i128> %1, %2 + %shr = lshr <1 x i128> %mul, %splat + %res = trunc <1 x i128> %shr to <1 x i64> + ret <1 x i64> %res +} + +define <2 x i64> @smulh_v2i64(<2 x i64> %op1, <2 x i64> %op2) #0 { +; CHECK-LABEL: smulh_v2i64: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: ptrue p0.d, vl2 +; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1 +; CHECK-NEXT: smulh z0.d, p0/m, z0.d, z1.d +; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 +; CHECK-NEXT: ret + %1 = sext <2 x i64> %op1 to <2 x i128> + %2 = sext <2 x i64> %op2 to <2 x i128> + %mul = mul <2 x i128> %1, %2 + %shr = lshr <2 x i128> %mul, + %res = trunc <2 x i128> %shr to <2 x i64> + ret <2 x i64> %res +} + +define void @smulh_v4i64(<4 x i64>* %a, <4 x i64>* %b) #0 { +; CHECK-LABEL: smulh_v4i64: +; CHECK: // %bb.0: +; CHECK-NEXT: mov x8, #2 +; CHECK-NEXT: ptrue p0.d, vl2 +; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] +; CHECK-NEXT: ld1d { z1.d }, p0/z, [x0] +; CHECK-NEXT: ld1d { z2.d }, p0/z, [x1, x8, lsl #3] +; CHECK-NEXT: ld1d { z3.d }, p0/z, [x1] +; CHECK-NEXT: ptrue p0.d, vl1 +; CHECK-NEXT: fmov x8, d0 +; CHECK-NEXT: mov z4.d, z0.d[1] +; CHECK-NEXT: fmov x10, d2 +; CHECK-NEXT: mov z0.d, z1.d[1] +; CHECK-NEXT: fmov x9, d1 +; CHECK-NEXT: mov z1.d, z2.d[1] +; CHECK-NEXT: mov z2.d, z3.d[1] +; CHECK-NEXT: fmov x11, d3 +; CHECK-NEXT: fmov x12, d0 +; CHECK-NEXT: fmov x13, d2 +; CHECK-NEXT: fmov x14, d4 +; CHECK-NEXT: smulh x8, x8, x10 +; CHECK-NEXT: fmov x10, d1 +; CHECK-NEXT: smulh x9, x9, x11 +; CHECK-NEXT: smulh x12, x12, x13 +; CHECK-NEXT: smulh x10, x14, x10 +; CHECK-NEXT: fmov d2, x8 +; CHECK-NEXT: fmov d0, x9 +; CHECK-NEXT: fmov d1, x12 +; CHECK-NEXT: fmov d3, x10 +; CHECK-NEXT: splice z0.d, p0, z0.d, z1.d +; CHECK-NEXT: splice z2.d, p0, z2.d, z3.d +; CHECK-NEXT: stp q0, q2, [x0] +; CHECK-NEXT: ret + %op1 = load <4 x i64>, <4 x i64>* %a + %op2 = load <4 x i64>, <4 x i64>* %b + %1 = sext <4 x i64> %op1 to <4 x i128> + %2 = sext <4 x i64> %op2 to <4 x i128> + %mul = mul <4 x i128> %1, %2 + %shr = lshr <4 x i128> %mul, + %res = trunc <4 x i128> %shr to <4 x i64> + store <4 x i64> %res, <4 x i64>* %a + ret void +} + +; +; UMULH +; + +define <4 x i8> @umulh_v4i8(<4 x i8> %op1, <4 x i8> %op2) #0 { +; CHECK-LABEL: umulh_v4i8: +; CHECK: // %bb.0: +; CHECK-NEXT: adrp x8, .LCPI14_0 +; CHECK-NEXT: add x8, x8, :lo12:.LCPI14_0 +; CHECK-NEXT: ptrue p0.h, vl4 +; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1 +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: ld1h { z2.h }, p0/z, [x8] +; CHECK-NEXT: adrp x8, .LCPI14_1 +; CHECK-NEXT: add x8, x8, :lo12:.LCPI14_1 +; CHECK-NEXT: ld1h { z3.h }, p0/z, [x8] +; CHECK-NEXT: and z0.d, z0.d, z2.d +; CHECK-NEXT: and z1.d, z1.d, z2.d +; CHECK-NEXT: mul z0.h, p0/m, z0.h, z1.h +; CHECK-NEXT: lsr z0.h, p0/m, z0.h, z3.h +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 +; CHECK-NEXT: ret + %1 = zext <4 x i8> %op1 to <4 x i16> + %2 = zext <4 x i8> %op2 to <4 x i16> + %mul = mul <4 x i16> %1, %2 + %shr = lshr <4 x i16> %mul, + %res = trunc <4 x i16> %shr to <4 x i8> + ret <4 x i8> %res +} + +define <8 x i8> @umulh_v8i8(<8 x i8> %op1, <8 x i8> %op2) #0 { +; CHECK-LABEL: umulh_v8i8: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: ptrue p0.b, vl8 +; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1 +; CHECK-NEXT: umulh z0.b, p0/m, z0.b, z1.b +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 +; CHECK-NEXT: ret + %1 = zext <8 x i8> %op1 to <8 x i16> + %2 = zext <8 x i8> %op2 to <8 x i16> + %mul = mul <8 x i16> %1, %2 + %shr = lshr <8 x i16> %mul, + %res = trunc <8 x i16> %shr to <8 x i8> + ret <8 x i8> %res +} + +define <16 x i8> @umulh_v16i8(<16 x i8> %op1, <16 x i8> %op2) #0 { +; CHECK-LABEL: umulh_v16i8: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: ptrue p0.b, vl16 +; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1 +; CHECK-NEXT: umulh z0.b, p0/m, z0.b, z1.b +; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 +; CHECK-NEXT: ret + %1 = zext <16 x i8> %op1 to <16 x i16> + %2 = zext <16 x i8> %op2 to <16 x i16> + %mul = mul <16 x i16> %1, %2 + %shr = lshr <16 x i16> %mul, + %res = trunc <16 x i16> %shr to <16 x i8> + ret <16 x i8> %res +} + +define void @umulh_v32i8(<32 x i8>* %a, <32 x i8>* %b) #0 { +; CHECK-LABEL: umulh_v32i8: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w8, #16 +; CHECK-NEXT: ptrue p0.b, vl16 +; CHECK-NEXT: ptrue p1.h, vl8 +; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0, x8] +; CHECK-NEXT: ld1b { z2.b }, p0/z, [x0] +; CHECK-NEXT: ld1b { z1.b }, p0/z, [x1, x8] +; CHECK-NEXT: ld1b { z3.b }, p0/z, [x1] +; CHECK-NEXT: adrp x8, .LCPI17_0 +; CHECK-NEXT: add x8, x8, :lo12:.LCPI17_0 +; CHECK-NEXT: uunpklo z5.h, z2.b +; CHECK-NEXT: ext z2.b, z2.b, z2.b, #8 +; CHECK-NEXT: uunpklo z7.h, z3.b +; CHECK-NEXT: ld1h { z16.h }, p1/z, [x8] +; CHECK-NEXT: ext z3.b, z3.b, z3.b, #8 +; CHECK-NEXT: uunpklo z2.h, z2.b +; CHECK-NEXT: uunpklo z3.h, z3.b +; CHECK-NEXT: mul z5.h, p1/m, z5.h, z7.h +; CHECK-NEXT: mul z2.h, p1/m, z2.h, z3.h +; CHECK-NEXT: movprfx z3, z5 +; CHECK-NEXT: lsr z3.h, p1/m, z3.h, z16.h +; CHECK-NEXT: fmov w8, s3 +; CHECK-NEXT: uunpklo z4.h, z0.b +; CHECK-NEXT: uunpklo z6.h, z1.b +; CHECK-NEXT: ext z0.b, z0.b, z0.b, #8 +; CHECK-NEXT: ext z1.b, z1.b, z1.b, #8 +; CHECK-NEXT: lsr z2.h, p1/m, z2.h, z16.h +; CHECK-NEXT: mov z5.h, z3.h[7] +; CHECK-NEXT: uunpklo z0.h, z0.b +; CHECK-NEXT: uunpklo z1.h, z1.b +; CHECK-NEXT: mul z4.h, p1/m, z4.h, z6.h +; CHECK-NEXT: mov z6.h, z3.h[6] +; CHECK-NEXT: mov z7.h, z3.h[5] +; CHECK-NEXT: mov z17.h, z3.h[4] +; CHECK-NEXT: mov z18.h, z3.h[3] +; CHECK-NEXT: mov z19.h, z3.h[2] +; CHECK-NEXT: mov z20.h, z3.h[1] +; CHECK-NEXT: mov z3.h, z2.h[7] +; CHECK-NEXT: mov z21.h, z2.h[6] +; CHECK-NEXT: mov z22.h, z2.h[5] +; CHECK-NEXT: mov z23.h, z2.h[4] +; CHECK-NEXT: mov z24.h, z2.h[3] +; CHECK-NEXT: mov z25.h, z2.h[2] +; CHECK-NEXT: mov z26.h, z2.h[1] +; CHECK-NEXT: fmov w9, s2 +; CHECK-NEXT: mul z0.h, p1/m, z0.h, z1.h +; CHECK-NEXT: fmov w10, s5 +; CHECK-NEXT: strb w8, [sp, #-32]! +; CHECK-NEXT: .cfi_def_cfa_offset 32 +; CHECK-NEXT: fmov w8, s6 +; CHECK-NEXT: strb w9, [sp, #8] +; CHECK-NEXT: fmov w9, s7 +; CHECK-NEXT: strb w10, [sp, #7] +; CHECK-NEXT: fmov w10, s17 +; CHECK-NEXT: lsr z0.h, p1/m, z0.h, z16.h +; CHECK-NEXT: strb w8, [sp, #6] +; CHECK-NEXT: fmov w8, s18 +; CHECK-NEXT: strb w9, [sp, #5] +; CHECK-NEXT: fmov w9, s19 +; CHECK-NEXT: strb w10, [sp, #4] +; CHECK-NEXT: fmov w10, s20 +; CHECK-NEXT: strb w8, [sp, #3] +; CHECK-NEXT: fmov w8, s3 +; CHECK-NEXT: strb w9, [sp, #2] +; CHECK-NEXT: fmov w9, s21 +; CHECK-NEXT: strb w10, [sp, #1] +; CHECK-NEXT: fmov w10, s22 +; CHECK-NEXT: strb w8, [sp, #15] +; CHECK-NEXT: fmov w8, s23 +; CHECK-NEXT: strb w9, [sp, #14] +; CHECK-NEXT: fmov w9, s24 +; CHECK-NEXT: strb w10, [sp, #13] +; CHECK-NEXT: fmov w10, s25 +; CHECK-NEXT: strb w8, [sp, #12] +; CHECK-NEXT: fmov w8, s26 +; CHECK-NEXT: movprfx z1, z4 +; CHECK-NEXT: lsr z1.h, p1/m, z1.h, z16.h +; CHECK-NEXT: strb w9, [sp, #11] +; CHECK-NEXT: mov z2.h, z1.h[7] +; CHECK-NEXT: fmov w9, s1 +; CHECK-NEXT: strb w10, [sp, #10] +; CHECK-NEXT: fmov w10, s0 +; CHECK-NEXT: strb w8, [sp, #9] +; CHECK-NEXT: fmov w8, s2 +; CHECK-NEXT: mov z3.h, z1.h[6] +; CHECK-NEXT: mov z4.h, z1.h[5] +; CHECK-NEXT: mov z5.h, z1.h[4] +; CHECK-NEXT: strb w9, [sp, #16] +; CHECK-NEXT: fmov w9, s3 +; CHECK-NEXT: strb w10, [sp, #24] +; CHECK-NEXT: fmov w10, s4 +; CHECK-NEXT: strb w8, [sp, #23] +; CHECK-NEXT: fmov w8, s5 +; CHECK-NEXT: mov z6.h, z1.h[3] +; CHECK-NEXT: mov z7.h, z1.h[2] +; CHECK-NEXT: mov z16.h, z1.h[1] +; CHECK-NEXT: strb w9, [sp, #22] +; CHECK-NEXT: fmov w9, s6 +; CHECK-NEXT: strb w10, [sp, #21] +; CHECK-NEXT: fmov w10, s7 +; CHECK-NEXT: strb w8, [sp, #20] +; CHECK-NEXT: fmov w8, s16 +; CHECK-NEXT: mov z1.h, z0.h[7] +; CHECK-NEXT: mov z17.h, z0.h[6] +; CHECK-NEXT: mov z18.h, z0.h[5] +; CHECK-NEXT: strb w9, [sp, #19] +; CHECK-NEXT: fmov w9, s1 +; CHECK-NEXT: strb w10, [sp, #18] +; CHECK-NEXT: fmov w10, s17 +; CHECK-NEXT: strb w8, [sp, #17] +; CHECK-NEXT: fmov w8, s18 +; CHECK-NEXT: mov z19.h, z0.h[4] +; CHECK-NEXT: mov z20.h, z0.h[3] +; CHECK-NEXT: mov z21.h, z0.h[2] +; CHECK-NEXT: strb w9, [sp, #31] +; CHECK-NEXT: fmov w9, s19 +; CHECK-NEXT: strb w10, [sp, #30] +; CHECK-NEXT: fmov w10, s20 +; CHECK-NEXT: strb w8, [sp, #29] +; CHECK-NEXT: fmov w8, s21 +; CHECK-NEXT: mov z22.h, z0.h[1] +; CHECK-NEXT: strb w9, [sp, #28] +; CHECK-NEXT: fmov w9, s22 +; CHECK-NEXT: strb w10, [sp, #27] +; CHECK-NEXT: mov x10, sp +; CHECK-NEXT: strb w8, [sp, #26] +; CHECK-NEXT: add x8, sp, #16 +; CHECK-NEXT: strb w9, [sp, #25] +; CHECK-NEXT: ld1b { z0.b }, p0/z, [x10] +; CHECK-NEXT: ld1b { z1.b }, p0/z, [x8] +; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: add sp, sp, #32 +; CHECK-NEXT: ret + %op1 = load <32 x i8>, <32 x i8>* %a + %op2 = load <32 x i8>, <32 x i8>* %b + %1 = zext <32 x i8> %op1 to <32 x i16> + %2 = zext <32 x i8> %op2 to <32 x i16> + %mul = mul <32 x i16> %1, %2 + %shr = lshr <32 x i16> %mul, + %res = trunc <32 x i16> %shr to <32 x i8> + store <32 x i8> %res, <32 x i8>* %a + ret void +} + +define <2 x i16> @umulh_v2i16(<2 x i16> %op1, <2 x i16> %op2) #0 { +; CHECK-LABEL: umulh_v2i16: +; CHECK: // %bb.0: +; CHECK-NEXT: adrp x8, .LCPI18_0 +; CHECK-NEXT: add x8, x8, :lo12:.LCPI18_0 +; CHECK-NEXT: ptrue p0.s, vl2 +; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1 +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: ld1w { z2.s }, p0/z, [x8] +; CHECK-NEXT: adrp x8, .LCPI18_1 +; CHECK-NEXT: add x8, x8, :lo12:.LCPI18_1 +; CHECK-NEXT: ld1w { z3.s }, p0/z, [x8] +; CHECK-NEXT: and z0.d, z0.d, z2.d +; CHECK-NEXT: and z1.d, z1.d, z2.d +; CHECK-NEXT: mul z0.s, p0/m, z0.s, z1.s +; CHECK-NEXT: lsr z0.s, p0/m, z0.s, z3.s +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 +; CHECK-NEXT: ret + %1 = zext <2 x i16> %op1 to <2 x i32> + %2 = zext <2 x i16> %op2 to <2 x i32> + %mul = mul <2 x i32> %1, %2 + %shr = lshr <2 x i32> %mul, + %res = trunc <2 x i32> %shr to <2 x i16> + ret <2 x i16> %res +} + +define <4 x i16> @umulh_v4i16(<4 x i16> %op1, <4 x i16> %op2) #0 { +; CHECK-LABEL: umulh_v4i16: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: ptrue p0.h, vl4 +; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1 +; CHECK-NEXT: umulh z0.h, p0/m, z0.h, z1.h +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 +; CHECK-NEXT: ret + %1 = zext <4 x i16> %op1 to <4 x i32> + %2 = zext <4 x i16> %op2 to <4 x i32> + %mul = mul <4 x i32> %1, %2 + %shr = lshr <4 x i32> %mul, + %res = trunc <4 x i32> %shr to <4 x i16> + ret <4 x i16> %res +} + +define <8 x i16> @umulh_v8i16(<8 x i16> %op1, <8 x i16> %op2) #0 { +; CHECK-LABEL: umulh_v8i16: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: ptrue p0.h, vl8 +; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1 +; CHECK-NEXT: umulh z0.h, p0/m, z0.h, z1.h +; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 +; CHECK-NEXT: ret + %1 = zext <8 x i16> %op1 to <8 x i32> + %2 = zext <8 x i16> %op2 to <8 x i32> + %mul = mul <8 x i32> %1, %2 + %shr = lshr <8 x i32> %mul, + %res = trunc <8 x i32> %shr to <8 x i16> + ret <8 x i16> %res +} + +define void @umulh_v16i16(<16 x i16>* %a, <16 x i16>* %b) #0 { +; CHECK-LABEL: umulh_v16i16: +; CHECK: // %bb.0: +; CHECK-NEXT: mov x8, #8 +; CHECK-NEXT: ptrue p0.h, vl8 +; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] +; CHECK-NEXT: ld1h { z1.h }, p0/z, [x0] +; CHECK-NEXT: ld1h { z2.h }, p0/z, [x1, x8, lsl #1] +; CHECK-NEXT: ld1h { z3.h }, p0/z, [x1] +; CHECK-NEXT: ptrue p0.h, vl4 +; CHECK-NEXT: movprfx z4, z1 +; CHECK-NEXT: umulh z4.h, p0/m, z4.h, z3.h +; CHECK-NEXT: ext z1.b, z1.b, z1.b, #8 +; CHECK-NEXT: ext z3.b, z3.b, z3.b, #8 +; CHECK-NEXT: umulh z1.h, p0/m, z1.h, z3.h +; CHECK-NEXT: movprfx z3, z0 +; CHECK-NEXT: umulh z3.h, p0/m, z3.h, z2.h +; CHECK-NEXT: ext z2.b, z2.b, z2.b, #8 +; CHECK-NEXT: ext z0.b, z0.b, z0.b, #8 +; CHECK-NEXT: umulh z0.h, p0/m, z0.h, z2.h +; CHECK-NEXT: splice z4.h, p0, z4.h, z1.h +; CHECK-NEXT: splice z3.h, p0, z3.h, z0.h +; CHECK-NEXT: stp q4, q3, [x0] +; CHECK-NEXT: ret + %op1 = load <16 x i16>, <16 x i16>* %a + %op2 = load <16 x i16>, <16 x i16>* %b + %1 = zext <16 x i16> %op1 to <16 x i32> + %2 = zext <16 x i16> %op2 to <16 x i32> + %mul = mul <16 x i32> %1, %2 + %shr = lshr <16 x i32> %mul, + %res = trunc <16 x i32> %shr to <16 x i16> + store <16 x i16> %res, <16 x i16>* %a + ret void +} + +define <2 x i32> @umulh_v2i32(<2 x i32> %op1, <2 x i32> %op2) #0 { +; CHECK-LABEL: umulh_v2i32: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: ptrue p0.s, vl2 +; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1 +; CHECK-NEXT: umulh z0.s, p0/m, z0.s, z1.s +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 +; CHECK-NEXT: ret + %1 = zext <2 x i32> %op1 to <2 x i64> + %2 = zext <2 x i32> %op2 to <2 x i64> + %mul = mul <2 x i64> %1, %2 + %shr = lshr <2 x i64> %mul, + %res = trunc <2 x i64> %shr to <2 x i32> + ret <2 x i32> %res +} + +define <4 x i32> @umulh_v4i32(<4 x i32> %op1, <4 x i32> %op2) #0 { +; CHECK-LABEL: umulh_v4i32: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: ptrue p0.s, vl4 +; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1 +; CHECK-NEXT: umulh z0.s, p0/m, z0.s, z1.s +; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 +; CHECK-NEXT: ret + %1 = zext <4 x i32> %op1 to <4 x i64> + %2 = zext <4 x i32> %op2 to <4 x i64> + %mul = mul <4 x i64> %1, %2 + %shr = lshr <4 x i64> %mul, + %res = trunc <4 x i64> %shr to <4 x i32> + ret <4 x i32> %res +} + +define void @umulh_v8i32(<8 x i32>* %a, <8 x i32>* %b) #0 { +; CHECK-LABEL: umulh_v8i32: +; CHECK: // %bb.0: +; CHECK-NEXT: mov x8, #4 +; CHECK-NEXT: ptrue p0.s, vl4 +; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] +; CHECK-NEXT: ld1w { z1.s }, p0/z, [x0] +; CHECK-NEXT: ld1w { z2.s }, p0/z, [x1, x8, lsl #2] +; CHECK-NEXT: ld1w { z3.s }, p0/z, [x1] +; CHECK-NEXT: ptrue p0.s, vl2 +; CHECK-NEXT: movprfx z4, z1 +; CHECK-NEXT: umulh z4.s, p0/m, z4.s, z3.s +; CHECK-NEXT: ext z1.b, z1.b, z1.b, #8 +; CHECK-NEXT: ext z3.b, z3.b, z3.b, #8 +; CHECK-NEXT: umulh z1.s, p0/m, z1.s, z3.s +; CHECK-NEXT: movprfx z3, z0 +; CHECK-NEXT: umulh z3.s, p0/m, z3.s, z2.s +; CHECK-NEXT: ext z2.b, z2.b, z2.b, #8 +; CHECK-NEXT: ext z0.b, z0.b, z0.b, #8 +; CHECK-NEXT: umulh z0.s, p0/m, z0.s, z2.s +; CHECK-NEXT: splice z4.s, p0, z4.s, z1.s +; CHECK-NEXT: splice z3.s, p0, z3.s, z0.s +; CHECK-NEXT: stp q4, q3, [x0] +; CHECK-NEXT: ret + %op1 = load <8 x i32>, <8 x i32>* %a + %op2 = load <8 x i32>, <8 x i32>* %b + %insert = insertelement <8 x i64> undef, i64 32, i64 0 + %splat = shufflevector <8 x i64> %insert, <8 x i64> undef, <8 x i32> zeroinitializer + %1 = zext <8 x i32> %op1 to <8 x i64> + %2 = zext <8 x i32> %op2 to <8 x i64> + %mul = mul <8 x i64> %1, %2 + %shr = lshr <8 x i64> %mul, + %res = trunc <8 x i64> %shr to <8 x i32> + store <8 x i32> %res, <8 x i32>* %a + ret void +} + +define <1 x i64> @umulh_v1i64(<1 x i64> %op1, <1 x i64> %op2) #0 { +; CHECK-LABEL: umulh_v1i64: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: ptrue p0.d, vl1 +; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1 +; CHECK-NEXT: umulh z0.d, p0/m, z0.d, z1.d +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 +; CHECK-NEXT: ret + %1 = zext <1 x i64> %op1 to <1 x i128> + %2 = zext <1 x i64> %op2 to <1 x i128> + %mul = mul <1 x i128> %1, %2 + %shr = lshr <1 x i128> %mul, + %res = trunc <1 x i128> %shr to <1 x i64> + ret <1 x i64> %res +} + +define <2 x i64> @umulh_v2i64(<2 x i64> %op1, <2 x i64> %op2) #0 { +; CHECK-LABEL: umulh_v2i64: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: ptrue p0.d, vl2 +; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1 +; CHECK-NEXT: umulh z0.d, p0/m, z0.d, z1.d +; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 +; CHECK-NEXT: ret + %1 = zext <2 x i64> %op1 to <2 x i128> + %2 = zext <2 x i64> %op2 to <2 x i128> + %mul = mul <2 x i128> %1, %2 + %shr = lshr <2 x i128> %mul, + %res = trunc <2 x i128> %shr to <2 x i64> + ret <2 x i64> %res +} + +define void @umulh_v4i64(<4 x i64>* %a, <4 x i64>* %b) #0 { +; CHECK-LABEL: umulh_v4i64: +; CHECK: // %bb.0: +; CHECK-NEXT: mov x8, #2 +; CHECK-NEXT: ptrue p0.d, vl2 +; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] +; CHECK-NEXT: ld1d { z1.d }, p0/z, [x0] +; CHECK-NEXT: ld1d { z2.d }, p0/z, [x1, x8, lsl #3] +; CHECK-NEXT: ld1d { z3.d }, p0/z, [x1] +; CHECK-NEXT: ptrue p0.d, vl1 +; CHECK-NEXT: fmov x8, d0 +; CHECK-NEXT: mov z4.d, z0.d[1] +; CHECK-NEXT: fmov x10, d2 +; CHECK-NEXT: mov z0.d, z1.d[1] +; CHECK-NEXT: fmov x9, d1 +; CHECK-NEXT: mov z1.d, z2.d[1] +; CHECK-NEXT: mov z2.d, z3.d[1] +; CHECK-NEXT: fmov x11, d3 +; CHECK-NEXT: fmov x12, d0 +; CHECK-NEXT: fmov x13, d2 +; CHECK-NEXT: fmov x14, d4 +; CHECK-NEXT: umulh x8, x8, x10 +; CHECK-NEXT: fmov x10, d1 +; CHECK-NEXT: umulh x9, x9, x11 +; CHECK-NEXT: umulh x12, x12, x13 +; CHECK-NEXT: umulh x10, x14, x10 +; CHECK-NEXT: fmov d2, x8 +; CHECK-NEXT: fmov d0, x9 +; CHECK-NEXT: fmov d1, x12 +; CHECK-NEXT: fmov d3, x10 +; CHECK-NEXT: splice z0.d, p0, z0.d, z1.d +; CHECK-NEXT: splice z2.d, p0, z2.d, z3.d +; CHECK-NEXT: stp q0, q2, [x0] +; CHECK-NEXT: ret + %op1 = load <4 x i64>, <4 x i64>* %a + %op2 = load <4 x i64>, <4 x i64>* %b + %1 = zext <4 x i64> %op1 to <4 x i128> + %2 = zext <4 x i64> %op2 to <4 x i128> + %mul = mul <4 x i128> %1, %2 + %shr = lshr <4 x i128> %mul, + %res = trunc <4 x i128> %shr to <4 x i64> + store <4 x i64> %res, <4 x i64>* %a + ret void +} + +attributes #0 = { "target-features"="+sve" } diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-rem.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-rem.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-rem.ll @@ -0,0 +1,774 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -force-streaming-compatible-sve < %s | FileCheck %s + +target triple = "aarch64-unknown-linux-gnu" + +; +; SREM +; + +define <4 x i8> @srem_v4i8(<4 x i8> %op1, <4 x i8> %op2) #0 { +; CHECK-LABEL: srem_v4i8: +; CHECK: // %bb.0: +; CHECK-NEXT: sub sp, sp, #16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: adrp x8, .LCPI0_0 +; CHECK-NEXT: add x8, x8, :lo12:.LCPI0_0 +; CHECK-NEXT: ptrue p0.h, vl4 +; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1 +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: ptrue p1.s, vl4 +; CHECK-NEXT: ld1h { z2.h }, p0/z, [x8] +; CHECK-NEXT: lsl z0.h, p0/m, z0.h, z2.h +; CHECK-NEXT: lsl z1.h, p0/m, z1.h, z2.h +; CHECK-NEXT: asr z0.h, p0/m, z0.h, z2.h +; CHECK-NEXT: asr z1.h, p0/m, z1.h, z2.h +; CHECK-NEXT: sunpklo z2.s, z1.h +; CHECK-NEXT: sunpklo z3.s, z0.h +; CHECK-NEXT: sdivr z2.s, p1/m, z2.s, z3.s +; CHECK-NEXT: fmov w8, s2 +; CHECK-NEXT: mov z3.s, z2.s[3] +; CHECK-NEXT: mov z4.s, z2.s[2] +; CHECK-NEXT: mov z2.s, z2.s[1] +; CHECK-NEXT: fmov w9, s3 +; CHECK-NEXT: fmov w10, s4 +; CHECK-NEXT: strh w8, [sp, #8] +; CHECK-NEXT: fmov w8, s2 +; CHECK-NEXT: strh w9, [sp, #14] +; CHECK-NEXT: strh w8, [sp, #10] +; CHECK-NEXT: add x8, sp, #8 +; CHECK-NEXT: strh w10, [sp, #12] +; CHECK-NEXT: ld1h { z2.h }, p0/z, [x8] +; CHECK-NEXT: mls z0.h, p0/m, z2.h, z1.h +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 +; CHECK-NEXT: add sp, sp, #16 +; CHECK-NEXT: ret + %res = srem <4 x i8> %op1, %op2 + ret <4 x i8> %res +} + +define <8 x i8> @srem_v8i8(<8 x i8> %op1, <8 x i8> %op2) #0 { +; CHECK-LABEL: srem_v8i8: +; CHECK: // %bb.0: +; CHECK-NEXT: sub sp, sp, #16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1 +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: sunpklo z2.h, z1.b +; CHECK-NEXT: sunpklo z3.h, z0.b +; CHECK-NEXT: ptrue p0.s, vl4 +; CHECK-NEXT: sunpkhi z4.s, z2.h +; CHECK-NEXT: sunpkhi z5.s, z3.h +; CHECK-NEXT: sunpklo z2.s, z2.h +; CHECK-NEXT: sunpklo z3.s, z3.h +; CHECK-NEXT: sdivr z4.s, p0/m, z4.s, z5.s +; CHECK-NEXT: sdivr z2.s, p0/m, z2.s, z3.s +; CHECK-NEXT: ptrue p0.b, vl8 +; CHECK-NEXT: uzp1 z2.h, z2.h, z4.h +; CHECK-NEXT: fmov w8, s2 +; CHECK-NEXT: mov z3.h, z2.h[7] +; CHECK-NEXT: mov z5.h, z2.h[5] +; CHECK-NEXT: fmov w9, s3 +; CHECK-NEXT: mov z4.h, z2.h[6] +; CHECK-NEXT: mov z6.h, z2.h[4] +; CHECK-NEXT: strb w8, [sp, #8] +; CHECK-NEXT: fmov w8, s5 +; CHECK-NEXT: mov z16.h, z2.h[2] +; CHECK-NEXT: fmov w10, s4 +; CHECK-NEXT: strb w9, [sp, #15] +; CHECK-NEXT: fmov w9, s6 +; CHECK-NEXT: strb w8, [sp, #13] +; CHECK-NEXT: fmov w8, s16 +; CHECK-NEXT: mov z7.h, z2.h[3] +; CHECK-NEXT: mov z2.h, z2.h[1] +; CHECK-NEXT: strb w10, [sp, #14] +; CHECK-NEXT: fmov w10, s7 +; CHECK-NEXT: strb w9, [sp, #12] +; CHECK-NEXT: fmov w9, s2 +; CHECK-NEXT: strb w8, [sp, #10] +; CHECK-NEXT: add x8, sp, #8 +; CHECK-NEXT: strb w10, [sp, #11] +; CHECK-NEXT: strb w9, [sp, #9] +; CHECK-NEXT: ld1b { z2.b }, p0/z, [x8] +; CHECK-NEXT: mls z0.b, p0/m, z2.b, z1.b +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 +; CHECK-NEXT: add sp, sp, #16 +; CHECK-NEXT: ret + %res = srem <8 x i8> %op1, %op2 + ret <8 x i8> %res +} + +define <16 x i8> @srem_v16i8(<16 x i8> %op1, <16 x i8> %op2) #0 { +; CHECK-LABEL: srem_v16i8: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1 +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: sunpkhi z2.h, z1.b +; CHECK-NEXT: sunpkhi z3.h, z0.b +; CHECK-NEXT: ptrue p0.s, vl4 +; CHECK-NEXT: sunpkhi z5.s, z2.h +; CHECK-NEXT: sunpkhi z6.s, z3.h +; CHECK-NEXT: sunpklo z2.s, z2.h +; CHECK-NEXT: sunpklo z3.s, z3.h +; CHECK-NEXT: sunpklo z4.h, z1.b +; CHECK-NEXT: sdivr z2.s, p0/m, z2.s, z3.s +; CHECK-NEXT: sunpklo z3.h, z0.b +; CHECK-NEXT: sdivr z5.s, p0/m, z5.s, z6.s +; CHECK-NEXT: sunpkhi z6.s, z4.h +; CHECK-NEXT: sunpkhi z7.s, z3.h +; CHECK-NEXT: sunpklo z4.s, z4.h +; CHECK-NEXT: sunpklo z3.s, z3.h +; CHECK-NEXT: sdivr z6.s, p0/m, z6.s, z7.s +; CHECK-NEXT: sdiv z3.s, p0/m, z3.s, z4.s +; CHECK-NEXT: uzp1 z2.h, z2.h, z5.h +; CHECK-NEXT: uzp1 z3.h, z3.h, z6.h +; CHECK-NEXT: ptrue p0.b, vl16 +; CHECK-NEXT: uzp1 z2.b, z3.b, z2.b +; CHECK-NEXT: mls z0.b, p0/m, z2.b, z1.b +; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 +; CHECK-NEXT: ret + %res = srem <16 x i8> %op1, %op2 + ret <16 x i8> %res +} + +define void @srem_v32i8(<32 x i8>* %a, <32 x i8>* %b) #0 { +; CHECK-LABEL: srem_v32i8: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w8, #16 +; CHECK-NEXT: ptrue p0.b, vl16 +; CHECK-NEXT: ptrue p1.s, vl4 +; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0, x8] +; CHECK-NEXT: ld1b { z1.b }, p0/z, [x0] +; CHECK-NEXT: ld1b { z2.b }, p0/z, [x1, x8] +; CHECK-NEXT: ld1b { z3.b }, p0/z, [x1] +; CHECK-NEXT: sunpkhi z5.h, z0.b +; CHECK-NEXT: sunpklo z7.h, z0.b +; CHECK-NEXT: sunpkhi z4.h, z2.b +; CHECK-NEXT: sunpklo z6.h, z2.b +; CHECK-NEXT: sunpkhi z16.s, z4.h +; CHECK-NEXT: sunpkhi z17.s, z5.h +; CHECK-NEXT: sunpklo z4.s, z4.h +; CHECK-NEXT: sunpklo z5.s, z5.h +; CHECK-NEXT: sunpkhi z18.s, z6.h +; CHECK-NEXT: sdivr z16.s, p1/m, z16.s, z17.s +; CHECK-NEXT: sdivr z4.s, p1/m, z4.s, z5.s +; CHECK-NEXT: sunpkhi z5.s, z7.h +; CHECK-NEXT: sunpklo z6.s, z6.h +; CHECK-NEXT: sunpklo z7.s, z7.h +; CHECK-NEXT: uzp1 z4.h, z4.h, z16.h +; CHECK-NEXT: sdivr z6.s, p1/m, z6.s, z7.s +; CHECK-NEXT: sunpkhi z7.h, z3.b +; CHECK-NEXT: sunpkhi z16.h, z1.b +; CHECK-NEXT: sdiv z5.s, p1/m, z5.s, z18.s +; CHECK-NEXT: sunpkhi z17.s, z7.h +; CHECK-NEXT: sunpkhi z18.s, z16.h +; CHECK-NEXT: sunpklo z7.s, z7.h +; CHECK-NEXT: sunpklo z16.s, z16.h +; CHECK-NEXT: sdivr z17.s, p1/m, z17.s, z18.s +; CHECK-NEXT: sdivr z7.s, p1/m, z7.s, z16.s +; CHECK-NEXT: sunpklo z16.h, z3.b +; CHECK-NEXT: sunpklo z18.h, z1.b +; CHECK-NEXT: sunpkhi z19.s, z16.h +; CHECK-NEXT: sunpkhi z20.s, z18.h +; CHECK-NEXT: sunpklo z16.s, z16.h +; CHECK-NEXT: sunpklo z18.s, z18.h +; CHECK-NEXT: sdivr z19.s, p1/m, z19.s, z20.s +; CHECK-NEXT: sdivr z16.s, p1/m, z16.s, z18.s +; CHECK-NEXT: uzp1 z7.h, z7.h, z17.h +; CHECK-NEXT: uzp1 z16.h, z16.h, z19.h +; CHECK-NEXT: uzp1 z5.h, z6.h, z5.h +; CHECK-NEXT: uzp1 z6.b, z16.b, z7.b +; CHECK-NEXT: uzp1 z4.b, z5.b, z4.b +; CHECK-NEXT: mls z1.b, p0/m, z6.b, z3.b +; CHECK-NEXT: mls z0.b, p0/m, z4.b, z2.b +; CHECK-NEXT: stp q1, q0, [x0] +; CHECK-NEXT: ret + %op1 = load <32 x i8>, <32 x i8>* %a + %op2 = load <32 x i8>, <32 x i8>* %b + %res = srem <32 x i8> %op1, %op2 + store <32 x i8> %res, <32 x i8>* %a + ret void +} + +define <4 x i16> @srem_v4i16(<4 x i16> %op1, <4 x i16> %op2) #0 { +; CHECK-LABEL: srem_v4i16: +; CHECK: // %bb.0: +; CHECK-NEXT: sub sp, sp, #16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1 +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: ptrue p0.s, vl4 +; CHECK-NEXT: sunpklo z2.s, z1.h +; CHECK-NEXT: sunpklo z3.s, z0.h +; CHECK-NEXT: sdivr z2.s, p0/m, z2.s, z3.s +; CHECK-NEXT: ptrue p0.h, vl4 +; CHECK-NEXT: fmov w8, s2 +; CHECK-NEXT: mov z3.s, z2.s[3] +; CHECK-NEXT: mov z4.s, z2.s[2] +; CHECK-NEXT: mov z2.s, z2.s[1] +; CHECK-NEXT: fmov w9, s3 +; CHECK-NEXT: fmov w10, s4 +; CHECK-NEXT: strh w8, [sp, #8] +; CHECK-NEXT: fmov w8, s2 +; CHECK-NEXT: strh w9, [sp, #14] +; CHECK-NEXT: strh w8, [sp, #10] +; CHECK-NEXT: add x8, sp, #8 +; CHECK-NEXT: strh w10, [sp, #12] +; CHECK-NEXT: ld1h { z2.h }, p0/z, [x8] +; CHECK-NEXT: mls z0.h, p0/m, z2.h, z1.h +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 +; CHECK-NEXT: add sp, sp, #16 +; CHECK-NEXT: ret + %res = srem <4 x i16> %op1, %op2 + ret <4 x i16> %res +} + +define <8 x i16> @srem_v8i16(<8 x i16> %op1, <8 x i16> %op2) #0 { +; CHECK-LABEL: srem_v8i16: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1 +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: ptrue p0.s, vl4 +; CHECK-NEXT: sunpkhi z2.s, z1.h +; CHECK-NEXT: sunpkhi z3.s, z0.h +; CHECK-NEXT: sunpklo z4.s, z1.h +; CHECK-NEXT: sdivr z2.s, p0/m, z2.s, z3.s +; CHECK-NEXT: sunpklo z5.s, z0.h +; CHECK-NEXT: movprfx z3, z5 +; CHECK-NEXT: sdiv z3.s, p0/m, z3.s, z4.s +; CHECK-NEXT: uzp1 z2.h, z3.h, z2.h +; CHECK-NEXT: ptrue p0.h, vl8 +; CHECK-NEXT: mls z0.h, p0/m, z2.h, z1.h +; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 +; CHECK-NEXT: ret + %res = srem <8 x i16> %op1, %op2 + ret <8 x i16> %res +} + +define void @srem_v16i16(<16 x i16>* %a, <16 x i16>* %b) #0 { +; CHECK-LABEL: srem_v16i16: +; CHECK: // %bb.0: +; CHECK-NEXT: mov x8, #8 +; CHECK-NEXT: ptrue p0.h, vl8 +; CHECK-NEXT: ptrue p1.s, vl4 +; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] +; CHECK-NEXT: ld1h { z1.h }, p0/z, [x0] +; CHECK-NEXT: ld1h { z2.h }, p0/z, [x1, x8, lsl #1] +; CHECK-NEXT: ld1h { z3.h }, p0/z, [x1] +; CHECK-NEXT: sunpkhi z5.s, z0.h +; CHECK-NEXT: sunpkhi z16.s, z1.h +; CHECK-NEXT: sunpkhi z4.s, z2.h +; CHECK-NEXT: sunpkhi z7.s, z3.h +; CHECK-NEXT: sdivr z4.s, p1/m, z4.s, z5.s +; CHECK-NEXT: sunpklo z5.s, z3.h +; CHECK-NEXT: sdivr z7.s, p1/m, z7.s, z16.s +; CHECK-NEXT: sunpklo z16.s, z1.h +; CHECK-NEXT: sunpklo z6.s, z2.h +; CHECK-NEXT: sdivr z5.s, p1/m, z5.s, z16.s +; CHECK-NEXT: sunpklo z16.s, z0.h +; CHECK-NEXT: uzp1 z5.h, z5.h, z7.h +; CHECK-NEXT: sdivr z6.s, p1/m, z6.s, z16.s +; CHECK-NEXT: mls z1.h, p0/m, z5.h, z3.h +; CHECK-NEXT: uzp1 z4.h, z6.h, z4.h +; CHECK-NEXT: mls z0.h, p0/m, z4.h, z2.h +; CHECK-NEXT: stp q1, q0, [x0] +; CHECK-NEXT: ret + %op1 = load <16 x i16>, <16 x i16>* %a + %op2 = load <16 x i16>, <16 x i16>* %b + %res = srem <16 x i16> %op1, %op2 + store <16 x i16> %res, <16 x i16>* %a + ret void +} + +define <2 x i32> @srem_v2i32(<2 x i32> %op1, <2 x i32> %op2) #0 { +; CHECK-LABEL: srem_v2i32: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: ptrue p0.s, vl2 +; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1 +; CHECK-NEXT: movprfx z2, z0 +; CHECK-NEXT: sdiv z2.s, p0/m, z2.s, z1.s +; CHECK-NEXT: mls z0.s, p0/m, z2.s, z1.s +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 +; CHECK-NEXT: ret + %res = srem <2 x i32> %op1, %op2 + ret <2 x i32> %res +} + +define <4 x i32> @srem_v4i32(<4 x i32> %op1, <4 x i32> %op2) #0 { +; CHECK-LABEL: srem_v4i32: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: ptrue p0.s, vl4 +; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1 +; CHECK-NEXT: movprfx z2, z0 +; CHECK-NEXT: sdiv z2.s, p0/m, z2.s, z1.s +; CHECK-NEXT: mls z0.s, p0/m, z2.s, z1.s +; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 +; CHECK-NEXT: ret + %res = srem <4 x i32> %op1, %op2 + ret <4 x i32> %res +} + +define void @srem_v8i32(<8 x i32>* %a, <8 x i32>* %b) #0 { +; CHECK-LABEL: srem_v8i32: +; CHECK: // %bb.0: +; CHECK-NEXT: mov x8, #4 +; CHECK-NEXT: ptrue p0.s, vl4 +; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] +; CHECK-NEXT: ld1w { z1.s }, p0/z, [x0] +; CHECK-NEXT: ld1w { z2.s }, p0/z, [x1, x8, lsl #2] +; CHECK-NEXT: ld1w { z3.s }, p0/z, [x1] +; CHECK-NEXT: movprfx z4, z1 +; CHECK-NEXT: sdiv z4.s, p0/m, z4.s, z3.s +; CHECK-NEXT: movprfx z5, z0 +; CHECK-NEXT: sdiv z5.s, p0/m, z5.s, z2.s +; CHECK-NEXT: mls z1.s, p0/m, z4.s, z3.s +; CHECK-NEXT: mls z0.s, p0/m, z5.s, z2.s +; CHECK-NEXT: stp q1, q0, [x0] +; CHECK-NEXT: ret + %op1 = load <8 x i32>, <8 x i32>* %a + %op2 = load <8 x i32>, <8 x i32>* %b + %res = srem <8 x i32> %op1, %op2 + store <8 x i32> %res, <8 x i32>* %a + ret void +} + +define <1 x i64> @srem_v1i64(<1 x i64> %op1, <1 x i64> %op2) #0 { +; CHECK-LABEL: srem_v1i64: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: ptrue p0.d, vl1 +; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1 +; CHECK-NEXT: movprfx z2, z0 +; CHECK-NEXT: sdiv z2.d, p0/m, z2.d, z1.d +; CHECK-NEXT: mls z0.d, p0/m, z2.d, z1.d +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 +; CHECK-NEXT: ret + %res = srem <1 x i64> %op1, %op2 + ret <1 x i64> %res +} + +define <2 x i64> @srem_v2i64(<2 x i64> %op1, <2 x i64> %op2) #0 { +; CHECK-LABEL: srem_v2i64: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: ptrue p0.d, vl2 +; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1 +; CHECK-NEXT: movprfx z2, z0 +; CHECK-NEXT: sdiv z2.d, p0/m, z2.d, z1.d +; CHECK-NEXT: mls z0.d, p0/m, z2.d, z1.d +; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 +; CHECK-NEXT: ret + %res = srem <2 x i64> %op1, %op2 + ret <2 x i64> %res +} + +define void @srem_v4i64(<4 x i64>* %a, <4 x i64>* %b) #0 { +; CHECK-LABEL: srem_v4i64: +; CHECK: // %bb.0: +; CHECK-NEXT: mov x8, #2 +; CHECK-NEXT: ptrue p0.d, vl2 +; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] +; CHECK-NEXT: ld1d { z1.d }, p0/z, [x0] +; CHECK-NEXT: ld1d { z2.d }, p0/z, [x1, x8, lsl #3] +; CHECK-NEXT: ld1d { z3.d }, p0/z, [x1] +; CHECK-NEXT: movprfx z4, z1 +; CHECK-NEXT: sdiv z4.d, p0/m, z4.d, z3.d +; CHECK-NEXT: movprfx z5, z0 +; CHECK-NEXT: sdiv z5.d, p0/m, z5.d, z2.d +; CHECK-NEXT: mls z1.d, p0/m, z4.d, z3.d +; CHECK-NEXT: mls z0.d, p0/m, z5.d, z2.d +; CHECK-NEXT: stp q1, q0, [x0] +; CHECK-NEXT: ret + %op1 = load <4 x i64>, <4 x i64>* %a + %op2 = load <4 x i64>, <4 x i64>* %b + %res = srem <4 x i64> %op1, %op2 + store <4 x i64> %res, <4 x i64>* %a + ret void +} + +; +; UREM +; + +define <4 x i8> @urem_v4i8(<4 x i8> %op1, <4 x i8> %op2) #0 { +; CHECK-LABEL: urem_v4i8: +; CHECK: // %bb.0: +; CHECK-NEXT: sub sp, sp, #16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: adrp x8, .LCPI13_0 +; CHECK-NEXT: add x8, x8, :lo12:.LCPI13_0 +; CHECK-NEXT: ptrue p0.h, vl4 +; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1 +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: ptrue p1.s, vl4 +; CHECK-NEXT: ld1h { z2.h }, p0/z, [x8] +; CHECK-NEXT: and z0.d, z0.d, z2.d +; CHECK-NEXT: and z1.d, z1.d, z2.d +; CHECK-NEXT: uunpklo z2.s, z1.h +; CHECK-NEXT: uunpklo z3.s, z0.h +; CHECK-NEXT: udivr z2.s, p1/m, z2.s, z3.s +; CHECK-NEXT: fmov w8, s2 +; CHECK-NEXT: mov z3.s, z2.s[3] +; CHECK-NEXT: mov z4.s, z2.s[2] +; CHECK-NEXT: mov z2.s, z2.s[1] +; CHECK-NEXT: fmov w9, s3 +; CHECK-NEXT: fmov w10, s4 +; CHECK-NEXT: strh w8, [sp, #8] +; CHECK-NEXT: fmov w8, s2 +; CHECK-NEXT: strh w9, [sp, #14] +; CHECK-NEXT: strh w8, [sp, #10] +; CHECK-NEXT: add x8, sp, #8 +; CHECK-NEXT: strh w10, [sp, #12] +; CHECK-NEXT: ld1h { z2.h }, p0/z, [x8] +; CHECK-NEXT: mls z0.h, p0/m, z2.h, z1.h +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 +; CHECK-NEXT: add sp, sp, #16 +; CHECK-NEXT: ret + %res = urem <4 x i8> %op1, %op2 + ret <4 x i8> %res +} + +define <8 x i8> @urem_v8i8(<8 x i8> %op1, <8 x i8> %op2) #0 { +; CHECK-LABEL: urem_v8i8: +; CHECK: // %bb.0: +; CHECK-NEXT: sub sp, sp, #16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1 +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: uunpklo z2.h, z1.b +; CHECK-NEXT: uunpklo z3.h, z0.b +; CHECK-NEXT: ptrue p0.s, vl4 +; CHECK-NEXT: uunpkhi z4.s, z2.h +; CHECK-NEXT: uunpkhi z5.s, z3.h +; CHECK-NEXT: uunpklo z2.s, z2.h +; CHECK-NEXT: uunpklo z3.s, z3.h +; CHECK-NEXT: udivr z4.s, p0/m, z4.s, z5.s +; CHECK-NEXT: udivr z2.s, p0/m, z2.s, z3.s +; CHECK-NEXT: ptrue p0.b, vl8 +; CHECK-NEXT: uzp1 z2.h, z2.h, z4.h +; CHECK-NEXT: fmov w8, s2 +; CHECK-NEXT: mov z3.h, z2.h[7] +; CHECK-NEXT: mov z5.h, z2.h[5] +; CHECK-NEXT: fmov w9, s3 +; CHECK-NEXT: mov z4.h, z2.h[6] +; CHECK-NEXT: mov z6.h, z2.h[4] +; CHECK-NEXT: strb w8, [sp, #8] +; CHECK-NEXT: fmov w8, s5 +; CHECK-NEXT: mov z16.h, z2.h[2] +; CHECK-NEXT: fmov w10, s4 +; CHECK-NEXT: strb w9, [sp, #15] +; CHECK-NEXT: fmov w9, s6 +; CHECK-NEXT: strb w8, [sp, #13] +; CHECK-NEXT: fmov w8, s16 +; CHECK-NEXT: mov z7.h, z2.h[3] +; CHECK-NEXT: mov z2.h, z2.h[1] +; CHECK-NEXT: strb w10, [sp, #14] +; CHECK-NEXT: fmov w10, s7 +; CHECK-NEXT: strb w9, [sp, #12] +; CHECK-NEXT: fmov w9, s2 +; CHECK-NEXT: strb w8, [sp, #10] +; CHECK-NEXT: add x8, sp, #8 +; CHECK-NEXT: strb w10, [sp, #11] +; CHECK-NEXT: strb w9, [sp, #9] +; CHECK-NEXT: ld1b { z2.b }, p0/z, [x8] +; CHECK-NEXT: mls z0.b, p0/m, z2.b, z1.b +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 +; CHECK-NEXT: add sp, sp, #16 +; CHECK-NEXT: ret + %res = urem <8 x i8> %op1, %op2 + ret <8 x i8> %res +} + +define <16 x i8> @urem_v16i8(<16 x i8> %op1, <16 x i8> %op2) #0 { +; CHECK-LABEL: urem_v16i8: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1 +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: uunpkhi z2.h, z1.b +; CHECK-NEXT: uunpkhi z3.h, z0.b +; CHECK-NEXT: ptrue p0.s, vl4 +; CHECK-NEXT: uunpkhi z5.s, z2.h +; CHECK-NEXT: uunpkhi z6.s, z3.h +; CHECK-NEXT: uunpklo z2.s, z2.h +; CHECK-NEXT: uunpklo z3.s, z3.h +; CHECK-NEXT: uunpklo z4.h, z1.b +; CHECK-NEXT: udivr z2.s, p0/m, z2.s, z3.s +; CHECK-NEXT: uunpklo z3.h, z0.b +; CHECK-NEXT: udivr z5.s, p0/m, z5.s, z6.s +; CHECK-NEXT: uunpkhi z6.s, z4.h +; CHECK-NEXT: uunpkhi z7.s, z3.h +; CHECK-NEXT: uunpklo z4.s, z4.h +; CHECK-NEXT: uunpklo z3.s, z3.h +; CHECK-NEXT: udivr z6.s, p0/m, z6.s, z7.s +; CHECK-NEXT: udiv z3.s, p0/m, z3.s, z4.s +; CHECK-NEXT: uzp1 z2.h, z2.h, z5.h +; CHECK-NEXT: uzp1 z3.h, z3.h, z6.h +; CHECK-NEXT: ptrue p0.b, vl16 +; CHECK-NEXT: uzp1 z2.b, z3.b, z2.b +; CHECK-NEXT: mls z0.b, p0/m, z2.b, z1.b +; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 +; CHECK-NEXT: ret + %res = urem <16 x i8> %op1, %op2 + ret <16 x i8> %res +} + +define void @urem_v32i8(<32 x i8>* %a, <32 x i8>* %b) #0 { +; CHECK-LABEL: urem_v32i8: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w8, #16 +; CHECK-NEXT: ptrue p0.b, vl16 +; CHECK-NEXT: ptrue p1.s, vl4 +; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0, x8] +; CHECK-NEXT: ld1b { z1.b }, p0/z, [x0] +; CHECK-NEXT: ld1b { z2.b }, p0/z, [x1, x8] +; CHECK-NEXT: ld1b { z3.b }, p0/z, [x1] +; CHECK-NEXT: uunpkhi z5.h, z0.b +; CHECK-NEXT: uunpklo z7.h, z0.b +; CHECK-NEXT: uunpkhi z4.h, z2.b +; CHECK-NEXT: uunpklo z6.h, z2.b +; CHECK-NEXT: uunpkhi z16.s, z4.h +; CHECK-NEXT: uunpkhi z17.s, z5.h +; CHECK-NEXT: uunpklo z4.s, z4.h +; CHECK-NEXT: uunpklo z5.s, z5.h +; CHECK-NEXT: uunpkhi z18.s, z6.h +; CHECK-NEXT: udivr z16.s, p1/m, z16.s, z17.s +; CHECK-NEXT: udivr z4.s, p1/m, z4.s, z5.s +; CHECK-NEXT: uunpkhi z5.s, z7.h +; CHECK-NEXT: uunpklo z6.s, z6.h +; CHECK-NEXT: uunpklo z7.s, z7.h +; CHECK-NEXT: uzp1 z4.h, z4.h, z16.h +; CHECK-NEXT: udivr z6.s, p1/m, z6.s, z7.s +; CHECK-NEXT: uunpkhi z7.h, z3.b +; CHECK-NEXT: uunpkhi z16.h, z1.b +; CHECK-NEXT: udiv z5.s, p1/m, z5.s, z18.s +; CHECK-NEXT: uunpkhi z17.s, z7.h +; CHECK-NEXT: uunpkhi z18.s, z16.h +; CHECK-NEXT: uunpklo z7.s, z7.h +; CHECK-NEXT: uunpklo z16.s, z16.h +; CHECK-NEXT: udivr z17.s, p1/m, z17.s, z18.s +; CHECK-NEXT: udivr z7.s, p1/m, z7.s, z16.s +; CHECK-NEXT: uunpklo z16.h, z3.b +; CHECK-NEXT: uunpklo z18.h, z1.b +; CHECK-NEXT: uunpkhi z19.s, z16.h +; CHECK-NEXT: uunpkhi z20.s, z18.h +; CHECK-NEXT: uunpklo z16.s, z16.h +; CHECK-NEXT: uunpklo z18.s, z18.h +; CHECK-NEXT: udivr z19.s, p1/m, z19.s, z20.s +; CHECK-NEXT: udivr z16.s, p1/m, z16.s, z18.s +; CHECK-NEXT: uzp1 z7.h, z7.h, z17.h +; CHECK-NEXT: uzp1 z16.h, z16.h, z19.h +; CHECK-NEXT: uzp1 z5.h, z6.h, z5.h +; CHECK-NEXT: uzp1 z6.b, z16.b, z7.b +; CHECK-NEXT: uzp1 z4.b, z5.b, z4.b +; CHECK-NEXT: mls z1.b, p0/m, z6.b, z3.b +; CHECK-NEXT: mls z0.b, p0/m, z4.b, z2.b +; CHECK-NEXT: stp q1, q0, [x0] +; CHECK-NEXT: ret + %op1 = load <32 x i8>, <32 x i8>* %a + %op2 = load <32 x i8>, <32 x i8>* %b + %res = urem <32 x i8> %op1, %op2 + store <32 x i8> %res, <32 x i8>* %a + ret void +} + +define <4 x i16> @urem_v4i16(<4 x i16> %op1, <4 x i16> %op2) #0 { +; CHECK-LABEL: urem_v4i16: +; CHECK: // %bb.0: +; CHECK-NEXT: sub sp, sp, #16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1 +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: ptrue p0.s, vl4 +; CHECK-NEXT: uunpklo z2.s, z1.h +; CHECK-NEXT: uunpklo z3.s, z0.h +; CHECK-NEXT: udivr z2.s, p0/m, z2.s, z3.s +; CHECK-NEXT: ptrue p0.h, vl4 +; CHECK-NEXT: fmov w8, s2 +; CHECK-NEXT: mov z3.s, z2.s[3] +; CHECK-NEXT: mov z4.s, z2.s[2] +; CHECK-NEXT: mov z2.s, z2.s[1] +; CHECK-NEXT: fmov w9, s3 +; CHECK-NEXT: fmov w10, s4 +; CHECK-NEXT: strh w8, [sp, #8] +; CHECK-NEXT: fmov w8, s2 +; CHECK-NEXT: strh w9, [sp, #14] +; CHECK-NEXT: strh w8, [sp, #10] +; CHECK-NEXT: add x8, sp, #8 +; CHECK-NEXT: strh w10, [sp, #12] +; CHECK-NEXT: ld1h { z2.h }, p0/z, [x8] +; CHECK-NEXT: mls z0.h, p0/m, z2.h, z1.h +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 +; CHECK-NEXT: add sp, sp, #16 +; CHECK-NEXT: ret + %res = urem <4 x i16> %op1, %op2 + ret <4 x i16> %res +} + +define <8 x i16> @urem_v8i16(<8 x i16> %op1, <8 x i16> %op2) #0 { +; CHECK-LABEL: urem_v8i16: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1 +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: ptrue p0.s, vl4 +; CHECK-NEXT: uunpkhi z2.s, z1.h +; CHECK-NEXT: uunpkhi z3.s, z0.h +; CHECK-NEXT: uunpklo z4.s, z1.h +; CHECK-NEXT: udivr z2.s, p0/m, z2.s, z3.s +; CHECK-NEXT: uunpklo z5.s, z0.h +; CHECK-NEXT: movprfx z3, z5 +; CHECK-NEXT: udiv z3.s, p0/m, z3.s, z4.s +; CHECK-NEXT: uzp1 z2.h, z3.h, z2.h +; CHECK-NEXT: ptrue p0.h, vl8 +; CHECK-NEXT: mls z0.h, p0/m, z2.h, z1.h +; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 +; CHECK-NEXT: ret + %res = urem <8 x i16> %op1, %op2 + ret <8 x i16> %res +} + +define void @urem_v16i16(<16 x i16>* %a, <16 x i16>* %b) #0 { +; CHECK-LABEL: urem_v16i16: +; CHECK: // %bb.0: +; CHECK-NEXT: mov x8, #8 +; CHECK-NEXT: ptrue p0.h, vl8 +; CHECK-NEXT: ptrue p1.s, vl4 +; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] +; CHECK-NEXT: ld1h { z1.h }, p0/z, [x0] +; CHECK-NEXT: ld1h { z2.h }, p0/z, [x1, x8, lsl #1] +; CHECK-NEXT: ld1h { z3.h }, p0/z, [x1] +; CHECK-NEXT: uunpkhi z5.s, z0.h +; CHECK-NEXT: uunpkhi z16.s, z1.h +; CHECK-NEXT: uunpkhi z4.s, z2.h +; CHECK-NEXT: uunpkhi z7.s, z3.h +; CHECK-NEXT: udivr z4.s, p1/m, z4.s, z5.s +; CHECK-NEXT: uunpklo z5.s, z3.h +; CHECK-NEXT: udivr z7.s, p1/m, z7.s, z16.s +; CHECK-NEXT: uunpklo z16.s, z1.h +; CHECK-NEXT: uunpklo z6.s, z2.h +; CHECK-NEXT: udivr z5.s, p1/m, z5.s, z16.s +; CHECK-NEXT: uunpklo z16.s, z0.h +; CHECK-NEXT: uzp1 z5.h, z5.h, z7.h +; CHECK-NEXT: udivr z6.s, p1/m, z6.s, z16.s +; CHECK-NEXT: mls z1.h, p0/m, z5.h, z3.h +; CHECK-NEXT: uzp1 z4.h, z6.h, z4.h +; CHECK-NEXT: mls z0.h, p0/m, z4.h, z2.h +; CHECK-NEXT: stp q1, q0, [x0] +; CHECK-NEXT: ret + %op1 = load <16 x i16>, <16 x i16>* %a + %op2 = load <16 x i16>, <16 x i16>* %b + %res = urem <16 x i16> %op1, %op2 + store <16 x i16> %res, <16 x i16>* %a + ret void +} + +define <2 x i32> @urem_v2i32(<2 x i32> %op1, <2 x i32> %op2) #0 { +; CHECK-LABEL: urem_v2i32: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: ptrue p0.s, vl2 +; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1 +; CHECK-NEXT: movprfx z2, z0 +; CHECK-NEXT: udiv z2.s, p0/m, z2.s, z1.s +; CHECK-NEXT: mls z0.s, p0/m, z2.s, z1.s +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 +; CHECK-NEXT: ret + %res = urem <2 x i32> %op1, %op2 + ret <2 x i32> %res +} + +define <4 x i32> @urem_v4i32(<4 x i32> %op1, <4 x i32> %op2) #0 { +; CHECK-LABEL: urem_v4i32: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: ptrue p0.s, vl4 +; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1 +; CHECK-NEXT: movprfx z2, z0 +; CHECK-NEXT: udiv z2.s, p0/m, z2.s, z1.s +; CHECK-NEXT: mls z0.s, p0/m, z2.s, z1.s +; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 +; CHECK-NEXT: ret + %res = urem <4 x i32> %op1, %op2 + ret <4 x i32> %res +} + +define void @urem_v8i32(<8 x i32>* %a, <8 x i32>* %b) #0 { +; CHECK-LABEL: urem_v8i32: +; CHECK: // %bb.0: +; CHECK-NEXT: mov x8, #4 +; CHECK-NEXT: ptrue p0.s, vl4 +; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] +; CHECK-NEXT: ld1w { z1.s }, p0/z, [x0] +; CHECK-NEXT: ld1w { z2.s }, p0/z, [x1, x8, lsl #2] +; CHECK-NEXT: ld1w { z3.s }, p0/z, [x1] +; CHECK-NEXT: movprfx z4, z1 +; CHECK-NEXT: udiv z4.s, p0/m, z4.s, z3.s +; CHECK-NEXT: movprfx z5, z0 +; CHECK-NEXT: udiv z5.s, p0/m, z5.s, z2.s +; CHECK-NEXT: mls z1.s, p0/m, z4.s, z3.s +; CHECK-NEXT: mls z0.s, p0/m, z5.s, z2.s +; CHECK-NEXT: stp q1, q0, [x0] +; CHECK-NEXT: ret + %op1 = load <8 x i32>, <8 x i32>* %a + %op2 = load <8 x i32>, <8 x i32>* %b + %res = urem <8 x i32> %op1, %op2 + store <8 x i32> %res, <8 x i32>* %a + ret void +} + +define <1 x i64> @urem_v1i64(<1 x i64> %op1, <1 x i64> %op2) #0 { +; CHECK-LABEL: urem_v1i64: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: ptrue p0.d, vl1 +; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1 +; CHECK-NEXT: movprfx z2, z0 +; CHECK-NEXT: udiv z2.d, p0/m, z2.d, z1.d +; CHECK-NEXT: mls z0.d, p0/m, z2.d, z1.d +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 +; CHECK-NEXT: ret + %res = urem <1 x i64> %op1, %op2 + ret <1 x i64> %res +} + +define <2 x i64> @urem_v2i64(<2 x i64> %op1, <2 x i64> %op2) #0 { +; CHECK-LABEL: urem_v2i64: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: ptrue p0.d, vl2 +; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1 +; CHECK-NEXT: movprfx z2, z0 +; CHECK-NEXT: udiv z2.d, p0/m, z2.d, z1.d +; CHECK-NEXT: mls z0.d, p0/m, z2.d, z1.d +; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 +; CHECK-NEXT: ret + %res = urem <2 x i64> %op1, %op2 + ret <2 x i64> %res +} + +define void @urem_v4i64(<4 x i64>* %a, <4 x i64>* %b) #0 { +; CHECK-LABEL: urem_v4i64: +; CHECK: // %bb.0: +; CHECK-NEXT: mov x8, #2 +; CHECK-NEXT: ptrue p0.d, vl2 +; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] +; CHECK-NEXT: ld1d { z1.d }, p0/z, [x0] +; CHECK-NEXT: ld1d { z2.d }, p0/z, [x1, x8, lsl #3] +; CHECK-NEXT: ld1d { z3.d }, p0/z, [x1] +; CHECK-NEXT: movprfx z4, z1 +; CHECK-NEXT: udiv z4.d, p0/m, z4.d, z3.d +; CHECK-NEXT: movprfx z5, z0 +; CHECK-NEXT: udiv z5.d, p0/m, z5.d, z2.d +; CHECK-NEXT: mls z1.d, p0/m, z4.d, z3.d +; CHECK-NEXT: mls z0.d, p0/m, z5.d, z2.d +; CHECK-NEXT: stp q1, q0, [x0] +; CHECK-NEXT: ret + %op1 = load <4 x i64>, <4 x i64>* %a + %op2 = load <4 x i64>, <4 x i64>* %b + %res = urem <4 x i64> %op1, %op2 + store <4 x i64> %res, <4 x i64>* %a + ret void +} + +attributes #0 = { "target-features"="+sve" } diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-loads.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-loads.ll --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-loads.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-loads.ll @@ -1,20 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -aarch64-sve-vector-bits-min=128 -force-sve-when-streaming-compatible < %s | FileCheck %s -; RUN: llc -aarch64-sve-vector-bits-min=256 -force-sve-when-streaming-compatible < %s | FileCheck %s -; RUN: llc -aarch64-sve-vector-bits-min=384 -force-sve-when-streaming-compatible < %s | FileCheck %s -; RUN: llc -aarch64-sve-vector-bits-min=512 -force-sve-when-streaming-compatible < %s | FileCheck %s -; RUN: llc -aarch64-sve-vector-bits-min=640 -force-sve-when-streaming-compatible < %s | FileCheck %s -; RUN: llc -aarch64-sve-vector-bits-min=768 -force-sve-when-streaming-compatible < %s | FileCheck %s -; RUN: llc -aarch64-sve-vector-bits-min=896 -force-sve-when-streaming-compatible < %s | FileCheck %s -; RUN: llc -aarch64-sve-vector-bits-min=1024 -force-sve-when-streaming-compatible < %s | FileCheck %s -; RUN: llc -aarch64-sve-vector-bits-min=1152 -force-sve-when-streaming-compatible < %s | FileCheck %s -; RUN: llc -aarch64-sve-vector-bits-min=1280 -force-sve-when-streaming-compatible < %s | FileCheck %s -; RUN: llc -aarch64-sve-vector-bits-min=1408 -force-sve-when-streaming-compatible < %s | FileCheck %s -; RUN: llc -aarch64-sve-vector-bits-min=1536 -force-sve-when-streaming-compatible < %s | FileCheck %s -; RUN: llc -aarch64-sve-vector-bits-min=1664 -force-sve-when-streaming-compatible < %s | FileCheck %s -; RUN: llc -aarch64-sve-vector-bits-min=1792 -force-sve-when-streaming-compatible < %s | FileCheck %s -; RUN: llc -aarch64-sve-vector-bits-min=1920 -force-sve-when-streaming-compatible < %s | FileCheck %s -; RUN: llc -aarch64-sve-vector-bits-min=2048 -force-sve-when-streaming-compatible < %s | FileCheck %s +; RUN: llc -force-streaming-compatible-sve < %s | FileCheck %s target triple = "aarch64-unknown-linux-gnu" @@ -62,9 +47,18 @@ ret <4 x float> %load } -define <32 x float> @load_v32f32(<32 x float>* %a) #0 { - %load = load <32 x float>, <32 x float>* %a - ret <32 x float> %load +define <8 x float> @load_v8f32(<8 x float>* %a) #0 { +; CHECK-LABEL: load_v8f32: +; CHECK: // %bb.0: +; CHECK-NEXT: mov x8, #2 +; CHECK-NEXT: ptrue p0.d, vl2 +; CHECK-NEXT: ld1d { z1.d }, p0/z, [x0, x8, lsl #3] +; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] +; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 +; CHECK-NEXT: // kill: def $q1 killed $q1 killed $z1 +; CHECK-NEXT: ret + %load = load <8 x float>, <8 x float>* %a + ret <8 x float> %load } define <2 x double> @load_v2f64(<2 x double>* %a) #0 { diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-masked-load.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-masked-load.ll --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-masked-load.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-masked-load.ll @@ -1,7 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -aarch64-sve-vector-bits-min=128 -force-sve-when-streaming-compatible < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_128_STREAMING -; RUN: llc -aarch64-sve-vector-bits-min=256 -force-sve-when-streaming-compatible < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_256_STREAMING -; RUN: llc -aarch64-sve-vector-bits-min=512 -force-sve-when-streaming-compatible < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512_STREAMING +; RUN: llc -force-streaming-compatible-sve < %s | FileCheck %s target triple = "aarch64-unknown-linux-gnu" @@ -9,11 +7,47 @@ ; Masked Load ; -define <16 x i8> @masked_load_v16i8(<16 x i8>* %src, <16 x i1> %mask) #0 { -; CHECK-LABEL: masked_load_v16i8: +define <4 x i8> @masked_load_v4i8(<4 x i8>* %src, <4 x i1> %mask) #0 { +; CHECK-LABEL: masked_load_v4i8: ; CHECK: // %bb.0: ; CHECK-NEXT: adrp x8, .LCPI0_0 ; CHECK-NEXT: add x8, x8, :lo12:.LCPI0_0 +; CHECK-NEXT: ptrue p0.h, vl4 +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: ld1h { z1.h }, p0/z, [x8] +; CHECK-NEXT: lsl z0.h, p0/m, z0.h, z1.h +; CHECK-NEXT: asr z0.h, p0/m, z0.h, z1.h +; CHECK-NEXT: cmpne p0.h, p0/z, z0.h, #0 +; CHECK-NEXT: ld1b { z0.h }, p0/z, [x0] +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 +; CHECK-NEXT: ret + %load = call <4 x i8> @llvm.masked.load.v4i8(<4 x i8>* %src, i32 8, <4 x i1> %mask, <4 x i8> zeroinitializer) + ret <4 x i8> %load +} + +define <8 x i8> @masked_load_v8i8(<8 x i8>* %src, <8 x i1> %mask) #0 { +; CHECK-LABEL: masked_load_v8i8: +; CHECK: // %bb.0: +; CHECK-NEXT: adrp x8, .LCPI1_0 +; CHECK-NEXT: add x8, x8, :lo12:.LCPI1_0 +; CHECK-NEXT: ptrue p0.b, vl8 +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: ld1b { z1.b }, p0/z, [x8] +; CHECK-NEXT: lsl z0.b, p0/m, z0.b, z1.b +; CHECK-NEXT: asr z0.b, p0/m, z0.b, z1.b +; CHECK-NEXT: cmpne p0.b, p0/z, z0.b, #0 +; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0] +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 +; CHECK-NEXT: ret + %load = call <8 x i8> @llvm.masked.load.v8i8(<8 x i8>* %src, i32 8, <8 x i1> %mask, <8 x i8> zeroinitializer) + ret <8 x i8> %load +} + +define <16 x i8> @masked_load_v16i8(<16 x i8>* %src, <16 x i1> %mask) #0 { +; CHECK-LABEL: masked_load_v16i8: +; CHECK: // %bb.0: +; CHECK-NEXT: adrp x8, .LCPI2_0 +; CHECK-NEXT: add x8, x8, :lo12:.LCPI2_0 ; CHECK-NEXT: ptrue p0.b, vl16 ; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 ; CHECK-NEXT: ld1b { z1.b }, p0/z, [x8] @@ -27,6 +61,93 @@ ret <16 x i8> %load } +define <32 x i8> @masked_load_v32i8(<32 x i8>* %src, <32 x i1> %mask) #0 { +; CHECK-LABEL: masked_load_v32i8: +; CHECK: // %bb.0: +; CHECK-NEXT: sub sp, sp, #32 +; CHECK-NEXT: .cfi_def_cfa_offset 32 +; CHECK-NEXT: ldr w8, [sp, #224] +; CHECK-NEXT: strb w7, [sp, #6] +; CHECK-NEXT: ldr w9, [sp, #216] +; CHECK-NEXT: strb w6, [sp, #5] +; CHECK-NEXT: ldr w10, [sp, #208] +; CHECK-NEXT: strb w5, [sp, #4] +; CHECK-NEXT: strb w8, [sp, #31] +; CHECK-NEXT: ldr w8, [sp, #200] +; CHECK-NEXT: strb w9, [sp, #30] +; CHECK-NEXT: ldr w9, [sp, #192] +; CHECK-NEXT: strb w10, [sp, #29] +; CHECK-NEXT: ldr w10, [sp, #184] +; CHECK-NEXT: strb w8, [sp, #28] +; CHECK-NEXT: ldr w8, [sp, #176] +; CHECK-NEXT: strb w9, [sp, #27] +; CHECK-NEXT: ldr w9, [sp, #168] +; CHECK-NEXT: strb w10, [sp, #26] +; CHECK-NEXT: ldr w10, [sp, #160] +; CHECK-NEXT: strb w8, [sp, #25] +; CHECK-NEXT: ldr w8, [sp, #152] +; CHECK-NEXT: strb w9, [sp, #24] +; CHECK-NEXT: ldr w9, [sp, #144] +; CHECK-NEXT: strb w10, [sp, #23] +; CHECK-NEXT: ldr w10, [sp, #136] +; CHECK-NEXT: strb w8, [sp, #22] +; CHECK-NEXT: ldr w8, [sp, #128] +; CHECK-NEXT: strb w9, [sp, #21] +; CHECK-NEXT: ldr w9, [sp, #120] +; CHECK-NEXT: strb w10, [sp, #20] +; CHECK-NEXT: ldr w10, [sp, #112] +; CHECK-NEXT: strb w8, [sp, #19] +; CHECK-NEXT: ldr w8, [sp, #104] +; CHECK-NEXT: strb w9, [sp, #18] +; CHECK-NEXT: ldr w9, [sp, #96] +; CHECK-NEXT: strb w10, [sp, #17] +; CHECK-NEXT: ldr w10, [sp, #88] +; CHECK-NEXT: strb w8, [sp, #16] +; CHECK-NEXT: ldr w8, [sp, #80] +; CHECK-NEXT: strb w9, [sp, #15] +; CHECK-NEXT: ldr w9, [sp, #72] +; CHECK-NEXT: strb w10, [sp, #14] +; CHECK-NEXT: ldr w10, [sp, #64] +; CHECK-NEXT: strb w8, [sp, #13] +; CHECK-NEXT: ldr w8, [sp, #56] +; CHECK-NEXT: strb w9, [sp, #12] +; CHECK-NEXT: ldr w9, [sp, #48] +; CHECK-NEXT: strb w10, [sp, #11] +; CHECK-NEXT: ldr w10, [sp, #40] +; CHECK-NEXT: strb w8, [sp, #10] +; CHECK-NEXT: ldr w8, [sp, #32] +; CHECK-NEXT: strb w9, [sp, #9] +; CHECK-NEXT: adrp x9, .LCPI3_0 +; CHECK-NEXT: add x9, x9, :lo12:.LCPI3_0 +; CHECK-NEXT: strb w10, [sp, #8] +; CHECK-NEXT: strb w8, [sp, #7] +; CHECK-NEXT: ptrue p0.b, vl16 +; CHECK-NEXT: strb w4, [sp, #3] +; CHECK-NEXT: mov x8, sp +; CHECK-NEXT: strb w3, [sp, #2] +; CHECK-NEXT: strb w2, [sp, #1] +; CHECK-NEXT: strb w1, [sp] +; CHECK-NEXT: ld1b { z0.b }, p0/z, [x9] +; CHECK-NEXT: add x9, sp, #16 +; CHECK-NEXT: ld1b { z1.b }, p0/z, [x9] +; CHECK-NEXT: ld1b { z2.b }, p0/z, [x8] +; CHECK-NEXT: mov w8, #16 +; CHECK-NEXT: lsl z1.b, p0/m, z1.b, z0.b +; CHECK-NEXT: lsl z2.b, p0/m, z2.b, z0.b +; CHECK-NEXT: asr z1.b, p0/m, z1.b, z0.b +; CHECK-NEXT: asrr z0.b, p0/m, z0.b, z2.b +; CHECK-NEXT: cmpne p1.b, p0/z, z1.b, #0 +; CHECK-NEXT: cmpne p0.b, p0/z, z0.b, #0 +; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0] +; CHECK-NEXT: ld1b { z1.b }, p1/z, [x0, x8] +; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 +; CHECK-NEXT: // kill: def $q1 killed $q1 killed $z1 +; CHECK-NEXT: add sp, sp, #32 +; CHECK-NEXT: ret + %load = call <32 x i8> @llvm.masked.load.v32i8(<32 x i8>* %src, i32 8, <32 x i1> %mask, <32 x i8> zeroinitializer) + ret <32 x i8> %load +} + define <2 x half> @masked_load_v2f16(<2 x half>* %src, <2 x i1> %mask) #0 { ; CHECK-LABEL: masked_load_v2f16: ; CHECK: // %bb.0: @@ -39,8 +160,8 @@ ; CHECK-NEXT: ptrue p0.h, vl4 ; CHECK-NEXT: fmov w9, s0 ; CHECK-NEXT: strh w8, [sp, #8] -; CHECK-NEXT: adrp x8, .LCPI1_0 -; CHECK-NEXT: add x8, x8, :lo12:.LCPI1_0 +; CHECK-NEXT: adrp x8, .LCPI4_0 +; CHECK-NEXT: add x8, x8, :lo12:.LCPI4_0 ; CHECK-NEXT: strh w9, [sp, #10] ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x8] ; CHECK-NEXT: add x8, sp, #8 @@ -56,11 +177,93 @@ ret <2 x half> %load } -define <4 x float> @masked_load_v4f32(<4 x float>* %src, <4 x i1> %mask) vscale_range(1,16) #0 { +define <4 x half> @masked_load_v4f16(<4 x half>* %src, <4 x i1> %mask) #0 { +; CHECK-LABEL: masked_load_v4f16: +; CHECK: // %bb.0: +; CHECK-NEXT: adrp x8, .LCPI5_0 +; CHECK-NEXT: add x8, x8, :lo12:.LCPI5_0 +; CHECK-NEXT: ptrue p0.h, vl4 +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: ld1h { z1.h }, p0/z, [x8] +; CHECK-NEXT: lsl z0.h, p0/m, z0.h, z1.h +; CHECK-NEXT: asr z0.h, p0/m, z0.h, z1.h +; CHECK-NEXT: cmpne p0.h, p0/z, z0.h, #0 +; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 +; CHECK-NEXT: ret + %load = call <4 x half> @llvm.masked.load.v4f16(<4 x half>* %src, i32 8, <4 x i1> %mask, <4 x half> zeroinitializer) + ret <4 x half> %load +} + +define <8 x half> @masked_load_v8f16(<8 x half>* %src, <8 x i1> %mask) #0 { +; CHECK-LABEL: masked_load_v8f16: +; CHECK: // %bb.0: +; CHECK-NEXT: adrp x8, .LCPI6_0 +; CHECK-NEXT: add x8, x8, :lo12:.LCPI6_0 +; CHECK-NEXT: ptrue p0.h, vl8 +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: uunpklo z0.h, z0.b +; CHECK-NEXT: ld1h { z1.h }, p0/z, [x8] +; CHECK-NEXT: lsl z0.h, p0/m, z0.h, z1.h +; CHECK-NEXT: asr z0.h, p0/m, z0.h, z1.h +; CHECK-NEXT: cmpne p0.h, p0/z, z0.h, #0 +; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] +; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 +; CHECK-NEXT: ret + %load = call <8 x half> @llvm.masked.load.v8f16(<8 x half>* %src, i32 8, <8 x i1> %mask, <8 x half> zeroinitializer) + ret <8 x half> %load +} + +define <16 x half> @masked_load_v16f16(<16 x half>* %src, <16 x i1> %mask) #0 { +; CHECK-LABEL: masked_load_v16f16: +; CHECK: // %bb.0: +; CHECK-NEXT: adrp x8, .LCPI7_0 +; CHECK-NEXT: add x8, x8, :lo12:.LCPI7_0 +; CHECK-NEXT: ptrue p0.h, vl8 +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: uunpklo z2.h, z0.b +; CHECK-NEXT: ext z0.b, z0.b, z0.b, #8 +; CHECK-NEXT: ld1h { z1.h }, p0/z, [x8] +; CHECK-NEXT: uunpklo z0.h, z0.b +; CHECK-NEXT: mov x8, #8 +; CHECK-NEXT: lsl z2.h, p0/m, z2.h, z1.h +; CHECK-NEXT: lsl z0.h, p0/m, z0.h, z1.h +; CHECK-NEXT: asr z2.h, p0/m, z2.h, z1.h +; CHECK-NEXT: asr z0.h, p0/m, z0.h, z1.h +; CHECK-NEXT: cmpne p1.h, p0/z, z2.h, #0 +; CHECK-NEXT: cmpne p0.h, p0/z, z0.h, #0 +; CHECK-NEXT: ld1h { z0.h }, p1/z, [x0] +; CHECK-NEXT: ld1h { z1.h }, p0/z, [x0, x8, lsl #1] +; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 +; CHECK-NEXT: // kill: def $q1 killed $q1 killed $z1 +; CHECK-NEXT: ret + %load = call <16 x half> @llvm.masked.load.v16f16(<16 x half>* %src, i32 8, <16 x i1> %mask, <16 x half> zeroinitializer) + ret <16 x half> %load +} + +define <2 x float> @masked_load_v2f32(<2 x float>* %src, <2 x i1> %mask) #0 { +; CHECK-LABEL: masked_load_v2f32: +; CHECK: // %bb.0: +; CHECK-NEXT: adrp x8, .LCPI8_0 +; CHECK-NEXT: add x8, x8, :lo12:.LCPI8_0 +; CHECK-NEXT: ptrue p0.s, vl2 +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: ld1w { z1.s }, p0/z, [x8] +; CHECK-NEXT: lsl z0.s, p0/m, z0.s, z1.s +; CHECK-NEXT: asr z0.s, p0/m, z0.s, z1.s +; CHECK-NEXT: cmpne p0.s, p0/z, z0.s, #0 +; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 +; CHECK-NEXT: ret + %load = call <2 x float> @llvm.masked.load.v2f32(<2 x float>* %src, i32 8, <2 x i1> %mask, <2 x float> zeroinitializer) + ret <2 x float> %load +} + +define <4 x float> @masked_load_v4f32(<4 x float>* %src, <4 x i1> %mask) #0 { ; CHECK-LABEL: masked_load_v4f32: ; CHECK: // %bb.0: -; CHECK-NEXT: adrp x8, .LCPI2_0 -; CHECK-NEXT: add x8, x8, :lo12:.LCPI2_0 +; CHECK-NEXT: adrp x8, .LCPI9_0 +; CHECK-NEXT: add x8, x8, :lo12:.LCPI9_0 ; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 ; CHECK-NEXT: uunpklo z0.s, z0.h @@ -75,11 +278,68 @@ ret <4 x float> %load } +define <8 x float> @masked_load_v8f32(<8 x float>* %src, <8 x i1> %mask) #0 { +; CHECK-LABEL: masked_load_v8f32: +; CHECK: // %bb.0: +; CHECK-NEXT: sub sp, sp, #16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: fmov w9, s0 +; CHECK-NEXT: mov z1.b, z0.b[3] +; CHECK-NEXT: mov z3.b, z0.b[1] +; CHECK-NEXT: fmov w10, s1 +; CHECK-NEXT: mov z2.b, z0.b[2] +; CHECK-NEXT: mov z4.b, z0.b[7] +; CHECK-NEXT: strh w9, [sp] +; CHECK-NEXT: fmov w9, s3 +; CHECK-NEXT: mov z6.b, z0.b[5] +; CHECK-NEXT: fmov w11, s2 +; CHECK-NEXT: strh w10, [sp, #6] +; CHECK-NEXT: fmov w10, s4 +; CHECK-NEXT: strh w9, [sp, #2] +; CHECK-NEXT: fmov w9, s6 +; CHECK-NEXT: mov z5.b, z0.b[6] +; CHECK-NEXT: mov z0.b, z0.b[4] +; CHECK-NEXT: strh w11, [sp, #4] +; CHECK-NEXT: fmov w11, s5 +; CHECK-NEXT: strh w10, [sp, #14] +; CHECK-NEXT: fmov w10, s0 +; CHECK-NEXT: mov x8, sp +; CHECK-NEXT: strh w9, [sp, #10] +; CHECK-NEXT: adrp x9, .LCPI10_0 +; CHECK-NEXT: add x9, x9, :lo12:.LCPI10_0 +; CHECK-NEXT: strh w11, [sp, #12] +; CHECK-NEXT: ptrue p0.s, vl4 +; CHECK-NEXT: strh w10, [sp, #8] +; CHECK-NEXT: ptrue p1.h, vl4 +; CHECK-NEXT: ld1w { z1.s }, p0/z, [x9] +; CHECK-NEXT: ld1h { z0.h }, p1/z, [x8] +; CHECK-NEXT: add x8, sp, #8 +; CHECK-NEXT: uunpklo z0.s, z0.h +; CHECK-NEXT: lsl z0.s, p0/m, z0.s, z1.s +; CHECK-NEXT: asr z0.s, p0/m, z0.s, z1.s +; CHECK-NEXT: cmpne p2.s, p0/z, z0.s, #0 +; CHECK-NEXT: ld1w { z0.s }, p2/z, [x0] +; CHECK-NEXT: ld1h { z2.h }, p1/z, [x8] +; CHECK-NEXT: mov x8, #4 +; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 +; CHECK-NEXT: uunpklo z2.s, z2.h +; CHECK-NEXT: lsl z2.s, p0/m, z2.s, z1.s +; CHECK-NEXT: asrr z1.s, p0/m, z1.s, z2.s +; CHECK-NEXT: cmpne p0.s, p0/z, z1.s, #0 +; CHECK-NEXT: ld1w { z1.s }, p0/z, [x0, x8, lsl #2] +; CHECK-NEXT: // kill: def $q1 killed $q1 killed $z1 +; CHECK-NEXT: add sp, sp, #16 +; CHECK-NEXT: ret + %load = call <8 x float> @llvm.masked.load.v8f32(<8 x float>* %src, i32 8, <8 x i1> %mask, <8 x float> zeroinitializer) + ret <8 x float> %load +} + define <2 x double> @masked_load_v2f64(<2 x double>* %src, <2 x i1> %mask) #0 { ; CHECK-LABEL: masked_load_v2f64: ; CHECK: // %bb.0: -; CHECK-NEXT: adrp x8, .LCPI3_0 -; CHECK-NEXT: add x8, x8, :lo12:.LCPI3_0 +; CHECK-NEXT: adrp x8, .LCPI11_0 +; CHECK-NEXT: add x8, x8, :lo12:.LCPI11_0 ; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 ; CHECK-NEXT: uunpklo z0.d, z0.s @@ -95,62 +355,47 @@ } define <4 x double> @masked_load_v4f64(<4 x double>* %src, <4 x i1> %mask) #0 { -; VBITS_GE_128_STREAMING-LABEL: masked_load_v4f64: -; VBITS_GE_128_STREAMING: // %bb.0: -; VBITS_GE_128_STREAMING-NEXT: adrp x8, .LCPI4_0 -; VBITS_GE_128_STREAMING-NEXT: add x8, x8, :lo12:.LCPI4_0 -; VBITS_GE_128_STREAMING-NEXT: ptrue p0.d, vl2 -; VBITS_GE_128_STREAMING-NEXT: // kill: def $d0 killed $d0 def $z0 -; VBITS_GE_128_STREAMING-NEXT: uunpklo z0.s, z0.h -; VBITS_GE_128_STREAMING-NEXT: ld1d { z1.d }, p0/z, [x8] -; VBITS_GE_128_STREAMING-NEXT: uunpklo z2.d, z0.s -; VBITS_GE_128_STREAMING-NEXT: ext z0.b, z0.b, z0.b, #8 -; VBITS_GE_128_STREAMING-NEXT: mov x8, #2 -; VBITS_GE_128_STREAMING-NEXT: uunpklo z0.d, z0.s -; VBITS_GE_128_STREAMING-NEXT: lsl z2.d, p0/m, z2.d, z1.d -; VBITS_GE_128_STREAMING-NEXT: lsl z0.d, p0/m, z0.d, z1.d -; VBITS_GE_128_STREAMING-NEXT: asr z2.d, p0/m, z2.d, z1.d -; VBITS_GE_128_STREAMING-NEXT: asr z0.d, p0/m, z0.d, z1.d -; VBITS_GE_128_STREAMING-NEXT: cmpne p1.d, p0/z, z2.d, #0 -; VBITS_GE_128_STREAMING-NEXT: cmpne p0.d, p0/z, z0.d, #0 -; VBITS_GE_128_STREAMING-NEXT: ld1d { z0.d }, p1/z, [x0] -; VBITS_GE_128_STREAMING-NEXT: ld1d { z1.d }, p0/z, [x0, x8, lsl #3] -; VBITS_GE_128_STREAMING-NEXT: // kill: def $q0 killed $q0 killed $z0 -; VBITS_GE_128_STREAMING-NEXT: // kill: def $q1 killed $q1 killed $z1 -; VBITS_GE_128_STREAMING-NEXT: ret -; -; VBITS_GE_256_STREAMING-LABEL: masked_load_v4f64: -; VBITS_GE_256_STREAMING: // %bb.0: -; VBITS_GE_256_STREAMING-NEXT: // kill: def $d0 killed $d0 def $z0 -; VBITS_GE_256_STREAMING-NEXT: ptrue p0.d, vl4 -; VBITS_GE_256_STREAMING-NEXT: uunpklo z0.s, z0.h -; VBITS_GE_256_STREAMING-NEXT: uunpklo z0.d, z0.s -; VBITS_GE_256_STREAMING-NEXT: lsl z0.d, p0/m, z0.d, #63 -; VBITS_GE_256_STREAMING-NEXT: asr z0.d, p0/m, z0.d, #63 -; VBITS_GE_256_STREAMING-NEXT: cmpne p1.d, p0/z, z0.d, #0 -; VBITS_GE_256_STREAMING-NEXT: ld1d { z0.d }, p1/z, [x0] -; VBITS_GE_256_STREAMING-NEXT: st1d { z0.d }, p0, [x8] -; VBITS_GE_256_STREAMING-NEXT: ret -; -; VBITS_GE_512_STREAMING-LABEL: masked_load_v4f64: -; VBITS_GE_512_STREAMING: // %bb.0: -; VBITS_GE_512_STREAMING-NEXT: // kill: def $d0 killed $d0 def $z0 -; VBITS_GE_512_STREAMING-NEXT: ptrue p0.d, vl4 -; VBITS_GE_512_STREAMING-NEXT: uunpklo z0.s, z0.h -; VBITS_GE_512_STREAMING-NEXT: uunpklo z0.d, z0.s -; VBITS_GE_512_STREAMING-NEXT: lsl z0.d, p0/m, z0.d, #63 -; VBITS_GE_512_STREAMING-NEXT: asr z0.d, p0/m, z0.d, #63 -; VBITS_GE_512_STREAMING-NEXT: cmpne p1.d, p0/z, z0.d, #0 -; VBITS_GE_512_STREAMING-NEXT: ld1d { z0.d }, p1/z, [x0] -; VBITS_GE_512_STREAMING-NEXT: st1d { z0.d }, p0, [x8] -; VBITS_GE_512_STREAMING-NEXT: ret +; CHECK-LABEL: masked_load_v4f64: +; CHECK: // %bb.0: +; CHECK-NEXT: adrp x8, .LCPI12_0 +; CHECK-NEXT: add x8, x8, :lo12:.LCPI12_0 +; CHECK-NEXT: ptrue p0.d, vl2 +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: uunpklo z0.s, z0.h +; CHECK-NEXT: ld1d { z1.d }, p0/z, [x8] +; CHECK-NEXT: uunpklo z2.d, z0.s +; CHECK-NEXT: ext z0.b, z0.b, z0.b, #8 +; CHECK-NEXT: mov x8, #2 +; CHECK-NEXT: uunpklo z0.d, z0.s +; CHECK-NEXT: lsl z2.d, p0/m, z2.d, z1.d +; CHECK-NEXT: lsl z0.d, p0/m, z0.d, z1.d +; CHECK-NEXT: asr z2.d, p0/m, z2.d, z1.d +; CHECK-NEXT: asr z0.d, p0/m, z0.d, z1.d +; CHECK-NEXT: cmpne p1.d, p0/z, z2.d, #0 +; CHECK-NEXT: cmpne p0.d, p0/z, z0.d, #0 +; CHECK-NEXT: ld1d { z0.d }, p1/z, [x0] +; CHECK-NEXT: ld1d { z1.d }, p0/z, [x0, x8, lsl #3] +; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 +; CHECK-NEXT: // kill: def $q1 killed $q1 killed $z1 +; CHECK-NEXT: ret %load = call <4 x double> @llvm.masked.load.v4f64(<4 x double>* %src, i32 8, <4 x i1> %mask, <4 x double> zeroinitializer) ret <4 x double> %load } +declare <4 x i8> @llvm.masked.load.v4i8(<4 x i8>*, i32, <4 x i1>, <4 x i8>) +declare <8 x i8> @llvm.masked.load.v8i8(<8 x i8>*, i32, <8 x i1>, <8 x i8>) declare <16 x i8> @llvm.masked.load.v16i8(<16 x i8>*, i32, <16 x i1>, <16 x i8>) +declare <32 x i8> @llvm.masked.load.v32i8(<32 x i8>*, i32, <32 x i1>, <32 x i8>) + declare <2 x half> @llvm.masked.load.v2f16(<2 x half>*, i32, <2 x i1>, <2 x half>) +declare <4 x half> @llvm.masked.load.v4f16(<4 x half>*, i32, <4 x i1>, <4 x half>) +declare <8 x half> @llvm.masked.load.v8f16(<8 x half>*, i32, <8 x i1>, <8 x half>) +declare <16 x half> @llvm.masked.load.v16f16(<16 x half>*, i32, <16 x i1>, <16 x half>) + +declare <2 x float> @llvm.masked.load.v2f32(<2 x float>*, i32, <2 x i1>, <2 x float>) declare <4 x float> @llvm.masked.load.v4f32(<4 x float>*, i32, <4 x i1>, <4 x float>) +declare <8 x float> @llvm.masked.load.v8f32(<8 x float>*, i32, <8 x i1>, <8 x float>) + declare <2 x double> @llvm.masked.load.v2f64(<2 x double>*, i32, <2 x i1>, <2 x double>) declare <4 x double> @llvm.masked.load.v4f64(<4 x double>*, i32, <4 x i1>, <4 x double>) diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-masked-store.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-masked-store.ll --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-masked-store.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-masked-store.ll @@ -1,7 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -aarch64-sve-vector-bits-min=128 -force-sve-when-streaming-compatible < %s | FileCheck %s -; RUN: llc -aarch64-sve-vector-bits-min=256 -force-sve-when-streaming-compatible < %s | FileCheck %s -; RUN: llc -aarch64-sve-vector-bits-min=512 -force-sve-when-streaming-compatible < %s | FileCheck %s +; RUN: llc -force-streaming-compatible-sve < %s | FileCheck %s target triple = "aarch64-unknown-linux-gnu" @@ -9,16 +7,56 @@ ; Masked Store ; -define void @masked_store_v16i8(<16 x i8>* %dst, <16 x i1> %mask) #0 { -; CHECK-LABEL: masked_store_v16i8: +define void @masked_store_v4i8(<4 x i8>* %dst, <4 x i1> %mask) #0 { +; CHECK-LABEL: masked_store_v4i8: ; CHECK: // %bb.0: ; CHECK-NEXT: adrp x8, .LCPI0_0 ; CHECK-NEXT: add x8, x8, :lo12:.LCPI0_0 +; CHECK-NEXT: ptrue p0.h, vl4 +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: ld1h { z1.h }, p0/z, [x8] +; CHECK-NEXT: adrp x8, .LCPI0_1 +; CHECK-NEXT: add x8, x8, :lo12:.LCPI0_1 +; CHECK-NEXT: ld1h { z2.h }, p0/z, [x8] +; CHECK-NEXT: lsl z0.h, p0/m, z0.h, z1.h +; CHECK-NEXT: asr z0.h, p0/m, z0.h, z1.h +; CHECK-NEXT: cmpne p0.h, p0/z, z0.h, #0 +; CHECK-NEXT: st1b { z2.h }, p0, [x0] +; CHECK-NEXT: ret + call void @llvm.masked.store.v4i8(<4 x i8> zeroinitializer, <4 x i8>* %dst, i32 8, <4 x i1> %mask) + ret void +} + +define void @masked_store_v8i8(<8 x i8>* %dst, <8 x i1> %mask) #0 { +; CHECK-LABEL: masked_store_v8i8: +; CHECK: // %bb.0: +; CHECK-NEXT: adrp x8, .LCPI1_0 +; CHECK-NEXT: add x8, x8, :lo12:.LCPI1_0 +; CHECK-NEXT: ptrue p0.b, vl8 +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: ld1b { z1.b }, p0/z, [x8] +; CHECK-NEXT: adrp x8, .LCPI1_1 +; CHECK-NEXT: add x8, x8, :lo12:.LCPI1_1 +; CHECK-NEXT: ld1b { z2.b }, p0/z, [x8] +; CHECK-NEXT: lsl z0.b, p0/m, z0.b, z1.b +; CHECK-NEXT: asr z0.b, p0/m, z0.b, z1.b +; CHECK-NEXT: cmpne p0.b, p0/z, z0.b, #0 +; CHECK-NEXT: st1b { z2.b }, p0, [x0] +; CHECK-NEXT: ret + call void @llvm.masked.store.v8i8(<8 x i8> zeroinitializer, <8 x i8>* %dst, i32 8, <8 x i1> %mask) + ret void +} + +define void @masked_store_v16i8(<16 x i8>* %dst, <16 x i1> %mask) #0 { +; CHECK-LABEL: masked_store_v16i8: +; CHECK: // %bb.0: +; CHECK-NEXT: adrp x8, .LCPI2_0 +; CHECK-NEXT: add x8, x8, :lo12:.LCPI2_0 ; CHECK-NEXT: ptrue p0.b, vl16 ; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 ; CHECK-NEXT: ld1b { z1.b }, p0/z, [x8] -; CHECK-NEXT: adrp x8, .LCPI0_1 -; CHECK-NEXT: add x8, x8, :lo12:.LCPI0_1 +; CHECK-NEXT: adrp x8, .LCPI2_1 +; CHECK-NEXT: add x8, x8, :lo12:.LCPI2_1 ; CHECK-NEXT: ld1b { z2.b }, p0/z, [x8] ; CHECK-NEXT: lsl z0.b, p0/m, z0.b, z1.b ; CHECK-NEXT: asr z0.b, p0/m, z0.b, z1.b @@ -29,6 +67,94 @@ ret void } +define void @masked_store_v32i8(<32 x i8>* %dst, <32 x i1> %mask) #0 { +; CHECK-LABEL: masked_store_v32i8: +; CHECK: // %bb.0: +; CHECK-NEXT: sub sp, sp, #32 +; CHECK-NEXT: .cfi_def_cfa_offset 32 +; CHECK-NEXT: ldr w8, [sp, #96] +; CHECK-NEXT: strb w7, [sp, #6] +; CHECK-NEXT: ldr w9, [sp, #88] +; CHECK-NEXT: strb w6, [sp, #5] +; CHECK-NEXT: ldr w10, [sp, #80] +; CHECK-NEXT: strb w5, [sp, #4] +; CHECK-NEXT: strb w8, [sp, #15] +; CHECK-NEXT: ldr w8, [sp, #72] +; CHECK-NEXT: strb w9, [sp, #14] +; CHECK-NEXT: ldr w9, [sp, #64] +; CHECK-NEXT: strb w10, [sp, #13] +; CHECK-NEXT: ldr w10, [sp, #56] +; CHECK-NEXT: strb w8, [sp, #12] +; CHECK-NEXT: ldr w8, [sp, #48] +; CHECK-NEXT: strb w9, [sp, #11] +; CHECK-NEXT: ldr w9, [sp, #40] +; CHECK-NEXT: strb w10, [sp, #10] +; CHECK-NEXT: ldr w10, [sp, #32] +; CHECK-NEXT: strb w8, [sp, #9] +; CHECK-NEXT: ldr w8, [sp, #224] +; CHECK-NEXT: strb w9, [sp, #8] +; CHECK-NEXT: ldr w9, [sp, #216] +; CHECK-NEXT: strb w10, [sp, #7] +; CHECK-NEXT: ldr w10, [sp, #208] +; CHECK-NEXT: strb w8, [sp, #31] +; CHECK-NEXT: ldr w8, [sp, #200] +; CHECK-NEXT: strb w9, [sp, #30] +; CHECK-NEXT: ldr w9, [sp, #192] +; CHECK-NEXT: strb w10, [sp, #29] +; CHECK-NEXT: ldr w10, [sp, #184] +; CHECK-NEXT: strb w8, [sp, #28] +; CHECK-NEXT: ldr w8, [sp, #176] +; CHECK-NEXT: strb w9, [sp, #27] +; CHECK-NEXT: ldr w9, [sp, #168] +; CHECK-NEXT: strb w10, [sp, #26] +; CHECK-NEXT: ldr w10, [sp, #160] +; CHECK-NEXT: strb w8, [sp, #25] +; CHECK-NEXT: ldr w8, [sp, #152] +; CHECK-NEXT: strb w9, [sp, #24] +; CHECK-NEXT: ldr w9, [sp, #144] +; CHECK-NEXT: strb w10, [sp, #23] +; CHECK-NEXT: ldr w10, [sp, #136] +; CHECK-NEXT: strb w8, [sp, #22] +; CHECK-NEXT: ldr w8, [sp, #128] +; CHECK-NEXT: strb w9, [sp, #21] +; CHECK-NEXT: ldr w9, [sp, #120] +; CHECK-NEXT: strb w10, [sp, #20] +; CHECK-NEXT: ldr w10, [sp, #112] +; CHECK-NEXT: strb w8, [sp, #19] +; CHECK-NEXT: ldr w8, [sp, #104] +; CHECK-NEXT: strb w4, [sp, #3] +; CHECK-NEXT: ptrue p0.b, vl16 +; CHECK-NEXT: strb w3, [sp, #2] +; CHECK-NEXT: strb w8, [sp, #16] +; CHECK-NEXT: adrp x8, .LCPI3_0 +; CHECK-NEXT: add x8, x8, :lo12:.LCPI3_0 +; CHECK-NEXT: strb w2, [sp, #1] +; CHECK-NEXT: strb w1, [sp] +; CHECK-NEXT: strb w9, [sp, #18] +; CHECK-NEXT: mov x9, sp +; CHECK-NEXT: strb w10, [sp, #17] +; CHECK-NEXT: ld1b { z0.b }, p0/z, [x8] +; CHECK-NEXT: add x8, sp, #16 +; CHECK-NEXT: ld1b { z1.b }, p0/z, [x9] +; CHECK-NEXT: ld1b { z2.b }, p0/z, [x8] +; CHECK-NEXT: adrp x8, .LCPI3_1 +; CHECK-NEXT: add x8, x8, :lo12:.LCPI3_1 +; CHECK-NEXT: lsl z1.b, p0/m, z1.b, z0.b +; CHECK-NEXT: asr z1.b, p0/m, z1.b, z0.b +; CHECK-NEXT: ld1b { z3.b }, p0/z, [x8] +; CHECK-NEXT: lsl z2.b, p0/m, z2.b, z0.b +; CHECK-NEXT: mov w8, #16 +; CHECK-NEXT: asrr z0.b, p0/m, z0.b, z2.b +; CHECK-NEXT: cmpne p1.b, p0/z, z1.b, #0 +; CHECK-NEXT: cmpne p0.b, p0/z, z0.b, #0 +; CHECK-NEXT: st1b { z3.b }, p0, [x0, x8] +; CHECK-NEXT: st1b { z3.b }, p1, [x0] +; CHECK-NEXT: add sp, sp, #32 +; CHECK-NEXT: ret + call void @llvm.masked.store.v32i8(<32 x i8> zeroinitializer, <32 x i8>* %dst, i32 8, <32 x i1> %mask) + ret void +} + define void @masked_store_v2f16(<2 x half>* %dst, <2 x i1> %mask) #0 { ; CHECK-LABEL: masked_store_v2f16: ; CHECK: // %bb.0: @@ -41,14 +167,14 @@ ; CHECK-NEXT: ptrue p0.h, vl4 ; CHECK-NEXT: fmov w9, s0 ; CHECK-NEXT: strh w8, [sp, #8] -; CHECK-NEXT: adrp x8, .LCPI1_0 -; CHECK-NEXT: add x8, x8, :lo12:.LCPI1_0 +; CHECK-NEXT: adrp x8, .LCPI4_0 +; CHECK-NEXT: add x8, x8, :lo12:.LCPI4_0 ; CHECK-NEXT: strh w9, [sp, #10] ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x8] ; CHECK-NEXT: add x8, sp, #8 ; CHECK-NEXT: ld1h { z1.h }, p0/z, [x8] -; CHECK-NEXT: adrp x8, .LCPI1_1 -; CHECK-NEXT: add x8, x8, :lo12:.LCPI1_1 +; CHECK-NEXT: adrp x8, .LCPI4_1 +; CHECK-NEXT: add x8, x8, :lo12:.LCPI4_1 ; CHECK-NEXT: ld1h { z2.h }, p0/z, [x8] ; CHECK-NEXT: lsl z1.h, p0/m, z1.h, z0.h ; CHECK-NEXT: asrr z0.h, p0/m, z0.h, z1.h @@ -60,17 +186,87 @@ ret void } +define void @masked_store_v4f16(<4 x half>* %dst, <4 x i1> %mask) #0 { +; CHECK-LABEL: masked_store_v4f16: +; CHECK: // %bb.0: +; CHECK-NEXT: adrp x8, .LCPI5_0 +; CHECK-NEXT: add x8, x8, :lo12:.LCPI5_0 +; CHECK-NEXT: ptrue p0.h, vl4 +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: ld1h { z1.h }, p0/z, [x8] +; CHECK-NEXT: adrp x8, .LCPI5_1 +; CHECK-NEXT: add x8, x8, :lo12:.LCPI5_1 +; CHECK-NEXT: ld1h { z2.h }, p0/z, [x8] +; CHECK-NEXT: lsl z0.h, p0/m, z0.h, z1.h +; CHECK-NEXT: asr z0.h, p0/m, z0.h, z1.h +; CHECK-NEXT: cmpne p0.h, p0/z, z0.h, #0 +; CHECK-NEXT: st1h { z2.h }, p0, [x0] +; CHECK-NEXT: ret + call void @llvm.masked.store.v4f16(<4 x half> zeroinitializer, <4 x half>* %dst, i32 8, <4 x i1> %mask) + ret void +} + +define void @masked_store_v8f16(<8 x half>* %dst, <8 x i1> %mask) #0 { +; CHECK-LABEL: masked_store_v8f16: +; CHECK: // %bb.0: +; CHECK-NEXT: adrp x8, .LCPI6_0 +; CHECK-NEXT: add x8, x8, :lo12:.LCPI6_0 +; CHECK-NEXT: ptrue p0.h, vl8 +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: uunpklo z0.h, z0.b +; CHECK-NEXT: ld1h { z1.h }, p0/z, [x8] +; CHECK-NEXT: adrp x8, .LCPI6_1 +; CHECK-NEXT: add x8, x8, :lo12:.LCPI6_1 +; CHECK-NEXT: ld1h { z2.h }, p0/z, [x8] +; CHECK-NEXT: lsl z0.h, p0/m, z0.h, z1.h +; CHECK-NEXT: asr z0.h, p0/m, z0.h, z1.h +; CHECK-NEXT: cmpne p0.h, p0/z, z0.h, #0 +; CHECK-NEXT: st1h { z2.h }, p0, [x0] +; CHECK-NEXT: ret + call void @llvm.masked.store.v8f16(<8 x half> zeroinitializer, <8 x half>* %dst, i32 8, <8 x i1> %mask) + ret void +} + +define void @masked_store_v16f16(<16 x half>* %dst, <16 x i1> %mask) #0 { +; CHECK-LABEL: masked_store_v16f16: +; CHECK: // %bb.0: +; CHECK-NEXT: adrp x8, .LCPI7_0 +; CHECK-NEXT: add x8, x8, :lo12:.LCPI7_0 +; CHECK-NEXT: ptrue p0.h, vl8 +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: mov z1.d, z0.d +; CHECK-NEXT: ext z1.b, z1.b, z0.b, #8 +; CHECK-NEXT: uunpklo z0.h, z0.b +; CHECK-NEXT: ld1h { z2.h }, p0/z, [x8] +; CHECK-NEXT: adrp x8, .LCPI7_1 +; CHECK-NEXT: add x8, x8, :lo12:.LCPI7_1 +; CHECK-NEXT: uunpklo z1.h, z1.b +; CHECK-NEXT: ld1h { z3.h }, p0/z, [x8] +; CHECK-NEXT: lsl z1.h, p0/m, z1.h, z2.h +; CHECK-NEXT: mov x8, #8 +; CHECK-NEXT: lsl z0.h, p0/m, z0.h, z2.h +; CHECK-NEXT: asr z1.h, p0/m, z1.h, z2.h +; CHECK-NEXT: asr z0.h, p0/m, z0.h, z2.h +; CHECK-NEXT: cmpne p1.h, p0/z, z1.h, #0 +; CHECK-NEXT: cmpne p0.h, p0/z, z0.h, #0 +; CHECK-NEXT: st1h { z3.h }, p1, [x0, x8, lsl #1] +; CHECK-NEXT: st1h { z3.h }, p0, [x0] +; CHECK-NEXT: ret + call void @llvm.masked.store.v16f16(<16 x half> zeroinitializer, <16 x half>* %dst, i32 8, <16 x i1> %mask) + ret void +} + define void @masked_store_v4f32(<4 x float>* %dst, <4 x i1> %mask) #0 { ; CHECK-LABEL: masked_store_v4f32: ; CHECK: // %bb.0: -; CHECK-NEXT: adrp x8, .LCPI2_0 -; CHECK-NEXT: add x8, x8, :lo12:.LCPI2_0 +; CHECK-NEXT: adrp x8, .LCPI8_0 +; CHECK-NEXT: add x8, x8, :lo12:.LCPI8_0 ; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 ; CHECK-NEXT: uunpklo z0.s, z0.h ; CHECK-NEXT: ld1w { z1.s }, p0/z, [x8] -; CHECK-NEXT: adrp x8, .LCPI2_1 -; CHECK-NEXT: add x8, x8, :lo12:.LCPI2_1 +; CHECK-NEXT: adrp x8, .LCPI8_1 +; CHECK-NEXT: add x8, x8, :lo12:.LCPI8_1 ; CHECK-NEXT: ld1w { z2.s }, p0/z, [x8] ; CHECK-NEXT: lsl z0.s, p0/m, z0.s, z1.s ; CHECK-NEXT: asr z0.s, p0/m, z0.s, z1.s @@ -81,23 +277,215 @@ ret void } -; this test case caused a crash, I don't why, I commented it now, but I'll investigate it later. -; define void @masked_store_v32f32(<32 x float>* %dst, <32 x i1> %mask) #0 { -; call void @llvm.masked.store.v32f32(<32 x float> zeroinitializer, <32 x float>* %dst, i32 8, <32 x i1> %mask) -; ret void -; } +define void @masked_store_v8f32(<8 x float>* %dst, <8 x i1> %mask) #0 { +; CHECK-LABEL: masked_store_v8f32: +; CHECK: // %bb.0: +; CHECK-NEXT: sub sp, sp, #16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: mov z1.b, z0.b[7] +; CHECK-NEXT: mov z2.b, z0.b[6] +; CHECK-NEXT: fmov w9, s1 +; CHECK-NEXT: mov z1.b, z0.b[5] +; CHECK-NEXT: fmov w10, s2 +; CHECK-NEXT: mov z2.b, z0.b[4] +; CHECK-NEXT: fmov w11, s1 +; CHECK-NEXT: adrp x8, .LCPI9_0 +; CHECK-NEXT: add x8, x8, :lo12:.LCPI9_0 +; CHECK-NEXT: strh w9, [sp, #14] +; CHECK-NEXT: fmov w9, s2 +; CHECK-NEXT: strh w10, [sp, #12] +; CHECK-NEXT: strh w11, [sp, #10] +; CHECK-NEXT: ptrue p0.s, vl4 +; CHECK-NEXT: ptrue p1.h, vl4 +; CHECK-NEXT: mov z4.b, z0.b[3] +; CHECK-NEXT: strh w9, [sp, #8] +; CHECK-NEXT: mov x9, #4 +; CHECK-NEXT: ld1w { z1.s }, p0/z, [x8] +; CHECK-NEXT: add x8, sp, #8 +; CHECK-NEXT: fmov w10, s4 +; CHECK-NEXT: mov z5.b, z0.b[2] +; CHECK-NEXT: mov z6.b, z0.b[1] +; CHECK-NEXT: ld1h { z2.h }, p1/z, [x8] +; CHECK-NEXT: adrp x8, .LCPI9_1 +; CHECK-NEXT: add x8, x8, :lo12:.LCPI9_1 +; CHECK-NEXT: ld1w { z3.s }, p0/z, [x8] +; CHECK-NEXT: fmov w8, s0 +; CHECK-NEXT: uunpklo z2.s, z2.h +; CHECK-NEXT: lsl z2.s, p0/m, z2.s, z1.s +; CHECK-NEXT: movprfx z0, z2 +; CHECK-NEXT: asr z0.s, p0/m, z0.s, z1.s +; CHECK-NEXT: cmpne p2.s, p0/z, z0.s, #0 +; CHECK-NEXT: st1w { z3.s }, p2, [x0, x9, lsl #2] +; CHECK-NEXT: fmov w9, s5 +; CHECK-NEXT: strh w8, [sp] +; CHECK-NEXT: fmov w8, s6 +; CHECK-NEXT: strh w10, [sp, #6] +; CHECK-NEXT: mov x10, sp +; CHECK-NEXT: strh w9, [sp, #4] +; CHECK-NEXT: strh w8, [sp, #2] +; CHECK-NEXT: ld1h { z0.h }, p1/z, [x10] +; CHECK-NEXT: uunpklo z0.s, z0.h +; CHECK-NEXT: lsl z0.s, p0/m, z0.s, z1.s +; CHECK-NEXT: asr z0.s, p0/m, z0.s, z1.s +; CHECK-NEXT: cmpne p0.s, p0/z, z0.s, #0 +; CHECK-NEXT: st1w { z3.s }, p0, [x0] +; CHECK-NEXT: add sp, sp, #16 +; CHECK-NEXT: ret + call void @llvm.masked.store.v8f32(<8 x float> zeroinitializer, <8 x float>* %dst, i32 8, <8 x i1> %mask) + ret void +} + +define void @masked_store_v32f32(<32 x float>* %dst, <32 x i1> %mask) #0 { +; CHECK-LABEL: masked_store_v32f32: +; CHECK: // %bb.0: +; CHECK-NEXT: sub sp, sp, #64 +; CHECK-NEXT: .cfi_def_cfa_offset 64 +; CHECK-NEXT: ldr w8, [sp, #256] +; CHECK-NEXT: ptrue p0.s, vl4 +; CHECK-NEXT: ldr w9, [sp, #248] +; CHECK-NEXT: ptrue p1.h, vl4 +; CHECK-NEXT: ldr w10, [sp, #240] +; CHECK-NEXT: ldr w11, [sp, #232] +; CHECK-NEXT: strh w8, [sp, #62] +; CHECK-NEXT: adrp x8, .LCPI10_0 +; CHECK-NEXT: add x8, x8, :lo12:.LCPI10_0 +; CHECK-NEXT: strh w9, [sp, #60] +; CHECK-NEXT: ldr w9, [sp, #224] +; CHECK-NEXT: strh w10, [sp, #58] +; CHECK-NEXT: adrp x10, .LCPI10_1 +; CHECK-NEXT: add x10, x10, :lo12:.LCPI10_1 +; CHECK-NEXT: strh w11, [sp, #56] +; CHECK-NEXT: ld1w { z0.s }, p0/z, [x8] +; CHECK-NEXT: add x8, sp, #56 +; CHECK-NEXT: ldr w11, [sp, #216] +; CHECK-NEXT: ldr w12, [sp, #208] +; CHECK-NEXT: ld1h { z1.h }, p1/z, [x8] +; CHECK-NEXT: ldr w8, [sp, #200] +; CHECK-NEXT: uunpklo z2.s, z1.h +; CHECK-NEXT: ld1w { z1.s }, p0/z, [x10] +; CHECK-NEXT: lsl z2.s, p0/m, z2.s, z0.s +; CHECK-NEXT: mov x10, #28 +; CHECK-NEXT: asr z2.s, p0/m, z2.s, z0.s +; CHECK-NEXT: cmpne p2.s, p0/z, z2.s, #0 +; CHECK-NEXT: st1w { z1.s }, p2, [x0, x10, lsl #2] +; CHECK-NEXT: strh w9, [sp, #54] +; CHECK-NEXT: add x9, sp, #48 +; CHECK-NEXT: strh w11, [sp, #52] +; CHECK-NEXT: strh w12, [sp, #50] +; CHECK-NEXT: mov x10, #24 +; CHECK-NEXT: strh w8, [sp, #48] +; CHECK-NEXT: ldr w8, [sp, #168] +; CHECK-NEXT: ld1h { z2.h }, p1/z, [x9] +; CHECK-NEXT: ldr w9, [sp, #192] +; CHECK-NEXT: ldr w11, [sp, #184] +; CHECK-NEXT: ldr w12, [sp, #176] +; CHECK-NEXT: uunpklo z2.s, z2.h +; CHECK-NEXT: lsl z2.s, p0/m, z2.s, z0.s +; CHECK-NEXT: asr z2.s, p0/m, z2.s, z0.s +; CHECK-NEXT: cmpne p2.s, p0/z, z2.s, #0 +; CHECK-NEXT: st1w { z1.s }, p2, [x0, x10, lsl #2] +; CHECK-NEXT: strh w9, [sp, #46] +; CHECK-NEXT: add x9, sp, #40 +; CHECK-NEXT: strh w11, [sp, #44] +; CHECK-NEXT: strh w12, [sp, #42] +; CHECK-NEXT: mov x10, #20 +; CHECK-NEXT: strh w8, [sp, #40] +; CHECK-NEXT: ldr w8, [sp, #136] +; CHECK-NEXT: ld1h { z2.h }, p1/z, [x9] +; CHECK-NEXT: ldr w9, [sp, #160] +; CHECK-NEXT: ldr w11, [sp, #152] +; CHECK-NEXT: ldr w12, [sp, #144] +; CHECK-NEXT: uunpklo z2.s, z2.h +; CHECK-NEXT: lsl z2.s, p0/m, z2.s, z0.s +; CHECK-NEXT: asr z2.s, p0/m, z2.s, z0.s +; CHECK-NEXT: cmpne p2.s, p0/z, z2.s, #0 +; CHECK-NEXT: st1w { z1.s }, p2, [x0, x10, lsl #2] +; CHECK-NEXT: strh w9, [sp, #38] +; CHECK-NEXT: add x9, sp, #32 +; CHECK-NEXT: strh w11, [sp, #36] +; CHECK-NEXT: strh w12, [sp, #34] +; CHECK-NEXT: mov x10, #16 +; CHECK-NEXT: strh w8, [sp, #32] +; CHECK-NEXT: ldr w8, [sp, #104] +; CHECK-NEXT: ld1h { z2.h }, p1/z, [x9] +; CHECK-NEXT: ldr w9, [sp, #128] +; CHECK-NEXT: ldr w11, [sp, #120] +; CHECK-NEXT: ldr w12, [sp, #112] +; CHECK-NEXT: uunpklo z2.s, z2.h +; CHECK-NEXT: lsl z2.s, p0/m, z2.s, z0.s +; CHECK-NEXT: asr z2.s, p0/m, z2.s, z0.s +; CHECK-NEXT: cmpne p2.s, p0/z, z2.s, #0 +; CHECK-NEXT: st1w { z1.s }, p2, [x0, x10, lsl #2] +; CHECK-NEXT: strh w9, [sp, #30] +; CHECK-NEXT: add x9, sp, #24 +; CHECK-NEXT: strh w11, [sp, #28] +; CHECK-NEXT: strh w12, [sp, #26] +; CHECK-NEXT: mov x10, #12 +; CHECK-NEXT: strh w8, [sp, #24] +; CHECK-NEXT: ldr w8, [sp, #72] +; CHECK-NEXT: ld1h { z2.h }, p1/z, [x9] +; CHECK-NEXT: ldr w9, [sp, #96] +; CHECK-NEXT: ldr w11, [sp, #88] +; CHECK-NEXT: ldr w12, [sp, #80] +; CHECK-NEXT: uunpklo z2.s, z2.h +; CHECK-NEXT: lsl z2.s, p0/m, z2.s, z0.s +; CHECK-NEXT: asr z2.s, p0/m, z2.s, z0.s +; CHECK-NEXT: cmpne p2.s, p0/z, z2.s, #0 +; CHECK-NEXT: st1w { z1.s }, p2, [x0, x10, lsl #2] +; CHECK-NEXT: strh w9, [sp, #22] +; CHECK-NEXT: add x9, sp, #16 +; CHECK-NEXT: strh w11, [sp, #20] +; CHECK-NEXT: strh w12, [sp, #18] +; CHECK-NEXT: strh w8, [sp, #16] +; CHECK-NEXT: ldr w8, [sp, #64] +; CHECK-NEXT: ld1h { z2.h }, p1/z, [x9] +; CHECK-NEXT: mov x9, #8 +; CHECK-NEXT: uunpklo z2.s, z2.h +; CHECK-NEXT: lsl z2.s, p0/m, z2.s, z0.s +; CHECK-NEXT: asr z2.s, p0/m, z2.s, z0.s +; CHECK-NEXT: cmpne p2.s, p0/z, z2.s, #0 +; CHECK-NEXT: st1w { z1.s }, p2, [x0, x9, lsl #2] +; CHECK-NEXT: strh w8, [sp, #14] +; CHECK-NEXT: add x8, sp, #8 +; CHECK-NEXT: strh w7, [sp, #12] +; CHECK-NEXT: strh w6, [sp, #10] +; CHECK-NEXT: strh w5, [sp, #8] +; CHECK-NEXT: ld1h { z2.h }, p1/z, [x8] +; CHECK-NEXT: mov x8, #4 +; CHECK-NEXT: uunpklo z2.s, z2.h +; CHECK-NEXT: lsl z2.s, p0/m, z2.s, z0.s +; CHECK-NEXT: asr z2.s, p0/m, z2.s, z0.s +; CHECK-NEXT: cmpne p2.s, p0/z, z2.s, #0 +; CHECK-NEXT: st1w { z1.s }, p2, [x0, x8, lsl #2] +; CHECK-NEXT: mov x8, sp +; CHECK-NEXT: strh w4, [sp, #6] +; CHECK-NEXT: strh w3, [sp, #4] +; CHECK-NEXT: strh w2, [sp, #2] +; CHECK-NEXT: strh w1, [sp] +; CHECK-NEXT: ld1h { z2.h }, p1/z, [x8] +; CHECK-NEXT: uunpklo z2.s, z2.h +; CHECK-NEXT: lsl z2.s, p0/m, z2.s, z0.s +; CHECK-NEXT: asrr z0.s, p0/m, z0.s, z2.s +; CHECK-NEXT: cmpne p0.s, p0/z, z0.s, #0 +; CHECK-NEXT: st1w { z1.s }, p0, [x0] +; CHECK-NEXT: add sp, sp, #64 +; CHECK-NEXT: ret + call void @llvm.masked.store.v32f32(<32 x float> zeroinitializer, <32 x float>* %dst, i32 8, <32 x i1> %mask) + ret void +} define void @masked_store_v2f64(<2 x double>* %dst, <2 x i1> %mask) #0 { ; CHECK-LABEL: masked_store_v2f64: ; CHECK: // %bb.0: -; CHECK-NEXT: adrp x8, .LCPI3_0 -; CHECK-NEXT: add x8, x8, :lo12:.LCPI3_0 +; CHECK-NEXT: adrp x8, .LCPI11_0 +; CHECK-NEXT: add x8, x8, :lo12:.LCPI11_0 ; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 ; CHECK-NEXT: uunpklo z0.d, z0.s ; CHECK-NEXT: ld1d { z1.d }, p0/z, [x8] -; CHECK-NEXT: adrp x8, .LCPI3_1 -; CHECK-NEXT: add x8, x8, :lo12:.LCPI3_1 +; CHECK-NEXT: adrp x8, .LCPI11_1 +; CHECK-NEXT: add x8, x8, :lo12:.LCPI11_1 ; CHECK-NEXT: ld1d { z2.d }, p0/z, [x8] ; CHECK-NEXT: lsl z0.d, p0/m, z0.d, z1.d ; CHECK-NEXT: asr z0.d, p0/m, z0.d, z1.d @@ -108,10 +496,48 @@ ret void } +define void @masked_store_v4f64(<4 x double>* %dst, <4 x i1> %mask) #0 { +; CHECK-LABEL: masked_store_v4f64: +; CHECK: // %bb.0: +; CHECK-NEXT: adrp x8, .LCPI12_0 +; CHECK-NEXT: add x8, x8, :lo12:.LCPI12_0 +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: ptrue p0.d, vl2 +; CHECK-NEXT: uunpklo z0.s, z0.h +; CHECK-NEXT: ld1d { z2.d }, p0/z, [x8] +; CHECK-NEXT: adrp x8, .LCPI12_1 +; CHECK-NEXT: add x8, x8, :lo12:.LCPI12_1 +; CHECK-NEXT: mov z1.d, z0.d +; CHECK-NEXT: ext z1.b, z1.b, z0.b, #8 +; CHECK-NEXT: uunpklo z0.d, z0.s +; CHECK-NEXT: uunpklo z1.d, z1.s +; CHECK-NEXT: ld1d { z3.d }, p0/z, [x8] +; CHECK-NEXT: lsl z1.d, p0/m, z1.d, z2.d +; CHECK-NEXT: mov x8, #2 +; CHECK-NEXT: lsl z0.d, p0/m, z0.d, z2.d +; CHECK-NEXT: asr z1.d, p0/m, z1.d, z2.d +; CHECK-NEXT: asr z0.d, p0/m, z0.d, z2.d +; CHECK-NEXT: cmpne p1.d, p0/z, z1.d, #0 +; CHECK-NEXT: cmpne p0.d, p0/z, z0.d, #0 +; CHECK-NEXT: st1d { z3.d }, p1, [x0, x8, lsl #3] +; CHECK-NEXT: st1d { z3.d }, p0, [x0] +; CHECK-NEXT: ret + call void @llvm.masked.store.v4f64(<4 x double> zeroinitializer, <4 x double>* %dst, i32 8, <4 x i1> %mask) + ret void +} + +declare void @llvm.masked.store.v4i8(<4 x i8>, <4 x i8>*, i32, <4 x i1>) +declare void @llvm.masked.store.v8i8(<8 x i8>, <8 x i8>*, i32, <8 x i1>) declare void @llvm.masked.store.v16i8(<16 x i8>, <16 x i8>*, i32, <16 x i1>) +declare void @llvm.masked.store.v32i8(<32 x i8>, <32 x i8>*, i32, <32 x i1>) declare void @llvm.masked.store.v2f16(<2 x half>, <2 x half>*, i32, <2 x i1>) +declare void @llvm.masked.store.v4f16(<4 x half>, <4 x half>*, i32, <4 x i1>) +declare void @llvm.masked.store.v8f16(<8 x half>, <8 x half>*, i32, <8 x i1>) +declare void @llvm.masked.store.v16f16(<16 x half>, <16 x half>*, i32, <16 x i1>) declare void @llvm.masked.store.v4f32(<4 x float>, <4 x float>*, i32, <4 x i1>) +declare void @llvm.masked.store.v8f32(<8 x float>, <8 x float>*, i32, <8 x i1>) declare void @llvm.masked.store.v32f32(<32 x float>, <32 x float>*, i32, <32 x i1>) declare void @llvm.masked.store.v2f64(<2 x double>, <2 x double>*, i32, <2 x i1>) +declare void @llvm.masked.store.v4f64(<4 x double>, <4 x double>*, i32, <4 x i1>) attributes #0 = { "target-features"="+sve" } diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-stores.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-stores.ll --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-stores.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-stores.ll @@ -1,20 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -aarch64-sve-vector-bits-min=128 -force-sve-when-streaming-compatible < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_128 -; RUN: llc -aarch64-sve-vector-bits-min=256 -force-sve-when-streaming-compatible < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_256 -; RUN: llc -aarch64-sve-vector-bits-min=384 -force-sve-when-streaming-compatible < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_256 -; RUN: llc -aarch64-sve-vector-bits-min=512 -force-sve-when-streaming-compatible < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512 -; RUN: llc -aarch64-sve-vector-bits-min=640 -force-sve-when-streaming-compatible < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512 -; RUN: llc -aarch64-sve-vector-bits-min=768 -force-sve-when-streaming-compatible < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512 -; RUN: llc -aarch64-sve-vector-bits-min=896 -force-sve-when-streaming-compatible < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512 -; RUN: llc -aarch64-sve-vector-bits-min=1024 -force-sve-when-streaming-compatible < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_1024 -; RUN: llc -aarch64-sve-vector-bits-min=1152 -force-sve-when-streaming-compatible < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_1024 -; RUN: llc -aarch64-sve-vector-bits-min=1280 -force-sve-when-streaming-compatible < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_1024 -; RUN: llc -aarch64-sve-vector-bits-min=1408 -force-sve-when-streaming-compatible < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_1024 -; RUN: llc -aarch64-sve-vector-bits-min=1536 -force-sve-when-streaming-compatible < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_1024 -; RUN: llc -aarch64-sve-vector-bits-min=1664 -force-sve-when-streaming-compatible < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_1024 -; RUN: llc -aarch64-sve-vector-bits-min=1792 -force-sve-when-streaming-compatible < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_1024 -; RUN: llc -aarch64-sve-vector-bits-min=1920 -force-sve-when-streaming-compatible < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_1024 -; RUN: llc -aarch64-sve-vector-bits-min=2048 -force-sve-when-streaming-compatible < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_2048 +; RUN: llc -force-streaming-compatible-sve < %s | FileCheck %s target triple = "aarch64-unknown-linux-gnu" @@ -62,6 +47,19 @@ ret void } +define void @store_v8f32(<8 x float>* %a) #0 { +; CHECK-LABEL: store_v8f32: +; CHECK: // %bb.0: +; CHECK-NEXT: adrp x8, .LCPI4_0 +; CHECK-NEXT: add x8, x8, :lo12:.LCPI4_0 +; CHECK-NEXT: ptrue p0.s, vl4 +; CHECK-NEXT: ld1w { z0.s }, p0/z, [x8] +; CHECK-NEXT: stp q0, q0, [x0] +; CHECK-NEXT: ret + store <8 x float> zeroinitializer, <8 x float>* %a + ret void +} + define void @store_v2f64(<2 x double>* %a) #0 { ; CHECK-LABEL: store_v2f64: ; CHECK: // %bb.0: @@ -73,9 +71,3 @@ attributes #0 = { "target-features"="+sve" } -;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: -; VBITS_GE_1024: {{.*}} -; VBITS_GE_128: {{.*}} -; VBITS_GE_2048: {{.*}} -; VBITS_GE_256: {{.*}} -; VBITS_GE_512: {{.*}} diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-trunc-stores.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-trunc-stores.ll --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-trunc-stores.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-trunc-stores.ll @@ -1,8 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -aarch64-sve-vector-bits-min=128 -force-sve-when-streaming-compatible < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_128 -; RUN: llc -aarch64-sve-vector-bits-min=256 -force-sve-when-streaming-compatible < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_256 -; RUN: llc -aarch64-sve-vector-bits-min=512 -force-sve-when-streaming-compatible < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512 -; RUN: llc -aarch64-sve-vector-bits-min=2048 -force-sve-when-streaming-compatible < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512 +; RUN: llc -force-streaming-compatible-sve < %s | FileCheck %s target triple = "aarch64-unknown-linux-gnu" @@ -138,7 +135,3 @@ attributes #0 = { "target-features"="+sve" } -;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: -; VBITS_GE_128: {{.*}} -; VBITS_GE_256: {{.*}} -; VBITS_GE_512: {{.*}}