diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -1614,6 +1614,18 @@ void AArch64TargetLowering::addTypeForStreamingSVE(MVT VT) { setOperationAction(ISD::LOAD, VT, Custom); + setOperationAction(ISD::ANY_EXTEND, VT, Custom); + setOperationAction(ISD::ZERO_EXTEND, VT, Custom); + setOperationAction(ISD::SIGN_EXTEND, VT, Custom); + setOperationAction(ISD::CONCAT_VECTORS, VT, Custom); + setOperationAction(ISD::ADD, VT, Custom); + setOperationAction(ISD::SUB, VT, Custom); + setOperationAction(ISD::MUL, VT, Custom); + setOperationAction(ISD::MULHS, VT, Custom); + setOperationAction(ISD::MULHU, VT, Custom); + setOperationAction(ISD::ABS, VT, Custom); + setOperationAction(ISD::AND, VT, Custom); + setOperationAction(ISD::XOR, VT, Custom); } void AArch64TargetLowering::addTypeForFixedLengthSVE(MVT VT) { @@ -3518,7 +3530,8 @@ } SDValue AArch64TargetLowering::LowerXOR(SDValue Op, SelectionDAG &DAG) const { - if (useSVEForFixedLengthVectorVT(Op.getValueType())) + if (useSVEForFixedLengthVectorVT(Op.getValueType(), + Subtarget->forceStreamingCompatibleSVE())) return LowerToScalableOp(Op, DAG); SDValue Sel = Op.getOperand(0); @@ -4430,7 +4443,8 @@ EVT VT = Op.getValueType(); // If SVE is available then i64 vector multiplications can also be made legal. - bool OverrideNEON = VT == MVT::v2i64 || VT == MVT::v1i64; + bool OverrideNEON = VT == MVT::v2i64 || VT == MVT::v1i64 || + Subtarget->forceStreamingCompatibleSVE(); if (VT.isScalableVector() || useSVEForFixedLengthVectorVT(VT, OverrideNEON)) return LowerToPredicatedOp(Op, DAG, AArch64ISD::MUL_PRED); @@ -10898,7 +10912,8 @@ ShuffleVectorSDNode *SVN = cast(Op.getNode()); - if (useSVEForFixedLengthVectorVT(VT)) + if (useSVEForFixedLengthVectorVT(VT, + Subtarget->forceStreamingCompatibleSVE())) return LowerFixedLengthVECTOR_SHUFFLEToSVE(Op, DAG); // Convert shuffles that are directly supported on NEON to target-specific @@ -11468,7 +11483,8 @@ SDValue AArch64TargetLowering::LowerVectorOR(SDValue Op, SelectionDAG &DAG) const { - if (useSVEForFixedLengthVectorVT(Op.getValueType())) + if (useSVEForFixedLengthVectorVT(Op.getValueType(), + Subtarget->forceStreamingCompatibleSVE())) return LowerToScalableOp(Op, DAG); // Attempt to form a vector S[LR]I from (or (and X, C1), (lsl Y, C2)) @@ -11919,7 +11935,8 @@ SDValue AArch64TargetLowering::LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const { - if (useSVEForFixedLengthVectorVT(Op.getValueType())) + if (useSVEForFixedLengthVectorVT(Op.getValueType(), + Subtarget->forceStreamingCompatibleSVE())) return LowerFixedLengthConcatVectorsToSVE(Op, DAG); assert(Op.getValueType().isScalableVector() && @@ -12025,8 +12042,8 @@ return DAG.getAnyExtOrTrunc(Extract, DL, Op.getValueType()); } - // try overriding NEON if possible. - if (useSVEForFixedLengthVectorVT(VT)) + if (useSVEForFixedLengthVectorVT(VT, + Subtarget->forceStreamingCompatibleSVE())) return LowerFixedLengthExtractVectorElt(Op, DAG); // Check for non-constant or out of range lane. @@ -12085,13 +12102,16 @@ if (Idx == 0 && InVT.getSizeInBits() <= 128) return Op; - // If this is extracting the upper 64-bits of a 128-bit vector, we match - // that directly. - if (Size == 64 && Idx * InVT.getScalarSizeInBits() == 64 && - InVT.getSizeInBits() == 128) - return Op; + if (!Subtarget->forceStreamingCompatibleSVE()) { + // If this is extracting the upper 64-bits of a 128-bit vector, we match + // that directly. + if (Size == 64 && Idx * InVT.getScalarSizeInBits() == 64 && + InVT.getSizeInBits() == 128) + return Op; + } - if (useSVEForFixedLengthVectorVT(InVT)) { + if (useSVEForFixedLengthVectorVT(InVT, + Subtarget->forceStreamingCompatibleSVE())) { SDLoc DL(Op); EVT ContainerVT = getContainerForFixedLengthVector(DAG, InVT); @@ -12389,7 +12409,9 @@ switch (Op.getOpcode()) { case ISD::SHL: - if (VT.isScalableVector() || useSVEForFixedLengthVectorVT(VT)) + if (VT.isScalableVector() || + useSVEForFixedLengthVectorVT(VT, + Subtarget->forceStreamingCompatibleSVE())) return LowerToPredicatedOp(Op, DAG, AArch64ISD::SHL_PRED); if (isVShiftLImm(Op.getOperand(1), VT, false, Cnt) && Cnt < EltSize) @@ -12401,7 +12423,9 @@ Op.getOperand(0), Op.getOperand(1)); case ISD::SRA: case ISD::SRL: - if (VT.isScalableVector() || useSVEForFixedLengthVectorVT(VT)) { + if (VT.isScalableVector() || + useSVEForFixedLengthVectorVT( + VT, Subtarget->forceStreamingCompatibleSVE())) { unsigned Opc = Op.getOpcode() == ISD::SRA ? AArch64ISD::SRA_PRED : AArch64ISD::SRL_PRED; return LowerToPredicatedOp(Op, DAG, Opc); @@ -15514,7 +15538,8 @@ } static SDValue performANDCombine(SDNode *N, - TargetLowering::DAGCombinerInfo &DCI) { + TargetLowering::DAGCombinerInfo &DCI, + const AArch64Subtarget *const Subtarget) { SelectionDAG &DAG = DCI.DAG; SDValue LHS = N->getOperand(0); SDValue RHS = N->getOperand(1); @@ -15525,10 +15550,19 @@ if (!DAG.getTargetLoweringInfo().isTypeLegal(VT)) return SDValue(); - if (VT.isScalableVector()) return performSVEAndCombine(N, DCI); + if (VT.isFixedLengthVector() && Subtarget->forceStreamingCompatibleSVE()) { + // convert fixed-length vector to scalable one: + EVT scalableContainerVT = getContainerForFixedLengthVector(DAG, VT); + SDValue scalableLHS = + convertToScalableVector(DAG, scalableContainerVT, LHS); + SDValue scalableRHS = + convertToScalableVector(DAG, scalableContainerVT, RHS); + return performSVEAndCombine(N, DCI); + } + // The combining code below works only for NEON vectors. In particular, it // does not work for SVE when dealing with vectors wider than 128 bits. if (!VT.is64BitVector() && !VT.is128BitVector()) @@ -20247,7 +20281,7 @@ case ISD::OR: return performORCombine(N, DCI, Subtarget); case ISD::AND: - return performANDCombine(N, DCI); + return performANDCombine(N, DCI, Subtarget); case ISD::INTRINSIC_WO_CHAIN: return performIntrinsicCombine(N, DCI, Subtarget); case ISD::ANY_EXTEND: @@ -22054,7 +22088,8 @@ SDValue AArch64TargetLowering::LowerToScalableOp(SDValue Op, SelectionDAG &DAG) const { EVT VT = Op.getValueType(); - assert(useSVEForFixedLengthVectorVT(VT) && + assert(useSVEForFixedLengthVectorVT( + VT, Subtarget->forceStreamingCompatibleSVE()) && "Only expected to lower fixed length vector operation!"); EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT); @@ -22070,7 +22105,8 @@ } // "cast" fixed length vector to a scalable vector. - assert(useSVEForFixedLengthVectorVT(V.getValueType()) && + assert(useSVEForFixedLengthVectorVT( + V.getValueType(), Subtarget->forceStreamingCompatibleSVE()) && "Only fixed length vectors are supported!"); Ops.push_back(convertToScalableVector(DAG, ContainerVT, V)); } diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-arith.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-arith.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-arith.ll @@ -0,0 +1,1310 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -force-streaming-compatible-sve < %s | FileCheck %s + +target triple = "aarch64-unknown-linux-gnu" + +; +; ADD +; +define <4 x i8> @add_v4i8(<4 x i8> %op1, <4 x i8> %op2) #0 { +; CHECK-LABEL: add_v4i8: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1 +; CHECK-NEXT: add z0.h, z0.h, z1.h +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 +; CHECK-NEXT: ret + %res = add <4 x i8> %op1, %op2 + ret <4 x i8> %res +} + +define <8 x i8> @add_v8i8(<8 x i8> %op1, <8 x i8> %op2) #0 { +; CHECK-LABEL: add_v8i8: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1 +; CHECK-NEXT: add z0.b, z0.b, z1.b +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 +; CHECK-NEXT: ret + %res = add <8 x i8> %op1, %op2 + ret <8 x i8> %res +} + +define <16 x i8> @add_v16i8(<16 x i8> %op1, <16 x i8> %op2) #0 { +; CHECK-LABEL: add_v16i8: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1 +; CHECK-NEXT: add z0.b, z0.b, z1.b +; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 +; CHECK-NEXT: ret + %res = add <16 x i8> %op1, %op2 + ret <16 x i8> %res +} + +define void @add_v32i8(<32 x i8>* %a, <32 x i8>* %b) #0 { +; CHECK-LABEL: add_v32i8: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w8, #16 +; CHECK-NEXT: ptrue p0.b, vl16 +; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0, x8] +; CHECK-NEXT: ld1b { z1.b }, p0/z, [x0] +; CHECK-NEXT: ld1b { z2.b }, p0/z, [x1, x8] +; CHECK-NEXT: ld1b { z3.b }, p0/z, [x1] +; CHECK-NEXT: add z1.b, z1.b, z3.b +; CHECK-NEXT: add z0.b, z0.b, z2.b +; CHECK-NEXT: stp q1, q0, [x0] +; CHECK-NEXT: ret + %op1 = load <32 x i8>, <32 x i8>* %a + %op2 = load <32 x i8>, <32 x i8>* %b + %res = add <32 x i8> %op1, %op2 + store <32 x i8> %res, <32 x i8>* %a + ret void +} + +define void @add_v64i8(<64 x i8>* %a, <64 x i8>* %b) #0 { +; CHECK-LABEL: add_v64i8: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w8, #32 +; CHECK-NEXT: mov w9, #48 +; CHECK-NEXT: mov w10, #16 +; CHECK-NEXT: ptrue p0.b, vl16 +; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0, x8] +; CHECK-NEXT: ld1b { z1.b }, p0/z, [x0, x9] +; CHECK-NEXT: ld1b { z2.b }, p0/z, [x0, x10] +; CHECK-NEXT: ld1b { z3.b }, p0/z, [x0] +; CHECK-NEXT: ld1b { z4.b }, p0/z, [x1, x8] +; CHECK-NEXT: ld1b { z5.b }, p0/z, [x1, x9] +; CHECK-NEXT: ld1b { z6.b }, p0/z, [x1, x10] +; CHECK-NEXT: ld1b { z7.b }, p0/z, [x1] +; CHECK-NEXT: add z0.b, z0.b, z4.b +; CHECK-NEXT: add z1.b, z1.b, z5.b +; CHECK-NEXT: stp q0, q1, [x0, #32] +; CHECK-NEXT: add z0.b, z3.b, z7.b +; CHECK-NEXT: add z1.b, z2.b, z6.b +; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: ret + %op1 = load <64 x i8>, <64 x i8>* %a + %op2 = load <64 x i8>, <64 x i8>* %b + %res = add <64 x i8> %op1, %op2 + store <64 x i8> %res, <64 x i8>* %a + ret void +} + +define <2 x i16> @add_v2i16(<2 x i16> %op1, <2 x i16> %op2) #0 { +; CHECK-LABEL: add_v2i16: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1 +; CHECK-NEXT: add z0.s, z0.s, z1.s +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 +; CHECK-NEXT: ret + %res = add <2 x i16> %op1, %op2 + ret <2 x i16> %res +} + +define <4 x i16> @add_v4i16(<4 x i16> %op1, <4 x i16> %op2) #0 { +; CHECK-LABEL: add_v4i16: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1 +; CHECK-NEXT: add z0.h, z0.h, z1.h +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 +; CHECK-NEXT: ret + %res = add <4 x i16> %op1, %op2 + ret <4 x i16> %res +} + +define <8 x i16> @add_v8i16(<8 x i16> %op1, <8 x i16> %op2) #0 { +; CHECK-LABEL: add_v8i16: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1 +; CHECK-NEXT: add z0.h, z0.h, z1.h +; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 +; CHECK-NEXT: ret + %res = add <8 x i16> %op1, %op2 + ret <8 x i16> %res +} + +define void @add_v16i16(<16 x i16>* %a, <16 x i16>* %b) #0 { +; CHECK-LABEL: add_v16i16: +; CHECK: // %bb.0: +; CHECK-NEXT: mov x8, #8 +; CHECK-NEXT: ptrue p0.h, vl8 +; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] +; CHECK-NEXT: ld1h { z1.h }, p0/z, [x0] +; CHECK-NEXT: ld1h { z2.h }, p0/z, [x1, x8, lsl #1] +; CHECK-NEXT: ld1h { z3.h }, p0/z, [x1] +; CHECK-NEXT: add z1.h, z1.h, z3.h +; CHECK-NEXT: add z0.h, z0.h, z2.h +; CHECK-NEXT: stp q1, q0, [x0] +; CHECK-NEXT: ret + %op1 = load <16 x i16>, <16 x i16>* %a + %op2 = load <16 x i16>, <16 x i16>* %b + %res = add <16 x i16> %op1, %op2 + store <16 x i16> %res, <16 x i16>* %a + ret void +} + +define void @add_v32i16(<32 x i16>* %a, <32 x i16>* %b) #0 { +; CHECK-LABEL: add_v32i16: +; CHECK: // %bb.0: +; CHECK-NEXT: mov x8, #16 +; CHECK-NEXT: mov x9, #24 +; CHECK-NEXT: mov x10, #8 +; CHECK-NEXT: ptrue p0.h, vl8 +; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] +; CHECK-NEXT: ld1h { z1.h }, p0/z, [x0, x9, lsl #1] +; CHECK-NEXT: ld1h { z2.h }, p0/z, [x0, x10, lsl #1] +; CHECK-NEXT: ld1h { z3.h }, p0/z, [x0] +; CHECK-NEXT: ld1h { z4.h }, p0/z, [x1, x8, lsl #1] +; CHECK-NEXT: ld1h { z5.h }, p0/z, [x1, x9, lsl #1] +; CHECK-NEXT: ld1h { z6.h }, p0/z, [x1, x10, lsl #1] +; CHECK-NEXT: ld1h { z7.h }, p0/z, [x1] +; CHECK-NEXT: add z0.h, z0.h, z4.h +; CHECK-NEXT: add z1.h, z1.h, z5.h +; CHECK-NEXT: stp q0, q1, [x0, #32] +; CHECK-NEXT: add z0.h, z3.h, z7.h +; CHECK-NEXT: add z1.h, z2.h, z6.h +; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: ret + %op1 = load <32 x i16>, <32 x i16>* %a + %op2 = load <32 x i16>, <32 x i16>* %b + %res = add <32 x i16> %op1, %op2 + store <32 x i16> %res, <32 x i16>* %a + ret void +} + +define <2 x i32> @add_v2i32(<2 x i32> %op1, <2 x i32> %op2) #0 { +; CHECK-LABEL: add_v2i32: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1 +; CHECK-NEXT: add z0.s, z0.s, z1.s +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 +; CHECK-NEXT: ret + %res = add <2 x i32> %op1, %op2 + ret <2 x i32> %res +} + +define <4 x i32> @add_v4i32(<4 x i32> %op1, <4 x i32> %op2) #0 { +; CHECK-LABEL: add_v4i32: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1 +; CHECK-NEXT: add z0.s, z0.s, z1.s +; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 +; CHECK-NEXT: ret + %res = add <4 x i32> %op1, %op2 + ret <4 x i32> %res +} + +define void @add_v8i32(<8 x i32>* %a, <8 x i32>* %b) #0 { +; CHECK-LABEL: add_v8i32: +; CHECK: // %bb.0: +; CHECK-NEXT: mov x8, #4 +; CHECK-NEXT: ptrue p0.s, vl4 +; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] +; CHECK-NEXT: ld1w { z1.s }, p0/z, [x0] +; CHECK-NEXT: ld1w { z2.s }, p0/z, [x1, x8, lsl #2] +; CHECK-NEXT: ld1w { z3.s }, p0/z, [x1] +; CHECK-NEXT: add z1.s, z1.s, z3.s +; CHECK-NEXT: add z0.s, z0.s, z2.s +; CHECK-NEXT: stp q1, q0, [x0] +; CHECK-NEXT: ret + %op1 = load <8 x i32>, <8 x i32>* %a + %op2 = load <8 x i32>, <8 x i32>* %b + %res = add <8 x i32> %op1, %op2 + store <8 x i32> %res, <8 x i32>* %a + ret void +} + +define void @add_v16i32(<16 x i32>* %a, <16 x i32>* %b) #0 { +; CHECK-LABEL: add_v16i32: +; CHECK: // %bb.0: +; CHECK-NEXT: mov x8, #8 +; CHECK-NEXT: mov x9, #12 +; CHECK-NEXT: mov x10, #4 +; CHECK-NEXT: ptrue p0.s, vl4 +; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] +; CHECK-NEXT: ld1w { z1.s }, p0/z, [x0, x9, lsl #2] +; CHECK-NEXT: ld1w { z2.s }, p0/z, [x0, x10, lsl #2] +; CHECK-NEXT: ld1w { z3.s }, p0/z, [x0] +; CHECK-NEXT: ld1w { z4.s }, p0/z, [x1, x8, lsl #2] +; CHECK-NEXT: ld1w { z5.s }, p0/z, [x1, x9, lsl #2] +; CHECK-NEXT: ld1w { z6.s }, p0/z, [x1, x10, lsl #2] +; CHECK-NEXT: ld1w { z7.s }, p0/z, [x1] +; CHECK-NEXT: add z0.s, z0.s, z4.s +; CHECK-NEXT: add z1.s, z1.s, z5.s +; CHECK-NEXT: stp q0, q1, [x0, #32] +; CHECK-NEXT: add z0.s, z3.s, z7.s +; CHECK-NEXT: add z1.s, z2.s, z6.s +; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: ret + %op1 = load <16 x i32>, <16 x i32>* %a + %op2 = load <16 x i32>, <16 x i32>* %b + %res = add <16 x i32> %op1, %op2 + store <16 x i32> %res, <16 x i32>* %a + ret void +} + +define <1 x i64> @add_v1i64(<1 x i64> %op1, <1 x i64> %op2) #0 { +; CHECK-LABEL: add_v1i64: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1 +; CHECK-NEXT: add z0.d, z0.d, z1.d +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 +; CHECK-NEXT: ret + %res = add <1 x i64> %op1, %op2 + ret <1 x i64> %res +} + +define <2 x i64> @add_v2i64(<2 x i64> %op1, <2 x i64> %op2) #0 { +; CHECK-LABEL: add_v2i64: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1 +; CHECK-NEXT: add z0.d, z0.d, z1.d +; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 +; CHECK-NEXT: ret + %res = add <2 x i64> %op1, %op2 + ret <2 x i64> %res +} + +define void @add_v4i64(<4 x i64>* %a, <4 x i64>* %b) #0 { +; CHECK-LABEL: add_v4i64: +; CHECK: // %bb.0: +; CHECK-NEXT: mov x8, #2 +; CHECK-NEXT: ptrue p0.d, vl2 +; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] +; CHECK-NEXT: ld1d { z1.d }, p0/z, [x0] +; CHECK-NEXT: ld1d { z2.d }, p0/z, [x1, x8, lsl #3] +; CHECK-NEXT: ld1d { z3.d }, p0/z, [x1] +; CHECK-NEXT: add z1.d, z1.d, z3.d +; CHECK-NEXT: add z0.d, z0.d, z2.d +; CHECK-NEXT: stp q1, q0, [x0] +; CHECK-NEXT: ret + %op1 = load <4 x i64>, <4 x i64>* %a + %op2 = load <4 x i64>, <4 x i64>* %b + %res = add <4 x i64> %op1, %op2 + store <4 x i64> %res, <4 x i64>* %a + ret void +} + +define void @add_v8i64(<8 x i64>* %a, <8 x i64>* %b) #0 { +; CHECK-LABEL: add_v8i64: +; CHECK: // %bb.0: +; CHECK-NEXT: mov x8, #4 +; CHECK-NEXT: mov x9, #6 +; CHECK-NEXT: mov x10, #2 +; CHECK-NEXT: ptrue p0.d, vl2 +; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] +; CHECK-NEXT: ld1d { z1.d }, p0/z, [x0, x9, lsl #3] +; CHECK-NEXT: ld1d { z2.d }, p0/z, [x0, x10, lsl #3] +; CHECK-NEXT: ld1d { z3.d }, p0/z, [x0] +; CHECK-NEXT: ld1d { z4.d }, p0/z, [x1, x8, lsl #3] +; CHECK-NEXT: ld1d { z5.d }, p0/z, [x1, x9, lsl #3] +; CHECK-NEXT: ld1d { z6.d }, p0/z, [x1, x10, lsl #3] +; CHECK-NEXT: ld1d { z7.d }, p0/z, [x1] +; CHECK-NEXT: add z0.d, z0.d, z4.d +; CHECK-NEXT: add z1.d, z1.d, z5.d +; CHECK-NEXT: stp q0, q1, [x0, #32] +; CHECK-NEXT: add z0.d, z3.d, z7.d +; CHECK-NEXT: add z1.d, z2.d, z6.d +; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: ret + %op1 = load <8 x i64>, <8 x i64>* %a + %op2 = load <8 x i64>, <8 x i64>* %b + %res = add <8 x i64> %op1, %op2 + store <8 x i64> %res, <8 x i64>* %a + ret void +} + +; +; MUL +; + +define <4 x i8> @mul_v4i8(<4 x i8> %op1, <4 x i8> %op2) #0 { +; CHECK-LABEL: mul_v4i8: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: ptrue p0.h, vl4 +; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1 +; CHECK-NEXT: mul z0.h, p0/m, z0.h, z1.h +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 +; CHECK-NEXT: ret + %res = mul <4 x i8> %op1, %op2 + ret <4 x i8> %res +} + +define <8 x i8> @mul_v8i8(<8 x i8> %op1, <8 x i8> %op2) #0 { +; CHECK-LABEL: mul_v8i8: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: ptrue p0.b, vl8 +; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1 +; CHECK-NEXT: mul z0.b, p0/m, z0.b, z1.b +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 +; CHECK-NEXT: ret + %res = mul <8 x i8> %op1, %op2 + ret <8 x i8> %res +} + +define <16 x i8> @mul_v16i8(<16 x i8> %op1, <16 x i8> %op2) #0 { +; CHECK-LABEL: mul_v16i8: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: ptrue p0.b, vl16 +; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1 +; CHECK-NEXT: mul z0.b, p0/m, z0.b, z1.b +; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 +; CHECK-NEXT: ret + %res = mul <16 x i8> %op1, %op2 + ret <16 x i8> %res +} + +define void @mul_v32i8(<32 x i8>* %a, <32 x i8>* %b) #0 { +; CHECK-LABEL: mul_v32i8: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w8, #16 +; CHECK-NEXT: ptrue p0.b, vl16 +; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0, x8] +; CHECK-NEXT: ld1b { z1.b }, p0/z, [x0] +; CHECK-NEXT: ld1b { z2.b }, p0/z, [x1, x8] +; CHECK-NEXT: ld1b { z3.b }, p0/z, [x1] +; CHECK-NEXT: mul z1.b, p0/m, z1.b, z3.b +; CHECK-NEXT: mul z0.b, p0/m, z0.b, z2.b +; CHECK-NEXT: stp q1, q0, [x0] +; CHECK-NEXT: ret + %op1 = load <32 x i8>, <32 x i8>* %a + %op2 = load <32 x i8>, <32 x i8>* %b + %res = mul <32 x i8> %op1, %op2 + store <32 x i8> %res, <32 x i8>* %a + ret void +} + +define void @mul_v64i8(<64 x i8>* %a, <64 x i8>* %b) #0 { +; CHECK-LABEL: mul_v64i8: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w8, #32 +; CHECK-NEXT: mov w9, #48 +; CHECK-NEXT: mov w10, #16 +; CHECK-NEXT: ptrue p0.b, vl16 +; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0, x8] +; CHECK-NEXT: ld1b { z1.b }, p0/z, [x0, x9] +; CHECK-NEXT: ld1b { z2.b }, p0/z, [x0, x10] +; CHECK-NEXT: ld1b { z3.b }, p0/z, [x0] +; CHECK-NEXT: ld1b { z4.b }, p0/z, [x1, x8] +; CHECK-NEXT: ld1b { z5.b }, p0/z, [x1, x9] +; CHECK-NEXT: ld1b { z6.b }, p0/z, [x1, x10] +; CHECK-NEXT: ld1b { z7.b }, p0/z, [x1] +; CHECK-NEXT: mul z0.b, p0/m, z0.b, z4.b +; CHECK-NEXT: mul z1.b, p0/m, z1.b, z5.b +; CHECK-NEXT: stp q0, q1, [x0, #32] +; CHECK-NEXT: movprfx z0, z3 +; CHECK-NEXT: mul z0.b, p0/m, z0.b, z7.b +; CHECK-NEXT: movprfx z1, z2 +; CHECK-NEXT: mul z1.b, p0/m, z1.b, z6.b +; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: ret + %op1 = load <64 x i8>, <64 x i8>* %a + %op2 = load <64 x i8>, <64 x i8>* %b + %res = mul <64 x i8> %op1, %op2 + store <64 x i8> %res, <64 x i8>* %a + ret void +} + +define <2 x i16> @mul_v2i16(<2 x i16> %op1, <2 x i16> %op2) #0 { +; CHECK-LABEL: mul_v2i16: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: ptrue p0.s, vl2 +; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1 +; CHECK-NEXT: mul z0.s, p0/m, z0.s, z1.s +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 +; CHECK-NEXT: ret + %res = mul <2 x i16> %op1, %op2 + ret <2 x i16> %res +} + +define <4 x i16> @mul_v4i16(<4 x i16> %op1, <4 x i16> %op2) #0 { +; CHECK-LABEL: mul_v4i16: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: ptrue p0.h, vl4 +; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1 +; CHECK-NEXT: mul z0.h, p0/m, z0.h, z1.h +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 +; CHECK-NEXT: ret + %res = mul <4 x i16> %op1, %op2 + ret <4 x i16> %res +} + +define <8 x i16> @mul_v8i16(<8 x i16> %op1, <8 x i16> %op2) #0 { +; CHECK-LABEL: mul_v8i16: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: ptrue p0.h, vl8 +; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1 +; CHECK-NEXT: mul z0.h, p0/m, z0.h, z1.h +; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 +; CHECK-NEXT: ret + %res = mul <8 x i16> %op1, %op2 + ret <8 x i16> %res +} + +define void @mul_v16i16(<16 x i16>* %a, <16 x i16>* %b) #0 { +; CHECK-LABEL: mul_v16i16: +; CHECK: // %bb.0: +; CHECK-NEXT: mov x8, #8 +; CHECK-NEXT: ptrue p0.h, vl8 +; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] +; CHECK-NEXT: ld1h { z1.h }, p0/z, [x0] +; CHECK-NEXT: ld1h { z2.h }, p0/z, [x1, x8, lsl #1] +; CHECK-NEXT: ld1h { z3.h }, p0/z, [x1] +; CHECK-NEXT: mul z1.h, p0/m, z1.h, z3.h +; CHECK-NEXT: mul z0.h, p0/m, z0.h, z2.h +; CHECK-NEXT: stp q1, q0, [x0] +; CHECK-NEXT: ret + %op1 = load <16 x i16>, <16 x i16>* %a + %op2 = load <16 x i16>, <16 x i16>* %b + %res = mul <16 x i16> %op1, %op2 + store <16 x i16> %res, <16 x i16>* %a + ret void +} + +define void @mul_v32i16(<32 x i16>* %a, <32 x i16>* %b) #0 { +; CHECK-LABEL: mul_v32i16: +; CHECK: // %bb.0: +; CHECK-NEXT: mov x8, #16 +; CHECK-NEXT: mov x9, #24 +; CHECK-NEXT: mov x10, #8 +; CHECK-NEXT: ptrue p0.h, vl8 +; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] +; CHECK-NEXT: ld1h { z1.h }, p0/z, [x0, x9, lsl #1] +; CHECK-NEXT: ld1h { z2.h }, p0/z, [x0, x10, lsl #1] +; CHECK-NEXT: ld1h { z3.h }, p0/z, [x0] +; CHECK-NEXT: ld1h { z4.h }, p0/z, [x1, x8, lsl #1] +; CHECK-NEXT: ld1h { z5.h }, p0/z, [x1, x9, lsl #1] +; CHECK-NEXT: ld1h { z6.h }, p0/z, [x1, x10, lsl #1] +; CHECK-NEXT: ld1h { z7.h }, p0/z, [x1] +; CHECK-NEXT: mul z0.h, p0/m, z0.h, z4.h +; CHECK-NEXT: mul z1.h, p0/m, z1.h, z5.h +; CHECK-NEXT: stp q0, q1, [x0, #32] +; CHECK-NEXT: movprfx z0, z3 +; CHECK-NEXT: mul z0.h, p0/m, z0.h, z7.h +; CHECK-NEXT: movprfx z1, z2 +; CHECK-NEXT: mul z1.h, p0/m, z1.h, z6.h +; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: ret + %op1 = load <32 x i16>, <32 x i16>* %a + %op2 = load <32 x i16>, <32 x i16>* %b + %res = mul <32 x i16> %op1, %op2 + store <32 x i16> %res, <32 x i16>* %a + ret void +} + +define <2 x i32> @mul_v2i32(<2 x i32> %op1, <2 x i32> %op2) #0 { +; CHECK-LABEL: mul_v2i32: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: ptrue p0.s, vl2 +; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1 +; CHECK-NEXT: mul z0.s, p0/m, z0.s, z1.s +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 +; CHECK-NEXT: ret + %res = mul <2 x i32> %op1, %op2 + ret <2 x i32> %res +} + +define <4 x i32> @mul_v4i32(<4 x i32> %op1, <4 x i32> %op2) #0 { +; CHECK-LABEL: mul_v4i32: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: ptrue p0.s, vl4 +; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1 +; CHECK-NEXT: mul z0.s, p0/m, z0.s, z1.s +; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 +; CHECK-NEXT: ret + %res = mul <4 x i32> %op1, %op2 + ret <4 x i32> %res +} + +define void @mul_v8i32(<8 x i32>* %a, <8 x i32>* %b) #0 { +; CHECK-LABEL: mul_v8i32: +; CHECK: // %bb.0: +; CHECK-NEXT: mov x8, #4 +; CHECK-NEXT: ptrue p0.s, vl4 +; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] +; CHECK-NEXT: ld1w { z1.s }, p0/z, [x0] +; CHECK-NEXT: ld1w { z2.s }, p0/z, [x1, x8, lsl #2] +; CHECK-NEXT: ld1w { z3.s }, p0/z, [x1] +; CHECK-NEXT: mul z1.s, p0/m, z1.s, z3.s +; CHECK-NEXT: mul z0.s, p0/m, z0.s, z2.s +; CHECK-NEXT: stp q1, q0, [x0] +; CHECK-NEXT: ret + %op1 = load <8 x i32>, <8 x i32>* %a + %op2 = load <8 x i32>, <8 x i32>* %b + %res = mul <8 x i32> %op1, %op2 + store <8 x i32> %res, <8 x i32>* %a + ret void +} + +define void @mul_v16i32(<16 x i32>* %a, <16 x i32>* %b) #0 { +; CHECK-LABEL: mul_v16i32: +; CHECK: // %bb.0: +; CHECK-NEXT: mov x8, #8 +; CHECK-NEXT: mov x9, #12 +; CHECK-NEXT: mov x10, #4 +; CHECK-NEXT: ptrue p0.s, vl4 +; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] +; CHECK-NEXT: ld1w { z1.s }, p0/z, [x0, x9, lsl #2] +; CHECK-NEXT: ld1w { z2.s }, p0/z, [x0, x10, lsl #2] +; CHECK-NEXT: ld1w { z3.s }, p0/z, [x0] +; CHECK-NEXT: ld1w { z4.s }, p0/z, [x1, x8, lsl #2] +; CHECK-NEXT: ld1w { z5.s }, p0/z, [x1, x9, lsl #2] +; CHECK-NEXT: ld1w { z6.s }, p0/z, [x1, x10, lsl #2] +; CHECK-NEXT: ld1w { z7.s }, p0/z, [x1] +; CHECK-NEXT: mul z0.s, p0/m, z0.s, z4.s +; CHECK-NEXT: mul z1.s, p0/m, z1.s, z5.s +; CHECK-NEXT: stp q0, q1, [x0, #32] +; CHECK-NEXT: movprfx z0, z3 +; CHECK-NEXT: mul z0.s, p0/m, z0.s, z7.s +; CHECK-NEXT: movprfx z1, z2 +; CHECK-NEXT: mul z1.s, p0/m, z1.s, z6.s +; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: ret + %op1 = load <16 x i32>, <16 x i32>* %a + %op2 = load <16 x i32>, <16 x i32>* %b + %res = mul <16 x i32> %op1, %op2 + store <16 x i32> %res, <16 x i32>* %a + ret void +} + +define <1 x i64> @mul_v1i64(<1 x i64> %op1, <1 x i64> %op2) #0 { +; CHECK-LABEL: mul_v1i64: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: ptrue p0.d, vl1 +; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1 +; CHECK-NEXT: mul z0.d, p0/m, z0.d, z1.d +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 +; CHECK-NEXT: ret + %res = mul <1 x i64> %op1, %op2 + ret <1 x i64> %res +} + +define <2 x i64> @mul_v2i64(<2 x i64> %op1, <2 x i64> %op2) #0 { +; CHECK-LABEL: mul_v2i64: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: ptrue p0.d, vl2 +; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1 +; CHECK-NEXT: mul z0.d, p0/m, z0.d, z1.d +; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 +; CHECK-NEXT: ret + %res = mul <2 x i64> %op1, %op2 + ret <2 x i64> %res +} + +define void @mul_v4i64(<4 x i64>* %a, <4 x i64>* %b) #0 { +; CHECK-LABEL: mul_v4i64: +; CHECK: // %bb.0: +; CHECK-NEXT: mov x8, #2 +; CHECK-NEXT: ptrue p0.d, vl2 +; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] +; CHECK-NEXT: ld1d { z1.d }, p0/z, [x0] +; CHECK-NEXT: ld1d { z2.d }, p0/z, [x1, x8, lsl #3] +; CHECK-NEXT: ld1d { z3.d }, p0/z, [x1] +; CHECK-NEXT: mul z1.d, p0/m, z1.d, z3.d +; CHECK-NEXT: mul z0.d, p0/m, z0.d, z2.d +; CHECK-NEXT: stp q1, q0, [x0] +; CHECK-NEXT: ret + %op1 = load <4 x i64>, <4 x i64>* %a + %op2 = load <4 x i64>, <4 x i64>* %b + %res = mul <4 x i64> %op1, %op2 + store <4 x i64> %res, <4 x i64>* %a + ret void +} + +define void @mul_v8i64(<8 x i64>* %a, <8 x i64>* %b) #0 { +; CHECK-LABEL: mul_v8i64: +; CHECK: // %bb.0: +; CHECK-NEXT: mov x8, #4 +; CHECK-NEXT: mov x9, #6 +; CHECK-NEXT: mov x10, #2 +; CHECK-NEXT: ptrue p0.d, vl2 +; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] +; CHECK-NEXT: ld1d { z1.d }, p0/z, [x0, x9, lsl #3] +; CHECK-NEXT: ld1d { z2.d }, p0/z, [x0, x10, lsl #3] +; CHECK-NEXT: ld1d { z3.d }, p0/z, [x0] +; CHECK-NEXT: ld1d { z4.d }, p0/z, [x1, x8, lsl #3] +; CHECK-NEXT: ld1d { z5.d }, p0/z, [x1, x9, lsl #3] +; CHECK-NEXT: ld1d { z6.d }, p0/z, [x1, x10, lsl #3] +; CHECK-NEXT: ld1d { z7.d }, p0/z, [x1] +; CHECK-NEXT: mul z0.d, p0/m, z0.d, z4.d +; CHECK-NEXT: mul z1.d, p0/m, z1.d, z5.d +; CHECK-NEXT: stp q0, q1, [x0, #32] +; CHECK-NEXT: movprfx z0, z3 +; CHECK-NEXT: mul z0.d, p0/m, z0.d, z7.d +; CHECK-NEXT: movprfx z1, z2 +; CHECK-NEXT: mul z1.d, p0/m, z1.d, z6.d +; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: ret + %op1 = load <8 x i64>, <8 x i64>* %a + %op2 = load <8 x i64>, <8 x i64>* %b + %res = mul <8 x i64> %op1, %op2 + store <8 x i64> %res, <8 x i64>* %a + ret void +} + +; +; SUB +; + +define <4 x i8> @sub_v4i8(<4 x i8> %op1, <4 x i8> %op2) #0 { +; CHECK-LABEL: sub_v4i8: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1 +; CHECK-NEXT: sub z0.h, z0.h, z1.h +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 +; CHECK-NEXT: ret + %res = sub <4 x i8> %op1, %op2 + ret <4 x i8> %res +} + +define <8 x i8> @sub_v8i8(<8 x i8> %op1, <8 x i8> %op2) #0 { +; CHECK-LABEL: sub_v8i8: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1 +; CHECK-NEXT: sub z0.b, z0.b, z1.b +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 +; CHECK-NEXT: ret + %res = sub <8 x i8> %op1, %op2 + ret <8 x i8> %res +} + +define <16 x i8> @sub_v16i8(<16 x i8> %op1, <16 x i8> %op2) #0 { +; CHECK-LABEL: sub_v16i8: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1 +; CHECK-NEXT: sub z0.b, z0.b, z1.b +; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 +; CHECK-NEXT: ret + %res = sub <16 x i8> %op1, %op2 + ret <16 x i8> %res +} + +define void @sub_v32i8(<32 x i8>* %a, <32 x i8>* %b) #0 { +; CHECK-LABEL: sub_v32i8: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w8, #16 +; CHECK-NEXT: ptrue p0.b, vl16 +; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0, x8] +; CHECK-NEXT: ld1b { z1.b }, p0/z, [x0] +; CHECK-NEXT: ld1b { z2.b }, p0/z, [x1, x8] +; CHECK-NEXT: ld1b { z3.b }, p0/z, [x1] +; CHECK-NEXT: sub z1.b, z1.b, z3.b +; CHECK-NEXT: sub z0.b, z0.b, z2.b +; CHECK-NEXT: stp q1, q0, [x0] +; CHECK-NEXT: ret + %op1 = load <32 x i8>, <32 x i8>* %a + %op2 = load <32 x i8>, <32 x i8>* %b + %res = sub <32 x i8> %op1, %op2 + store <32 x i8> %res, <32 x i8>* %a + ret void +} + +define void @sub_v64i8(<64 x i8>* %a, <64 x i8>* %b) #0 { +; CHECK-LABEL: sub_v64i8: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w8, #32 +; CHECK-NEXT: mov w9, #48 +; CHECK-NEXT: mov w10, #16 +; CHECK-NEXT: ptrue p0.b, vl16 +; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0, x8] +; CHECK-NEXT: ld1b { z1.b }, p0/z, [x0, x9] +; CHECK-NEXT: ld1b { z2.b }, p0/z, [x0, x10] +; CHECK-NEXT: ld1b { z3.b }, p0/z, [x0] +; CHECK-NEXT: ld1b { z4.b }, p0/z, [x1, x8] +; CHECK-NEXT: ld1b { z5.b }, p0/z, [x1, x9] +; CHECK-NEXT: ld1b { z6.b }, p0/z, [x1, x10] +; CHECK-NEXT: ld1b { z7.b }, p0/z, [x1] +; CHECK-NEXT: sub z0.b, z0.b, z4.b +; CHECK-NEXT: sub z1.b, z1.b, z5.b +; CHECK-NEXT: stp q0, q1, [x0, #32] +; CHECK-NEXT: sub z0.b, z3.b, z7.b +; CHECK-NEXT: sub z1.b, z2.b, z6.b +; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: ret + %op1 = load <64 x i8>, <64 x i8>* %a + %op2 = load <64 x i8>, <64 x i8>* %b + %res = sub <64 x i8> %op1, %op2 + store <64 x i8> %res, <64 x i8>* %a + ret void +} + +define <2 x i16> @sub_v2i16(<2 x i16> %op1, <2 x i16> %op2) #0 { +; CHECK-LABEL: sub_v2i16: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1 +; CHECK-NEXT: sub z0.s, z0.s, z1.s +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 +; CHECK-NEXT: ret + %res = sub <2 x i16> %op1, %op2 + ret <2 x i16> %res +} + +define <4 x i16> @sub_v4i16(<4 x i16> %op1, <4 x i16> %op2) #0 { +; CHECK-LABEL: sub_v4i16: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1 +; CHECK-NEXT: sub z0.h, z0.h, z1.h +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 +; CHECK-NEXT: ret + %res = sub <4 x i16> %op1, %op2 + ret <4 x i16> %res +} + +define <8 x i16> @sub_v8i16(<8 x i16> %op1, <8 x i16> %op2) #0 { +; CHECK-LABEL: sub_v8i16: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1 +; CHECK-NEXT: sub z0.h, z0.h, z1.h +; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 +; CHECK-NEXT: ret + %res = sub <8 x i16> %op1, %op2 + ret <8 x i16> %res +} + +define void @sub_v16i16(<16 x i16>* %a, <16 x i16>* %b) #0 { +; CHECK-LABEL: sub_v16i16: +; CHECK: // %bb.0: +; CHECK-NEXT: mov x8, #8 +; CHECK-NEXT: ptrue p0.h, vl8 +; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] +; CHECK-NEXT: ld1h { z1.h }, p0/z, [x0] +; CHECK-NEXT: ld1h { z2.h }, p0/z, [x1, x8, lsl #1] +; CHECK-NEXT: ld1h { z3.h }, p0/z, [x1] +; CHECK-NEXT: sub z1.h, z1.h, z3.h +; CHECK-NEXT: sub z0.h, z0.h, z2.h +; CHECK-NEXT: stp q1, q0, [x0] +; CHECK-NEXT: ret + %op1 = load <16 x i16>, <16 x i16>* %a + %op2 = load <16 x i16>, <16 x i16>* %b + %res = sub <16 x i16> %op1, %op2 + store <16 x i16> %res, <16 x i16>* %a + ret void +} + +define void @sub_v32i16(<32 x i16>* %a, <32 x i16>* %b) #0 { +; CHECK-LABEL: sub_v32i16: +; CHECK: // %bb.0: +; CHECK-NEXT: mov x8, #16 +; CHECK-NEXT: mov x9, #24 +; CHECK-NEXT: mov x10, #8 +; CHECK-NEXT: ptrue p0.h, vl8 +; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] +; CHECK-NEXT: ld1h { z1.h }, p0/z, [x0, x9, lsl #1] +; CHECK-NEXT: ld1h { z2.h }, p0/z, [x0, x10, lsl #1] +; CHECK-NEXT: ld1h { z3.h }, p0/z, [x0] +; CHECK-NEXT: ld1h { z4.h }, p0/z, [x1, x8, lsl #1] +; CHECK-NEXT: ld1h { z5.h }, p0/z, [x1, x9, lsl #1] +; CHECK-NEXT: ld1h { z6.h }, p0/z, [x1, x10, lsl #1] +; CHECK-NEXT: ld1h { z7.h }, p0/z, [x1] +; CHECK-NEXT: sub z0.h, z0.h, z4.h +; CHECK-NEXT: sub z1.h, z1.h, z5.h +; CHECK-NEXT: stp q0, q1, [x0, #32] +; CHECK-NEXT: sub z0.h, z3.h, z7.h +; CHECK-NEXT: sub z1.h, z2.h, z6.h +; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: ret + %op1 = load <32 x i16>, <32 x i16>* %a + %op2 = load <32 x i16>, <32 x i16>* %b + %res = sub <32 x i16> %op1, %op2 + store <32 x i16> %res, <32 x i16>* %a + ret void +} + +define <2 x i32> @sub_v2i32(<2 x i32> %op1, <2 x i32> %op2) #0 { +; CHECK-LABEL: sub_v2i32: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1 +; CHECK-NEXT: sub z0.s, z0.s, z1.s +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 +; CHECK-NEXT: ret + %res = sub <2 x i32> %op1, %op2 + ret <2 x i32> %res +} + +define <4 x i32> @sub_v4i32(<4 x i32> %op1, <4 x i32> %op2) #0 { +; CHECK-LABEL: sub_v4i32: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1 +; CHECK-NEXT: sub z0.s, z0.s, z1.s +; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 +; CHECK-NEXT: ret + %res = sub <4 x i32> %op1, %op2 + ret <4 x i32> %res +} + +define void @sub_v8i32(<8 x i32>* %a, <8 x i32>* %b) #0 { +; CHECK-LABEL: sub_v8i32: +; CHECK: // %bb.0: +; CHECK-NEXT: mov x8, #4 +; CHECK-NEXT: ptrue p0.s, vl4 +; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] +; CHECK-NEXT: ld1w { z1.s }, p0/z, [x0] +; CHECK-NEXT: ld1w { z2.s }, p0/z, [x1, x8, lsl #2] +; CHECK-NEXT: ld1w { z3.s }, p0/z, [x1] +; CHECK-NEXT: sub z1.s, z1.s, z3.s +; CHECK-NEXT: sub z0.s, z0.s, z2.s +; CHECK-NEXT: stp q1, q0, [x0] +; CHECK-NEXT: ret + %op1 = load <8 x i32>, <8 x i32>* %a + %op2 = load <8 x i32>, <8 x i32>* %b + %res = sub <8 x i32> %op1, %op2 + store <8 x i32> %res, <8 x i32>* %a + ret void +} + +define void @sub_v16i32(<16 x i32>* %a, <16 x i32>* %b) #0 { +; CHECK-LABEL: sub_v16i32: +; CHECK: // %bb.0: +; CHECK-NEXT: mov x8, #8 +; CHECK-NEXT: mov x9, #12 +; CHECK-NEXT: mov x10, #4 +; CHECK-NEXT: ptrue p0.s, vl4 +; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] +; CHECK-NEXT: ld1w { z1.s }, p0/z, [x0, x9, lsl #2] +; CHECK-NEXT: ld1w { z2.s }, p0/z, [x0, x10, lsl #2] +; CHECK-NEXT: ld1w { z3.s }, p0/z, [x0] +; CHECK-NEXT: ld1w { z4.s }, p0/z, [x1, x8, lsl #2] +; CHECK-NEXT: ld1w { z5.s }, p0/z, [x1, x9, lsl #2] +; CHECK-NEXT: ld1w { z6.s }, p0/z, [x1, x10, lsl #2] +; CHECK-NEXT: ld1w { z7.s }, p0/z, [x1] +; CHECK-NEXT: sub z0.s, z0.s, z4.s +; CHECK-NEXT: sub z1.s, z1.s, z5.s +; CHECK-NEXT: stp q0, q1, [x0, #32] +; CHECK-NEXT: sub z0.s, z3.s, z7.s +; CHECK-NEXT: sub z1.s, z2.s, z6.s +; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: ret + %op1 = load <16 x i32>, <16 x i32>* %a + %op2 = load <16 x i32>, <16 x i32>* %b + %res = sub <16 x i32> %op1, %op2 + store <16 x i32> %res, <16 x i32>* %a + ret void +} + +define <1 x i64> @sub_v1i64(<1 x i64> %op1, <1 x i64> %op2) #0 { +; CHECK-LABEL: sub_v1i64: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1 +; CHECK-NEXT: sub z0.d, z0.d, z1.d +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 +; CHECK-NEXT: ret + %res = sub <1 x i64> %op1, %op2 + ret <1 x i64> %res +} + +define <2 x i64> @sub_v2i64(<2 x i64> %op1, <2 x i64> %op2) #0 { +; CHECK-LABEL: sub_v2i64: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1 +; CHECK-NEXT: sub z0.d, z0.d, z1.d +; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 +; CHECK-NEXT: ret + %res = sub <2 x i64> %op1, %op2 + ret <2 x i64> %res +} + +define void @sub_v4i64(<4 x i64>* %a, <4 x i64>* %b) #0 { +; CHECK-LABEL: sub_v4i64: +; CHECK: // %bb.0: +; CHECK-NEXT: mov x8, #2 +; CHECK-NEXT: ptrue p0.d, vl2 +; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] +; CHECK-NEXT: ld1d { z1.d }, p0/z, [x0] +; CHECK-NEXT: ld1d { z2.d }, p0/z, [x1, x8, lsl #3] +; CHECK-NEXT: ld1d { z3.d }, p0/z, [x1] +; CHECK-NEXT: sub z1.d, z1.d, z3.d +; CHECK-NEXT: sub z0.d, z0.d, z2.d +; CHECK-NEXT: stp q1, q0, [x0] +; CHECK-NEXT: ret + %op1 = load <4 x i64>, <4 x i64>* %a + %op2 = load <4 x i64>, <4 x i64>* %b + %res = sub <4 x i64> %op1, %op2 + store <4 x i64> %res, <4 x i64>* %a + ret void +} + +define void @sub_v8i64(<8 x i64>* %a, <8 x i64>* %b) #0 { +; CHECK-LABEL: sub_v8i64: +; CHECK: // %bb.0: +; CHECK-NEXT: mov x8, #4 +; CHECK-NEXT: mov x9, #6 +; CHECK-NEXT: mov x10, #2 +; CHECK-NEXT: ptrue p0.d, vl2 +; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] +; CHECK-NEXT: ld1d { z1.d }, p0/z, [x0, x9, lsl #3] +; CHECK-NEXT: ld1d { z2.d }, p0/z, [x0, x10, lsl #3] +; CHECK-NEXT: ld1d { z3.d }, p0/z, [x0] +; CHECK-NEXT: ld1d { z4.d }, p0/z, [x1, x8, lsl #3] +; CHECK-NEXT: ld1d { z5.d }, p0/z, [x1, x9, lsl #3] +; CHECK-NEXT: ld1d { z6.d }, p0/z, [x1, x10, lsl #3] +; CHECK-NEXT: ld1d { z7.d }, p0/z, [x1] +; CHECK-NEXT: sub z0.d, z0.d, z4.d +; CHECK-NEXT: sub z1.d, z1.d, z5.d +; CHECK-NEXT: stp q0, q1, [x0, #32] +; CHECK-NEXT: sub z0.d, z3.d, z7.d +; CHECK-NEXT: sub z1.d, z2.d, z6.d +; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: ret + %op1 = load <8 x i64>, <8 x i64>* %a + %op2 = load <8 x i64>, <8 x i64>* %b + %res = sub <8 x i64> %op1, %op2 + store <8 x i64> %res, <8 x i64>* %a + ret void +} + + +; +; ABS +; + +define <4 x i8> @abs_v4i8(<4 x i8> %op1) #0 { +; CHECK-LABEL: abs_v4i8: +; CHECK: // %bb.0: +; CHECK-NEXT: adrp x8, .LCPI54_0 +; CHECK-NEXT: add x8, x8, :lo12:.LCPI54_0 +; CHECK-NEXT: ptrue p0.h, vl4 +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: ld1h { z1.h }, p0/z, [x8] +; CHECK-NEXT: lsl z0.h, p0/m, z0.h, z1.h +; CHECK-NEXT: asr z0.h, p0/m, z0.h, z1.h +; CHECK-NEXT: abs z0.h, p0/m, z0.h +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 +; CHECK-NEXT: ret + %res = call <4 x i8> @llvm.abs.v4i8(<4 x i8> %op1, i1 false) + ret <4 x i8> %res +} + +define <8 x i8> @abs_v8i8(<8 x i8> %op1) #0 { +; CHECK-LABEL: abs_v8i8: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: ptrue p0.b, vl8 +; CHECK-NEXT: abs z0.b, p0/m, z0.b +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 +; CHECK-NEXT: ret + %res = call <8 x i8> @llvm.abs.v8i8(<8 x i8> %op1, i1 false) + ret <8 x i8> %res +} + +define <16 x i8> @abs_v16i8(<16 x i8> %op1) #0 { +; CHECK-LABEL: abs_v16i8: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: ptrue p0.b, vl16 +; CHECK-NEXT: abs z0.b, p0/m, z0.b +; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 +; CHECK-NEXT: ret + %res = call <16 x i8> @llvm.abs.v16i8(<16 x i8> %op1, i1 false) + ret <16 x i8> %res +} + +define void @abs_v32i8(<32 x i8>* %a) #0 { +; CHECK-LABEL: abs_v32i8: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w8, #16 +; CHECK-NEXT: ptrue p0.b, vl16 +; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0, x8] +; CHECK-NEXT: ld1b { z1.b }, p0/z, [x0] +; CHECK-NEXT: abs z1.b, p0/m, z1.b +; CHECK-NEXT: abs z0.b, p0/m, z0.b +; CHECK-NEXT: stp q1, q0, [x0] +; CHECK-NEXT: ret + %op1 = load <32 x i8>, <32 x i8>* %a + %res = call <32 x i8> @llvm.abs.v32i8(<32 x i8> %op1, i1 false) + store <32 x i8> %res, <32 x i8>* %a + ret void +} + +define void @abs_v64i8(<64 x i8>* %a) #0 { +; CHECK-LABEL: abs_v64i8: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w8, #32 +; CHECK-NEXT: mov w9, #48 +; CHECK-NEXT: ptrue p0.b, vl16 +; CHECK-NEXT: mov w10, #16 +; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0, x8] +; CHECK-NEXT: ld1b { z1.b }, p0/z, [x0, x9] +; CHECK-NEXT: ld1b { z2.b }, p0/z, [x0, x10] +; CHECK-NEXT: ld1b { z3.b }, p0/z, [x0] +; CHECK-NEXT: abs z0.b, p0/m, z0.b +; CHECK-NEXT: abs z1.b, p0/m, z1.b +; CHECK-NEXT: stp q0, q1, [x0, #32] +; CHECK-NEXT: movprfx z0, z3 +; CHECK-NEXT: abs z0.b, p0/m, z3.b +; CHECK-NEXT: movprfx z1, z2 +; CHECK-NEXT: abs z1.b, p0/m, z2.b +; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: ret + %op1 = load <64 x i8>, <64 x i8>* %a + %res = call <64 x i8> @llvm.abs.v64i8(<64 x i8> %op1, i1 false) + store <64 x i8> %res, <64 x i8>* %a + ret void +} + +define <2 x i16> @abs_v2i16(<2 x i16> %op1) #0 { +; CHECK-LABEL: abs_v2i16: +; CHECK: // %bb.0: +; CHECK-NEXT: adrp x8, .LCPI59_0 +; CHECK-NEXT: add x8, x8, :lo12:.LCPI59_0 +; CHECK-NEXT: ptrue p0.s, vl2 +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: ld1w { z1.s }, p0/z, [x8] +; CHECK-NEXT: lsl z0.s, p0/m, z0.s, z1.s +; CHECK-NEXT: asr z0.s, p0/m, z0.s, z1.s +; CHECK-NEXT: abs z0.s, p0/m, z0.s +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 +; CHECK-NEXT: ret + %res = call <2 x i16> @llvm.abs.v2i16(<2 x i16> %op1, i1 false) + ret <2 x i16> %res +} + +define <4 x i16> @abs_v4i16(<4 x i16> %op1) #0 { +; CHECK-LABEL: abs_v4i16: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: ptrue p0.h, vl4 +; CHECK-NEXT: abs z0.h, p0/m, z0.h +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 +; CHECK-NEXT: ret + %res = call <4 x i16> @llvm.abs.v4i16(<4 x i16> %op1, i1 false) + ret <4 x i16> %res +} + +define <8 x i16> @abs_v8i16(<8 x i16> %op1) #0 { +; CHECK-LABEL: abs_v8i16: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: ptrue p0.h, vl8 +; CHECK-NEXT: abs z0.h, p0/m, z0.h +; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 +; CHECK-NEXT: ret + %res = call <8 x i16> @llvm.abs.v8i16(<8 x i16> %op1, i1 false) + ret <8 x i16> %res +} + +define void @abs_v16i16(<16 x i16>* %a) #0 { +; CHECK-LABEL: abs_v16i16: +; CHECK: // %bb.0: +; CHECK-NEXT: mov x8, #8 +; CHECK-NEXT: ptrue p0.h, vl8 +; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] +; CHECK-NEXT: ld1h { z1.h }, p0/z, [x0] +; CHECK-NEXT: abs z1.h, p0/m, z1.h +; CHECK-NEXT: abs z0.h, p0/m, z0.h +; CHECK-NEXT: stp q1, q0, [x0] +; CHECK-NEXT: ret + %op1 = load <16 x i16>, <16 x i16>* %a + %res = call <16 x i16> @llvm.abs.v16i16(<16 x i16> %op1, i1 false) + store <16 x i16> %res, <16 x i16>* %a + ret void +} + +define void @abs_v32i16(<32 x i16>* %a) #0 { +; CHECK-LABEL: abs_v32i16: +; CHECK: // %bb.0: +; CHECK-NEXT: mov x8, #16 +; CHECK-NEXT: mov x9, #24 +; CHECK-NEXT: ptrue p0.h, vl8 +; CHECK-NEXT: mov x10, #8 +; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] +; CHECK-NEXT: ld1h { z1.h }, p0/z, [x0, x9, lsl #1] +; CHECK-NEXT: ld1h { z2.h }, p0/z, [x0, x10, lsl #1] +; CHECK-NEXT: ld1h { z3.h }, p0/z, [x0] +; CHECK-NEXT: abs z0.h, p0/m, z0.h +; CHECK-NEXT: abs z1.h, p0/m, z1.h +; CHECK-NEXT: stp q0, q1, [x0, #32] +; CHECK-NEXT: movprfx z0, z3 +; CHECK-NEXT: abs z0.h, p0/m, z3.h +; CHECK-NEXT: movprfx z1, z2 +; CHECK-NEXT: abs z1.h, p0/m, z2.h +; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: ret + %op1 = load <32 x i16>, <32 x i16>* %a + %res = call <32 x i16> @llvm.abs.v32i16(<32 x i16> %op1, i1 false) + store <32 x i16> %res, <32 x i16>* %a + ret void +} + +define <2 x i32> @abs_v2i32(<2 x i32> %op1) #0 { +; CHECK-LABEL: abs_v2i32: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: ptrue p0.s, vl2 +; CHECK-NEXT: abs z0.s, p0/m, z0.s +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 +; CHECK-NEXT: ret + %res = call <2 x i32> @llvm.abs.v2i32(<2 x i32> %op1, i1 false) + ret <2 x i32> %res +} + +define <4 x i32> @abs_v4i32(<4 x i32> %op1) #0 { +; CHECK-LABEL: abs_v4i32: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: ptrue p0.s, vl4 +; CHECK-NEXT: abs z0.s, p0/m, z0.s +; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 +; CHECK-NEXT: ret + %res = call <4 x i32> @llvm.abs.v4i32(<4 x i32> %op1, i1 false) + ret <4 x i32> %res +} + +define void @abs_v8i32(<8 x i32>* %a) #0 { +; CHECK-LABEL: abs_v8i32: +; CHECK: // %bb.0: +; CHECK-NEXT: mov x8, #4 +; CHECK-NEXT: ptrue p0.s, vl4 +; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] +; CHECK-NEXT: ld1w { z1.s }, p0/z, [x0] +; CHECK-NEXT: abs z1.s, p0/m, z1.s +; CHECK-NEXT: abs z0.s, p0/m, z0.s +; CHECK-NEXT: stp q1, q0, [x0] +; CHECK-NEXT: ret + %op1 = load <8 x i32>, <8 x i32>* %a + %res = call <8 x i32> @llvm.abs.v8i32(<8 x i32> %op1, i1 false) + store <8 x i32> %res, <8 x i32>* %a + ret void +} + +define void @abs_v16i32(<16 x i32>* %a) #0 { +; CHECK-LABEL: abs_v16i32: +; CHECK: // %bb.0: +; CHECK-NEXT: mov x8, #8 +; CHECK-NEXT: mov x9, #12 +; CHECK-NEXT: ptrue p0.s, vl4 +; CHECK-NEXT: mov x10, #4 +; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] +; CHECK-NEXT: ld1w { z1.s }, p0/z, [x0, x9, lsl #2] +; CHECK-NEXT: ld1w { z2.s }, p0/z, [x0, x10, lsl #2] +; CHECK-NEXT: ld1w { z3.s }, p0/z, [x0] +; CHECK-NEXT: abs z0.s, p0/m, z0.s +; CHECK-NEXT: abs z1.s, p0/m, z1.s +; CHECK-NEXT: stp q0, q1, [x0, #32] +; CHECK-NEXT: movprfx z0, z3 +; CHECK-NEXT: abs z0.s, p0/m, z3.s +; CHECK-NEXT: movprfx z1, z2 +; CHECK-NEXT: abs z1.s, p0/m, z2.s +; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: ret + %op1 = load <16 x i32>, <16 x i32>* %a + %res = call <16 x i32> @llvm.abs.v16i32(<16 x i32> %op1, i1 false) + store <16 x i32> %res, <16 x i32>* %a + ret void +} + +define <1 x i64> @abs_v1i64(<1 x i64> %op1) #0 { +; CHECK-LABEL: abs_v1i64: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: ptrue p0.d, vl1 +; CHECK-NEXT: abs z0.d, p0/m, z0.d +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 +; CHECK-NEXT: ret + %res = call <1 x i64> @llvm.abs.v1i64(<1 x i64> %op1, i1 false) + ret <1 x i64> %res +} + +define <2 x i64> @abs_v2i64(<2 x i64> %op1) #0 { +; CHECK-LABEL: abs_v2i64: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: ptrue p0.d, vl2 +; CHECK-NEXT: abs z0.d, p0/m, z0.d +; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 +; CHECK-NEXT: ret + %res = call <2 x i64> @llvm.abs.v2i64(<2 x i64> %op1, i1 false) + ret <2 x i64> %res +} + +define void @abs_v4i64(<4 x i64>* %a) #0 { +; CHECK-LABEL: abs_v4i64: +; CHECK: // %bb.0: +; CHECK-NEXT: mov x8, #2 +; CHECK-NEXT: ptrue p0.d, vl2 +; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] +; CHECK-NEXT: ld1d { z1.d }, p0/z, [x0] +; CHECK-NEXT: abs z1.d, p0/m, z1.d +; CHECK-NEXT: abs z0.d, p0/m, z0.d +; CHECK-NEXT: stp q1, q0, [x0] +; CHECK-NEXT: ret + %op1 = load <4 x i64>, <4 x i64>* %a + %res = call <4 x i64> @llvm.abs.v4i64(<4 x i64> %op1, i1 false) + store <4 x i64> %res, <4 x i64>* %a + ret void +} + +define void @abs_v8i64(<8 x i64>* %a) #0 { +; CHECK-LABEL: abs_v8i64: +; CHECK: // %bb.0: +; CHECK-NEXT: mov x8, #4 +; CHECK-NEXT: mov x9, #6 +; CHECK-NEXT: ptrue p0.d, vl2 +; CHECK-NEXT: mov x10, #2 +; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] +; CHECK-NEXT: ld1d { z1.d }, p0/z, [x0, x9, lsl #3] +; CHECK-NEXT: ld1d { z2.d }, p0/z, [x0, x10, lsl #3] +; CHECK-NEXT: ld1d { z3.d }, p0/z, [x0] +; CHECK-NEXT: abs z0.d, p0/m, z0.d +; CHECK-NEXT: abs z1.d, p0/m, z1.d +; CHECK-NEXT: stp q0, q1, [x0, #32] +; CHECK-NEXT: movprfx z0, z3 +; CHECK-NEXT: abs z0.d, p0/m, z3.d +; CHECK-NEXT: movprfx z1, z2 +; CHECK-NEXT: abs z1.d, p0/m, z2.d +; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: ret + %op1 = load <8 x i64>, <8 x i64>* %a + %res = call <8 x i64> @llvm.abs.v8i64(<8 x i64> %op1, i1 false) + store <8 x i64> %res, <8 x i64>* %a + ret void +} + +declare <4 x i8> @llvm.abs.v4i8(<4 x i8>, i1) +declare <8 x i8> @llvm.abs.v8i8(<8 x i8>, i1) +declare <16 x i8> @llvm.abs.v16i8(<16 x i8>, i1) +declare <32 x i8> @llvm.abs.v32i8(<32 x i8>, i1) +declare <64 x i8> @llvm.abs.v64i8(<64 x i8>, i1) +declare <4 x i16> @llvm.abs.v4i16(<4 x i16>, i1) +declare <2 x i16> @llvm.abs.v2i16(<2 x i16>, i1) +declare <8 x i16> @llvm.abs.v8i16(<8 x i16>, i1) +declare <16 x i16> @llvm.abs.v16i16(<16 x i16>, i1) +declare <32 x i16> @llvm.abs.v32i16(<32 x i16>, i1) +declare <2 x i32> @llvm.abs.v2i32(<2 x i32>, i1) +declare <4 x i32> @llvm.abs.v4i32(<4 x i32>, i1) +declare <8 x i32> @llvm.abs.v8i32(<8 x i32>, i1) +declare <16 x i32> @llvm.abs.v16i32(<16 x i32>, i1) +declare <1 x i64> @llvm.abs.v1i64(<1 x i64>, i1) +declare <2 x i64> @llvm.abs.v2i64(<2 x i64>, i1) +declare <4 x i64> @llvm.abs.v4i64(<4 x i64>, i1) +declare <8 x i64> @llvm.abs.v8i64(<8 x i64>, i1) + + +attributes #0 = { "target-features"="+sve" } diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-div.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-div.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-div.ll @@ -0,0 +1,1229 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -force-streaming-compatible-sve < %s | FileCheck %s + +target triple = "aarch64-unknown-linux-gnu" + +; +; SDIV +; + +define <4 x i8> @sdiv_v4i8(<4 x i8> %op1, <4 x i8> %op2) #0 { +; CHECK-LABEL: sdiv_v4i8: +; CHECK: // %bb.0: +; CHECK-NEXT: sub sp, sp, #16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: adrp x8, .LCPI0_0 +; CHECK-NEXT: add x8, x8, :lo12:.LCPI0_0 +; CHECK-NEXT: ptrue p0.h, vl4 +; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1 +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: ptrue p1.s, vl4 +; CHECK-NEXT: ld1h { z2.h }, p0/z, [x8] +; CHECK-NEXT: lsl z0.h, p0/m, z0.h, z2.h +; CHECK-NEXT: lsl z1.h, p0/m, z1.h, z2.h +; CHECK-NEXT: asr z0.h, p0/m, z0.h, z2.h +; CHECK-NEXT: asr z1.h, p0/m, z1.h, z2.h +; CHECK-NEXT: sunpklo z1.s, z1.h +; CHECK-NEXT: sunpklo z0.s, z0.h +; CHECK-NEXT: sdiv z0.s, p1/m, z0.s, z1.s +; CHECK-NEXT: fmov w8, s0 +; CHECK-NEXT: mov z1.s, z0.s[3] +; CHECK-NEXT: mov z2.s, z0.s[2] +; CHECK-NEXT: mov z0.s, z0.s[1] +; CHECK-NEXT: fmov w9, s1 +; CHECK-NEXT: fmov w10, s2 +; CHECK-NEXT: strh w8, [sp, #8] +; CHECK-NEXT: fmov w8, s0 +; CHECK-NEXT: strh w9, [sp, #14] +; CHECK-NEXT: strh w8, [sp, #10] +; CHECK-NEXT: add x8, sp, #8 +; CHECK-NEXT: strh w10, [sp, #12] +; CHECK-NEXT: ld1h { z0.h }, p0/z, [x8] +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 +; CHECK-NEXT: add sp, sp, #16 +; CHECK-NEXT: ret + %res = sdiv <4 x i8> %op1, %op2 + ret <4 x i8> %res +} + +define <8 x i8> @sdiv_v8i8(<8 x i8> %op1, <8 x i8> %op2) #0 { +; CHECK-LABEL: sdiv_v8i8: +; CHECK: // %bb.0: +; CHECK-NEXT: sub sp, sp, #16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1 +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: ptrue p0.s, vl4 +; CHECK-NEXT: sunpklo z1.h, z1.b +; CHECK-NEXT: sunpklo z0.h, z0.b +; CHECK-NEXT: sunpkhi z2.s, z1.h +; CHECK-NEXT: sunpkhi z3.s, z0.h +; CHECK-NEXT: sunpklo z1.s, z1.h +; CHECK-NEXT: sunpklo z0.s, z0.h +; CHECK-NEXT: sdivr z2.s, p0/m, z2.s, z3.s +; CHECK-NEXT: sdiv z0.s, p0/m, z0.s, z1.s +; CHECK-NEXT: uzp1 z0.h, z0.h, z2.h +; CHECK-NEXT: ptrue p0.b, vl8 +; CHECK-NEXT: fmov w8, s0 +; CHECK-NEXT: mov z1.h, z0.h[7] +; CHECK-NEXT: mov z3.h, z0.h[5] +; CHECK-NEXT: fmov w9, s1 +; CHECK-NEXT: mov z2.h, z0.h[6] +; CHECK-NEXT: mov z4.h, z0.h[4] +; CHECK-NEXT: strb w8, [sp, #8] +; CHECK-NEXT: fmov w8, s3 +; CHECK-NEXT: mov z6.h, z0.h[2] +; CHECK-NEXT: fmov w10, s2 +; CHECK-NEXT: strb w9, [sp, #15] +; CHECK-NEXT: fmov w9, s4 +; CHECK-NEXT: strb w8, [sp, #13] +; CHECK-NEXT: fmov w8, s6 +; CHECK-NEXT: mov z5.h, z0.h[3] +; CHECK-NEXT: mov z0.h, z0.h[1] +; CHECK-NEXT: strb w10, [sp, #14] +; CHECK-NEXT: fmov w10, s5 +; CHECK-NEXT: strb w9, [sp, #12] +; CHECK-NEXT: fmov w9, s0 +; CHECK-NEXT: strb w8, [sp, #10] +; CHECK-NEXT: add x8, sp, #8 +; CHECK-NEXT: strb w10, [sp, #11] +; CHECK-NEXT: strb w9, [sp, #9] +; CHECK-NEXT: ld1b { z0.b }, p0/z, [x8] +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 +; CHECK-NEXT: add sp, sp, #16 +; CHECK-NEXT: ret + %res = sdiv <8 x i8> %op1, %op2 + ret <8 x i8> %res +} + +define <16 x i8> @sdiv_v16i8(<16 x i8> %op1, <16 x i8> %op2) #0 { +; CHECK-LABEL: sdiv_v16i8: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1 +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: sunpkhi z2.h, z1.b +; CHECK-NEXT: sunpkhi z3.h, z0.b +; CHECK-NEXT: ptrue p0.s, vl4 +; CHECK-NEXT: sunpklo z1.h, z1.b +; CHECK-NEXT: sunpkhi z4.s, z2.h +; CHECK-NEXT: sunpkhi z5.s, z3.h +; CHECK-NEXT: sunpklo z2.s, z2.h +; CHECK-NEXT: sunpklo z3.s, z3.h +; CHECK-NEXT: sunpklo z0.h, z0.b +; CHECK-NEXT: sdivr z4.s, p0/m, z4.s, z5.s +; CHECK-NEXT: sdivr z2.s, p0/m, z2.s, z3.s +; CHECK-NEXT: sunpkhi z3.s, z1.h +; CHECK-NEXT: sunpkhi z5.s, z0.h +; CHECK-NEXT: sunpklo z1.s, z1.h +; CHECK-NEXT: sunpklo z0.s, z0.h +; CHECK-NEXT: sdivr z3.s, p0/m, z3.s, z5.s +; CHECK-NEXT: sdiv z0.s, p0/m, z0.s, z1.s +; CHECK-NEXT: uzp1 z1.h, z2.h, z4.h +; CHECK-NEXT: uzp1 z0.h, z0.h, z3.h +; CHECK-NEXT: uzp1 z0.b, z0.b, z1.b +; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 +; CHECK-NEXT: ret + %res = sdiv <16 x i8> %op1, %op2 + ret <16 x i8> %res +} + +define void @sdiv_v32i8(<32 x i8>* %a, <32 x i8>* %b) #0 { +; CHECK-LABEL: sdiv_v32i8: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w8, #16 +; CHECK-NEXT: ptrue p0.b, vl16 +; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0, x8] +; CHECK-NEXT: ld1b { z1.b }, p0/z, [x0] +; CHECK-NEXT: ld1b { z2.b }, p0/z, [x1, x8] +; CHECK-NEXT: ld1b { z3.b }, p0/z, [x1] +; CHECK-NEXT: ptrue p0.s, vl4 +; CHECK-NEXT: sunpkhi z5.h, z0.b +; CHECK-NEXT: sunpklo z0.h, z0.b +; CHECK-NEXT: sunpkhi z4.h, z2.b +; CHECK-NEXT: sunpklo z2.h, z2.b +; CHECK-NEXT: sunpkhi z6.s, z4.h +; CHECK-NEXT: sunpkhi z7.s, z5.h +; CHECK-NEXT: sunpklo z4.s, z4.h +; CHECK-NEXT: sunpklo z5.s, z5.h +; CHECK-NEXT: sunpkhi z16.s, z2.h +; CHECK-NEXT: sdivr z6.s, p0/m, z6.s, z7.s +; CHECK-NEXT: sdivr z4.s, p0/m, z4.s, z5.s +; CHECK-NEXT: sunpkhi z5.s, z0.h +; CHECK-NEXT: sunpklo z2.s, z2.h +; CHECK-NEXT: sunpklo z0.s, z0.h +; CHECK-NEXT: uzp1 z4.h, z4.h, z6.h +; CHECK-NEXT: sdiv z0.s, p0/m, z0.s, z2.s +; CHECK-NEXT: sunpkhi z2.h, z3.b +; CHECK-NEXT: sunpkhi z6.h, z1.b +; CHECK-NEXT: sdiv z5.s, p0/m, z5.s, z16.s +; CHECK-NEXT: sunpkhi z7.s, z2.h +; CHECK-NEXT: sunpkhi z16.s, z6.h +; CHECK-NEXT: sunpklo z2.s, z2.h +; CHECK-NEXT: sunpklo z6.s, z6.h +; CHECK-NEXT: sunpklo z3.h, z3.b +; CHECK-NEXT: sunpklo z1.h, z1.b +; CHECK-NEXT: sdivr z7.s, p0/m, z7.s, z16.s +; CHECK-NEXT: sdivr z2.s, p0/m, z2.s, z6.s +; CHECK-NEXT: sunpkhi z6.s, z3.h +; CHECK-NEXT: sunpkhi z16.s, z1.h +; CHECK-NEXT: sunpklo z3.s, z3.h +; CHECK-NEXT: sunpklo z1.s, z1.h +; CHECK-NEXT: sdivr z6.s, p0/m, z6.s, z16.s +; CHECK-NEXT: sdiv z1.s, p0/m, z1.s, z3.s +; CHECK-NEXT: uzp1 z2.h, z2.h, z7.h +; CHECK-NEXT: uzp1 z1.h, z1.h, z6.h +; CHECK-NEXT: uzp1 z0.h, z0.h, z5.h +; CHECK-NEXT: uzp1 z1.b, z1.b, z2.b +; CHECK-NEXT: uzp1 z0.b, z0.b, z4.b +; CHECK-NEXT: stp q1, q0, [x0] +; CHECK-NEXT: ret + %op1 = load <32 x i8>, <32 x i8>* %a + %op2 = load <32 x i8>, <32 x i8>* %b + %res = sdiv <32 x i8> %op1, %op2 + store <32 x i8> %res, <32 x i8>* %a + ret void +} + +define void @sdiv_v64i8(<64 x i8>* %a, <64 x i8>* %b) #0 { +; CHECK-LABEL: sdiv_v64i8: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w8, #32 +; CHECK-NEXT: mov w9, #48 +; CHECK-NEXT: mov w10, #16 +; CHECK-NEXT: ptrue p1.b, vl16 +; CHECK-NEXT: ptrue p0.s, vl4 +; CHECK-NEXT: ld1b { z2.b }, p1/z, [x0, x8] +; CHECK-NEXT: ld1b { z3.b }, p1/z, [x0, x9] +; CHECK-NEXT: ld1b { z4.b }, p1/z, [x0, x10] +; CHECK-NEXT: ld1b { z0.b }, p1/z, [x0] +; CHECK-NEXT: ld1b { z5.b }, p1/z, [x1, x10] +; CHECK-NEXT: ld1b { z7.b }, p1/z, [x1, x9] +; CHECK-NEXT: ld1b { z6.b }, p1/z, [x1, x8] +; CHECK-NEXT: sunpkhi z16.h, z4.b +; CHECK-NEXT: sunpklo z4.h, z4.b +; CHECK-NEXT: sunpkhi z1.h, z5.b +; CHECK-NEXT: sunpkhi z18.s, z16.h +; CHECK-NEXT: sunpkhi z17.s, z1.h +; CHECK-NEXT: sunpklo z1.s, z1.h +; CHECK-NEXT: sunpklo z16.s, z16.h +; CHECK-NEXT: sdivr z17.s, p0/m, z17.s, z18.s +; CHECK-NEXT: sdivr z1.s, p0/m, z1.s, z16.s +; CHECK-NEXT: sunpklo z5.h, z5.b +; CHECK-NEXT: uzp1 z1.h, z1.h, z17.h +; CHECK-NEXT: sunpkhi z17.s, z5.h +; CHECK-NEXT: sunpkhi z18.s, z4.h +; CHECK-NEXT: sunpklo z5.s, z5.h +; CHECK-NEXT: sunpklo z4.s, z4.h +; CHECK-NEXT: sdivr z17.s, p0/m, z17.s, z18.s +; CHECK-NEXT: sdiv z4.s, p0/m, z4.s, z5.s +; CHECK-NEXT: sunpkhi z5.h, z7.b +; CHECK-NEXT: sunpkhi z18.h, z3.b +; CHECK-NEXT: sunpkhi z19.s, z5.h +; CHECK-NEXT: sunpkhi z20.s, z18.h +; CHECK-NEXT: sunpklo z5.s, z5.h +; CHECK-NEXT: sunpklo z18.s, z18.h +; CHECK-NEXT: sunpklo z7.h, z7.b +; CHECK-NEXT: sunpklo z3.h, z3.b +; CHECK-NEXT: sdivr z19.s, p0/m, z19.s, z20.s +; CHECK-NEXT: sdivr z5.s, p0/m, z5.s, z18.s +; CHECK-NEXT: sunpkhi z18.s, z7.h +; CHECK-NEXT: sunpkhi z20.s, z3.h +; CHECK-NEXT: sunpklo z7.s, z7.h +; CHECK-NEXT: sunpklo z3.s, z3.h +; CHECK-NEXT: sdivr z18.s, p0/m, z18.s, z20.s +; CHECK-NEXT: sdiv z3.s, p0/m, z3.s, z7.s +; CHECK-NEXT: uzp1 z5.h, z5.h, z19.h +; CHECK-NEXT: uzp1 z3.h, z3.h, z18.h +; CHECK-NEXT: ld1b { z16.b }, p1/z, [x1] +; CHECK-NEXT: uzp1 z3.b, z3.b, z5.b +; CHECK-NEXT: sunpkhi z5.h, z6.b +; CHECK-NEXT: sunpkhi z7.h, z2.b +; CHECK-NEXT: uzp1 z4.h, z4.h, z17.h +; CHECK-NEXT: sunpkhi z17.s, z5.h +; CHECK-NEXT: sunpkhi z18.s, z7.h +; CHECK-NEXT: sunpklo z5.s, z5.h +; CHECK-NEXT: sunpklo z7.s, z7.h +; CHECK-NEXT: sunpklo z6.h, z6.b +; CHECK-NEXT: sunpklo z2.h, z2.b +; CHECK-NEXT: sdivr z17.s, p0/m, z17.s, z18.s +; CHECK-NEXT: sdivr z5.s, p0/m, z5.s, z7.s +; CHECK-NEXT: sunpkhi z7.s, z6.h +; CHECK-NEXT: sunpkhi z18.s, z2.h +; CHECK-NEXT: sunpklo z6.s, z6.h +; CHECK-NEXT: sunpklo z2.s, z2.h +; CHECK-NEXT: sdivr z7.s, p0/m, z7.s, z18.s +; CHECK-NEXT: sdiv z2.s, p0/m, z2.s, z6.s +; CHECK-NEXT: uzp1 z2.h, z2.h, z7.h +; CHECK-NEXT: sunpkhi z6.h, z16.b +; CHECK-NEXT: sunpkhi z7.h, z0.b +; CHECK-NEXT: uzp1 z5.h, z5.h, z17.h +; CHECK-NEXT: sunpkhi z17.s, z6.h +; CHECK-NEXT: sunpkhi z18.s, z7.h +; CHECK-NEXT: sunpklo z6.s, z6.h +; CHECK-NEXT: sunpklo z7.s, z7.h +; CHECK-NEXT: sdivr z17.s, p0/m, z17.s, z18.s +; CHECK-NEXT: sdivr z6.s, p0/m, z6.s, z7.s +; CHECK-NEXT: uzp1 z2.b, z2.b, z5.b +; CHECK-NEXT: uzp1 z5.h, z6.h, z17.h +; CHECK-NEXT: sunpklo z6.h, z16.b +; CHECK-NEXT: sunpklo z0.h, z0.b +; CHECK-NEXT: sunpkhi z7.s, z6.h +; CHECK-NEXT: sunpkhi z16.s, z0.h +; CHECK-NEXT: sunpklo z6.s, z6.h +; CHECK-NEXT: sunpklo z0.s, z0.h +; CHECK-NEXT: sdivr z7.s, p0/m, z7.s, z16.s +; CHECK-NEXT: sdiv z0.s, p0/m, z0.s, z6.s +; CHECK-NEXT: uzp1 z0.h, z0.h, z7.h +; CHECK-NEXT: uzp1 z1.b, z4.b, z1.b +; CHECK-NEXT: uzp1 z0.b, z0.b, z5.b +; CHECK-NEXT: stp q2, q3, [x0, #32] +; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: ret + %op1 = load <64 x i8>, <64 x i8>* %a + %op2 = load <64 x i8>, <64 x i8>* %b + %res = sdiv <64 x i8> %op1, %op2 + store <64 x i8> %res, <64 x i8>* %a + ret void +} + +define <2 x i16> @sdiv_v2i16(<2 x i16> %op1, <2 x i16> %op2) #0 { +; CHECK-LABEL: sdiv_v2i16: +; CHECK: // %bb.0: +; CHECK-NEXT: adrp x8, .LCPI5_0 +; CHECK-NEXT: add x8, x8, :lo12:.LCPI5_0 +; CHECK-NEXT: ptrue p0.s, vl2 +; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1 +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: ld1w { z2.s }, p0/z, [x8] +; CHECK-NEXT: lsl z1.s, p0/m, z1.s, z2.s +; CHECK-NEXT: lsl z0.s, p0/m, z0.s, z2.s +; CHECK-NEXT: asr z1.s, p0/m, z1.s, z2.s +; CHECK-NEXT: asr z0.s, p0/m, z0.s, z2.s +; CHECK-NEXT: sdiv z0.s, p0/m, z0.s, z1.s +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 +; CHECK-NEXT: ret + %res = sdiv <2 x i16> %op1, %op2 + ret <2 x i16> %res +} + +define <4 x i16> @sdiv_v4i16(<4 x i16> %op1, <4 x i16> %op2) #0 { +; CHECK-LABEL: sdiv_v4i16: +; CHECK: // %bb.0: +; CHECK-NEXT: sub sp, sp, #16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1 +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: ptrue p0.s, vl4 +; CHECK-NEXT: sunpklo z1.s, z1.h +; CHECK-NEXT: sunpklo z0.s, z0.h +; CHECK-NEXT: sdiv z0.s, p0/m, z0.s, z1.s +; CHECK-NEXT: ptrue p0.h, vl4 +; CHECK-NEXT: fmov w8, s0 +; CHECK-NEXT: mov z1.s, z0.s[3] +; CHECK-NEXT: mov z2.s, z0.s[2] +; CHECK-NEXT: mov z0.s, z0.s[1] +; CHECK-NEXT: fmov w9, s1 +; CHECK-NEXT: fmov w10, s2 +; CHECK-NEXT: strh w8, [sp, #8] +; CHECK-NEXT: fmov w8, s0 +; CHECK-NEXT: strh w9, [sp, #14] +; CHECK-NEXT: strh w8, [sp, #10] +; CHECK-NEXT: add x8, sp, #8 +; CHECK-NEXT: strh w10, [sp, #12] +; CHECK-NEXT: ld1h { z0.h }, p0/z, [x8] +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 +; CHECK-NEXT: add sp, sp, #16 +; CHECK-NEXT: ret + %res = sdiv <4 x i16> %op1, %op2 + ret <4 x i16> %res +} + +define <8 x i16> @sdiv_v8i16(<8 x i16> %op1, <8 x i16> %op2) #0 { +; CHECK-LABEL: sdiv_v8i16: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1 +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: ptrue p0.s, vl4 +; CHECK-NEXT: sunpkhi z2.s, z1.h +; CHECK-NEXT: sunpkhi z3.s, z0.h +; CHECK-NEXT: sunpklo z1.s, z1.h +; CHECK-NEXT: sunpklo z0.s, z0.h +; CHECK-NEXT: sdivr z2.s, p0/m, z2.s, z3.s +; CHECK-NEXT: sdiv z0.s, p0/m, z0.s, z1.s +; CHECK-NEXT: uzp1 z0.h, z0.h, z2.h +; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 +; CHECK-NEXT: ret + %res = sdiv <8 x i16> %op1, %op2 + ret <8 x i16> %res +} + +define void @sdiv_v16i16(<16 x i16>* %a, <16 x i16>* %b) #0 { +; CHECK-LABEL: sdiv_v16i16: +; CHECK: // %bb.0: +; CHECK-NEXT: mov x8, #8 +; CHECK-NEXT: ptrue p0.h, vl8 +; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] +; CHECK-NEXT: ld1h { z1.h }, p0/z, [x0] +; CHECK-NEXT: ld1h { z2.h }, p0/z, [x1, x8, lsl #1] +; CHECK-NEXT: ld1h { z3.h }, p0/z, [x1] +; CHECK-NEXT: ptrue p0.s, vl4 +; CHECK-NEXT: sunpkhi z5.s, z0.h +; CHECK-NEXT: sunpklo z0.s, z0.h +; CHECK-NEXT: sunpkhi z4.s, z2.h +; CHECK-NEXT: sunpkhi z6.s, z3.h +; CHECK-NEXT: sdivr z4.s, p0/m, z4.s, z5.s +; CHECK-NEXT: sunpkhi z5.s, z1.h +; CHECK-NEXT: sunpklo z2.s, z2.h +; CHECK-NEXT: sunpklo z3.s, z3.h +; CHECK-NEXT: sunpklo z1.s, z1.h +; CHECK-NEXT: sdiv z5.s, p0/m, z5.s, z6.s +; CHECK-NEXT: sdiv z1.s, p0/m, z1.s, z3.s +; CHECK-NEXT: sdiv z0.s, p0/m, z0.s, z2.s +; CHECK-NEXT: uzp1 z1.h, z1.h, z5.h +; CHECK-NEXT: uzp1 z0.h, z0.h, z4.h +; CHECK-NEXT: stp q1, q0, [x0] +; CHECK-NEXT: ret + %op1 = load <16 x i16>, <16 x i16>* %a + %op2 = load <16 x i16>, <16 x i16>* %b + %res = sdiv <16 x i16> %op1, %op2 + store <16 x i16> %res, <16 x i16>* %a + ret void +} + +define void @sdiv_v32i16(<32 x i16>* %a, <32 x i16>* %b) #0 { +; CHECK-LABEL: sdiv_v32i16: +; CHECK: // %bb.0: +; CHECK-NEXT: mov x8, #16 +; CHECK-NEXT: mov x9, #24 +; CHECK-NEXT: mov x10, #8 +; CHECK-NEXT: ptrue p1.h, vl8 +; CHECK-NEXT: ptrue p0.s, vl4 +; CHECK-NEXT: ld1h { z0.h }, p1/z, [x0, x8, lsl #1] +; CHECK-NEXT: ld1h { z1.h }, p1/z, [x0, x9, lsl #1] +; CHECK-NEXT: ld1h { z2.h }, p1/z, [x0, x10, lsl #1] +; CHECK-NEXT: ld1h { z3.h }, p1/z, [x0] +; CHECK-NEXT: ld1h { z4.h }, p1/z, [x1, x10, lsl #1] +; CHECK-NEXT: ld1h { z5.h }, p1/z, [x1, x9, lsl #1] +; CHECK-NEXT: ld1h { z6.h }, p1/z, [x1, x8, lsl #1] +; CHECK-NEXT: ld1h { z17.h }, p1/z, [x1] +; CHECK-NEXT: sunpkhi z18.s, z1.h +; CHECK-NEXT: sunpklo z1.s, z1.h +; CHECK-NEXT: sunpkhi z16.s, z2.h +; CHECK-NEXT: sunpklo z2.s, z2.h +; CHECK-NEXT: sunpkhi z7.s, z4.h +; CHECK-NEXT: sunpklo z4.s, z4.h +; CHECK-NEXT: sdivr z7.s, p0/m, z7.s, z16.s +; CHECK-NEXT: sunpkhi z16.s, z5.h +; CHECK-NEXT: sunpklo z5.s, z5.h +; CHECK-NEXT: sdiv z2.s, p0/m, z2.s, z4.s +; CHECK-NEXT: sdiv z1.s, p0/m, z1.s, z5.s +; CHECK-NEXT: sunpkhi z4.s, z6.h +; CHECK-NEXT: sunpkhi z5.s, z0.h +; CHECK-NEXT: sunpklo z6.s, z6.h +; CHECK-NEXT: sunpklo z0.s, z0.h +; CHECK-NEXT: sdivr z4.s, p0/m, z4.s, z5.s +; CHECK-NEXT: sdiv z0.s, p0/m, z0.s, z6.s +; CHECK-NEXT: sunpkhi z5.s, z17.h +; CHECK-NEXT: sdivr z16.s, p0/m, z16.s, z18.s +; CHECK-NEXT: sunpkhi z6.s, z3.h +; CHECK-NEXT: uzp1 z0.h, z0.h, z4.h +; CHECK-NEXT: movprfx z4, z6 +; CHECK-NEXT: sdiv z4.s, p0/m, z4.s, z5.s +; CHECK-NEXT: sunpklo z5.s, z17.h +; CHECK-NEXT: sunpklo z3.s, z3.h +; CHECK-NEXT: uzp1 z1.h, z1.h, z16.h +; CHECK-NEXT: sdiv z3.s, p0/m, z3.s, z5.s +; CHECK-NEXT: stp q0, q1, [x0, #32] +; CHECK-NEXT: uzp1 z0.h, z3.h, z4.h +; CHECK-NEXT: uzp1 z1.h, z2.h, z7.h +; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: ret + %op1 = load <32 x i16>, <32 x i16>* %a + %op2 = load <32 x i16>, <32 x i16>* %b + %res = sdiv <32 x i16> %op1, %op2 + store <32 x i16> %res, <32 x i16>* %a + ret void +} + +define <2 x i32> @sdiv_v2i32(<2 x i32> %op1, <2 x i32> %op2) #0 { +; CHECK-LABEL: sdiv_v2i32: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: ptrue p0.s, vl2 +; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1 +; CHECK-NEXT: sdiv z0.s, p0/m, z0.s, z1.s +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 +; CHECK-NEXT: ret + %res = sdiv <2 x i32> %op1, %op2 + ret <2 x i32> %res +} + +define <4 x i32> @sdiv_v4i32(<4 x i32> %op1, <4 x i32> %op2) #0 { +; CHECK-LABEL: sdiv_v4i32: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: ptrue p0.s, vl4 +; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1 +; CHECK-NEXT: sdiv z0.s, p0/m, z0.s, z1.s +; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 +; CHECK-NEXT: ret + %res = sdiv <4 x i32> %op1, %op2 + ret <4 x i32> %res +} + +define void @sdiv_v8i32(<8 x i32>* %a, <8 x i32>* %b) #0 { +; CHECK-LABEL: sdiv_v8i32: +; CHECK: // %bb.0: +; CHECK-NEXT: mov x8, #4 +; CHECK-NEXT: ptrue p0.s, vl4 +; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] +; CHECK-NEXT: ld1w { z1.s }, p0/z, [x0] +; CHECK-NEXT: ld1w { z2.s }, p0/z, [x1, x8, lsl #2] +; CHECK-NEXT: ld1w { z3.s }, p0/z, [x1] +; CHECK-NEXT: sdiv z1.s, p0/m, z1.s, z3.s +; CHECK-NEXT: sdiv z0.s, p0/m, z0.s, z2.s +; CHECK-NEXT: stp q1, q0, [x0] +; CHECK-NEXT: ret + %op1 = load <8 x i32>, <8 x i32>* %a + %op2 = load <8 x i32>, <8 x i32>* %b + %res = sdiv <8 x i32> %op1, %op2 + store <8 x i32> %res, <8 x i32>* %a + ret void +} + +define void @sdiv_v16i32(<16 x i32>* %a, <16 x i32>* %b) #0 { +; CHECK-LABEL: sdiv_v16i32: +; CHECK: // %bb.0: +; CHECK-NEXT: mov x8, #8 +; CHECK-NEXT: mov x9, #12 +; CHECK-NEXT: mov x10, #4 +; CHECK-NEXT: ptrue p0.s, vl4 +; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] +; CHECK-NEXT: ld1w { z1.s }, p0/z, [x0, x9, lsl #2] +; CHECK-NEXT: ld1w { z2.s }, p0/z, [x0, x10, lsl #2] +; CHECK-NEXT: ld1w { z3.s }, p0/z, [x0] +; CHECK-NEXT: ld1w { z4.s }, p0/z, [x1, x8, lsl #2] +; CHECK-NEXT: ld1w { z5.s }, p0/z, [x1, x9, lsl #2] +; CHECK-NEXT: ld1w { z6.s }, p0/z, [x1, x10, lsl #2] +; CHECK-NEXT: ld1w { z7.s }, p0/z, [x1] +; CHECK-NEXT: sdiv z0.s, p0/m, z0.s, z4.s +; CHECK-NEXT: sdiv z1.s, p0/m, z1.s, z5.s +; CHECK-NEXT: stp q0, q1, [x0, #32] +; CHECK-NEXT: movprfx z0, z3 +; CHECK-NEXT: sdiv z0.s, p0/m, z0.s, z7.s +; CHECK-NEXT: movprfx z1, z2 +; CHECK-NEXT: sdiv z1.s, p0/m, z1.s, z6.s +; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: ret + %op1 = load <16 x i32>, <16 x i32>* %a + %op2 = load <16 x i32>, <16 x i32>* %b + %res = sdiv <16 x i32> %op1, %op2 + store <16 x i32> %res, <16 x i32>* %a + ret void +} + +define <1 x i64> @sdiv_v1i64(<1 x i64> %op1, <1 x i64> %op2) #0 { +; CHECK-LABEL: sdiv_v1i64: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: ptrue p0.d, vl1 +; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1 +; CHECK-NEXT: sdiv z0.d, p0/m, z0.d, z1.d +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 +; CHECK-NEXT: ret + %res = sdiv <1 x i64> %op1, %op2 + ret <1 x i64> %res +} + +define <2 x i64> @sdiv_v2i64(<2 x i64> %op1, <2 x i64> %op2) #0 { +; CHECK-LABEL: sdiv_v2i64: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: ptrue p0.d, vl2 +; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1 +; CHECK-NEXT: sdiv z0.d, p0/m, z0.d, z1.d +; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 +; CHECK-NEXT: ret + %res = sdiv <2 x i64> %op1, %op2 + ret <2 x i64> %res +} + +define void @sdiv_v4i64(<4 x i64>* %a, <4 x i64>* %b) #0 { +; CHECK-LABEL: sdiv_v4i64: +; CHECK: // %bb.0: +; CHECK-NEXT: mov x8, #2 +; CHECK-NEXT: ptrue p0.d, vl2 +; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] +; CHECK-NEXT: ld1d { z1.d }, p0/z, [x0] +; CHECK-NEXT: ld1d { z2.d }, p0/z, [x1, x8, lsl #3] +; CHECK-NEXT: ld1d { z3.d }, p0/z, [x1] +; CHECK-NEXT: sdiv z1.d, p0/m, z1.d, z3.d +; CHECK-NEXT: sdiv z0.d, p0/m, z0.d, z2.d +; CHECK-NEXT: stp q1, q0, [x0] +; CHECK-NEXT: ret + %op1 = load <4 x i64>, <4 x i64>* %a + %op2 = load <4 x i64>, <4 x i64>* %b + %res = sdiv <4 x i64> %op1, %op2 + store <4 x i64> %res, <4 x i64>* %a + ret void +} + +define void @sdiv_v8i64(<8 x i64>* %a, <8 x i64>* %b) #0 { +; CHECK-LABEL: sdiv_v8i64: +; CHECK: // %bb.0: +; CHECK-NEXT: mov x8, #4 +; CHECK-NEXT: mov x9, #6 +; CHECK-NEXT: mov x10, #2 +; CHECK-NEXT: ptrue p0.d, vl2 +; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] +; CHECK-NEXT: ld1d { z1.d }, p0/z, [x0, x9, lsl #3] +; CHECK-NEXT: ld1d { z2.d }, p0/z, [x0, x10, lsl #3] +; CHECK-NEXT: ld1d { z3.d }, p0/z, [x0] +; CHECK-NEXT: ld1d { z4.d }, p0/z, [x1, x8, lsl #3] +; CHECK-NEXT: ld1d { z5.d }, p0/z, [x1, x9, lsl #3] +; CHECK-NEXT: ld1d { z6.d }, p0/z, [x1, x10, lsl #3] +; CHECK-NEXT: ld1d { z7.d }, p0/z, [x1] +; CHECK-NEXT: sdiv z0.d, p0/m, z0.d, z4.d +; CHECK-NEXT: sdiv z1.d, p0/m, z1.d, z5.d +; CHECK-NEXT: stp q0, q1, [x0, #32] +; CHECK-NEXT: movprfx z0, z3 +; CHECK-NEXT: sdiv z0.d, p0/m, z0.d, z7.d +; CHECK-NEXT: movprfx z1, z2 +; CHECK-NEXT: sdiv z1.d, p0/m, z1.d, z6.d +; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: ret + %op1 = load <8 x i64>, <8 x i64>* %a + %op2 = load <8 x i64>, <8 x i64>* %b + %res = sdiv <8 x i64> %op1, %op2 + store <8 x i64> %res, <8 x i64>* %a + ret void +} + +; +; UDIV +; + +define <4 x i8> @udiv_v4i8(<4 x i8> %op1, <4 x i8> %op2) #0 { +; CHECK-LABEL: udiv_v4i8: +; CHECK: // %bb.0: +; CHECK-NEXT: sub sp, sp, #16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: adrp x8, .LCPI18_0 +; CHECK-NEXT: add x8, x8, :lo12:.LCPI18_0 +; CHECK-NEXT: ptrue p0.h, vl4 +; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1 +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: ptrue p1.s, vl4 +; CHECK-NEXT: ld1h { z2.h }, p0/z, [x8] +; CHECK-NEXT: and z0.d, z0.d, z2.d +; CHECK-NEXT: and z1.d, z1.d, z2.d +; CHECK-NEXT: uunpklo z1.s, z1.h +; CHECK-NEXT: uunpklo z0.s, z0.h +; CHECK-NEXT: udiv z0.s, p1/m, z0.s, z1.s +; CHECK-NEXT: fmov w8, s0 +; CHECK-NEXT: mov z1.s, z0.s[3] +; CHECK-NEXT: mov z2.s, z0.s[2] +; CHECK-NEXT: mov z0.s, z0.s[1] +; CHECK-NEXT: fmov w9, s1 +; CHECK-NEXT: fmov w10, s2 +; CHECK-NEXT: strh w8, [sp, #8] +; CHECK-NEXT: fmov w8, s0 +; CHECK-NEXT: strh w9, [sp, #14] +; CHECK-NEXT: strh w8, [sp, #10] +; CHECK-NEXT: add x8, sp, #8 +; CHECK-NEXT: strh w10, [sp, #12] +; CHECK-NEXT: ld1h { z0.h }, p0/z, [x8] +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 +; CHECK-NEXT: add sp, sp, #16 +; CHECK-NEXT: ret + %res = udiv <4 x i8> %op1, %op2 + ret <4 x i8> %res +} + +define <8 x i8> @udiv_v8i8(<8 x i8> %op1, <8 x i8> %op2) #0 { +; CHECK-LABEL: udiv_v8i8: +; CHECK: // %bb.0: +; CHECK-NEXT: sub sp, sp, #16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1 +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: ptrue p0.s, vl4 +; CHECK-NEXT: uunpklo z1.h, z1.b +; CHECK-NEXT: uunpklo z0.h, z0.b +; CHECK-NEXT: uunpkhi z2.s, z1.h +; CHECK-NEXT: uunpkhi z3.s, z0.h +; CHECK-NEXT: uunpklo z1.s, z1.h +; CHECK-NEXT: uunpklo z0.s, z0.h +; CHECK-NEXT: udivr z2.s, p0/m, z2.s, z3.s +; CHECK-NEXT: udiv z0.s, p0/m, z0.s, z1.s +; CHECK-NEXT: uzp1 z0.h, z0.h, z2.h +; CHECK-NEXT: ptrue p0.b, vl8 +; CHECK-NEXT: fmov w8, s0 +; CHECK-NEXT: mov z1.h, z0.h[7] +; CHECK-NEXT: mov z3.h, z0.h[5] +; CHECK-NEXT: fmov w9, s1 +; CHECK-NEXT: mov z2.h, z0.h[6] +; CHECK-NEXT: mov z4.h, z0.h[4] +; CHECK-NEXT: strb w8, [sp, #8] +; CHECK-NEXT: fmov w8, s3 +; CHECK-NEXT: mov z6.h, z0.h[2] +; CHECK-NEXT: fmov w10, s2 +; CHECK-NEXT: strb w9, [sp, #15] +; CHECK-NEXT: fmov w9, s4 +; CHECK-NEXT: strb w8, [sp, #13] +; CHECK-NEXT: fmov w8, s6 +; CHECK-NEXT: mov z5.h, z0.h[3] +; CHECK-NEXT: mov z0.h, z0.h[1] +; CHECK-NEXT: strb w10, [sp, #14] +; CHECK-NEXT: fmov w10, s5 +; CHECK-NEXT: strb w9, [sp, #12] +; CHECK-NEXT: fmov w9, s0 +; CHECK-NEXT: strb w8, [sp, #10] +; CHECK-NEXT: add x8, sp, #8 +; CHECK-NEXT: strb w10, [sp, #11] +; CHECK-NEXT: strb w9, [sp, #9] +; CHECK-NEXT: ld1b { z0.b }, p0/z, [x8] +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 +; CHECK-NEXT: add sp, sp, #16 +; CHECK-NEXT: ret + %res = udiv <8 x i8> %op1, %op2 + ret <8 x i8> %res +} + +define <16 x i8> @udiv_v16i8(<16 x i8> %op1, <16 x i8> %op2) #0 { +; CHECK-LABEL: udiv_v16i8: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1 +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: uunpkhi z2.h, z1.b +; CHECK-NEXT: uunpkhi z3.h, z0.b +; CHECK-NEXT: ptrue p0.s, vl4 +; CHECK-NEXT: uunpklo z1.h, z1.b +; CHECK-NEXT: uunpkhi z4.s, z2.h +; CHECK-NEXT: uunpkhi z5.s, z3.h +; CHECK-NEXT: uunpklo z2.s, z2.h +; CHECK-NEXT: uunpklo z3.s, z3.h +; CHECK-NEXT: uunpklo z0.h, z0.b +; CHECK-NEXT: udivr z4.s, p0/m, z4.s, z5.s +; CHECK-NEXT: udivr z2.s, p0/m, z2.s, z3.s +; CHECK-NEXT: uunpkhi z3.s, z1.h +; CHECK-NEXT: uunpkhi z5.s, z0.h +; CHECK-NEXT: uunpklo z1.s, z1.h +; CHECK-NEXT: uunpklo z0.s, z0.h +; CHECK-NEXT: udivr z3.s, p0/m, z3.s, z5.s +; CHECK-NEXT: udiv z0.s, p0/m, z0.s, z1.s +; CHECK-NEXT: uzp1 z1.h, z2.h, z4.h +; CHECK-NEXT: uzp1 z0.h, z0.h, z3.h +; CHECK-NEXT: uzp1 z0.b, z0.b, z1.b +; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 +; CHECK-NEXT: ret + %res = udiv <16 x i8> %op1, %op2 + ret <16 x i8> %res +} + +define void @udiv_v32i8(<32 x i8>* %a, <32 x i8>* %b) #0 { +; CHECK-LABEL: udiv_v32i8: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w8, #16 +; CHECK-NEXT: ptrue p0.b, vl16 +; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0, x8] +; CHECK-NEXT: ld1b { z1.b }, p0/z, [x0] +; CHECK-NEXT: ld1b { z2.b }, p0/z, [x1, x8] +; CHECK-NEXT: ld1b { z3.b }, p0/z, [x1] +; CHECK-NEXT: ptrue p0.s, vl4 +; CHECK-NEXT: uunpkhi z5.h, z0.b +; CHECK-NEXT: uunpklo z0.h, z0.b +; CHECK-NEXT: uunpkhi z4.h, z2.b +; CHECK-NEXT: uunpklo z2.h, z2.b +; CHECK-NEXT: uunpkhi z6.s, z4.h +; CHECK-NEXT: uunpkhi z7.s, z5.h +; CHECK-NEXT: uunpklo z4.s, z4.h +; CHECK-NEXT: uunpklo z5.s, z5.h +; CHECK-NEXT: uunpkhi z16.s, z2.h +; CHECK-NEXT: udivr z6.s, p0/m, z6.s, z7.s +; CHECK-NEXT: udivr z4.s, p0/m, z4.s, z5.s +; CHECK-NEXT: uunpkhi z5.s, z0.h +; CHECK-NEXT: uunpklo z2.s, z2.h +; CHECK-NEXT: uunpklo z0.s, z0.h +; CHECK-NEXT: uzp1 z4.h, z4.h, z6.h +; CHECK-NEXT: udiv z0.s, p0/m, z0.s, z2.s +; CHECK-NEXT: uunpkhi z2.h, z3.b +; CHECK-NEXT: uunpkhi z6.h, z1.b +; CHECK-NEXT: udiv z5.s, p0/m, z5.s, z16.s +; CHECK-NEXT: uunpkhi z7.s, z2.h +; CHECK-NEXT: uunpkhi z16.s, z6.h +; CHECK-NEXT: uunpklo z2.s, z2.h +; CHECK-NEXT: uunpklo z6.s, z6.h +; CHECK-NEXT: uunpklo z3.h, z3.b +; CHECK-NEXT: uunpklo z1.h, z1.b +; CHECK-NEXT: udivr z7.s, p0/m, z7.s, z16.s +; CHECK-NEXT: udivr z2.s, p0/m, z2.s, z6.s +; CHECK-NEXT: uunpkhi z6.s, z3.h +; CHECK-NEXT: uunpkhi z16.s, z1.h +; CHECK-NEXT: uunpklo z3.s, z3.h +; CHECK-NEXT: uunpklo z1.s, z1.h +; CHECK-NEXT: udivr z6.s, p0/m, z6.s, z16.s +; CHECK-NEXT: udiv z1.s, p0/m, z1.s, z3.s +; CHECK-NEXT: uzp1 z2.h, z2.h, z7.h +; CHECK-NEXT: uzp1 z1.h, z1.h, z6.h +; CHECK-NEXT: uzp1 z0.h, z0.h, z5.h +; CHECK-NEXT: uzp1 z1.b, z1.b, z2.b +; CHECK-NEXT: uzp1 z0.b, z0.b, z4.b +; CHECK-NEXT: stp q1, q0, [x0] +; CHECK-NEXT: ret + %op1 = load <32 x i8>, <32 x i8>* %a + %op2 = load <32 x i8>, <32 x i8>* %b + %res = udiv <32 x i8> %op1, %op2 + store <32 x i8> %res, <32 x i8>* %a + ret void +} + +define void @udiv_v64i8(<64 x i8>* %a, <64 x i8>* %b) #0 { +; CHECK-LABEL: udiv_v64i8: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w8, #32 +; CHECK-NEXT: mov w9, #48 +; CHECK-NEXT: mov w10, #16 +; CHECK-NEXT: ptrue p1.b, vl16 +; CHECK-NEXT: ptrue p0.s, vl4 +; CHECK-NEXT: ld1b { z2.b }, p1/z, [x0, x8] +; CHECK-NEXT: ld1b { z3.b }, p1/z, [x0, x9] +; CHECK-NEXT: ld1b { z4.b }, p1/z, [x0, x10] +; CHECK-NEXT: ld1b { z0.b }, p1/z, [x0] +; CHECK-NEXT: ld1b { z5.b }, p1/z, [x1, x10] +; CHECK-NEXT: ld1b { z7.b }, p1/z, [x1, x9] +; CHECK-NEXT: ld1b { z6.b }, p1/z, [x1, x8] +; CHECK-NEXT: uunpkhi z16.h, z4.b +; CHECK-NEXT: uunpklo z4.h, z4.b +; CHECK-NEXT: uunpkhi z1.h, z5.b +; CHECK-NEXT: uunpkhi z18.s, z16.h +; CHECK-NEXT: uunpkhi z17.s, z1.h +; CHECK-NEXT: uunpklo z1.s, z1.h +; CHECK-NEXT: uunpklo z16.s, z16.h +; CHECK-NEXT: udivr z17.s, p0/m, z17.s, z18.s +; CHECK-NEXT: udivr z1.s, p0/m, z1.s, z16.s +; CHECK-NEXT: uunpklo z5.h, z5.b +; CHECK-NEXT: uzp1 z1.h, z1.h, z17.h +; CHECK-NEXT: uunpkhi z17.s, z5.h +; CHECK-NEXT: uunpkhi z18.s, z4.h +; CHECK-NEXT: uunpklo z5.s, z5.h +; CHECK-NEXT: uunpklo z4.s, z4.h +; CHECK-NEXT: udivr z17.s, p0/m, z17.s, z18.s +; CHECK-NEXT: udiv z4.s, p0/m, z4.s, z5.s +; CHECK-NEXT: uunpkhi z5.h, z7.b +; CHECK-NEXT: uunpkhi z18.h, z3.b +; CHECK-NEXT: uunpkhi z19.s, z5.h +; CHECK-NEXT: uunpkhi z20.s, z18.h +; CHECK-NEXT: uunpklo z5.s, z5.h +; CHECK-NEXT: uunpklo z18.s, z18.h +; CHECK-NEXT: uunpklo z7.h, z7.b +; CHECK-NEXT: uunpklo z3.h, z3.b +; CHECK-NEXT: udivr z19.s, p0/m, z19.s, z20.s +; CHECK-NEXT: udivr z5.s, p0/m, z5.s, z18.s +; CHECK-NEXT: uunpkhi z18.s, z7.h +; CHECK-NEXT: uunpkhi z20.s, z3.h +; CHECK-NEXT: uunpklo z7.s, z7.h +; CHECK-NEXT: uunpklo z3.s, z3.h +; CHECK-NEXT: udivr z18.s, p0/m, z18.s, z20.s +; CHECK-NEXT: udiv z3.s, p0/m, z3.s, z7.s +; CHECK-NEXT: uzp1 z5.h, z5.h, z19.h +; CHECK-NEXT: uzp1 z3.h, z3.h, z18.h +; CHECK-NEXT: ld1b { z16.b }, p1/z, [x1] +; CHECK-NEXT: uzp1 z3.b, z3.b, z5.b +; CHECK-NEXT: uunpkhi z5.h, z6.b +; CHECK-NEXT: uunpkhi z7.h, z2.b +; CHECK-NEXT: uzp1 z4.h, z4.h, z17.h +; CHECK-NEXT: uunpkhi z17.s, z5.h +; CHECK-NEXT: uunpkhi z18.s, z7.h +; CHECK-NEXT: uunpklo z5.s, z5.h +; CHECK-NEXT: uunpklo z7.s, z7.h +; CHECK-NEXT: uunpklo z6.h, z6.b +; CHECK-NEXT: uunpklo z2.h, z2.b +; CHECK-NEXT: udivr z17.s, p0/m, z17.s, z18.s +; CHECK-NEXT: udivr z5.s, p0/m, z5.s, z7.s +; CHECK-NEXT: uunpkhi z7.s, z6.h +; CHECK-NEXT: uunpkhi z18.s, z2.h +; CHECK-NEXT: uunpklo z6.s, z6.h +; CHECK-NEXT: uunpklo z2.s, z2.h +; CHECK-NEXT: udivr z7.s, p0/m, z7.s, z18.s +; CHECK-NEXT: udiv z2.s, p0/m, z2.s, z6.s +; CHECK-NEXT: uzp1 z2.h, z2.h, z7.h +; CHECK-NEXT: uunpkhi z6.h, z16.b +; CHECK-NEXT: uunpkhi z7.h, z0.b +; CHECK-NEXT: uzp1 z5.h, z5.h, z17.h +; CHECK-NEXT: uunpkhi z17.s, z6.h +; CHECK-NEXT: uunpkhi z18.s, z7.h +; CHECK-NEXT: uunpklo z6.s, z6.h +; CHECK-NEXT: uunpklo z7.s, z7.h +; CHECK-NEXT: udivr z17.s, p0/m, z17.s, z18.s +; CHECK-NEXT: udivr z6.s, p0/m, z6.s, z7.s +; CHECK-NEXT: uzp1 z2.b, z2.b, z5.b +; CHECK-NEXT: uzp1 z5.h, z6.h, z17.h +; CHECK-NEXT: uunpklo z6.h, z16.b +; CHECK-NEXT: uunpklo z0.h, z0.b +; CHECK-NEXT: uunpkhi z7.s, z6.h +; CHECK-NEXT: uunpkhi z16.s, z0.h +; CHECK-NEXT: uunpklo z6.s, z6.h +; CHECK-NEXT: uunpklo z0.s, z0.h +; CHECK-NEXT: udivr z7.s, p0/m, z7.s, z16.s +; CHECK-NEXT: udiv z0.s, p0/m, z0.s, z6.s +; CHECK-NEXT: uzp1 z0.h, z0.h, z7.h +; CHECK-NEXT: uzp1 z1.b, z4.b, z1.b +; CHECK-NEXT: uzp1 z0.b, z0.b, z5.b +; CHECK-NEXT: stp q2, q3, [x0, #32] +; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: ret + %op1 = load <64 x i8>, <64 x i8>* %a + %op2 = load <64 x i8>, <64 x i8>* %b + %res = udiv <64 x i8> %op1, %op2 + store <64 x i8> %res, <64 x i8>* %a + ret void +} + +define <2 x i16> @udiv_v2i16(<2 x i16> %op1, <2 x i16> %op2) #0 { +; CHECK-LABEL: udiv_v2i16: +; CHECK: // %bb.0: +; CHECK-NEXT: adrp x8, .LCPI23_0 +; CHECK-NEXT: add x8, x8, :lo12:.LCPI23_0 +; CHECK-NEXT: ptrue p0.s, vl2 +; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1 +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: ld1w { z2.s }, p0/z, [x8] +; CHECK-NEXT: and z1.d, z1.d, z2.d +; CHECK-NEXT: and z0.d, z0.d, z2.d +; CHECK-NEXT: udiv z0.s, p0/m, z0.s, z1.s +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 +; CHECK-NEXT: ret + %res = udiv <2 x i16> %op1, %op2 + ret <2 x i16> %res +} + +define <4 x i16> @udiv_v4i16(<4 x i16> %op1, <4 x i16> %op2) #0 { +; CHECK-LABEL: udiv_v4i16: +; CHECK: // %bb.0: +; CHECK-NEXT: sub sp, sp, #16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1 +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: ptrue p0.s, vl4 +; CHECK-NEXT: uunpklo z1.s, z1.h +; CHECK-NEXT: uunpklo z0.s, z0.h +; CHECK-NEXT: udiv z0.s, p0/m, z0.s, z1.s +; CHECK-NEXT: ptrue p0.h, vl4 +; CHECK-NEXT: fmov w8, s0 +; CHECK-NEXT: mov z1.s, z0.s[3] +; CHECK-NEXT: mov z2.s, z0.s[2] +; CHECK-NEXT: mov z0.s, z0.s[1] +; CHECK-NEXT: fmov w9, s1 +; CHECK-NEXT: fmov w10, s2 +; CHECK-NEXT: strh w8, [sp, #8] +; CHECK-NEXT: fmov w8, s0 +; CHECK-NEXT: strh w9, [sp, #14] +; CHECK-NEXT: strh w8, [sp, #10] +; CHECK-NEXT: add x8, sp, #8 +; CHECK-NEXT: strh w10, [sp, #12] +; CHECK-NEXT: ld1h { z0.h }, p0/z, [x8] +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 +; CHECK-NEXT: add sp, sp, #16 +; CHECK-NEXT: ret + %res = udiv <4 x i16> %op1, %op2 + ret <4 x i16> %res +} + +define <8 x i16> @udiv_v8i16(<8 x i16> %op1, <8 x i16> %op2) #0 { +; CHECK-LABEL: udiv_v8i16: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1 +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: ptrue p0.s, vl4 +; CHECK-NEXT: uunpkhi z2.s, z1.h +; CHECK-NEXT: uunpkhi z3.s, z0.h +; CHECK-NEXT: uunpklo z1.s, z1.h +; CHECK-NEXT: uunpklo z0.s, z0.h +; CHECK-NEXT: udivr z2.s, p0/m, z2.s, z3.s +; CHECK-NEXT: udiv z0.s, p0/m, z0.s, z1.s +; CHECK-NEXT: uzp1 z0.h, z0.h, z2.h +; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 +; CHECK-NEXT: ret + %res = udiv <8 x i16> %op1, %op2 + ret <8 x i16> %res +} + +define void @udiv_v16i16(<16 x i16>* %a, <16 x i16>* %b) #0 { +; CHECK-LABEL: udiv_v16i16: +; CHECK: // %bb.0: +; CHECK-NEXT: mov x8, #8 +; CHECK-NEXT: ptrue p0.h, vl8 +; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] +; CHECK-NEXT: ld1h { z1.h }, p0/z, [x0] +; CHECK-NEXT: ld1h { z2.h }, p0/z, [x1, x8, lsl #1] +; CHECK-NEXT: ld1h { z3.h }, p0/z, [x1] +; CHECK-NEXT: ptrue p0.s, vl4 +; CHECK-NEXT: uunpkhi z5.s, z0.h +; CHECK-NEXT: uunpklo z0.s, z0.h +; CHECK-NEXT: uunpkhi z4.s, z2.h +; CHECK-NEXT: uunpkhi z6.s, z3.h +; CHECK-NEXT: udivr z4.s, p0/m, z4.s, z5.s +; CHECK-NEXT: uunpkhi z5.s, z1.h +; CHECK-NEXT: uunpklo z2.s, z2.h +; CHECK-NEXT: uunpklo z3.s, z3.h +; CHECK-NEXT: uunpklo z1.s, z1.h +; CHECK-NEXT: udiv z5.s, p0/m, z5.s, z6.s +; CHECK-NEXT: udiv z1.s, p0/m, z1.s, z3.s +; CHECK-NEXT: udiv z0.s, p0/m, z0.s, z2.s +; CHECK-NEXT: uzp1 z1.h, z1.h, z5.h +; CHECK-NEXT: uzp1 z0.h, z0.h, z4.h +; CHECK-NEXT: stp q1, q0, [x0] +; CHECK-NEXT: ret + %op1 = load <16 x i16>, <16 x i16>* %a + %op2 = load <16 x i16>, <16 x i16>* %b + %res = udiv <16 x i16> %op1, %op2 + store <16 x i16> %res, <16 x i16>* %a + ret void +} + +define void @udiv_v32i16(<32 x i16>* %a, <32 x i16>* %b) #0 { +; CHECK-LABEL: udiv_v32i16: +; CHECK: // %bb.0: +; CHECK-NEXT: mov x8, #16 +; CHECK-NEXT: mov x9, #24 +; CHECK-NEXT: mov x10, #8 +; CHECK-NEXT: ptrue p1.h, vl8 +; CHECK-NEXT: ptrue p0.s, vl4 +; CHECK-NEXT: ld1h { z0.h }, p1/z, [x0, x8, lsl #1] +; CHECK-NEXT: ld1h { z1.h }, p1/z, [x0, x9, lsl #1] +; CHECK-NEXT: ld1h { z2.h }, p1/z, [x0, x10, lsl #1] +; CHECK-NEXT: ld1h { z3.h }, p1/z, [x0] +; CHECK-NEXT: ld1h { z4.h }, p1/z, [x1, x10, lsl #1] +; CHECK-NEXT: ld1h { z5.h }, p1/z, [x1, x9, lsl #1] +; CHECK-NEXT: ld1h { z6.h }, p1/z, [x1, x8, lsl #1] +; CHECK-NEXT: ld1h { z17.h }, p1/z, [x1] +; CHECK-NEXT: uunpkhi z18.s, z1.h +; CHECK-NEXT: uunpklo z1.s, z1.h +; CHECK-NEXT: uunpkhi z16.s, z2.h +; CHECK-NEXT: uunpklo z2.s, z2.h +; CHECK-NEXT: uunpkhi z7.s, z4.h +; CHECK-NEXT: uunpklo z4.s, z4.h +; CHECK-NEXT: udivr z7.s, p0/m, z7.s, z16.s +; CHECK-NEXT: uunpkhi z16.s, z5.h +; CHECK-NEXT: uunpklo z5.s, z5.h +; CHECK-NEXT: udiv z2.s, p0/m, z2.s, z4.s +; CHECK-NEXT: udiv z1.s, p0/m, z1.s, z5.s +; CHECK-NEXT: uunpkhi z4.s, z6.h +; CHECK-NEXT: uunpkhi z5.s, z0.h +; CHECK-NEXT: uunpklo z6.s, z6.h +; CHECK-NEXT: uunpklo z0.s, z0.h +; CHECK-NEXT: udivr z4.s, p0/m, z4.s, z5.s +; CHECK-NEXT: udiv z0.s, p0/m, z0.s, z6.s +; CHECK-NEXT: uunpkhi z5.s, z17.h +; CHECK-NEXT: udivr z16.s, p0/m, z16.s, z18.s +; CHECK-NEXT: uunpkhi z6.s, z3.h +; CHECK-NEXT: uzp1 z0.h, z0.h, z4.h +; CHECK-NEXT: movprfx z4, z6 +; CHECK-NEXT: udiv z4.s, p0/m, z4.s, z5.s +; CHECK-NEXT: uunpklo z5.s, z17.h +; CHECK-NEXT: uunpklo z3.s, z3.h +; CHECK-NEXT: uzp1 z1.h, z1.h, z16.h +; CHECK-NEXT: udiv z3.s, p0/m, z3.s, z5.s +; CHECK-NEXT: stp q0, q1, [x0, #32] +; CHECK-NEXT: uzp1 z0.h, z3.h, z4.h +; CHECK-NEXT: uzp1 z1.h, z2.h, z7.h +; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: ret + %op1 = load <32 x i16>, <32 x i16>* %a + %op2 = load <32 x i16>, <32 x i16>* %b + %res = udiv <32 x i16> %op1, %op2 + store <32 x i16> %res, <32 x i16>* %a + ret void +} + +define <2 x i32> @udiv_v2i32(<2 x i32> %op1, <2 x i32> %op2) #0 { +; CHECK-LABEL: udiv_v2i32: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: ptrue p0.s, vl2 +; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1 +; CHECK-NEXT: udiv z0.s, p0/m, z0.s, z1.s +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 +; CHECK-NEXT: ret + %res = udiv <2 x i32> %op1, %op2 + ret <2 x i32> %res +} + +define <4 x i32> @udiv_v4i32(<4 x i32> %op1, <4 x i32> %op2) #0 { +; CHECK-LABEL: udiv_v4i32: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: ptrue p0.s, vl4 +; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1 +; CHECK-NEXT: udiv z0.s, p0/m, z0.s, z1.s +; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 +; CHECK-NEXT: ret + %res = udiv <4 x i32> %op1, %op2 + ret <4 x i32> %res +} + +define void @udiv_v8i32(<8 x i32>* %a, <8 x i32>* %b) #0 { +; CHECK-LABEL: udiv_v8i32: +; CHECK: // %bb.0: +; CHECK-NEXT: mov x8, #4 +; CHECK-NEXT: ptrue p0.s, vl4 +; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] +; CHECK-NEXT: ld1w { z1.s }, p0/z, [x0] +; CHECK-NEXT: ld1w { z2.s }, p0/z, [x1, x8, lsl #2] +; CHECK-NEXT: ld1w { z3.s }, p0/z, [x1] +; CHECK-NEXT: udiv z1.s, p0/m, z1.s, z3.s +; CHECK-NEXT: udiv z0.s, p0/m, z0.s, z2.s +; CHECK-NEXT: stp q1, q0, [x0] +; CHECK-NEXT: ret + %op1 = load <8 x i32>, <8 x i32>* %a + %op2 = load <8 x i32>, <8 x i32>* %b + %res = udiv <8 x i32> %op1, %op2 + store <8 x i32> %res, <8 x i32>* %a + ret void +} + +define void @udiv_v16i32(<16 x i32>* %a, <16 x i32>* %b) #0 { +; CHECK-LABEL: udiv_v16i32: +; CHECK: // %bb.0: +; CHECK-NEXT: mov x8, #8 +; CHECK-NEXT: mov x9, #12 +; CHECK-NEXT: mov x10, #4 +; CHECK-NEXT: ptrue p0.s, vl4 +; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] +; CHECK-NEXT: ld1w { z1.s }, p0/z, [x0, x9, lsl #2] +; CHECK-NEXT: ld1w { z2.s }, p0/z, [x0, x10, lsl #2] +; CHECK-NEXT: ld1w { z3.s }, p0/z, [x0] +; CHECK-NEXT: ld1w { z4.s }, p0/z, [x1, x8, lsl #2] +; CHECK-NEXT: ld1w { z5.s }, p0/z, [x1, x9, lsl #2] +; CHECK-NEXT: ld1w { z6.s }, p0/z, [x1, x10, lsl #2] +; CHECK-NEXT: ld1w { z7.s }, p0/z, [x1] +; CHECK-NEXT: udiv z0.s, p0/m, z0.s, z4.s +; CHECK-NEXT: udiv z1.s, p0/m, z1.s, z5.s +; CHECK-NEXT: stp q0, q1, [x0, #32] +; CHECK-NEXT: movprfx z0, z3 +; CHECK-NEXT: udiv z0.s, p0/m, z0.s, z7.s +; CHECK-NEXT: movprfx z1, z2 +; CHECK-NEXT: udiv z1.s, p0/m, z1.s, z6.s +; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: ret + %op1 = load <16 x i32>, <16 x i32>* %a + %op2 = load <16 x i32>, <16 x i32>* %b + %res = udiv <16 x i32> %op1, %op2 + store <16 x i32> %res, <16 x i32>* %a + ret void +} + +define <1 x i64> @udiv_v1i64(<1 x i64> %op1, <1 x i64> %op2) #0 { +; CHECK-LABEL: udiv_v1i64: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: ptrue p0.d, vl1 +; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1 +; CHECK-NEXT: udiv z0.d, p0/m, z0.d, z1.d +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 +; CHECK-NEXT: ret + %res = udiv <1 x i64> %op1, %op2 + ret <1 x i64> %res +} + +define <2 x i64> @udiv_v2i64(<2 x i64> %op1, <2 x i64> %op2) #0 { +; CHECK-LABEL: udiv_v2i64: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: ptrue p0.d, vl2 +; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1 +; CHECK-NEXT: udiv z0.d, p0/m, z0.d, z1.d +; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 +; CHECK-NEXT: ret + %res = udiv <2 x i64> %op1, %op2 + ret <2 x i64> %res +} + +define void @udiv_v4i64(<4 x i64>* %a, <4 x i64>* %b) #0 { +; CHECK-LABEL: udiv_v4i64: +; CHECK: // %bb.0: +; CHECK-NEXT: mov x8, #2 +; CHECK-NEXT: ptrue p0.d, vl2 +; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] +; CHECK-NEXT: ld1d { z1.d }, p0/z, [x0] +; CHECK-NEXT: ld1d { z2.d }, p0/z, [x1, x8, lsl #3] +; CHECK-NEXT: ld1d { z3.d }, p0/z, [x1] +; CHECK-NEXT: udiv z1.d, p0/m, z1.d, z3.d +; CHECK-NEXT: udiv z0.d, p0/m, z0.d, z2.d +; CHECK-NEXT: stp q1, q0, [x0] +; CHECK-NEXT: ret + %op1 = load <4 x i64>, <4 x i64>* %a + %op2 = load <4 x i64>, <4 x i64>* %b + %res = udiv <4 x i64> %op1, %op2 + store <4 x i64> %res, <4 x i64>* %a + ret void +} + +define void @udiv_v8i64(<8 x i64>* %a, <8 x i64>* %b) #0 { +; CHECK-LABEL: udiv_v8i64: +; CHECK: // %bb.0: +; CHECK-NEXT: mov x8, #4 +; CHECK-NEXT: mov x9, #6 +; CHECK-NEXT: mov x10, #2 +; CHECK-NEXT: ptrue p0.d, vl2 +; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] +; CHECK-NEXT: ld1d { z1.d }, p0/z, [x0, x9, lsl #3] +; CHECK-NEXT: ld1d { z2.d }, p0/z, [x0, x10, lsl #3] +; CHECK-NEXT: ld1d { z3.d }, p0/z, [x0] +; CHECK-NEXT: ld1d { z4.d }, p0/z, [x1, x8, lsl #3] +; CHECK-NEXT: ld1d { z5.d }, p0/z, [x1, x9, lsl #3] +; CHECK-NEXT: ld1d { z6.d }, p0/z, [x1, x10, lsl #3] +; CHECK-NEXT: ld1d { z7.d }, p0/z, [x1] +; CHECK-NEXT: udiv z0.d, p0/m, z0.d, z4.d +; CHECK-NEXT: udiv z1.d, p0/m, z1.d, z5.d +; CHECK-NEXT: stp q0, q1, [x0, #32] +; CHECK-NEXT: movprfx z0, z3 +; CHECK-NEXT: udiv z0.d, p0/m, z0.d, z7.d +; CHECK-NEXT: movprfx z1, z2 +; CHECK-NEXT: udiv z1.d, p0/m, z1.d, z6.d +; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: ret + %op1 = load <8 x i64>, <8 x i64>* %a + %op2 = load <8 x i64>, <8 x i64>* %b + %res = udiv <8 x i64> %op1, %op2 + store <8 x i64> %res, <8 x i64>* %a + ret void +} + +define void @udiv_constantsplat_v8i32(<8 x i32>* %a) #0 { +; CHECK-LABEL: udiv_constantsplat_v8i32: +; CHECK: // %bb.0: +; CHECK-NEXT: mov x8, #4 +; CHECK-NEXT: ptrue p0.s, vl4 +; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] +; CHECK-NEXT: adrp x8, .LCPI36_0 +; CHECK-NEXT: add x8, x8, :lo12:.LCPI36_0 +; CHECK-NEXT: ld1w { z1.s }, p0/z, [x0] +; CHECK-NEXT: ld1w { z2.s }, p0/z, [x8] +; CHECK-NEXT: adrp x8, .LCPI36_1 +; CHECK-NEXT: add x8, x8, :lo12:.LCPI36_1 +; CHECK-NEXT: ld1w { z3.s }, p0/z, [x8] +; CHECK-NEXT: adrp x8, .LCPI36_2 +; CHECK-NEXT: add x8, x8, :lo12:.LCPI36_2 +; CHECK-NEXT: ld1w { z4.s }, p0/z, [x8] +; CHECK-NEXT: movprfx z5, z1 +; CHECK-NEXT: umulh z5.s, p0/m, z5.s, z2.s +; CHECK-NEXT: umulh z2.s, p0/m, z2.s, z0.s +; CHECK-NEXT: sub z1.s, z1.s, z5.s +; CHECK-NEXT: sub z0.s, z0.s, z2.s +; CHECK-NEXT: lsr z1.s, p0/m, z1.s, z3.s +; CHECK-NEXT: lsr z0.s, p0/m, z0.s, z3.s +; CHECK-NEXT: add z1.s, z1.s, z5.s +; CHECK-NEXT: add z0.s, z0.s, z2.s +; CHECK-NEXT: lsr z1.s, p0/m, z1.s, z4.s +; CHECK-NEXT: lsr z0.s, p0/m, z0.s, z4.s +; CHECK-NEXT: stp q1, q0, [x0] +; CHECK-NEXT: ret + %op1 = load <8 x i32>, <8 x i32>* %a + %res = udiv <8 x i32> %op1, + store <8 x i32> %res, <8 x i32>* %a + ret void +} + +attributes #0 = { "target-features"="+sve" } diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-log.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-log.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-log.ll @@ -0,0 +1,546 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -force-streaming-compatible-sve < %s | FileCheck %s + +target triple = "aarch64-unknown-linux-gnu" + +; +; AND +; + +define <8 x i8> @and_v8i8(<8 x i8> %op1, <8 x i8> %op2) #0 { +; CHECK-LABEL: and_v8i8: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1 +; CHECK-NEXT: and z0.d, z0.d, z1.d +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 +; CHECK-NEXT: ret + %res = and <8 x i8> %op1, %op2 + ret <8 x i8> %res +} + +define <16 x i8> @and_v16i8(<16 x i8> %op1, <16 x i8> %op2) #0 { +; CHECK-LABEL: and_v16i8: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1 +; CHECK-NEXT: and z0.d, z0.d, z1.d +; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 +; CHECK-NEXT: ret + %res = and <16 x i8> %op1, %op2 + ret <16 x i8> %res +} + +define void @and_v32i8(<32 x i8>* %a, <32 x i8>* %b) #0 { +; CHECK-LABEL: and_v32i8: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w8, #16 +; CHECK-NEXT: ptrue p0.b, vl16 +; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0, x8] +; CHECK-NEXT: ld1b { z1.b }, p0/z, [x0] +; CHECK-NEXT: ld1b { z2.b }, p0/z, [x1, x8] +; CHECK-NEXT: ld1b { z3.b }, p0/z, [x1] +; CHECK-NEXT: and z1.d, z1.d, z3.d +; CHECK-NEXT: and z0.d, z0.d, z2.d +; CHECK-NEXT: stp q1, q0, [x0] +; CHECK-NEXT: ret + %op1 = load <32 x i8>, <32 x i8>* %a + %op2 = load <32 x i8>, <32 x i8>* %b + %res = and <32 x i8> %op1, %op2 + store <32 x i8> %res, <32 x i8>* %a + ret void +} + +define <4 x i16> @and_v4i16(<4 x i16> %op1, <4 x i16> %op2) #0 { +; CHECK-LABEL: and_v4i16: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1 +; CHECK-NEXT: and z0.d, z0.d, z1.d +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 +; CHECK-NEXT: ret + %res = and <4 x i16> %op1, %op2 + ret <4 x i16> %res +} + +define <8 x i16> @and_v8i16(<8 x i16> %op1, <8 x i16> %op2) #0 { +; CHECK-LABEL: and_v8i16: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1 +; CHECK-NEXT: and z0.d, z0.d, z1.d +; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 +; CHECK-NEXT: ret + %res = and <8 x i16> %op1, %op2 + ret <8 x i16> %res +} + +define void @and_v16i16(<16 x i16>* %a, <16 x i16>* %b) #0 { +; CHECK-LABEL: and_v16i16: +; CHECK: // %bb.0: +; CHECK-NEXT: mov x8, #8 +; CHECK-NEXT: ptrue p0.h, vl8 +; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] +; CHECK-NEXT: ld1h { z1.h }, p0/z, [x0] +; CHECK-NEXT: ld1h { z2.h }, p0/z, [x1, x8, lsl #1] +; CHECK-NEXT: ld1h { z3.h }, p0/z, [x1] +; CHECK-NEXT: and z1.d, z1.d, z3.d +; CHECK-NEXT: and z0.d, z0.d, z2.d +; CHECK-NEXT: stp q1, q0, [x0] +; CHECK-NEXT: ret + %op1 = load <16 x i16>, <16 x i16>* %a + %op2 = load <16 x i16>, <16 x i16>* %b + %res = and <16 x i16> %op1, %op2 + store <16 x i16> %res, <16 x i16>* %a + ret void +} + +define <2 x i32> @and_v2i32(<2 x i32> %op1, <2 x i32> %op2) #0 { +; CHECK-LABEL: and_v2i32: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1 +; CHECK-NEXT: and z0.d, z0.d, z1.d +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 +; CHECK-NEXT: ret + %res = and <2 x i32> %op1, %op2 + ret <2 x i32> %res +} + +define <4 x i32> @and_v4i32(<4 x i32> %op1, <4 x i32> %op2) #0 { +; CHECK-LABEL: and_v4i32: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1 +; CHECK-NEXT: and z0.d, z0.d, z1.d +; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 +; CHECK-NEXT: ret + %res = and <4 x i32> %op1, %op2 + ret <4 x i32> %res +} + +define void @and_v8i32(<8 x i32>* %a, <8 x i32>* %b) #0 { +; CHECK-LABEL: and_v8i32: +; CHECK: // %bb.0: +; CHECK-NEXT: mov x8, #4 +; CHECK-NEXT: ptrue p0.s, vl4 +; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] +; CHECK-NEXT: ld1w { z1.s }, p0/z, [x0] +; CHECK-NEXT: ld1w { z2.s }, p0/z, [x1, x8, lsl #2] +; CHECK-NEXT: ld1w { z3.s }, p0/z, [x1] +; CHECK-NEXT: and z1.d, z1.d, z3.d +; CHECK-NEXT: and z0.d, z0.d, z2.d +; CHECK-NEXT: stp q1, q0, [x0] +; CHECK-NEXT: ret + %op1 = load <8 x i32>, <8 x i32>* %a + %op2 = load <8 x i32>, <8 x i32>* %b + %res = and <8 x i32> %op1, %op2 + store <8 x i32> %res, <8 x i32>* %a + ret void +} + +define <1 x i64> @and_v1i64(<1 x i64> %op1, <1 x i64> %op2) #0 { +; CHECK-LABEL: and_v1i64: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1 +; CHECK-NEXT: and z0.d, z0.d, z1.d +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 +; CHECK-NEXT: ret + %res = and <1 x i64> %op1, %op2 + ret <1 x i64> %res +} + +define <2 x i64> @and_v2i64(<2 x i64> %op1, <2 x i64> %op2) #0 { +; CHECK-LABEL: and_v2i64: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1 +; CHECK-NEXT: and z0.d, z0.d, z1.d +; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 +; CHECK-NEXT: ret + %res = and <2 x i64> %op1, %op2 + ret <2 x i64> %res +} + +define void @and_v4i64(<4 x i64>* %a, <4 x i64>* %b) #0 { +; CHECK-LABEL: and_v4i64: +; CHECK: // %bb.0: +; CHECK-NEXT: mov x8, #2 +; CHECK-NEXT: ptrue p0.d, vl2 +; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] +; CHECK-NEXT: ld1d { z1.d }, p0/z, [x0] +; CHECK-NEXT: ld1d { z2.d }, p0/z, [x1, x8, lsl #3] +; CHECK-NEXT: ld1d { z3.d }, p0/z, [x1] +; CHECK-NEXT: and z1.d, z1.d, z3.d +; CHECK-NEXT: and z0.d, z0.d, z2.d +; CHECK-NEXT: stp q1, q0, [x0] +; CHECK-NEXT: ret + %op1 = load <4 x i64>, <4 x i64>* %a + %op2 = load <4 x i64>, <4 x i64>* %b + %res = and <4 x i64> %op1, %op2 + store <4 x i64> %res, <4 x i64>* %a + ret void +} + +; +; OR +; + +define <8 x i8> @or_v8i8(<8 x i8> %op1, <8 x i8> %op2) #0 { +; CHECK-LABEL: or_v8i8: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1 +; CHECK-NEXT: orr z0.d, z0.d, z1.d +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 +; CHECK-NEXT: ret + %res = or <8 x i8> %op1, %op2 + ret <8 x i8> %res +} + +define <16 x i8> @or_v16i8(<16 x i8> %op1, <16 x i8> %op2) #0 { +; CHECK-LABEL: or_v16i8: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1 +; CHECK-NEXT: orr z0.d, z0.d, z1.d +; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 +; CHECK-NEXT: ret + %res = or <16 x i8> %op1, %op2 + ret <16 x i8> %res +} + +define void @or_v32i8(<32 x i8>* %a, <32 x i8>* %b) #0 { +; CHECK-LABEL: or_v32i8: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w8, #16 +; CHECK-NEXT: ptrue p0.b, vl16 +; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0, x8] +; CHECK-NEXT: ld1b { z1.b }, p0/z, [x0] +; CHECK-NEXT: ld1b { z2.b }, p0/z, [x1, x8] +; CHECK-NEXT: ld1b { z3.b }, p0/z, [x1] +; CHECK-NEXT: orr z1.d, z1.d, z3.d +; CHECK-NEXT: orr z0.d, z0.d, z2.d +; CHECK-NEXT: stp q1, q0, [x0] +; CHECK-NEXT: ret + %op1 = load <32 x i8>, <32 x i8>* %a + %op2 = load <32 x i8>, <32 x i8>* %b + %res = or <32 x i8> %op1, %op2 + store <32 x i8> %res, <32 x i8>* %a + ret void +} + +define <4 x i16> @or_v4i16(<4 x i16> %op1, <4 x i16> %op2) #0 { +; CHECK-LABEL: or_v4i16: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1 +; CHECK-NEXT: orr z0.d, z0.d, z1.d +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 +; CHECK-NEXT: ret + %res = or <4 x i16> %op1, %op2 + ret <4 x i16> %res +} + +define <8 x i16> @or_v8i16(<8 x i16> %op1, <8 x i16> %op2) #0 { +; CHECK-LABEL: or_v8i16: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1 +; CHECK-NEXT: orr z0.d, z0.d, z1.d +; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 +; CHECK-NEXT: ret + %res = or <8 x i16> %op1, %op2 + ret <8 x i16> %res +} + +define void @or_v16i16(<16 x i16>* %a, <16 x i16>* %b) #0 { +; CHECK-LABEL: or_v16i16: +; CHECK: // %bb.0: +; CHECK-NEXT: mov x8, #8 +; CHECK-NEXT: ptrue p0.h, vl8 +; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] +; CHECK-NEXT: ld1h { z1.h }, p0/z, [x0] +; CHECK-NEXT: ld1h { z2.h }, p0/z, [x1, x8, lsl #1] +; CHECK-NEXT: ld1h { z3.h }, p0/z, [x1] +; CHECK-NEXT: orr z1.d, z1.d, z3.d +; CHECK-NEXT: orr z0.d, z0.d, z2.d +; CHECK-NEXT: stp q1, q0, [x0] +; CHECK-NEXT: ret + %op1 = load <16 x i16>, <16 x i16>* %a + %op2 = load <16 x i16>, <16 x i16>* %b + %res = or <16 x i16> %op1, %op2 + store <16 x i16> %res, <16 x i16>* %a + ret void +} + +define <2 x i32> @or_v2i32(<2 x i32> %op1, <2 x i32> %op2) #0 { +; CHECK-LABEL: or_v2i32: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1 +; CHECK-NEXT: orr z0.d, z0.d, z1.d +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 +; CHECK-NEXT: ret + %res = or <2 x i32> %op1, %op2 + ret <2 x i32> %res +} + +define <4 x i32> @or_v4i32(<4 x i32> %op1, <4 x i32> %op2) #0 { +; CHECK-LABEL: or_v4i32: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1 +; CHECK-NEXT: orr z0.d, z0.d, z1.d +; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 +; CHECK-NEXT: ret + %res = or <4 x i32> %op1, %op2 + ret <4 x i32> %res +} + +define void @or_v8i32(<8 x i32>* %a, <8 x i32>* %b) #0 { +; CHECK-LABEL: or_v8i32: +; CHECK: // %bb.0: +; CHECK-NEXT: mov x8, #4 +; CHECK-NEXT: ptrue p0.s, vl4 +; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] +; CHECK-NEXT: ld1w { z1.s }, p0/z, [x0] +; CHECK-NEXT: ld1w { z2.s }, p0/z, [x1, x8, lsl #2] +; CHECK-NEXT: ld1w { z3.s }, p0/z, [x1] +; CHECK-NEXT: orr z1.d, z1.d, z3.d +; CHECK-NEXT: orr z0.d, z0.d, z2.d +; CHECK-NEXT: stp q1, q0, [x0] +; CHECK-NEXT: ret + %op1 = load <8 x i32>, <8 x i32>* %a + %op2 = load <8 x i32>, <8 x i32>* %b + %res = or <8 x i32> %op1, %op2 + store <8 x i32> %res, <8 x i32>* %a + ret void +} + +define <1 x i64> @or_v1i64(<1 x i64> %op1, <1 x i64> %op2) #0 { +; CHECK-LABEL: or_v1i64: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1 +; CHECK-NEXT: orr z0.d, z0.d, z1.d +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 +; CHECK-NEXT: ret + %res = or <1 x i64> %op1, %op2 + ret <1 x i64> %res +} + +define <2 x i64> @or_v2i64(<2 x i64> %op1, <2 x i64> %op2) #0 { +; CHECK-LABEL: or_v2i64: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1 +; CHECK-NEXT: orr z0.d, z0.d, z1.d +; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 +; CHECK-NEXT: ret + %res = or <2 x i64> %op1, %op2 + ret <2 x i64> %res +} + +define void @or_v4i64(<4 x i64>* %a, <4 x i64>* %b) #0 { +; CHECK-LABEL: or_v4i64: +; CHECK: // %bb.0: +; CHECK-NEXT: mov x8, #2 +; CHECK-NEXT: ptrue p0.d, vl2 +; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] +; CHECK-NEXT: ld1d { z1.d }, p0/z, [x0] +; CHECK-NEXT: ld1d { z2.d }, p0/z, [x1, x8, lsl #3] +; CHECK-NEXT: ld1d { z3.d }, p0/z, [x1] +; CHECK-NEXT: orr z1.d, z1.d, z3.d +; CHECK-NEXT: orr z0.d, z0.d, z2.d +; CHECK-NEXT: stp q1, q0, [x0] +; CHECK-NEXT: ret + %op1 = load <4 x i64>, <4 x i64>* %a + %op2 = load <4 x i64>, <4 x i64>* %b + %res = or <4 x i64> %op1, %op2 + store <4 x i64> %res, <4 x i64>* %a + ret void +} + +; +; XOR +; + +define <8 x i8> @xor_v8i8(<8 x i8> %op1, <8 x i8> %op2) #0 { +; CHECK-LABEL: xor_v8i8: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1 +; CHECK-NEXT: eor z0.d, z0.d, z1.d +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 +; CHECK-NEXT: ret + %res = xor <8 x i8> %op1, %op2 + ret <8 x i8> %res +} + +define <16 x i8> @xor_v16i8(<16 x i8> %op1, <16 x i8> %op2) #0 { +; CHECK-LABEL: xor_v16i8: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1 +; CHECK-NEXT: eor z0.d, z0.d, z1.d +; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 +; CHECK-NEXT: ret + %res = xor <16 x i8> %op1, %op2 + ret <16 x i8> %res +} + +define void @xor_v32i8(<32 x i8>* %a, <32 x i8>* %b) #0 { +; CHECK-LABEL: xor_v32i8: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w8, #16 +; CHECK-NEXT: ptrue p0.b, vl16 +; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0, x8] +; CHECK-NEXT: ld1b { z1.b }, p0/z, [x0] +; CHECK-NEXT: ld1b { z2.b }, p0/z, [x1, x8] +; CHECK-NEXT: ld1b { z3.b }, p0/z, [x1] +; CHECK-NEXT: eor z1.d, z1.d, z3.d +; CHECK-NEXT: eor z0.d, z0.d, z2.d +; CHECK-NEXT: stp q1, q0, [x0] +; CHECK-NEXT: ret + %op1 = load <32 x i8>, <32 x i8>* %a + %op2 = load <32 x i8>, <32 x i8>* %b + %res = xor <32 x i8> %op1, %op2 + store <32 x i8> %res, <32 x i8>* %a + ret void +} + +define <4 x i16> @xor_v4i16(<4 x i16> %op1, <4 x i16> %op2) #0 { +; CHECK-LABEL: xor_v4i16: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1 +; CHECK-NEXT: eor z0.d, z0.d, z1.d +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 +; CHECK-NEXT: ret + %res = xor <4 x i16> %op1, %op2 + ret <4 x i16> %res +} + +define <8 x i16> @xor_v8i16(<8 x i16> %op1, <8 x i16> %op2) #0 { +; CHECK-LABEL: xor_v8i16: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1 +; CHECK-NEXT: eor z0.d, z0.d, z1.d +; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 +; CHECK-NEXT: ret + %res = xor <8 x i16> %op1, %op2 + ret <8 x i16> %res +} + +define void @xor_v16i16(<16 x i16>* %a, <16 x i16>* %b) #0 { +; CHECK-LABEL: xor_v16i16: +; CHECK: // %bb.0: +; CHECK-NEXT: mov x8, #8 +; CHECK-NEXT: ptrue p0.h, vl8 +; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] +; CHECK-NEXT: ld1h { z1.h }, p0/z, [x0] +; CHECK-NEXT: ld1h { z2.h }, p0/z, [x1, x8, lsl #1] +; CHECK-NEXT: ld1h { z3.h }, p0/z, [x1] +; CHECK-NEXT: eor z1.d, z1.d, z3.d +; CHECK-NEXT: eor z0.d, z0.d, z2.d +; CHECK-NEXT: stp q1, q0, [x0] +; CHECK-NEXT: ret + %op1 = load <16 x i16>, <16 x i16>* %a + %op2 = load <16 x i16>, <16 x i16>* %b + %res = xor <16 x i16> %op1, %op2 + store <16 x i16> %res, <16 x i16>* %a + ret void +} + +define <2 x i32> @xor_v2i32(<2 x i32> %op1, <2 x i32> %op2) #0 { +; CHECK-LABEL: xor_v2i32: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1 +; CHECK-NEXT: eor z0.d, z0.d, z1.d +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 +; CHECK-NEXT: ret + %res = xor <2 x i32> %op1, %op2 + ret <2 x i32> %res +} + +define <4 x i32> @xor_v4i32(<4 x i32> %op1, <4 x i32> %op2) #0 { +; CHECK-LABEL: xor_v4i32: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1 +; CHECK-NEXT: eor z0.d, z0.d, z1.d +; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 +; CHECK-NEXT: ret + %res = xor <4 x i32> %op1, %op2 + ret <4 x i32> %res +} + +define void @xor_v8i32(<8 x i32>* %a, <8 x i32>* %b) #0 { +; CHECK-LABEL: xor_v8i32: +; CHECK: // %bb.0: +; CHECK-NEXT: mov x8, #4 +; CHECK-NEXT: ptrue p0.s, vl4 +; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] +; CHECK-NEXT: ld1w { z1.s }, p0/z, [x0] +; CHECK-NEXT: ld1w { z2.s }, p0/z, [x1, x8, lsl #2] +; CHECK-NEXT: ld1w { z3.s }, p0/z, [x1] +; CHECK-NEXT: eor z1.d, z1.d, z3.d +; CHECK-NEXT: eor z0.d, z0.d, z2.d +; CHECK-NEXT: stp q1, q0, [x0] +; CHECK-NEXT: ret + %op1 = load <8 x i32>, <8 x i32>* %a + %op2 = load <8 x i32>, <8 x i32>* %b + %res = xor <8 x i32> %op1, %op2 + store <8 x i32> %res, <8 x i32>* %a + ret void +} + +define <1 x i64> @xor_v1i64(<1 x i64> %op1, <1 x i64> %op2) #0 { +; CHECK-LABEL: xor_v1i64: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1 +; CHECK-NEXT: eor z0.d, z0.d, z1.d +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 +; CHECK-NEXT: ret + %res = xor <1 x i64> %op1, %op2 + ret <1 x i64> %res +} + +define <2 x i64> @xor_v2i64(<2 x i64> %op1, <2 x i64> %op2) #0 { +; CHECK-LABEL: xor_v2i64: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1 +; CHECK-NEXT: eor z0.d, z0.d, z1.d +; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 +; CHECK-NEXT: ret + %res = xor <2 x i64> %op1, %op2 + ret <2 x i64> %res +} + +define void @xor_v4i64(<4 x i64>* %a, <4 x i64>* %b) #0 { +; CHECK-LABEL: xor_v4i64: +; CHECK: // %bb.0: +; CHECK-NEXT: mov x8, #2 +; CHECK-NEXT: ptrue p0.d, vl2 +; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] +; CHECK-NEXT: ld1d { z1.d }, p0/z, [x0] +; CHECK-NEXT: ld1d { z2.d }, p0/z, [x1, x8, lsl #3] +; CHECK-NEXT: ld1d { z3.d }, p0/z, [x1] +; CHECK-NEXT: eor z1.d, z1.d, z3.d +; CHECK-NEXT: eor z0.d, z0.d, z2.d +; CHECK-NEXT: stp q1, q0, [x0] +; CHECK-NEXT: ret + %op1 = load <4 x i64>, <4 x i64>* %a + %op2 = load <4 x i64>, <4 x i64>* %b + %res = xor <4 x i64> %op1, %op2 + store <4 x i64> %res, <4 x i64>* %a + ret void +} + +attributes #0 = { "target-features"="+sve" } diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-mulh.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-mulh.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-mulh.ll @@ -0,0 +1,924 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -force-streaming-compatible-sve < %s | FileCheck %s + +; This test only tests the legal types for a given vector width, as mulh nodes +; do not get generated for non-legal types. + +target triple = "aarch64-unknown-linux-gnu" + +; +; SMULH +; + +define <4 x i8> @smulh_v4i8(<4 x i8> %op1, <4 x i8> %op2) #0 { +; CHECK-LABEL: smulh_v4i8: +; CHECK: // %bb.0: +; CHECK-NEXT: adrp x8, .LCPI0_0 +; CHECK-NEXT: add x8, x8, :lo12:.LCPI0_0 +; CHECK-NEXT: ptrue p0.h, vl4 +; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1 +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: ld1h { z2.h }, p0/z, [x8] +; CHECK-NEXT: adrp x8, .LCPI0_1 +; CHECK-NEXT: add x8, x8, :lo12:.LCPI0_1 +; CHECK-NEXT: ld1h { z3.h }, p0/z, [x8] +; CHECK-NEXT: lsl z0.h, p0/m, z0.h, z2.h +; CHECK-NEXT: lsl z1.h, p0/m, z1.h, z2.h +; CHECK-NEXT: asr z0.h, p0/m, z0.h, z2.h +; CHECK-NEXT: asr z1.h, p0/m, z1.h, z2.h +; CHECK-NEXT: mul z0.h, p0/m, z0.h, z1.h +; CHECK-NEXT: lsr z0.h, p0/m, z0.h, z3.h +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 +; CHECK-NEXT: ret + %insert = insertelement <4 x i16> undef, i16 4, i64 0 + %splat = shufflevector <4 x i16> %insert, <4 x i16> undef, <4 x i32> zeroinitializer + %1 = sext <4 x i8> %op1 to <4 x i16> + %2 = sext <4 x i8> %op2 to <4 x i16> + %mul = mul <4 x i16> %1, %2 + %shr = lshr <4 x i16> %mul, + %res = trunc <4 x i16> %shr to <4 x i8> + ret <4 x i8> %res +} + +define <8 x i8> @smulh_v8i8(<8 x i8> %op1, <8 x i8> %op2) #0 { +; CHECK-LABEL: smulh_v8i8: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: ptrue p0.b, vl8 +; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1 +; CHECK-NEXT: smulh z0.b, p0/m, z0.b, z1.b +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 +; CHECK-NEXT: ret + %insert = insertelement <8 x i16> undef, i16 8, i64 0 + %splat = shufflevector <8 x i16> %insert, <8 x i16> undef, <8 x i32> zeroinitializer + %1 = sext <8 x i8> %op1 to <8 x i16> + %2 = sext <8 x i8> %op2 to <8 x i16> + %mul = mul <8 x i16> %1, %2 + %shr = lshr <8 x i16> %mul, + %res = trunc <8 x i16> %shr to <8 x i8> + ret <8 x i8> %res +} + +define <16 x i8> @smulh_v16i8(<16 x i8> %op1, <16 x i8> %op2) #0 { +; CHECK-LABEL: smulh_v16i8: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: ptrue p0.b, vl16 +; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1 +; CHECK-NEXT: smulh z0.b, p0/m, z0.b, z1.b +; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 +; CHECK-NEXT: ret + %1 = sext <16 x i8> %op1 to <16 x i16> + %2 = sext <16 x i8> %op2 to <16 x i16> + %mul = mul <16 x i16> %1, %2 + %shr = lshr <16 x i16> %mul, + %res = trunc <16 x i16> %shr to <16 x i8> + ret <16 x i8> %res +} + +define void @smulh_v32i8(<32 x i8>* %a, <32 x i8>* %b) #0 { +; CHECK-LABEL: smulh_v32i8: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w8, #16 +; CHECK-NEXT: ptrue p0.b, vl16 +; CHECK-NEXT: ptrue p1.h, vl8 +; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0, x8] +; CHECK-NEXT: ld1b { z2.b }, p0/z, [x0] +; CHECK-NEXT: ld1b { z1.b }, p0/z, [x1, x8] +; CHECK-NEXT: ld1b { z3.b }, p0/z, [x1] +; CHECK-NEXT: adrp x8, .LCPI3_0 +; CHECK-NEXT: add x8, x8, :lo12:.LCPI3_0 +; CHECK-NEXT: sunpklo z5.h, z2.b +; CHECK-NEXT: ext z2.b, z2.b, z2.b, #8 +; CHECK-NEXT: sunpklo z7.h, z3.b +; CHECK-NEXT: ld1h { z16.h }, p1/z, [x8] +; CHECK-NEXT: ext z3.b, z3.b, z3.b, #8 +; CHECK-NEXT: sunpklo z2.h, z2.b +; CHECK-NEXT: sunpklo z3.h, z3.b +; CHECK-NEXT: mul z5.h, p1/m, z5.h, z7.h +; CHECK-NEXT: mul z2.h, p1/m, z2.h, z3.h +; CHECK-NEXT: movprfx z3, z5 +; CHECK-NEXT: lsr z3.h, p1/m, z3.h, z16.h +; CHECK-NEXT: fmov w8, s3 +; CHECK-NEXT: sunpklo z4.h, z0.b +; CHECK-NEXT: sunpklo z6.h, z1.b +; CHECK-NEXT: ext z0.b, z0.b, z0.b, #8 +; CHECK-NEXT: ext z1.b, z1.b, z1.b, #8 +; CHECK-NEXT: lsr z2.h, p1/m, z2.h, z16.h +; CHECK-NEXT: mov z5.h, z3.h[7] +; CHECK-NEXT: sunpklo z0.h, z0.b +; CHECK-NEXT: sunpklo z1.h, z1.b +; CHECK-NEXT: mul z4.h, p1/m, z4.h, z6.h +; CHECK-NEXT: mov z6.h, z3.h[6] +; CHECK-NEXT: mov z7.h, z3.h[5] +; CHECK-NEXT: mov z17.h, z3.h[4] +; CHECK-NEXT: mov z18.h, z3.h[3] +; CHECK-NEXT: mov z19.h, z3.h[2] +; CHECK-NEXT: mov z20.h, z3.h[1] +; CHECK-NEXT: mov z3.h, z2.h[7] +; CHECK-NEXT: mov z21.h, z2.h[6] +; CHECK-NEXT: mov z22.h, z2.h[5] +; CHECK-NEXT: mov z23.h, z2.h[4] +; CHECK-NEXT: mov z24.h, z2.h[3] +; CHECK-NEXT: mov z25.h, z2.h[2] +; CHECK-NEXT: mov z26.h, z2.h[1] +; CHECK-NEXT: fmov w9, s2 +; CHECK-NEXT: mul z0.h, p1/m, z0.h, z1.h +; CHECK-NEXT: fmov w10, s5 +; CHECK-NEXT: strb w8, [sp, #-32]! +; CHECK-NEXT: .cfi_def_cfa_offset 32 +; CHECK-NEXT: fmov w8, s6 +; CHECK-NEXT: strb w9, [sp, #8] +; CHECK-NEXT: fmov w9, s7 +; CHECK-NEXT: strb w10, [sp, #7] +; CHECK-NEXT: fmov w10, s17 +; CHECK-NEXT: lsr z0.h, p1/m, z0.h, z16.h +; CHECK-NEXT: strb w8, [sp, #6] +; CHECK-NEXT: fmov w8, s18 +; CHECK-NEXT: strb w9, [sp, #5] +; CHECK-NEXT: fmov w9, s19 +; CHECK-NEXT: strb w10, [sp, #4] +; CHECK-NEXT: fmov w10, s20 +; CHECK-NEXT: strb w8, [sp, #3] +; CHECK-NEXT: fmov w8, s3 +; CHECK-NEXT: strb w9, [sp, #2] +; CHECK-NEXT: fmov w9, s21 +; CHECK-NEXT: strb w10, [sp, #1] +; CHECK-NEXT: fmov w10, s22 +; CHECK-NEXT: strb w8, [sp, #15] +; CHECK-NEXT: fmov w8, s23 +; CHECK-NEXT: strb w9, [sp, #14] +; CHECK-NEXT: fmov w9, s24 +; CHECK-NEXT: strb w10, [sp, #13] +; CHECK-NEXT: fmov w10, s25 +; CHECK-NEXT: strb w8, [sp, #12] +; CHECK-NEXT: fmov w8, s26 +; CHECK-NEXT: movprfx z1, z4 +; CHECK-NEXT: lsr z1.h, p1/m, z1.h, z16.h +; CHECK-NEXT: strb w9, [sp, #11] +; CHECK-NEXT: mov z2.h, z1.h[7] +; CHECK-NEXT: fmov w9, s1 +; CHECK-NEXT: strb w10, [sp, #10] +; CHECK-NEXT: fmov w10, s0 +; CHECK-NEXT: strb w8, [sp, #9] +; CHECK-NEXT: fmov w8, s2 +; CHECK-NEXT: mov z3.h, z1.h[6] +; CHECK-NEXT: mov z4.h, z1.h[5] +; CHECK-NEXT: mov z5.h, z1.h[4] +; CHECK-NEXT: strb w9, [sp, #16] +; CHECK-NEXT: fmov w9, s3 +; CHECK-NEXT: strb w10, [sp, #24] +; CHECK-NEXT: fmov w10, s4 +; CHECK-NEXT: strb w8, [sp, #23] +; CHECK-NEXT: fmov w8, s5 +; CHECK-NEXT: mov z6.h, z1.h[3] +; CHECK-NEXT: mov z7.h, z1.h[2] +; CHECK-NEXT: mov z16.h, z1.h[1] +; CHECK-NEXT: strb w9, [sp, #22] +; CHECK-NEXT: fmov w9, s6 +; CHECK-NEXT: strb w10, [sp, #21] +; CHECK-NEXT: fmov w10, s7 +; CHECK-NEXT: strb w8, [sp, #20] +; CHECK-NEXT: fmov w8, s16 +; CHECK-NEXT: mov z1.h, z0.h[7] +; CHECK-NEXT: mov z17.h, z0.h[6] +; CHECK-NEXT: mov z18.h, z0.h[5] +; CHECK-NEXT: strb w9, [sp, #19] +; CHECK-NEXT: fmov w9, s1 +; CHECK-NEXT: strb w10, [sp, #18] +; CHECK-NEXT: fmov w10, s17 +; CHECK-NEXT: strb w8, [sp, #17] +; CHECK-NEXT: fmov w8, s18 +; CHECK-NEXT: mov z19.h, z0.h[4] +; CHECK-NEXT: mov z20.h, z0.h[3] +; CHECK-NEXT: mov z21.h, z0.h[2] +; CHECK-NEXT: strb w9, [sp, #31] +; CHECK-NEXT: fmov w9, s19 +; CHECK-NEXT: strb w10, [sp, #30] +; CHECK-NEXT: fmov w10, s20 +; CHECK-NEXT: strb w8, [sp, #29] +; CHECK-NEXT: fmov w8, s21 +; CHECK-NEXT: mov z22.h, z0.h[1] +; CHECK-NEXT: strb w9, [sp, #28] +; CHECK-NEXT: fmov w9, s22 +; CHECK-NEXT: strb w10, [sp, #27] +; CHECK-NEXT: mov x10, sp +; CHECK-NEXT: strb w8, [sp, #26] +; CHECK-NEXT: add x8, sp, #16 +; CHECK-NEXT: strb w9, [sp, #25] +; CHECK-NEXT: ld1b { z0.b }, p0/z, [x10] +; CHECK-NEXT: ld1b { z1.b }, p0/z, [x8] +; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: add sp, sp, #32 +; CHECK-NEXT: ret + %op1 = load <32 x i8>, <32 x i8>* %a + %op2 = load <32 x i8>, <32 x i8>* %b + %1 = sext <32 x i8> %op1 to <32 x i16> + %2 = sext <32 x i8> %op2 to <32 x i16> + %mul = mul <32 x i16> %1, %2 + %shr = lshr <32 x i16> %mul, + %res = trunc <32 x i16> %shr to <32 x i8> + store <32 x i8> %res, <32 x i8>* %a + ret void +} + +define <2 x i16> @smulh_v2i16(<2 x i16> %op1, <2 x i16> %op2) #0 { +; CHECK-LABEL: smulh_v2i16: +; CHECK: // %bb.0: +; CHECK-NEXT: adrp x8, .LCPI4_0 +; CHECK-NEXT: add x8, x8, :lo12:.LCPI4_0 +; CHECK-NEXT: ptrue p0.s, vl2 +; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1 +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: ld1w { z2.s }, p0/z, [x8] +; CHECK-NEXT: lsl z0.s, p0/m, z0.s, z2.s +; CHECK-NEXT: lsl z1.s, p0/m, z1.s, z2.s +; CHECK-NEXT: asr z0.s, p0/m, z0.s, z2.s +; CHECK-NEXT: asr z1.s, p0/m, z1.s, z2.s +; CHECK-NEXT: mul z0.s, p0/m, z0.s, z1.s +; CHECK-NEXT: lsr z0.s, p0/m, z0.s, z2.s +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 +; CHECK-NEXT: ret + %1 = sext <2 x i16> %op1 to <2 x i32> + %2 = sext <2 x i16> %op2 to <2 x i32> + %mul = mul <2 x i32> %1, %2 + %shr = lshr <2 x i32> %mul, + %res = trunc <2 x i32> %shr to <2 x i16> + ret <2 x i16> %res +} + +define <4 x i16> @smulh_v4i16(<4 x i16> %op1, <4 x i16> %op2) #0 { +; CHECK-LABEL: smulh_v4i16: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: ptrue p0.h, vl4 +; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1 +; CHECK-NEXT: smulh z0.h, p0/m, z0.h, z1.h +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 +; CHECK-NEXT: ret + %1 = sext <4 x i16> %op1 to <4 x i32> + %2 = sext <4 x i16> %op2 to <4 x i32> + %mul = mul <4 x i32> %1, %2 + %shr = lshr <4 x i32> %mul, + %res = trunc <4 x i32> %shr to <4 x i16> + ret <4 x i16> %res +} + +define <8 x i16> @smulh_v8i16(<8 x i16> %op1, <8 x i16> %op2) #0 { +; CHECK-LABEL: smulh_v8i16: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: ptrue p0.h, vl8 +; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1 +; CHECK-NEXT: smulh z0.h, p0/m, z0.h, z1.h +; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 +; CHECK-NEXT: ret + %1 = sext <8 x i16> %op1 to <8 x i32> + %2 = sext <8 x i16> %op2 to <8 x i32> + %mul = mul <8 x i32> %1, %2 + %shr = lshr <8 x i32> %mul, + %res = trunc <8 x i32> %shr to <8 x i16> + ret <8 x i16> %res +} + +define void @smulh_v16i16(<16 x i16>* %a, <16 x i16>* %b) #0 { +; CHECK-LABEL: smulh_v16i16: +; CHECK: // %bb.0: +; CHECK-NEXT: mov x8, #8 +; CHECK-NEXT: ptrue p0.h, vl8 +; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] +; CHECK-NEXT: ld1h { z1.h }, p0/z, [x0] +; CHECK-NEXT: ld1h { z2.h }, p0/z, [x1, x8, lsl #1] +; CHECK-NEXT: ld1h { z3.h }, p0/z, [x1] +; CHECK-NEXT: ptrue p0.h, vl4 +; CHECK-NEXT: movprfx z4, z1 +; CHECK-NEXT: smulh z4.h, p0/m, z4.h, z3.h +; CHECK-NEXT: ext z1.b, z1.b, z1.b, #8 +; CHECK-NEXT: ext z3.b, z3.b, z3.b, #8 +; CHECK-NEXT: smulh z1.h, p0/m, z1.h, z3.h +; CHECK-NEXT: movprfx z3, z0 +; CHECK-NEXT: smulh z3.h, p0/m, z3.h, z2.h +; CHECK-NEXT: ext z2.b, z2.b, z2.b, #8 +; CHECK-NEXT: ext z0.b, z0.b, z0.b, #8 +; CHECK-NEXT: smulh z0.h, p0/m, z0.h, z2.h +; CHECK-NEXT: splice z4.h, p0, z4.h, z1.h +; CHECK-NEXT: splice z3.h, p0, z3.h, z0.h +; CHECK-NEXT: stp q4, q3, [x0] +; CHECK-NEXT: ret + %op1 = load <16 x i16>, <16 x i16>* %a + %op2 = load <16 x i16>, <16 x i16>* %b + %1 = sext <16 x i16> %op1 to <16 x i32> + %2 = sext <16 x i16> %op2 to <16 x i32> + %mul = mul <16 x i32> %1, %2 + %shr = lshr <16 x i32> %mul, + %res = trunc <16 x i32> %shr to <16 x i16> + store <16 x i16> %res, <16 x i16>* %a + ret void +} + +define <2 x i32> @smulh_v2i32(<2 x i32> %op1, <2 x i32> %op2) #0 { +; CHECK-LABEL: smulh_v2i32: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: ptrue p0.s, vl2 +; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1 +; CHECK-NEXT: smulh z0.s, p0/m, z0.s, z1.s +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 +; CHECK-NEXT: ret + %1 = sext <2 x i32> %op1 to <2 x i64> + %2 = sext <2 x i32> %op2 to <2 x i64> + %mul = mul <2 x i64> %1, %2 + %shr = lshr <2 x i64> %mul, + %res = trunc <2 x i64> %shr to <2 x i32> + ret <2 x i32> %res +} + +define <4 x i32> @smulh_v4i32(<4 x i32> %op1, <4 x i32> %op2) #0 { +; CHECK-LABEL: smulh_v4i32: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: ptrue p0.s, vl4 +; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1 +; CHECK-NEXT: smulh z0.s, p0/m, z0.s, z1.s +; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 +; CHECK-NEXT: ret + %1 = sext <4 x i32> %op1 to <4 x i64> + %2 = sext <4 x i32> %op2 to <4 x i64> + %mul = mul <4 x i64> %1, %2 + %shr = lshr <4 x i64> %mul, + %res = trunc <4 x i64> %shr to <4 x i32> + ret <4 x i32> %res +} + +define void @smulh_v8i32(<8 x i32>* %a, <8 x i32>* %b) #0 { +; CHECK-LABEL: smulh_v8i32: +; CHECK: // %bb.0: +; CHECK-NEXT: mov x8, #4 +; CHECK-NEXT: ptrue p0.s, vl4 +; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] +; CHECK-NEXT: ld1w { z1.s }, p0/z, [x0] +; CHECK-NEXT: ld1w { z2.s }, p0/z, [x1, x8, lsl #2] +; CHECK-NEXT: ld1w { z3.s }, p0/z, [x1] +; CHECK-NEXT: ptrue p0.s, vl2 +; CHECK-NEXT: movprfx z4, z1 +; CHECK-NEXT: smulh z4.s, p0/m, z4.s, z3.s +; CHECK-NEXT: ext z1.b, z1.b, z1.b, #8 +; CHECK-NEXT: ext z3.b, z3.b, z3.b, #8 +; CHECK-NEXT: smulh z1.s, p0/m, z1.s, z3.s +; CHECK-NEXT: movprfx z3, z0 +; CHECK-NEXT: smulh z3.s, p0/m, z3.s, z2.s +; CHECK-NEXT: ext z2.b, z2.b, z2.b, #8 +; CHECK-NEXT: ext z0.b, z0.b, z0.b, #8 +; CHECK-NEXT: smulh z0.s, p0/m, z0.s, z2.s +; CHECK-NEXT: splice z4.s, p0, z4.s, z1.s +; CHECK-NEXT: splice z3.s, p0, z3.s, z0.s +; CHECK-NEXT: stp q4, q3, [x0] +; CHECK-NEXT: ret + %op1 = load <8 x i32>, <8 x i32>* %a + %op2 = load <8 x i32>, <8 x i32>* %b + %1 = sext <8 x i32> %op1 to <8 x i64> + %2 = sext <8 x i32> %op2 to <8 x i64> + %mul = mul <8 x i64> %1, %2 + %shr = lshr <8 x i64> %mul, + %res = trunc <8 x i64> %shr to <8 x i32> + store <8 x i32> %res, <8 x i32>* %a + ret void +} + +define <1 x i64> @smulh_v1i64(<1 x i64> %op1, <1 x i64> %op2) #0 { +; CHECK-LABEL: smulh_v1i64: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: ptrue p0.d, vl1 +; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1 +; CHECK-NEXT: smulh z0.d, p0/m, z0.d, z1.d +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 +; CHECK-NEXT: ret + %insert = insertelement <1 x i128> undef, i128 64, i128 0 + %splat = shufflevector <1 x i128> %insert, <1 x i128> undef, <1 x i32> zeroinitializer + %1 = sext <1 x i64> %op1 to <1 x i128> + %2 = sext <1 x i64> %op2 to <1 x i128> + %mul = mul <1 x i128> %1, %2 + %shr = lshr <1 x i128> %mul, %splat + %res = trunc <1 x i128> %shr to <1 x i64> + ret <1 x i64> %res +} + +define <2 x i64> @smulh_v2i64(<2 x i64> %op1, <2 x i64> %op2) #0 { +; CHECK-LABEL: smulh_v2i64: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: ptrue p0.d, vl2 +; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1 +; CHECK-NEXT: smulh z0.d, p0/m, z0.d, z1.d +; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 +; CHECK-NEXT: ret + %1 = sext <2 x i64> %op1 to <2 x i128> + %2 = sext <2 x i64> %op2 to <2 x i128> + %mul = mul <2 x i128> %1, %2 + %shr = lshr <2 x i128> %mul, + %res = trunc <2 x i128> %shr to <2 x i64> + ret <2 x i64> %res +} + +define void @smulh_v4i64(<4 x i64>* %a, <4 x i64>* %b) #0 { +; CHECK-LABEL: smulh_v4i64: +; CHECK: // %bb.0: +; CHECK-NEXT: mov x8, #2 +; CHECK-NEXT: ptrue p0.d, vl2 +; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] +; CHECK-NEXT: ld1d { z1.d }, p0/z, [x0] +; CHECK-NEXT: ld1d { z2.d }, p0/z, [x1, x8, lsl #3] +; CHECK-NEXT: ld1d { z3.d }, p0/z, [x1] +; CHECK-NEXT: ptrue p0.d, vl1 +; CHECK-NEXT: fmov x8, d0 +; CHECK-NEXT: mov z4.d, z0.d[1] +; CHECK-NEXT: fmov x10, d2 +; CHECK-NEXT: mov z0.d, z1.d[1] +; CHECK-NEXT: fmov x9, d1 +; CHECK-NEXT: mov z1.d, z2.d[1] +; CHECK-NEXT: mov z2.d, z3.d[1] +; CHECK-NEXT: fmov x11, d3 +; CHECK-NEXT: fmov x12, d0 +; CHECK-NEXT: fmov x13, d2 +; CHECK-NEXT: fmov x14, d4 +; CHECK-NEXT: smulh x8, x8, x10 +; CHECK-NEXT: fmov x10, d1 +; CHECK-NEXT: smulh x9, x9, x11 +; CHECK-NEXT: smulh x12, x12, x13 +; CHECK-NEXT: smulh x10, x14, x10 +; CHECK-NEXT: fmov d2, x8 +; CHECK-NEXT: fmov d0, x9 +; CHECK-NEXT: fmov d1, x12 +; CHECK-NEXT: fmov d3, x10 +; CHECK-NEXT: splice z0.d, p0, z0.d, z1.d +; CHECK-NEXT: splice z2.d, p0, z2.d, z3.d +; CHECK-NEXT: stp q0, q2, [x0] +; CHECK-NEXT: ret + %op1 = load <4 x i64>, <4 x i64>* %a + %op2 = load <4 x i64>, <4 x i64>* %b + %1 = sext <4 x i64> %op1 to <4 x i128> + %2 = sext <4 x i64> %op2 to <4 x i128> + %mul = mul <4 x i128> %1, %2 + %shr = lshr <4 x i128> %mul, + %res = trunc <4 x i128> %shr to <4 x i64> + store <4 x i64> %res, <4 x i64>* %a + ret void +} + +; +; UMULH +; + +define <4 x i8> @umulh_v4i8(<4 x i8> %op1, <4 x i8> %op2) #0 { +; CHECK-LABEL: umulh_v4i8: +; CHECK: // %bb.0: +; CHECK-NEXT: adrp x8, .LCPI14_0 +; CHECK-NEXT: add x8, x8, :lo12:.LCPI14_0 +; CHECK-NEXT: ptrue p0.h, vl4 +; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1 +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: ld1h { z2.h }, p0/z, [x8] +; CHECK-NEXT: adrp x8, .LCPI14_1 +; CHECK-NEXT: add x8, x8, :lo12:.LCPI14_1 +; CHECK-NEXT: ld1h { z3.h }, p0/z, [x8] +; CHECK-NEXT: and z0.d, z0.d, z2.d +; CHECK-NEXT: and z1.d, z1.d, z2.d +; CHECK-NEXT: mul z0.h, p0/m, z0.h, z1.h +; CHECK-NEXT: lsr z0.h, p0/m, z0.h, z3.h +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 +; CHECK-NEXT: ret + %1 = zext <4 x i8> %op1 to <4 x i16> + %2 = zext <4 x i8> %op2 to <4 x i16> + %mul = mul <4 x i16> %1, %2 + %shr = lshr <4 x i16> %mul, + %res = trunc <4 x i16> %shr to <4 x i8> + ret <4 x i8> %res +} + +define <8 x i8> @umulh_v8i8(<8 x i8> %op1, <8 x i8> %op2) #0 { +; CHECK-LABEL: umulh_v8i8: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: ptrue p0.b, vl8 +; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1 +; CHECK-NEXT: umulh z0.b, p0/m, z0.b, z1.b +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 +; CHECK-NEXT: ret + %1 = zext <8 x i8> %op1 to <8 x i16> + %2 = zext <8 x i8> %op2 to <8 x i16> + %mul = mul <8 x i16> %1, %2 + %shr = lshr <8 x i16> %mul, + %res = trunc <8 x i16> %shr to <8 x i8> + ret <8 x i8> %res +} + +define <16 x i8> @umulh_v16i8(<16 x i8> %op1, <16 x i8> %op2) #0 { +; CHECK-LABEL: umulh_v16i8: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: ptrue p0.b, vl16 +; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1 +; CHECK-NEXT: umulh z0.b, p0/m, z0.b, z1.b +; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 +; CHECK-NEXT: ret + %1 = zext <16 x i8> %op1 to <16 x i16> + %2 = zext <16 x i8> %op2 to <16 x i16> + %mul = mul <16 x i16> %1, %2 + %shr = lshr <16 x i16> %mul, + %res = trunc <16 x i16> %shr to <16 x i8> + ret <16 x i8> %res +} + +define void @umulh_v32i8(<32 x i8>* %a, <32 x i8>* %b) #0 { +; CHECK-LABEL: umulh_v32i8: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w8, #16 +; CHECK-NEXT: ptrue p0.b, vl16 +; CHECK-NEXT: ptrue p1.h, vl8 +; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0, x8] +; CHECK-NEXT: ld1b { z2.b }, p0/z, [x0] +; CHECK-NEXT: ld1b { z1.b }, p0/z, [x1, x8] +; CHECK-NEXT: ld1b { z3.b }, p0/z, [x1] +; CHECK-NEXT: adrp x8, .LCPI17_0 +; CHECK-NEXT: add x8, x8, :lo12:.LCPI17_0 +; CHECK-NEXT: uunpklo z5.h, z2.b +; CHECK-NEXT: ext z2.b, z2.b, z2.b, #8 +; CHECK-NEXT: uunpklo z7.h, z3.b +; CHECK-NEXT: ld1h { z16.h }, p1/z, [x8] +; CHECK-NEXT: ext z3.b, z3.b, z3.b, #8 +; CHECK-NEXT: uunpklo z2.h, z2.b +; CHECK-NEXT: uunpklo z3.h, z3.b +; CHECK-NEXT: mul z5.h, p1/m, z5.h, z7.h +; CHECK-NEXT: mul z2.h, p1/m, z2.h, z3.h +; CHECK-NEXT: movprfx z3, z5 +; CHECK-NEXT: lsr z3.h, p1/m, z3.h, z16.h +; CHECK-NEXT: fmov w8, s3 +; CHECK-NEXT: uunpklo z4.h, z0.b +; CHECK-NEXT: uunpklo z6.h, z1.b +; CHECK-NEXT: ext z0.b, z0.b, z0.b, #8 +; CHECK-NEXT: ext z1.b, z1.b, z1.b, #8 +; CHECK-NEXT: lsr z2.h, p1/m, z2.h, z16.h +; CHECK-NEXT: mov z5.h, z3.h[7] +; CHECK-NEXT: uunpklo z0.h, z0.b +; CHECK-NEXT: uunpklo z1.h, z1.b +; CHECK-NEXT: mul z4.h, p1/m, z4.h, z6.h +; CHECK-NEXT: mov z6.h, z3.h[6] +; CHECK-NEXT: mov z7.h, z3.h[5] +; CHECK-NEXT: mov z17.h, z3.h[4] +; CHECK-NEXT: mov z18.h, z3.h[3] +; CHECK-NEXT: mov z19.h, z3.h[2] +; CHECK-NEXT: mov z20.h, z3.h[1] +; CHECK-NEXT: mov z3.h, z2.h[7] +; CHECK-NEXT: mov z21.h, z2.h[6] +; CHECK-NEXT: mov z22.h, z2.h[5] +; CHECK-NEXT: mov z23.h, z2.h[4] +; CHECK-NEXT: mov z24.h, z2.h[3] +; CHECK-NEXT: mov z25.h, z2.h[2] +; CHECK-NEXT: mov z26.h, z2.h[1] +; CHECK-NEXT: fmov w9, s2 +; CHECK-NEXT: mul z0.h, p1/m, z0.h, z1.h +; CHECK-NEXT: fmov w10, s5 +; CHECK-NEXT: strb w8, [sp, #-32]! +; CHECK-NEXT: .cfi_def_cfa_offset 32 +; CHECK-NEXT: fmov w8, s6 +; CHECK-NEXT: strb w9, [sp, #8] +; CHECK-NEXT: fmov w9, s7 +; CHECK-NEXT: strb w10, [sp, #7] +; CHECK-NEXT: fmov w10, s17 +; CHECK-NEXT: lsr z0.h, p1/m, z0.h, z16.h +; CHECK-NEXT: strb w8, [sp, #6] +; CHECK-NEXT: fmov w8, s18 +; CHECK-NEXT: strb w9, [sp, #5] +; CHECK-NEXT: fmov w9, s19 +; CHECK-NEXT: strb w10, [sp, #4] +; CHECK-NEXT: fmov w10, s20 +; CHECK-NEXT: strb w8, [sp, #3] +; CHECK-NEXT: fmov w8, s3 +; CHECK-NEXT: strb w9, [sp, #2] +; CHECK-NEXT: fmov w9, s21 +; CHECK-NEXT: strb w10, [sp, #1] +; CHECK-NEXT: fmov w10, s22 +; CHECK-NEXT: strb w8, [sp, #15] +; CHECK-NEXT: fmov w8, s23 +; CHECK-NEXT: strb w9, [sp, #14] +; CHECK-NEXT: fmov w9, s24 +; CHECK-NEXT: strb w10, [sp, #13] +; CHECK-NEXT: fmov w10, s25 +; CHECK-NEXT: strb w8, [sp, #12] +; CHECK-NEXT: fmov w8, s26 +; CHECK-NEXT: movprfx z1, z4 +; CHECK-NEXT: lsr z1.h, p1/m, z1.h, z16.h +; CHECK-NEXT: strb w9, [sp, #11] +; CHECK-NEXT: mov z2.h, z1.h[7] +; CHECK-NEXT: fmov w9, s1 +; CHECK-NEXT: strb w10, [sp, #10] +; CHECK-NEXT: fmov w10, s0 +; CHECK-NEXT: strb w8, [sp, #9] +; CHECK-NEXT: fmov w8, s2 +; CHECK-NEXT: mov z3.h, z1.h[6] +; CHECK-NEXT: mov z4.h, z1.h[5] +; CHECK-NEXT: mov z5.h, z1.h[4] +; CHECK-NEXT: strb w9, [sp, #16] +; CHECK-NEXT: fmov w9, s3 +; CHECK-NEXT: strb w10, [sp, #24] +; CHECK-NEXT: fmov w10, s4 +; CHECK-NEXT: strb w8, [sp, #23] +; CHECK-NEXT: fmov w8, s5 +; CHECK-NEXT: mov z6.h, z1.h[3] +; CHECK-NEXT: mov z7.h, z1.h[2] +; CHECK-NEXT: mov z16.h, z1.h[1] +; CHECK-NEXT: strb w9, [sp, #22] +; CHECK-NEXT: fmov w9, s6 +; CHECK-NEXT: strb w10, [sp, #21] +; CHECK-NEXT: fmov w10, s7 +; CHECK-NEXT: strb w8, [sp, #20] +; CHECK-NEXT: fmov w8, s16 +; CHECK-NEXT: mov z1.h, z0.h[7] +; CHECK-NEXT: mov z17.h, z0.h[6] +; CHECK-NEXT: mov z18.h, z0.h[5] +; CHECK-NEXT: strb w9, [sp, #19] +; CHECK-NEXT: fmov w9, s1 +; CHECK-NEXT: strb w10, [sp, #18] +; CHECK-NEXT: fmov w10, s17 +; CHECK-NEXT: strb w8, [sp, #17] +; CHECK-NEXT: fmov w8, s18 +; CHECK-NEXT: mov z19.h, z0.h[4] +; CHECK-NEXT: mov z20.h, z0.h[3] +; CHECK-NEXT: mov z21.h, z0.h[2] +; CHECK-NEXT: strb w9, [sp, #31] +; CHECK-NEXT: fmov w9, s19 +; CHECK-NEXT: strb w10, [sp, #30] +; CHECK-NEXT: fmov w10, s20 +; CHECK-NEXT: strb w8, [sp, #29] +; CHECK-NEXT: fmov w8, s21 +; CHECK-NEXT: mov z22.h, z0.h[1] +; CHECK-NEXT: strb w9, [sp, #28] +; CHECK-NEXT: fmov w9, s22 +; CHECK-NEXT: strb w10, [sp, #27] +; CHECK-NEXT: mov x10, sp +; CHECK-NEXT: strb w8, [sp, #26] +; CHECK-NEXT: add x8, sp, #16 +; CHECK-NEXT: strb w9, [sp, #25] +; CHECK-NEXT: ld1b { z0.b }, p0/z, [x10] +; CHECK-NEXT: ld1b { z1.b }, p0/z, [x8] +; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: add sp, sp, #32 +; CHECK-NEXT: ret + %op1 = load <32 x i8>, <32 x i8>* %a + %op2 = load <32 x i8>, <32 x i8>* %b + %1 = zext <32 x i8> %op1 to <32 x i16> + %2 = zext <32 x i8> %op2 to <32 x i16> + %mul = mul <32 x i16> %1, %2 + %shr = lshr <32 x i16> %mul, + %res = trunc <32 x i16> %shr to <32 x i8> + store <32 x i8> %res, <32 x i8>* %a + ret void +} + +define <2 x i16> @umulh_v2i16(<2 x i16> %op1, <2 x i16> %op2) #0 { +; CHECK-LABEL: umulh_v2i16: +; CHECK: // %bb.0: +; CHECK-NEXT: adrp x8, .LCPI18_0 +; CHECK-NEXT: add x8, x8, :lo12:.LCPI18_0 +; CHECK-NEXT: ptrue p0.s, vl2 +; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1 +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: ld1w { z2.s }, p0/z, [x8] +; CHECK-NEXT: adrp x8, .LCPI18_1 +; CHECK-NEXT: add x8, x8, :lo12:.LCPI18_1 +; CHECK-NEXT: ld1w { z3.s }, p0/z, [x8] +; CHECK-NEXT: and z0.d, z0.d, z2.d +; CHECK-NEXT: and z1.d, z1.d, z2.d +; CHECK-NEXT: mul z0.s, p0/m, z0.s, z1.s +; CHECK-NEXT: lsr z0.s, p0/m, z0.s, z3.s +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 +; CHECK-NEXT: ret + %1 = zext <2 x i16> %op1 to <2 x i32> + %2 = zext <2 x i16> %op2 to <2 x i32> + %mul = mul <2 x i32> %1, %2 + %shr = lshr <2 x i32> %mul, + %res = trunc <2 x i32> %shr to <2 x i16> + ret <2 x i16> %res +} + +define <4 x i16> @umulh_v4i16(<4 x i16> %op1, <4 x i16> %op2) #0 { +; CHECK-LABEL: umulh_v4i16: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: ptrue p0.h, vl4 +; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1 +; CHECK-NEXT: umulh z0.h, p0/m, z0.h, z1.h +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 +; CHECK-NEXT: ret + %1 = zext <4 x i16> %op1 to <4 x i32> + %2 = zext <4 x i16> %op2 to <4 x i32> + %mul = mul <4 x i32> %1, %2 + %shr = lshr <4 x i32> %mul, + %res = trunc <4 x i32> %shr to <4 x i16> + ret <4 x i16> %res +} + +define <8 x i16> @umulh_v8i16(<8 x i16> %op1, <8 x i16> %op2) #0 { +; CHECK-LABEL: umulh_v8i16: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: ptrue p0.h, vl8 +; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1 +; CHECK-NEXT: umulh z0.h, p0/m, z0.h, z1.h +; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 +; CHECK-NEXT: ret + %1 = zext <8 x i16> %op1 to <8 x i32> + %2 = zext <8 x i16> %op2 to <8 x i32> + %mul = mul <8 x i32> %1, %2 + %shr = lshr <8 x i32> %mul, + %res = trunc <8 x i32> %shr to <8 x i16> + ret <8 x i16> %res +} + +define void @umulh_v16i16(<16 x i16>* %a, <16 x i16>* %b) #0 { +; CHECK-LABEL: umulh_v16i16: +; CHECK: // %bb.0: +; CHECK-NEXT: mov x8, #8 +; CHECK-NEXT: ptrue p0.h, vl8 +; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] +; CHECK-NEXT: ld1h { z1.h }, p0/z, [x0] +; CHECK-NEXT: ld1h { z2.h }, p0/z, [x1, x8, lsl #1] +; CHECK-NEXT: ld1h { z3.h }, p0/z, [x1] +; CHECK-NEXT: ptrue p0.h, vl4 +; CHECK-NEXT: movprfx z4, z1 +; CHECK-NEXT: umulh z4.h, p0/m, z4.h, z3.h +; CHECK-NEXT: ext z1.b, z1.b, z1.b, #8 +; CHECK-NEXT: ext z3.b, z3.b, z3.b, #8 +; CHECK-NEXT: umulh z1.h, p0/m, z1.h, z3.h +; CHECK-NEXT: movprfx z3, z0 +; CHECK-NEXT: umulh z3.h, p0/m, z3.h, z2.h +; CHECK-NEXT: ext z2.b, z2.b, z2.b, #8 +; CHECK-NEXT: ext z0.b, z0.b, z0.b, #8 +; CHECK-NEXT: umulh z0.h, p0/m, z0.h, z2.h +; CHECK-NEXT: splice z4.h, p0, z4.h, z1.h +; CHECK-NEXT: splice z3.h, p0, z3.h, z0.h +; CHECK-NEXT: stp q4, q3, [x0] +; CHECK-NEXT: ret + %op1 = load <16 x i16>, <16 x i16>* %a + %op2 = load <16 x i16>, <16 x i16>* %b + %1 = zext <16 x i16> %op1 to <16 x i32> + %2 = zext <16 x i16> %op2 to <16 x i32> + %mul = mul <16 x i32> %1, %2 + %shr = lshr <16 x i32> %mul, + %res = trunc <16 x i32> %shr to <16 x i16> + store <16 x i16> %res, <16 x i16>* %a + ret void +} + +define <2 x i32> @umulh_v2i32(<2 x i32> %op1, <2 x i32> %op2) #0 { +; CHECK-LABEL: umulh_v2i32: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: ptrue p0.s, vl2 +; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1 +; CHECK-NEXT: umulh z0.s, p0/m, z0.s, z1.s +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 +; CHECK-NEXT: ret + %1 = zext <2 x i32> %op1 to <2 x i64> + %2 = zext <2 x i32> %op2 to <2 x i64> + %mul = mul <2 x i64> %1, %2 + %shr = lshr <2 x i64> %mul, + %res = trunc <2 x i64> %shr to <2 x i32> + ret <2 x i32> %res +} + +define <4 x i32> @umulh_v4i32(<4 x i32> %op1, <4 x i32> %op2) #0 { +; CHECK-LABEL: umulh_v4i32: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: ptrue p0.s, vl4 +; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1 +; CHECK-NEXT: umulh z0.s, p0/m, z0.s, z1.s +; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 +; CHECK-NEXT: ret + %1 = zext <4 x i32> %op1 to <4 x i64> + %2 = zext <4 x i32> %op2 to <4 x i64> + %mul = mul <4 x i64> %1, %2 + %shr = lshr <4 x i64> %mul, + %res = trunc <4 x i64> %shr to <4 x i32> + ret <4 x i32> %res +} + +define void @umulh_v8i32(<8 x i32>* %a, <8 x i32>* %b) #0 { +; CHECK-LABEL: umulh_v8i32: +; CHECK: // %bb.0: +; CHECK-NEXT: mov x8, #4 +; CHECK-NEXT: ptrue p0.s, vl4 +; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] +; CHECK-NEXT: ld1w { z1.s }, p0/z, [x0] +; CHECK-NEXT: ld1w { z2.s }, p0/z, [x1, x8, lsl #2] +; CHECK-NEXT: ld1w { z3.s }, p0/z, [x1] +; CHECK-NEXT: ptrue p0.s, vl2 +; CHECK-NEXT: movprfx z4, z1 +; CHECK-NEXT: umulh z4.s, p0/m, z4.s, z3.s +; CHECK-NEXT: ext z1.b, z1.b, z1.b, #8 +; CHECK-NEXT: ext z3.b, z3.b, z3.b, #8 +; CHECK-NEXT: umulh z1.s, p0/m, z1.s, z3.s +; CHECK-NEXT: movprfx z3, z0 +; CHECK-NEXT: umulh z3.s, p0/m, z3.s, z2.s +; CHECK-NEXT: ext z2.b, z2.b, z2.b, #8 +; CHECK-NEXT: ext z0.b, z0.b, z0.b, #8 +; CHECK-NEXT: umulh z0.s, p0/m, z0.s, z2.s +; CHECK-NEXT: splice z4.s, p0, z4.s, z1.s +; CHECK-NEXT: splice z3.s, p0, z3.s, z0.s +; CHECK-NEXT: stp q4, q3, [x0] +; CHECK-NEXT: ret + %op1 = load <8 x i32>, <8 x i32>* %a + %op2 = load <8 x i32>, <8 x i32>* %b + %insert = insertelement <8 x i64> undef, i64 32, i64 0 + %splat = shufflevector <8 x i64> %insert, <8 x i64> undef, <8 x i32> zeroinitializer + %1 = zext <8 x i32> %op1 to <8 x i64> + %2 = zext <8 x i32> %op2 to <8 x i64> + %mul = mul <8 x i64> %1, %2 + %shr = lshr <8 x i64> %mul, + %res = trunc <8 x i64> %shr to <8 x i32> + store <8 x i32> %res, <8 x i32>* %a + ret void +} + +define <1 x i64> @umulh_v1i64(<1 x i64> %op1, <1 x i64> %op2) #0 { +; CHECK-LABEL: umulh_v1i64: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: ptrue p0.d, vl1 +; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1 +; CHECK-NEXT: umulh z0.d, p0/m, z0.d, z1.d +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 +; CHECK-NEXT: ret + %1 = zext <1 x i64> %op1 to <1 x i128> + %2 = zext <1 x i64> %op2 to <1 x i128> + %mul = mul <1 x i128> %1, %2 + %shr = lshr <1 x i128> %mul, + %res = trunc <1 x i128> %shr to <1 x i64> + ret <1 x i64> %res +} + +define <2 x i64> @umulh_v2i64(<2 x i64> %op1, <2 x i64> %op2) #0 { +; CHECK-LABEL: umulh_v2i64: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: ptrue p0.d, vl2 +; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1 +; CHECK-NEXT: umulh z0.d, p0/m, z0.d, z1.d +; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 +; CHECK-NEXT: ret + %1 = zext <2 x i64> %op1 to <2 x i128> + %2 = zext <2 x i64> %op2 to <2 x i128> + %mul = mul <2 x i128> %1, %2 + %shr = lshr <2 x i128> %mul, + %res = trunc <2 x i128> %shr to <2 x i64> + ret <2 x i64> %res +} + +define void @umulh_v4i64(<4 x i64>* %a, <4 x i64>* %b) #0 { +; CHECK-LABEL: umulh_v4i64: +; CHECK: // %bb.0: +; CHECK-NEXT: mov x8, #2 +; CHECK-NEXT: ptrue p0.d, vl2 +; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] +; CHECK-NEXT: ld1d { z1.d }, p0/z, [x0] +; CHECK-NEXT: ld1d { z2.d }, p0/z, [x1, x8, lsl #3] +; CHECK-NEXT: ld1d { z3.d }, p0/z, [x1] +; CHECK-NEXT: ptrue p0.d, vl1 +; CHECK-NEXT: fmov x8, d0 +; CHECK-NEXT: mov z4.d, z0.d[1] +; CHECK-NEXT: fmov x10, d2 +; CHECK-NEXT: mov z0.d, z1.d[1] +; CHECK-NEXT: fmov x9, d1 +; CHECK-NEXT: mov z1.d, z2.d[1] +; CHECK-NEXT: mov z2.d, z3.d[1] +; CHECK-NEXT: fmov x11, d3 +; CHECK-NEXT: fmov x12, d0 +; CHECK-NEXT: fmov x13, d2 +; CHECK-NEXT: fmov x14, d4 +; CHECK-NEXT: umulh x8, x8, x10 +; CHECK-NEXT: fmov x10, d1 +; CHECK-NEXT: umulh x9, x9, x11 +; CHECK-NEXT: umulh x12, x12, x13 +; CHECK-NEXT: umulh x10, x14, x10 +; CHECK-NEXT: fmov d2, x8 +; CHECK-NEXT: fmov d0, x9 +; CHECK-NEXT: fmov d1, x12 +; CHECK-NEXT: fmov d3, x10 +; CHECK-NEXT: splice z0.d, p0, z0.d, z1.d +; CHECK-NEXT: splice z2.d, p0, z2.d, z3.d +; CHECK-NEXT: stp q0, q2, [x0] +; CHECK-NEXT: ret + %op1 = load <4 x i64>, <4 x i64>* %a + %op2 = load <4 x i64>, <4 x i64>* %b + %1 = zext <4 x i64> %op1 to <4 x i128> + %2 = zext <4 x i64> %op2 to <4 x i128> + %mul = mul <4 x i128> %1, %2 + %shr = lshr <4 x i128> %mul, + %res = trunc <4 x i128> %shr to <4 x i64> + store <4 x i64> %res, <4 x i64>* %a + ret void +} + +attributes #0 = { "target-features"="+sve" } diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-rem.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-rem.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-rem.ll @@ -0,0 +1,774 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -force-streaming-compatible-sve < %s | FileCheck %s + +target triple = "aarch64-unknown-linux-gnu" + +; +; SREM +; + +define <4 x i8> @srem_v4i8(<4 x i8> %op1, <4 x i8> %op2) #0 { +; CHECK-LABEL: srem_v4i8: +; CHECK: // %bb.0: +; CHECK-NEXT: sub sp, sp, #16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: adrp x8, .LCPI0_0 +; CHECK-NEXT: add x8, x8, :lo12:.LCPI0_0 +; CHECK-NEXT: ptrue p0.h, vl4 +; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1 +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: ptrue p1.s, vl4 +; CHECK-NEXT: ld1h { z2.h }, p0/z, [x8] +; CHECK-NEXT: lsl z0.h, p0/m, z0.h, z2.h +; CHECK-NEXT: lsl z1.h, p0/m, z1.h, z2.h +; CHECK-NEXT: asr z0.h, p0/m, z0.h, z2.h +; CHECK-NEXT: asr z1.h, p0/m, z1.h, z2.h +; CHECK-NEXT: sunpklo z2.s, z1.h +; CHECK-NEXT: sunpklo z3.s, z0.h +; CHECK-NEXT: sdivr z2.s, p1/m, z2.s, z3.s +; CHECK-NEXT: fmov w8, s2 +; CHECK-NEXT: mov z3.s, z2.s[3] +; CHECK-NEXT: mov z4.s, z2.s[2] +; CHECK-NEXT: mov z2.s, z2.s[1] +; CHECK-NEXT: fmov w9, s3 +; CHECK-NEXT: fmov w10, s4 +; CHECK-NEXT: strh w8, [sp, #8] +; CHECK-NEXT: fmov w8, s2 +; CHECK-NEXT: strh w9, [sp, #14] +; CHECK-NEXT: strh w8, [sp, #10] +; CHECK-NEXT: add x8, sp, #8 +; CHECK-NEXT: strh w10, [sp, #12] +; CHECK-NEXT: ld1h { z2.h }, p0/z, [x8] +; CHECK-NEXT: mls z0.h, p0/m, z2.h, z1.h +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 +; CHECK-NEXT: add sp, sp, #16 +; CHECK-NEXT: ret + %res = srem <4 x i8> %op1, %op2 + ret <4 x i8> %res +} + +define <8 x i8> @srem_v8i8(<8 x i8> %op1, <8 x i8> %op2) #0 { +; CHECK-LABEL: srem_v8i8: +; CHECK: // %bb.0: +; CHECK-NEXT: sub sp, sp, #16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1 +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: sunpklo z2.h, z1.b +; CHECK-NEXT: sunpklo z3.h, z0.b +; CHECK-NEXT: ptrue p0.s, vl4 +; CHECK-NEXT: sunpkhi z4.s, z2.h +; CHECK-NEXT: sunpkhi z5.s, z3.h +; CHECK-NEXT: sunpklo z2.s, z2.h +; CHECK-NEXT: sunpklo z3.s, z3.h +; CHECK-NEXT: sdivr z4.s, p0/m, z4.s, z5.s +; CHECK-NEXT: sdivr z2.s, p0/m, z2.s, z3.s +; CHECK-NEXT: ptrue p0.b, vl8 +; CHECK-NEXT: uzp1 z2.h, z2.h, z4.h +; CHECK-NEXT: fmov w8, s2 +; CHECK-NEXT: mov z3.h, z2.h[7] +; CHECK-NEXT: mov z5.h, z2.h[5] +; CHECK-NEXT: fmov w9, s3 +; CHECK-NEXT: mov z4.h, z2.h[6] +; CHECK-NEXT: mov z6.h, z2.h[4] +; CHECK-NEXT: strb w8, [sp, #8] +; CHECK-NEXT: fmov w8, s5 +; CHECK-NEXT: mov z16.h, z2.h[2] +; CHECK-NEXT: fmov w10, s4 +; CHECK-NEXT: strb w9, [sp, #15] +; CHECK-NEXT: fmov w9, s6 +; CHECK-NEXT: strb w8, [sp, #13] +; CHECK-NEXT: fmov w8, s16 +; CHECK-NEXT: mov z7.h, z2.h[3] +; CHECK-NEXT: mov z2.h, z2.h[1] +; CHECK-NEXT: strb w10, [sp, #14] +; CHECK-NEXT: fmov w10, s7 +; CHECK-NEXT: strb w9, [sp, #12] +; CHECK-NEXT: fmov w9, s2 +; CHECK-NEXT: strb w8, [sp, #10] +; CHECK-NEXT: add x8, sp, #8 +; CHECK-NEXT: strb w10, [sp, #11] +; CHECK-NEXT: strb w9, [sp, #9] +; CHECK-NEXT: ld1b { z2.b }, p0/z, [x8] +; CHECK-NEXT: mls z0.b, p0/m, z2.b, z1.b +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 +; CHECK-NEXT: add sp, sp, #16 +; CHECK-NEXT: ret + %res = srem <8 x i8> %op1, %op2 + ret <8 x i8> %res +} + +define <16 x i8> @srem_v16i8(<16 x i8> %op1, <16 x i8> %op2) #0 { +; CHECK-LABEL: srem_v16i8: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1 +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: sunpkhi z2.h, z1.b +; CHECK-NEXT: sunpkhi z3.h, z0.b +; CHECK-NEXT: ptrue p0.s, vl4 +; CHECK-NEXT: sunpkhi z5.s, z2.h +; CHECK-NEXT: sunpkhi z6.s, z3.h +; CHECK-NEXT: sunpklo z2.s, z2.h +; CHECK-NEXT: sunpklo z3.s, z3.h +; CHECK-NEXT: sunpklo z4.h, z1.b +; CHECK-NEXT: sdivr z2.s, p0/m, z2.s, z3.s +; CHECK-NEXT: sunpklo z3.h, z0.b +; CHECK-NEXT: sdivr z5.s, p0/m, z5.s, z6.s +; CHECK-NEXT: sunpkhi z6.s, z4.h +; CHECK-NEXT: sunpkhi z7.s, z3.h +; CHECK-NEXT: sunpklo z4.s, z4.h +; CHECK-NEXT: sunpklo z3.s, z3.h +; CHECK-NEXT: sdivr z6.s, p0/m, z6.s, z7.s +; CHECK-NEXT: sdiv z3.s, p0/m, z3.s, z4.s +; CHECK-NEXT: uzp1 z2.h, z2.h, z5.h +; CHECK-NEXT: uzp1 z3.h, z3.h, z6.h +; CHECK-NEXT: ptrue p0.b, vl16 +; CHECK-NEXT: uzp1 z2.b, z3.b, z2.b +; CHECK-NEXT: mls z0.b, p0/m, z2.b, z1.b +; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 +; CHECK-NEXT: ret + %res = srem <16 x i8> %op1, %op2 + ret <16 x i8> %res +} + +define void @srem_v32i8(<32 x i8>* %a, <32 x i8>* %b) #0 { +; CHECK-LABEL: srem_v32i8: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w8, #16 +; CHECK-NEXT: ptrue p0.b, vl16 +; CHECK-NEXT: ptrue p1.s, vl4 +; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0, x8] +; CHECK-NEXT: ld1b { z1.b }, p0/z, [x0] +; CHECK-NEXT: ld1b { z2.b }, p0/z, [x1, x8] +; CHECK-NEXT: ld1b { z3.b }, p0/z, [x1] +; CHECK-NEXT: sunpkhi z5.h, z0.b +; CHECK-NEXT: sunpklo z7.h, z0.b +; CHECK-NEXT: sunpkhi z4.h, z2.b +; CHECK-NEXT: sunpklo z6.h, z2.b +; CHECK-NEXT: sunpkhi z16.s, z4.h +; CHECK-NEXT: sunpkhi z17.s, z5.h +; CHECK-NEXT: sunpklo z4.s, z4.h +; CHECK-NEXT: sunpklo z5.s, z5.h +; CHECK-NEXT: sunpkhi z18.s, z6.h +; CHECK-NEXT: sdivr z16.s, p1/m, z16.s, z17.s +; CHECK-NEXT: sdivr z4.s, p1/m, z4.s, z5.s +; CHECK-NEXT: sunpkhi z5.s, z7.h +; CHECK-NEXT: sunpklo z6.s, z6.h +; CHECK-NEXT: sunpklo z7.s, z7.h +; CHECK-NEXT: uzp1 z4.h, z4.h, z16.h +; CHECK-NEXT: sdivr z6.s, p1/m, z6.s, z7.s +; CHECK-NEXT: sunpkhi z7.h, z3.b +; CHECK-NEXT: sunpkhi z16.h, z1.b +; CHECK-NEXT: sdiv z5.s, p1/m, z5.s, z18.s +; CHECK-NEXT: sunpkhi z17.s, z7.h +; CHECK-NEXT: sunpkhi z18.s, z16.h +; CHECK-NEXT: sunpklo z7.s, z7.h +; CHECK-NEXT: sunpklo z16.s, z16.h +; CHECK-NEXT: sdivr z17.s, p1/m, z17.s, z18.s +; CHECK-NEXT: sdivr z7.s, p1/m, z7.s, z16.s +; CHECK-NEXT: sunpklo z16.h, z3.b +; CHECK-NEXT: sunpklo z18.h, z1.b +; CHECK-NEXT: sunpkhi z19.s, z16.h +; CHECK-NEXT: sunpkhi z20.s, z18.h +; CHECK-NEXT: sunpklo z16.s, z16.h +; CHECK-NEXT: sunpklo z18.s, z18.h +; CHECK-NEXT: sdivr z19.s, p1/m, z19.s, z20.s +; CHECK-NEXT: sdivr z16.s, p1/m, z16.s, z18.s +; CHECK-NEXT: uzp1 z7.h, z7.h, z17.h +; CHECK-NEXT: uzp1 z16.h, z16.h, z19.h +; CHECK-NEXT: uzp1 z5.h, z6.h, z5.h +; CHECK-NEXT: uzp1 z6.b, z16.b, z7.b +; CHECK-NEXT: uzp1 z4.b, z5.b, z4.b +; CHECK-NEXT: mls z1.b, p0/m, z6.b, z3.b +; CHECK-NEXT: mls z0.b, p0/m, z4.b, z2.b +; CHECK-NEXT: stp q1, q0, [x0] +; CHECK-NEXT: ret + %op1 = load <32 x i8>, <32 x i8>* %a + %op2 = load <32 x i8>, <32 x i8>* %b + %res = srem <32 x i8> %op1, %op2 + store <32 x i8> %res, <32 x i8>* %a + ret void +} + +define <4 x i16> @srem_v4i16(<4 x i16> %op1, <4 x i16> %op2) #0 { +; CHECK-LABEL: srem_v4i16: +; CHECK: // %bb.0: +; CHECK-NEXT: sub sp, sp, #16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1 +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: ptrue p0.s, vl4 +; CHECK-NEXT: sunpklo z2.s, z1.h +; CHECK-NEXT: sunpklo z3.s, z0.h +; CHECK-NEXT: sdivr z2.s, p0/m, z2.s, z3.s +; CHECK-NEXT: ptrue p0.h, vl4 +; CHECK-NEXT: fmov w8, s2 +; CHECK-NEXT: mov z3.s, z2.s[3] +; CHECK-NEXT: mov z4.s, z2.s[2] +; CHECK-NEXT: mov z2.s, z2.s[1] +; CHECK-NEXT: fmov w9, s3 +; CHECK-NEXT: fmov w10, s4 +; CHECK-NEXT: strh w8, [sp, #8] +; CHECK-NEXT: fmov w8, s2 +; CHECK-NEXT: strh w9, [sp, #14] +; CHECK-NEXT: strh w8, [sp, #10] +; CHECK-NEXT: add x8, sp, #8 +; CHECK-NEXT: strh w10, [sp, #12] +; CHECK-NEXT: ld1h { z2.h }, p0/z, [x8] +; CHECK-NEXT: mls z0.h, p0/m, z2.h, z1.h +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 +; CHECK-NEXT: add sp, sp, #16 +; CHECK-NEXT: ret + %res = srem <4 x i16> %op1, %op2 + ret <4 x i16> %res +} + +define <8 x i16> @srem_v8i16(<8 x i16> %op1, <8 x i16> %op2) #0 { +; CHECK-LABEL: srem_v8i16: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1 +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: ptrue p0.s, vl4 +; CHECK-NEXT: sunpkhi z2.s, z1.h +; CHECK-NEXT: sunpkhi z3.s, z0.h +; CHECK-NEXT: sunpklo z4.s, z1.h +; CHECK-NEXT: sdivr z2.s, p0/m, z2.s, z3.s +; CHECK-NEXT: sunpklo z5.s, z0.h +; CHECK-NEXT: movprfx z3, z5 +; CHECK-NEXT: sdiv z3.s, p0/m, z3.s, z4.s +; CHECK-NEXT: uzp1 z2.h, z3.h, z2.h +; CHECK-NEXT: ptrue p0.h, vl8 +; CHECK-NEXT: mls z0.h, p0/m, z2.h, z1.h +; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 +; CHECK-NEXT: ret + %res = srem <8 x i16> %op1, %op2 + ret <8 x i16> %res +} + +define void @srem_v16i16(<16 x i16>* %a, <16 x i16>* %b) #0 { +; CHECK-LABEL: srem_v16i16: +; CHECK: // %bb.0: +; CHECK-NEXT: mov x8, #8 +; CHECK-NEXT: ptrue p0.h, vl8 +; CHECK-NEXT: ptrue p1.s, vl4 +; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] +; CHECK-NEXT: ld1h { z1.h }, p0/z, [x0] +; CHECK-NEXT: ld1h { z2.h }, p0/z, [x1, x8, lsl #1] +; CHECK-NEXT: ld1h { z3.h }, p0/z, [x1] +; CHECK-NEXT: sunpkhi z5.s, z0.h +; CHECK-NEXT: sunpkhi z16.s, z1.h +; CHECK-NEXT: sunpkhi z4.s, z2.h +; CHECK-NEXT: sunpkhi z7.s, z3.h +; CHECK-NEXT: sdivr z4.s, p1/m, z4.s, z5.s +; CHECK-NEXT: sunpklo z5.s, z3.h +; CHECK-NEXT: sdivr z7.s, p1/m, z7.s, z16.s +; CHECK-NEXT: sunpklo z16.s, z1.h +; CHECK-NEXT: sunpklo z6.s, z2.h +; CHECK-NEXT: sdivr z5.s, p1/m, z5.s, z16.s +; CHECK-NEXT: sunpklo z16.s, z0.h +; CHECK-NEXT: uzp1 z5.h, z5.h, z7.h +; CHECK-NEXT: sdivr z6.s, p1/m, z6.s, z16.s +; CHECK-NEXT: mls z1.h, p0/m, z5.h, z3.h +; CHECK-NEXT: uzp1 z4.h, z6.h, z4.h +; CHECK-NEXT: mls z0.h, p0/m, z4.h, z2.h +; CHECK-NEXT: stp q1, q0, [x0] +; CHECK-NEXT: ret + %op1 = load <16 x i16>, <16 x i16>* %a + %op2 = load <16 x i16>, <16 x i16>* %b + %res = srem <16 x i16> %op1, %op2 + store <16 x i16> %res, <16 x i16>* %a + ret void +} + +define <2 x i32> @srem_v2i32(<2 x i32> %op1, <2 x i32> %op2) #0 { +; CHECK-LABEL: srem_v2i32: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: ptrue p0.s, vl2 +; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1 +; CHECK-NEXT: movprfx z2, z0 +; CHECK-NEXT: sdiv z2.s, p0/m, z2.s, z1.s +; CHECK-NEXT: mls z0.s, p0/m, z2.s, z1.s +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 +; CHECK-NEXT: ret + %res = srem <2 x i32> %op1, %op2 + ret <2 x i32> %res +} + +define <4 x i32> @srem_v4i32(<4 x i32> %op1, <4 x i32> %op2) #0 { +; CHECK-LABEL: srem_v4i32: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: ptrue p0.s, vl4 +; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1 +; CHECK-NEXT: movprfx z2, z0 +; CHECK-NEXT: sdiv z2.s, p0/m, z2.s, z1.s +; CHECK-NEXT: mls z0.s, p0/m, z2.s, z1.s +; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 +; CHECK-NEXT: ret + %res = srem <4 x i32> %op1, %op2 + ret <4 x i32> %res +} + +define void @srem_v8i32(<8 x i32>* %a, <8 x i32>* %b) #0 { +; CHECK-LABEL: srem_v8i32: +; CHECK: // %bb.0: +; CHECK-NEXT: mov x8, #4 +; CHECK-NEXT: ptrue p0.s, vl4 +; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] +; CHECK-NEXT: ld1w { z1.s }, p0/z, [x0] +; CHECK-NEXT: ld1w { z2.s }, p0/z, [x1, x8, lsl #2] +; CHECK-NEXT: ld1w { z3.s }, p0/z, [x1] +; CHECK-NEXT: movprfx z4, z1 +; CHECK-NEXT: sdiv z4.s, p0/m, z4.s, z3.s +; CHECK-NEXT: movprfx z5, z0 +; CHECK-NEXT: sdiv z5.s, p0/m, z5.s, z2.s +; CHECK-NEXT: mls z1.s, p0/m, z4.s, z3.s +; CHECK-NEXT: mls z0.s, p0/m, z5.s, z2.s +; CHECK-NEXT: stp q1, q0, [x0] +; CHECK-NEXT: ret + %op1 = load <8 x i32>, <8 x i32>* %a + %op2 = load <8 x i32>, <8 x i32>* %b + %res = srem <8 x i32> %op1, %op2 + store <8 x i32> %res, <8 x i32>* %a + ret void +} + +define <1 x i64> @srem_v1i64(<1 x i64> %op1, <1 x i64> %op2) #0 { +; CHECK-LABEL: srem_v1i64: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: ptrue p0.d, vl1 +; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1 +; CHECK-NEXT: movprfx z2, z0 +; CHECK-NEXT: sdiv z2.d, p0/m, z2.d, z1.d +; CHECK-NEXT: mls z0.d, p0/m, z2.d, z1.d +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 +; CHECK-NEXT: ret + %res = srem <1 x i64> %op1, %op2 + ret <1 x i64> %res +} + +define <2 x i64> @srem_v2i64(<2 x i64> %op1, <2 x i64> %op2) #0 { +; CHECK-LABEL: srem_v2i64: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: ptrue p0.d, vl2 +; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1 +; CHECK-NEXT: movprfx z2, z0 +; CHECK-NEXT: sdiv z2.d, p0/m, z2.d, z1.d +; CHECK-NEXT: mls z0.d, p0/m, z2.d, z1.d +; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 +; CHECK-NEXT: ret + %res = srem <2 x i64> %op1, %op2 + ret <2 x i64> %res +} + +define void @srem_v4i64(<4 x i64>* %a, <4 x i64>* %b) #0 { +; CHECK-LABEL: srem_v4i64: +; CHECK: // %bb.0: +; CHECK-NEXT: mov x8, #2 +; CHECK-NEXT: ptrue p0.d, vl2 +; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] +; CHECK-NEXT: ld1d { z1.d }, p0/z, [x0] +; CHECK-NEXT: ld1d { z2.d }, p0/z, [x1, x8, lsl #3] +; CHECK-NEXT: ld1d { z3.d }, p0/z, [x1] +; CHECK-NEXT: movprfx z4, z1 +; CHECK-NEXT: sdiv z4.d, p0/m, z4.d, z3.d +; CHECK-NEXT: movprfx z5, z0 +; CHECK-NEXT: sdiv z5.d, p0/m, z5.d, z2.d +; CHECK-NEXT: mls z1.d, p0/m, z4.d, z3.d +; CHECK-NEXT: mls z0.d, p0/m, z5.d, z2.d +; CHECK-NEXT: stp q1, q0, [x0] +; CHECK-NEXT: ret + %op1 = load <4 x i64>, <4 x i64>* %a + %op2 = load <4 x i64>, <4 x i64>* %b + %res = srem <4 x i64> %op1, %op2 + store <4 x i64> %res, <4 x i64>* %a + ret void +} + +; +; UREM +; + +define <4 x i8> @urem_v4i8(<4 x i8> %op1, <4 x i8> %op2) #0 { +; CHECK-LABEL: urem_v4i8: +; CHECK: // %bb.0: +; CHECK-NEXT: sub sp, sp, #16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: adrp x8, .LCPI13_0 +; CHECK-NEXT: add x8, x8, :lo12:.LCPI13_0 +; CHECK-NEXT: ptrue p0.h, vl4 +; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1 +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: ptrue p1.s, vl4 +; CHECK-NEXT: ld1h { z2.h }, p0/z, [x8] +; CHECK-NEXT: and z0.d, z0.d, z2.d +; CHECK-NEXT: and z1.d, z1.d, z2.d +; CHECK-NEXT: uunpklo z2.s, z1.h +; CHECK-NEXT: uunpklo z3.s, z0.h +; CHECK-NEXT: udivr z2.s, p1/m, z2.s, z3.s +; CHECK-NEXT: fmov w8, s2 +; CHECK-NEXT: mov z3.s, z2.s[3] +; CHECK-NEXT: mov z4.s, z2.s[2] +; CHECK-NEXT: mov z2.s, z2.s[1] +; CHECK-NEXT: fmov w9, s3 +; CHECK-NEXT: fmov w10, s4 +; CHECK-NEXT: strh w8, [sp, #8] +; CHECK-NEXT: fmov w8, s2 +; CHECK-NEXT: strh w9, [sp, #14] +; CHECK-NEXT: strh w8, [sp, #10] +; CHECK-NEXT: add x8, sp, #8 +; CHECK-NEXT: strh w10, [sp, #12] +; CHECK-NEXT: ld1h { z2.h }, p0/z, [x8] +; CHECK-NEXT: mls z0.h, p0/m, z2.h, z1.h +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 +; CHECK-NEXT: add sp, sp, #16 +; CHECK-NEXT: ret + %res = urem <4 x i8> %op1, %op2 + ret <4 x i8> %res +} + +define <8 x i8> @urem_v8i8(<8 x i8> %op1, <8 x i8> %op2) #0 { +; CHECK-LABEL: urem_v8i8: +; CHECK: // %bb.0: +; CHECK-NEXT: sub sp, sp, #16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1 +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: uunpklo z2.h, z1.b +; CHECK-NEXT: uunpklo z3.h, z0.b +; CHECK-NEXT: ptrue p0.s, vl4 +; CHECK-NEXT: uunpkhi z4.s, z2.h +; CHECK-NEXT: uunpkhi z5.s, z3.h +; CHECK-NEXT: uunpklo z2.s, z2.h +; CHECK-NEXT: uunpklo z3.s, z3.h +; CHECK-NEXT: udivr z4.s, p0/m, z4.s, z5.s +; CHECK-NEXT: udivr z2.s, p0/m, z2.s, z3.s +; CHECK-NEXT: ptrue p0.b, vl8 +; CHECK-NEXT: uzp1 z2.h, z2.h, z4.h +; CHECK-NEXT: fmov w8, s2 +; CHECK-NEXT: mov z3.h, z2.h[7] +; CHECK-NEXT: mov z5.h, z2.h[5] +; CHECK-NEXT: fmov w9, s3 +; CHECK-NEXT: mov z4.h, z2.h[6] +; CHECK-NEXT: mov z6.h, z2.h[4] +; CHECK-NEXT: strb w8, [sp, #8] +; CHECK-NEXT: fmov w8, s5 +; CHECK-NEXT: mov z16.h, z2.h[2] +; CHECK-NEXT: fmov w10, s4 +; CHECK-NEXT: strb w9, [sp, #15] +; CHECK-NEXT: fmov w9, s6 +; CHECK-NEXT: strb w8, [sp, #13] +; CHECK-NEXT: fmov w8, s16 +; CHECK-NEXT: mov z7.h, z2.h[3] +; CHECK-NEXT: mov z2.h, z2.h[1] +; CHECK-NEXT: strb w10, [sp, #14] +; CHECK-NEXT: fmov w10, s7 +; CHECK-NEXT: strb w9, [sp, #12] +; CHECK-NEXT: fmov w9, s2 +; CHECK-NEXT: strb w8, [sp, #10] +; CHECK-NEXT: add x8, sp, #8 +; CHECK-NEXT: strb w10, [sp, #11] +; CHECK-NEXT: strb w9, [sp, #9] +; CHECK-NEXT: ld1b { z2.b }, p0/z, [x8] +; CHECK-NEXT: mls z0.b, p0/m, z2.b, z1.b +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 +; CHECK-NEXT: add sp, sp, #16 +; CHECK-NEXT: ret + %res = urem <8 x i8> %op1, %op2 + ret <8 x i8> %res +} + +define <16 x i8> @urem_v16i8(<16 x i8> %op1, <16 x i8> %op2) #0 { +; CHECK-LABEL: urem_v16i8: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1 +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: uunpkhi z2.h, z1.b +; CHECK-NEXT: uunpkhi z3.h, z0.b +; CHECK-NEXT: ptrue p0.s, vl4 +; CHECK-NEXT: uunpkhi z5.s, z2.h +; CHECK-NEXT: uunpkhi z6.s, z3.h +; CHECK-NEXT: uunpklo z2.s, z2.h +; CHECK-NEXT: uunpklo z3.s, z3.h +; CHECK-NEXT: uunpklo z4.h, z1.b +; CHECK-NEXT: udivr z2.s, p0/m, z2.s, z3.s +; CHECK-NEXT: uunpklo z3.h, z0.b +; CHECK-NEXT: udivr z5.s, p0/m, z5.s, z6.s +; CHECK-NEXT: uunpkhi z6.s, z4.h +; CHECK-NEXT: uunpkhi z7.s, z3.h +; CHECK-NEXT: uunpklo z4.s, z4.h +; CHECK-NEXT: uunpklo z3.s, z3.h +; CHECK-NEXT: udivr z6.s, p0/m, z6.s, z7.s +; CHECK-NEXT: udiv z3.s, p0/m, z3.s, z4.s +; CHECK-NEXT: uzp1 z2.h, z2.h, z5.h +; CHECK-NEXT: uzp1 z3.h, z3.h, z6.h +; CHECK-NEXT: ptrue p0.b, vl16 +; CHECK-NEXT: uzp1 z2.b, z3.b, z2.b +; CHECK-NEXT: mls z0.b, p0/m, z2.b, z1.b +; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 +; CHECK-NEXT: ret + %res = urem <16 x i8> %op1, %op2 + ret <16 x i8> %res +} + +define void @urem_v32i8(<32 x i8>* %a, <32 x i8>* %b) #0 { +; CHECK-LABEL: urem_v32i8: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w8, #16 +; CHECK-NEXT: ptrue p0.b, vl16 +; CHECK-NEXT: ptrue p1.s, vl4 +; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0, x8] +; CHECK-NEXT: ld1b { z1.b }, p0/z, [x0] +; CHECK-NEXT: ld1b { z2.b }, p0/z, [x1, x8] +; CHECK-NEXT: ld1b { z3.b }, p0/z, [x1] +; CHECK-NEXT: uunpkhi z5.h, z0.b +; CHECK-NEXT: uunpklo z7.h, z0.b +; CHECK-NEXT: uunpkhi z4.h, z2.b +; CHECK-NEXT: uunpklo z6.h, z2.b +; CHECK-NEXT: uunpkhi z16.s, z4.h +; CHECK-NEXT: uunpkhi z17.s, z5.h +; CHECK-NEXT: uunpklo z4.s, z4.h +; CHECK-NEXT: uunpklo z5.s, z5.h +; CHECK-NEXT: uunpkhi z18.s, z6.h +; CHECK-NEXT: udivr z16.s, p1/m, z16.s, z17.s +; CHECK-NEXT: udivr z4.s, p1/m, z4.s, z5.s +; CHECK-NEXT: uunpkhi z5.s, z7.h +; CHECK-NEXT: uunpklo z6.s, z6.h +; CHECK-NEXT: uunpklo z7.s, z7.h +; CHECK-NEXT: uzp1 z4.h, z4.h, z16.h +; CHECK-NEXT: udivr z6.s, p1/m, z6.s, z7.s +; CHECK-NEXT: uunpkhi z7.h, z3.b +; CHECK-NEXT: uunpkhi z16.h, z1.b +; CHECK-NEXT: udiv z5.s, p1/m, z5.s, z18.s +; CHECK-NEXT: uunpkhi z17.s, z7.h +; CHECK-NEXT: uunpkhi z18.s, z16.h +; CHECK-NEXT: uunpklo z7.s, z7.h +; CHECK-NEXT: uunpklo z16.s, z16.h +; CHECK-NEXT: udivr z17.s, p1/m, z17.s, z18.s +; CHECK-NEXT: udivr z7.s, p1/m, z7.s, z16.s +; CHECK-NEXT: uunpklo z16.h, z3.b +; CHECK-NEXT: uunpklo z18.h, z1.b +; CHECK-NEXT: uunpkhi z19.s, z16.h +; CHECK-NEXT: uunpkhi z20.s, z18.h +; CHECK-NEXT: uunpklo z16.s, z16.h +; CHECK-NEXT: uunpklo z18.s, z18.h +; CHECK-NEXT: udivr z19.s, p1/m, z19.s, z20.s +; CHECK-NEXT: udivr z16.s, p1/m, z16.s, z18.s +; CHECK-NEXT: uzp1 z7.h, z7.h, z17.h +; CHECK-NEXT: uzp1 z16.h, z16.h, z19.h +; CHECK-NEXT: uzp1 z5.h, z6.h, z5.h +; CHECK-NEXT: uzp1 z6.b, z16.b, z7.b +; CHECK-NEXT: uzp1 z4.b, z5.b, z4.b +; CHECK-NEXT: mls z1.b, p0/m, z6.b, z3.b +; CHECK-NEXT: mls z0.b, p0/m, z4.b, z2.b +; CHECK-NEXT: stp q1, q0, [x0] +; CHECK-NEXT: ret + %op1 = load <32 x i8>, <32 x i8>* %a + %op2 = load <32 x i8>, <32 x i8>* %b + %res = urem <32 x i8> %op1, %op2 + store <32 x i8> %res, <32 x i8>* %a + ret void +} + +define <4 x i16> @urem_v4i16(<4 x i16> %op1, <4 x i16> %op2) #0 { +; CHECK-LABEL: urem_v4i16: +; CHECK: // %bb.0: +; CHECK-NEXT: sub sp, sp, #16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1 +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: ptrue p0.s, vl4 +; CHECK-NEXT: uunpklo z2.s, z1.h +; CHECK-NEXT: uunpklo z3.s, z0.h +; CHECK-NEXT: udivr z2.s, p0/m, z2.s, z3.s +; CHECK-NEXT: ptrue p0.h, vl4 +; CHECK-NEXT: fmov w8, s2 +; CHECK-NEXT: mov z3.s, z2.s[3] +; CHECK-NEXT: mov z4.s, z2.s[2] +; CHECK-NEXT: mov z2.s, z2.s[1] +; CHECK-NEXT: fmov w9, s3 +; CHECK-NEXT: fmov w10, s4 +; CHECK-NEXT: strh w8, [sp, #8] +; CHECK-NEXT: fmov w8, s2 +; CHECK-NEXT: strh w9, [sp, #14] +; CHECK-NEXT: strh w8, [sp, #10] +; CHECK-NEXT: add x8, sp, #8 +; CHECK-NEXT: strh w10, [sp, #12] +; CHECK-NEXT: ld1h { z2.h }, p0/z, [x8] +; CHECK-NEXT: mls z0.h, p0/m, z2.h, z1.h +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 +; CHECK-NEXT: add sp, sp, #16 +; CHECK-NEXT: ret + %res = urem <4 x i16> %op1, %op2 + ret <4 x i16> %res +} + +define <8 x i16> @urem_v8i16(<8 x i16> %op1, <8 x i16> %op2) #0 { +; CHECK-LABEL: urem_v8i16: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1 +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: ptrue p0.s, vl4 +; CHECK-NEXT: uunpkhi z2.s, z1.h +; CHECK-NEXT: uunpkhi z3.s, z0.h +; CHECK-NEXT: uunpklo z4.s, z1.h +; CHECK-NEXT: udivr z2.s, p0/m, z2.s, z3.s +; CHECK-NEXT: uunpklo z5.s, z0.h +; CHECK-NEXT: movprfx z3, z5 +; CHECK-NEXT: udiv z3.s, p0/m, z3.s, z4.s +; CHECK-NEXT: uzp1 z2.h, z3.h, z2.h +; CHECK-NEXT: ptrue p0.h, vl8 +; CHECK-NEXT: mls z0.h, p0/m, z2.h, z1.h +; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 +; CHECK-NEXT: ret + %res = urem <8 x i16> %op1, %op2 + ret <8 x i16> %res +} + +define void @urem_v16i16(<16 x i16>* %a, <16 x i16>* %b) #0 { +; CHECK-LABEL: urem_v16i16: +; CHECK: // %bb.0: +; CHECK-NEXT: mov x8, #8 +; CHECK-NEXT: ptrue p0.h, vl8 +; CHECK-NEXT: ptrue p1.s, vl4 +; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] +; CHECK-NEXT: ld1h { z1.h }, p0/z, [x0] +; CHECK-NEXT: ld1h { z2.h }, p0/z, [x1, x8, lsl #1] +; CHECK-NEXT: ld1h { z3.h }, p0/z, [x1] +; CHECK-NEXT: uunpkhi z5.s, z0.h +; CHECK-NEXT: uunpkhi z16.s, z1.h +; CHECK-NEXT: uunpkhi z4.s, z2.h +; CHECK-NEXT: uunpkhi z7.s, z3.h +; CHECK-NEXT: udivr z4.s, p1/m, z4.s, z5.s +; CHECK-NEXT: uunpklo z5.s, z3.h +; CHECK-NEXT: udivr z7.s, p1/m, z7.s, z16.s +; CHECK-NEXT: uunpklo z16.s, z1.h +; CHECK-NEXT: uunpklo z6.s, z2.h +; CHECK-NEXT: udivr z5.s, p1/m, z5.s, z16.s +; CHECK-NEXT: uunpklo z16.s, z0.h +; CHECK-NEXT: uzp1 z5.h, z5.h, z7.h +; CHECK-NEXT: udivr z6.s, p1/m, z6.s, z16.s +; CHECK-NEXT: mls z1.h, p0/m, z5.h, z3.h +; CHECK-NEXT: uzp1 z4.h, z6.h, z4.h +; CHECK-NEXT: mls z0.h, p0/m, z4.h, z2.h +; CHECK-NEXT: stp q1, q0, [x0] +; CHECK-NEXT: ret + %op1 = load <16 x i16>, <16 x i16>* %a + %op2 = load <16 x i16>, <16 x i16>* %b + %res = urem <16 x i16> %op1, %op2 + store <16 x i16> %res, <16 x i16>* %a + ret void +} + +define <2 x i32> @urem_v2i32(<2 x i32> %op1, <2 x i32> %op2) #0 { +; CHECK-LABEL: urem_v2i32: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: ptrue p0.s, vl2 +; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1 +; CHECK-NEXT: movprfx z2, z0 +; CHECK-NEXT: udiv z2.s, p0/m, z2.s, z1.s +; CHECK-NEXT: mls z0.s, p0/m, z2.s, z1.s +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 +; CHECK-NEXT: ret + %res = urem <2 x i32> %op1, %op2 + ret <2 x i32> %res +} + +define <4 x i32> @urem_v4i32(<4 x i32> %op1, <4 x i32> %op2) #0 { +; CHECK-LABEL: urem_v4i32: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: ptrue p0.s, vl4 +; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1 +; CHECK-NEXT: movprfx z2, z0 +; CHECK-NEXT: udiv z2.s, p0/m, z2.s, z1.s +; CHECK-NEXT: mls z0.s, p0/m, z2.s, z1.s +; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 +; CHECK-NEXT: ret + %res = urem <4 x i32> %op1, %op2 + ret <4 x i32> %res +} + +define void @urem_v8i32(<8 x i32>* %a, <8 x i32>* %b) #0 { +; CHECK-LABEL: urem_v8i32: +; CHECK: // %bb.0: +; CHECK-NEXT: mov x8, #4 +; CHECK-NEXT: ptrue p0.s, vl4 +; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] +; CHECK-NEXT: ld1w { z1.s }, p0/z, [x0] +; CHECK-NEXT: ld1w { z2.s }, p0/z, [x1, x8, lsl #2] +; CHECK-NEXT: ld1w { z3.s }, p0/z, [x1] +; CHECK-NEXT: movprfx z4, z1 +; CHECK-NEXT: udiv z4.s, p0/m, z4.s, z3.s +; CHECK-NEXT: movprfx z5, z0 +; CHECK-NEXT: udiv z5.s, p0/m, z5.s, z2.s +; CHECK-NEXT: mls z1.s, p0/m, z4.s, z3.s +; CHECK-NEXT: mls z0.s, p0/m, z5.s, z2.s +; CHECK-NEXT: stp q1, q0, [x0] +; CHECK-NEXT: ret + %op1 = load <8 x i32>, <8 x i32>* %a + %op2 = load <8 x i32>, <8 x i32>* %b + %res = urem <8 x i32> %op1, %op2 + store <8 x i32> %res, <8 x i32>* %a + ret void +} + +define <1 x i64> @urem_v1i64(<1 x i64> %op1, <1 x i64> %op2) #0 { +; CHECK-LABEL: urem_v1i64: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: ptrue p0.d, vl1 +; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1 +; CHECK-NEXT: movprfx z2, z0 +; CHECK-NEXT: udiv z2.d, p0/m, z2.d, z1.d +; CHECK-NEXT: mls z0.d, p0/m, z2.d, z1.d +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 +; CHECK-NEXT: ret + %res = urem <1 x i64> %op1, %op2 + ret <1 x i64> %res +} + +define <2 x i64> @urem_v2i64(<2 x i64> %op1, <2 x i64> %op2) #0 { +; CHECK-LABEL: urem_v2i64: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: ptrue p0.d, vl2 +; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1 +; CHECK-NEXT: movprfx z2, z0 +; CHECK-NEXT: udiv z2.d, p0/m, z2.d, z1.d +; CHECK-NEXT: mls z0.d, p0/m, z2.d, z1.d +; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 +; CHECK-NEXT: ret + %res = urem <2 x i64> %op1, %op2 + ret <2 x i64> %res +} + +define void @urem_v4i64(<4 x i64>* %a, <4 x i64>* %b) #0 { +; CHECK-LABEL: urem_v4i64: +; CHECK: // %bb.0: +; CHECK-NEXT: mov x8, #2 +; CHECK-NEXT: ptrue p0.d, vl2 +; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] +; CHECK-NEXT: ld1d { z1.d }, p0/z, [x0] +; CHECK-NEXT: ld1d { z2.d }, p0/z, [x1, x8, lsl #3] +; CHECK-NEXT: ld1d { z3.d }, p0/z, [x1] +; CHECK-NEXT: movprfx z4, z1 +; CHECK-NEXT: udiv z4.d, p0/m, z4.d, z3.d +; CHECK-NEXT: movprfx z5, z0 +; CHECK-NEXT: udiv z5.d, p0/m, z5.d, z2.d +; CHECK-NEXT: mls z1.d, p0/m, z4.d, z3.d +; CHECK-NEXT: mls z0.d, p0/m, z5.d, z2.d +; CHECK-NEXT: stp q1, q0, [x0] +; CHECK-NEXT: ret + %op1 = load <4 x i64>, <4 x i64>* %a + %op2 = load <4 x i64>, <4 x i64>* %b + %res = urem <4 x i64> %op1, %op2 + store <4 x i64> %res, <4 x i64>* %a + ret void +} + +attributes #0 = { "target-features"="+sve" }