diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -1395,6 +1395,8 @@ setOperationAction(ISD::VECREDUCE_SEQ_FADD, VT, Custom); for (MVT VT : MVT::integer_fixedlen_vector_valuetypes()) { + if (!Subtarget->forceSVEInStreamingMode()) + break; if (useSVEForFixedLengthVectorVT(VT, Subtarget->forceSVEInStreamingMode())) { setOperationAction(ISD::ANY_EXTEND, VT, Custom); @@ -1402,10 +1404,20 @@ setOperationAction(ISD::SIGN_EXTEND, VT, Custom); setOperationAction(ISD::LOAD, VT, Custom); setOperationAction(ISD::CONCAT_VECTORS, VT, Custom); + setOperationAction(ISD::ADD, VT, Custom); + setOperationAction(ISD::SUB, VT, Custom); + setOperationAction(ISD::MUL, VT, Custom); + setOperationAction(ISD::MULHS, VT, Custom); + setOperationAction(ISD::MULHU, VT, Custom); + setOperationAction(ISD::ABS, VT, Custom); + setOperationAction(ISD::AND, VT, Custom); + setOperationAction(ISD::XOR, VT, Custom); } } for (MVT VT : MVT::fp_fixedlen_vector_valuetypes()) { + if (!Subtarget->forceSVEInStreamingMode()) + break; if (useSVEForFixedLengthVectorVT(VT, Subtarget->forceSVEInStreamingMode())) { setOperationAction(ISD::ANY_EXTEND, VT, Custom); @@ -1413,6 +1425,14 @@ setOperationAction(ISD::SIGN_EXTEND, VT, Custom); setOperationAction(ISD::LOAD, VT, Custom); setOperationAction(ISD::CONCAT_VECTORS, VT, Custom); + setOperationAction(ISD::ADD, VT, Custom); + setOperationAction(ISD::SUB, VT, Custom); + setOperationAction(ISD::MUL, VT, Custom); + setOperationAction(ISD::MULHS, VT, Custom); + setOperationAction(ISD::MULHU, VT, Custom); + setOperationAction(ISD::ABS, VT, Custom); + setOperationAction(ISD::AND, VT, Custom); + setOperationAction(ISD::XOR, VT, Custom); } } @@ -3524,7 +3544,8 @@ } SDValue AArch64TargetLowering::LowerXOR(SDValue Op, SelectionDAG &DAG) const { - if (useSVEForFixedLengthVectorVT(Op.getValueType())) + if (useSVEForFixedLengthVectorVT(Op.getValueType(), + Subtarget->forceSVEInStreamingMode())) return LowerToScalableOp(Op, DAG); SDValue Sel = Op.getOperand(0); @@ -4435,10 +4456,8 @@ SDValue AArch64TargetLowering::LowerMUL(SDValue Op, SelectionDAG &DAG) const { EVT VT = Op.getValueType(); - // If SVE is available then i64 vector multiplications can also be made legal. - bool OverrideNEON = VT == MVT::v2i64 || VT == MVT::v1i64; - - if (VT.isScalableVector() || useSVEForFixedLengthVectorVT(VT, OverrideNEON)) + if (VT.isScalableVector() || + useSVEForFixedLengthVectorVT(VT, Subtarget->forceSVEInStreamingMode())) return LowerToPredicatedOp(Op, DAG, AArch64ISD::MUL_PRED); // Multiplications are only custom-lowered for 128-bit vectors so that @@ -11474,7 +11493,8 @@ SDValue AArch64TargetLowering::LowerVectorOR(SDValue Op, SelectionDAG &DAG) const { - if (useSVEForFixedLengthVectorVT(Op.getValueType())) + if (useSVEForFixedLengthVectorVT(Op.getValueType(), + Subtarget->forceSVEInStreamingMode())) return LowerToScalableOp(Op, DAG); // Attempt to form a vector S[LR]I from (or (and X, C1), (lsl Y, C2)) diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-arith.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-arith.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-arith.ll @@ -0,0 +1,1839 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -aarch64-sve-vector-bits-min=128 -force-sve-when-streaming-compatible < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_128 +; RUN: llc -aarch64-sve-vector-bits-min=256 -force-sve-when-streaming-compatible < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_256 +; RUN: llc -aarch64-sve-vector-bits-min=512 -force-sve-when-streaming-compatible < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512 +; RUN: llc -aarch64-sve-vector-bits-min=2048 -force-sve-when-streaming-compatible < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512 + +target triple = "aarch64-unknown-linux-gnu" + +; +; ADD +; +define <8 x i8> @add_v8i8(<8 x i8> %op1, <8 x i8> %op2) #0 { +; CHECK-LABEL: add_v8i8: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1 +; CHECK-NEXT: add z0.b, z0.b, z1.b +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 +; CHECK-NEXT: ret + %res = add <8 x i8> %op1, %op2 + ret <8 x i8> %res +} + +define <16 x i8> @add_v16i8(<16 x i8> %op1, <16 x i8> %op2) #0 { +; CHECK-LABEL: add_v16i8: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1 +; CHECK-NEXT: add z0.b, z0.b, z1.b +; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 +; CHECK-NEXT: ret + %res = add <16 x i8> %op1, %op2 + ret <16 x i8> %res +} + +define void @add_v32i8(<32 x i8>* %a, <32 x i8>* %b) #0 { +; VBITS_GE_128-LABEL: add_v32i8: +; VBITS_GE_128: // %bb.0: +; VBITS_GE_128-NEXT: mov w8, #16 +; VBITS_GE_128-NEXT: ptrue p0.b, vl16 +; VBITS_GE_128-NEXT: ld1b { z0.b }, p0/z, [x0, x8] +; VBITS_GE_128-NEXT: ld1b { z1.b }, p0/z, [x0] +; VBITS_GE_128-NEXT: ld1b { z2.b }, p0/z, [x1, x8] +; VBITS_GE_128-NEXT: ld1b { z3.b }, p0/z, [x1] +; VBITS_GE_128-NEXT: add z1.b, z1.b, z3.b +; VBITS_GE_128-NEXT: add z0.b, z0.b, z2.b +; VBITS_GE_128-NEXT: stp q1, q0, [x0] +; VBITS_GE_128-NEXT: ret +; +; VBITS_GE_256-LABEL: add_v32i8: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: ptrue p0.b, vl32 +; VBITS_GE_256-NEXT: ld1b { z0.b }, p0/z, [x0] +; VBITS_GE_256-NEXT: ld1b { z1.b }, p0/z, [x1] +; VBITS_GE_256-NEXT: add z0.b, z0.b, z1.b +; VBITS_GE_256-NEXT: st1b { z0.b }, p0, [x0] +; VBITS_GE_256-NEXT: ret +; +; VBITS_GE_512-LABEL: add_v32i8: +; VBITS_GE_512: // %bb.0: +; VBITS_GE_512-NEXT: ptrue p0.b, vl32 +; VBITS_GE_512-NEXT: ld1b { z0.b }, p0/z, [x0] +; VBITS_GE_512-NEXT: ld1b { z1.b }, p0/z, [x1] +; VBITS_GE_512-NEXT: add z0.b, z0.b, z1.b +; VBITS_GE_512-NEXT: st1b { z0.b }, p0, [x0] +; VBITS_GE_512-NEXT: ret + %op1 = load <32 x i8>, <32 x i8>* %a + %op2 = load <32 x i8>, <32 x i8>* %b + %res = add <32 x i8> %op1, %op2 + store <32 x i8> %res, <32 x i8>* %a + ret void +} + +define void @add_v64i8(<64 x i8>* %a, <64 x i8>* %b) #0 { +; VBITS_GE_128-LABEL: add_v64i8: +; VBITS_GE_128: // %bb.0: +; VBITS_GE_128-NEXT: mov w8, #32 +; VBITS_GE_128-NEXT: mov w9, #48 +; VBITS_GE_128-NEXT: mov w10, #16 +; VBITS_GE_128-NEXT: ptrue p0.b, vl16 +; VBITS_GE_128-NEXT: ld1b { z0.b }, p0/z, [x0, x8] +; VBITS_GE_128-NEXT: ld1b { z1.b }, p0/z, [x0, x9] +; VBITS_GE_128-NEXT: ld1b { z2.b }, p0/z, [x0, x10] +; VBITS_GE_128-NEXT: ld1b { z3.b }, p0/z, [x0] +; VBITS_GE_128-NEXT: ld1b { z4.b }, p0/z, [x1, x8] +; VBITS_GE_128-NEXT: ld1b { z5.b }, p0/z, [x1, x9] +; VBITS_GE_128-NEXT: ld1b { z6.b }, p0/z, [x1, x10] +; VBITS_GE_128-NEXT: ld1b { z7.b }, p0/z, [x1] +; VBITS_GE_128-NEXT: add z0.b, z0.b, z4.b +; VBITS_GE_128-NEXT: add z1.b, z1.b, z5.b +; VBITS_GE_128-NEXT: stp q0, q1, [x0, #32] +; VBITS_GE_128-NEXT: add z0.b, z3.b, z7.b +; VBITS_GE_128-NEXT: add z1.b, z2.b, z6.b +; VBITS_GE_128-NEXT: stp q0, q1, [x0] +; VBITS_GE_128-NEXT: ret +; +; VBITS_GE_256-LABEL: add_v64i8: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov w8, #32 +; VBITS_GE_256-NEXT: ptrue p0.b, vl32 +; VBITS_GE_256-NEXT: ld1b { z0.b }, p0/z, [x0, x8] +; VBITS_GE_256-NEXT: ld1b { z1.b }, p0/z, [x0] +; VBITS_GE_256-NEXT: ld1b { z2.b }, p0/z, [x1, x8] +; VBITS_GE_256-NEXT: ld1b { z3.b }, p0/z, [x1] +; VBITS_GE_256-NEXT: add z0.b, z0.b, z2.b +; VBITS_GE_256-NEXT: add z1.b, z1.b, z3.b +; VBITS_GE_256-NEXT: st1b { z0.b }, p0, [x0, x8] +; VBITS_GE_256-NEXT: st1b { z1.b }, p0, [x0] +; VBITS_GE_256-NEXT: ret +; +; VBITS_GE_512-LABEL: add_v64i8: +; VBITS_GE_512: // %bb.0: +; VBITS_GE_512-NEXT: ptrue p0.b, vl64 +; VBITS_GE_512-NEXT: ld1b { z0.b }, p0/z, [x0] +; VBITS_GE_512-NEXT: ld1b { z1.b }, p0/z, [x1] +; VBITS_GE_512-NEXT: add z0.b, z0.b, z1.b +; VBITS_GE_512-NEXT: st1b { z0.b }, p0, [x0] +; VBITS_GE_512-NEXT: ret + %op1 = load <64 x i8>, <64 x i8>* %a + %op2 = load <64 x i8>, <64 x i8>* %b + %res = add <64 x i8> %op1, %op2 + store <64 x i8> %res, <64 x i8>* %a + ret void +} + +define <4 x i16> @add_v4i16(<4 x i16> %op1, <4 x i16> %op2) #0 { +; CHECK-LABEL: add_v4i16: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1 +; CHECK-NEXT: add z0.h, z0.h, z1.h +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 +; CHECK-NEXT: ret + %res = add <4 x i16> %op1, %op2 + ret <4 x i16> %res +} + +define <8 x i16> @add_v8i16(<8 x i16> %op1, <8 x i16> %op2) #0 { +; CHECK-LABEL: add_v8i16: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1 +; CHECK-NEXT: add z0.h, z0.h, z1.h +; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 +; CHECK-NEXT: ret + %res = add <8 x i16> %op1, %op2 + ret <8 x i16> %res +} + +define void @add_v16i16(<16 x i16>* %a, <16 x i16>* %b) #0 { +; VBITS_GE_128-LABEL: add_v16i16: +; VBITS_GE_128: // %bb.0: +; VBITS_GE_128-NEXT: mov x8, #8 +; VBITS_GE_128-NEXT: ptrue p0.h, vl8 +; VBITS_GE_128-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] +; VBITS_GE_128-NEXT: ld1h { z1.h }, p0/z, [x0] +; VBITS_GE_128-NEXT: ld1h { z2.h }, p0/z, [x1, x8, lsl #1] +; VBITS_GE_128-NEXT: ld1h { z3.h }, p0/z, [x1] +; VBITS_GE_128-NEXT: add z1.h, z1.h, z3.h +; VBITS_GE_128-NEXT: add z0.h, z0.h, z2.h +; VBITS_GE_128-NEXT: stp q1, q0, [x0] +; VBITS_GE_128-NEXT: ret +; +; VBITS_GE_256-LABEL: add_v16i16: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: ptrue p0.h, vl16 +; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0] +; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x1] +; VBITS_GE_256-NEXT: add z0.h, z0.h, z1.h +; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x0] +; VBITS_GE_256-NEXT: ret +; +; VBITS_GE_512-LABEL: add_v16i16: +; VBITS_GE_512: // %bb.0: +; VBITS_GE_512-NEXT: ptrue p0.h, vl16 +; VBITS_GE_512-NEXT: ld1h { z0.h }, p0/z, [x0] +; VBITS_GE_512-NEXT: ld1h { z1.h }, p0/z, [x1] +; VBITS_GE_512-NEXT: add z0.h, z0.h, z1.h +; VBITS_GE_512-NEXT: st1h { z0.h }, p0, [x0] +; VBITS_GE_512-NEXT: ret + %op1 = load <16 x i16>, <16 x i16>* %a + %op2 = load <16 x i16>, <16 x i16>* %b + %res = add <16 x i16> %op1, %op2 + store <16 x i16> %res, <16 x i16>* %a + ret void +} + +define void @add_v32i16(<32 x i16>* %a, <32 x i16>* %b) #0 { +; VBITS_GE_128-LABEL: add_v32i16: +; VBITS_GE_128: // %bb.0: +; VBITS_GE_128-NEXT: mov x8, #16 +; VBITS_GE_128-NEXT: mov x9, #24 +; VBITS_GE_128-NEXT: mov x10, #8 +; VBITS_GE_128-NEXT: ptrue p0.h, vl8 +; VBITS_GE_128-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] +; VBITS_GE_128-NEXT: ld1h { z1.h }, p0/z, [x0, x9, lsl #1] +; VBITS_GE_128-NEXT: ld1h { z2.h }, p0/z, [x0, x10, lsl #1] +; VBITS_GE_128-NEXT: ld1h { z3.h }, p0/z, [x0] +; VBITS_GE_128-NEXT: ld1h { z4.h }, p0/z, [x1, x8, lsl #1] +; VBITS_GE_128-NEXT: ld1h { z5.h }, p0/z, [x1, x9, lsl #1] +; VBITS_GE_128-NEXT: ld1h { z6.h }, p0/z, [x1, x10, lsl #1] +; VBITS_GE_128-NEXT: ld1h { z7.h }, p0/z, [x1] +; VBITS_GE_128-NEXT: add z0.h, z0.h, z4.h +; VBITS_GE_128-NEXT: add z1.h, z1.h, z5.h +; VBITS_GE_128-NEXT: stp q0, q1, [x0, #32] +; VBITS_GE_128-NEXT: add z0.h, z3.h, z7.h +; VBITS_GE_128-NEXT: add z1.h, z2.h, z6.h +; VBITS_GE_128-NEXT: stp q0, q1, [x0] +; VBITS_GE_128-NEXT: ret +; +; VBITS_GE_256-LABEL: add_v32i16: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov x8, #16 +; VBITS_GE_256-NEXT: ptrue p0.h, vl16 +; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] +; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0] +; VBITS_GE_256-NEXT: ld1h { z2.h }, p0/z, [x1, x8, lsl #1] +; VBITS_GE_256-NEXT: ld1h { z3.h }, p0/z, [x1] +; VBITS_GE_256-NEXT: add z0.h, z0.h, z2.h +; VBITS_GE_256-NEXT: add z1.h, z1.h, z3.h +; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x0, x8, lsl #1] +; VBITS_GE_256-NEXT: st1h { z1.h }, p0, [x0] +; VBITS_GE_256-NEXT: ret +; +; VBITS_GE_512-LABEL: add_v32i16: +; VBITS_GE_512: // %bb.0: +; VBITS_GE_512-NEXT: ptrue p0.h, vl32 +; VBITS_GE_512-NEXT: ld1h { z0.h }, p0/z, [x0] +; VBITS_GE_512-NEXT: ld1h { z1.h }, p0/z, [x1] +; VBITS_GE_512-NEXT: add z0.h, z0.h, z1.h +; VBITS_GE_512-NEXT: st1h { z0.h }, p0, [x0] +; VBITS_GE_512-NEXT: ret + %op1 = load <32 x i16>, <32 x i16>* %a + %op2 = load <32 x i16>, <32 x i16>* %b + %res = add <32 x i16> %op1, %op2 + store <32 x i16> %res, <32 x i16>* %a + ret void +} + +define <2 x i32> @add_v2i32(<2 x i32> %op1, <2 x i32> %op2) #0 { +; CHECK-LABEL: add_v2i32: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1 +; CHECK-NEXT: add z0.s, z0.s, z1.s +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 +; CHECK-NEXT: ret + %res = add <2 x i32> %op1, %op2 + ret <2 x i32> %res +} + +define <4 x i32> @add_v4i32(<4 x i32> %op1, <4 x i32> %op2) #0 { +; CHECK-LABEL: add_v4i32: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1 +; CHECK-NEXT: add z0.s, z0.s, z1.s +; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 +; CHECK-NEXT: ret + %res = add <4 x i32> %op1, %op2 + ret <4 x i32> %res +} + +define void @add_v8i32(<8 x i32>* %a, <8 x i32>* %b) #0 { +; VBITS_GE_128-LABEL: add_v8i32: +; VBITS_GE_128: // %bb.0: +; VBITS_GE_128-NEXT: mov x8, #4 +; VBITS_GE_128-NEXT: ptrue p0.s, vl4 +; VBITS_GE_128-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] +; VBITS_GE_128-NEXT: ld1w { z1.s }, p0/z, [x0] +; VBITS_GE_128-NEXT: ld1w { z2.s }, p0/z, [x1, x8, lsl #2] +; VBITS_GE_128-NEXT: ld1w { z3.s }, p0/z, [x1] +; VBITS_GE_128-NEXT: add z1.s, z1.s, z3.s +; VBITS_GE_128-NEXT: add z0.s, z0.s, z2.s +; VBITS_GE_128-NEXT: stp q1, q0, [x0] +; VBITS_GE_128-NEXT: ret +; +; VBITS_GE_256-LABEL: add_v8i32: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: ptrue p0.s, vl8 +; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0] +; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x1] +; VBITS_GE_256-NEXT: add z0.s, z0.s, z1.s +; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0] +; VBITS_GE_256-NEXT: ret +; +; VBITS_GE_512-LABEL: add_v8i32: +; VBITS_GE_512: // %bb.0: +; VBITS_GE_512-NEXT: ptrue p0.s, vl8 +; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x0] +; VBITS_GE_512-NEXT: ld1w { z1.s }, p0/z, [x1] +; VBITS_GE_512-NEXT: add z0.s, z0.s, z1.s +; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x0] +; VBITS_GE_512-NEXT: ret + %op1 = load <8 x i32>, <8 x i32>* %a + %op2 = load <8 x i32>, <8 x i32>* %b + %res = add <8 x i32> %op1, %op2 + store <8 x i32> %res, <8 x i32>* %a + ret void +} + +define void @add_v16i32(<16 x i32>* %a, <16 x i32>* %b) #0 { +; VBITS_GE_128-LABEL: add_v16i32: +; VBITS_GE_128: // %bb.0: +; VBITS_GE_128-NEXT: mov x8, #8 +; VBITS_GE_128-NEXT: mov x9, #12 +; VBITS_GE_128-NEXT: mov x10, #4 +; VBITS_GE_128-NEXT: ptrue p0.s, vl4 +; VBITS_GE_128-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] +; VBITS_GE_128-NEXT: ld1w { z1.s }, p0/z, [x0, x9, lsl #2] +; VBITS_GE_128-NEXT: ld1w { z2.s }, p0/z, [x0, x10, lsl #2] +; VBITS_GE_128-NEXT: ld1w { z3.s }, p0/z, [x0] +; VBITS_GE_128-NEXT: ld1w { z4.s }, p0/z, [x1, x8, lsl #2] +; VBITS_GE_128-NEXT: ld1w { z5.s }, p0/z, [x1, x9, lsl #2] +; VBITS_GE_128-NEXT: ld1w { z6.s }, p0/z, [x1, x10, lsl #2] +; VBITS_GE_128-NEXT: ld1w { z7.s }, p0/z, [x1] +; VBITS_GE_128-NEXT: add z0.s, z0.s, z4.s +; VBITS_GE_128-NEXT: add z1.s, z1.s, z5.s +; VBITS_GE_128-NEXT: stp q0, q1, [x0, #32] +; VBITS_GE_128-NEXT: add z0.s, z3.s, z7.s +; VBITS_GE_128-NEXT: add z1.s, z2.s, z6.s +; VBITS_GE_128-NEXT: stp q0, q1, [x0] +; VBITS_GE_128-NEXT: ret +; +; VBITS_GE_256-LABEL: add_v16i32: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov x8, #8 +; VBITS_GE_256-NEXT: ptrue p0.s, vl8 +; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0] +; VBITS_GE_256-NEXT: ld1w { z2.s }, p0/z, [x1, x8, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z3.s }, p0/z, [x1] +; VBITS_GE_256-NEXT: add z0.s, z0.s, z2.s +; VBITS_GE_256-NEXT: add z1.s, z1.s, z3.s +; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0, x8, lsl #2] +; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x0] +; VBITS_GE_256-NEXT: ret +; +; VBITS_GE_512-LABEL: add_v16i32: +; VBITS_GE_512: // %bb.0: +; VBITS_GE_512-NEXT: ptrue p0.s, vl16 +; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x0] +; VBITS_GE_512-NEXT: ld1w { z1.s }, p0/z, [x1] +; VBITS_GE_512-NEXT: add z0.s, z0.s, z1.s +; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x0] +; VBITS_GE_512-NEXT: ret + %op1 = load <16 x i32>, <16 x i32>* %a + %op2 = load <16 x i32>, <16 x i32>* %b + %res = add <16 x i32> %op1, %op2 + store <16 x i32> %res, <16 x i32>* %a + ret void +} + +define <1 x i64> @add_v1i64(<1 x i64> %op1, <1 x i64> %op2) #0 { +; CHECK-LABEL: add_v1i64: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1 +; CHECK-NEXT: add z0.d, z0.d, z1.d +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 +; CHECK-NEXT: ret + %res = add <1 x i64> %op1, %op2 + ret <1 x i64> %res +} + +define <2 x i64> @add_v2i64(<2 x i64> %op1, <2 x i64> %op2) #0 { +; CHECK-LABEL: add_v2i64: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1 +; CHECK-NEXT: add z0.d, z0.d, z1.d +; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 +; CHECK-NEXT: ret + %res = add <2 x i64> %op1, %op2 + ret <2 x i64> %res +} + +define void @add_v4i64(<4 x i64>* %a, <4 x i64>* %b) #0 { +; VBITS_GE_128-LABEL: add_v4i64: +; VBITS_GE_128: // %bb.0: +; VBITS_GE_128-NEXT: mov x8, #2 +; VBITS_GE_128-NEXT: ptrue p0.d, vl2 +; VBITS_GE_128-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] +; VBITS_GE_128-NEXT: ld1d { z1.d }, p0/z, [x0] +; VBITS_GE_128-NEXT: ld1d { z2.d }, p0/z, [x1, x8, lsl #3] +; VBITS_GE_128-NEXT: ld1d { z3.d }, p0/z, [x1] +; VBITS_GE_128-NEXT: add z1.d, z1.d, z3.d +; VBITS_GE_128-NEXT: add z0.d, z0.d, z2.d +; VBITS_GE_128-NEXT: stp q1, q0, [x0] +; VBITS_GE_128-NEXT: ret +; +; VBITS_GE_256-LABEL: add_v4i64: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: ptrue p0.d, vl4 +; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0] +; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x1] +; VBITS_GE_256-NEXT: add z0.d, z0.d, z1.d +; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x0] +; VBITS_GE_256-NEXT: ret +; +; VBITS_GE_512-LABEL: add_v4i64: +; VBITS_GE_512: // %bb.0: +; VBITS_GE_512-NEXT: ptrue p0.d, vl4 +; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x0] +; VBITS_GE_512-NEXT: ld1d { z1.d }, p0/z, [x1] +; VBITS_GE_512-NEXT: add z0.d, z0.d, z1.d +; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x0] +; VBITS_GE_512-NEXT: ret + %op1 = load <4 x i64>, <4 x i64>* %a + %op2 = load <4 x i64>, <4 x i64>* %b + %res = add <4 x i64> %op1, %op2 + store <4 x i64> %res, <4 x i64>* %a + ret void +} + +define void @add_v8i64(<8 x i64>* %a, <8 x i64>* %b) #0 { +; VBITS_GE_128-LABEL: add_v8i64: +; VBITS_GE_128: // %bb.0: +; VBITS_GE_128-NEXT: mov x8, #4 +; VBITS_GE_128-NEXT: mov x9, #6 +; VBITS_GE_128-NEXT: mov x10, #2 +; VBITS_GE_128-NEXT: ptrue p0.d, vl2 +; VBITS_GE_128-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] +; VBITS_GE_128-NEXT: ld1d { z1.d }, p0/z, [x0, x9, lsl #3] +; VBITS_GE_128-NEXT: ld1d { z2.d }, p0/z, [x0, x10, lsl #3] +; VBITS_GE_128-NEXT: ld1d { z3.d }, p0/z, [x0] +; VBITS_GE_128-NEXT: ld1d { z4.d }, p0/z, [x1, x8, lsl #3] +; VBITS_GE_128-NEXT: ld1d { z5.d }, p0/z, [x1, x9, lsl #3] +; VBITS_GE_128-NEXT: ld1d { z6.d }, p0/z, [x1, x10, lsl #3] +; VBITS_GE_128-NEXT: ld1d { z7.d }, p0/z, [x1] +; VBITS_GE_128-NEXT: add z0.d, z0.d, z4.d +; VBITS_GE_128-NEXT: add z1.d, z1.d, z5.d +; VBITS_GE_128-NEXT: stp q0, q1, [x0, #32] +; VBITS_GE_128-NEXT: add z0.d, z3.d, z7.d +; VBITS_GE_128-NEXT: add z1.d, z2.d, z6.d +; VBITS_GE_128-NEXT: stp q0, q1, [x0] +; VBITS_GE_128-NEXT: ret +; +; VBITS_GE_256-LABEL: add_v8i64: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov x8, #4 +; VBITS_GE_256-NEXT: ptrue p0.d, vl4 +; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0] +; VBITS_GE_256-NEXT: ld1d { z2.d }, p0/z, [x1, x8, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z3.d }, p0/z, [x1] +; VBITS_GE_256-NEXT: add z0.d, z0.d, z2.d +; VBITS_GE_256-NEXT: add z1.d, z1.d, z3.d +; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x0, x8, lsl #3] +; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x0] +; VBITS_GE_256-NEXT: ret +; +; VBITS_GE_512-LABEL: add_v8i64: +; VBITS_GE_512: // %bb.0: +; VBITS_GE_512-NEXT: ptrue p0.d, vl8 +; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x0] +; VBITS_GE_512-NEXT: ld1d { z1.d }, p0/z, [x1] +; VBITS_GE_512-NEXT: add z0.d, z0.d, z1.d +; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x0] +; VBITS_GE_512-NEXT: ret + %op1 = load <8 x i64>, <8 x i64>* %a + %op2 = load <8 x i64>, <8 x i64>* %b + %res = add <8 x i64> %op1, %op2 + store <8 x i64> %res, <8 x i64>* %a + ret void +} + +; +; MUL +; + +define <8 x i8> @mul_v8i8(<8 x i8> %op1, <8 x i8> %op2) #0 { +; CHECK-LABEL: mul_v8i8: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: ptrue p0.b, vl8 +; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1 +; CHECK-NEXT: mul z0.b, p0/m, z0.b, z1.b +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 +; CHECK-NEXT: ret + %res = mul <8 x i8> %op1, %op2 + ret <8 x i8> %res +} + +define <16 x i8> @mul_v16i8(<16 x i8> %op1, <16 x i8> %op2) #0 { +; CHECK-LABEL: mul_v16i8: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: ptrue p0.b, vl16 +; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1 +; CHECK-NEXT: mul z0.b, p0/m, z0.b, z1.b +; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 +; CHECK-NEXT: ret + %res = mul <16 x i8> %op1, %op2 + ret <16 x i8> %res +} + +define void @mul_v32i8(<32 x i8>* %a, <32 x i8>* %b) #0 { +; VBITS_GE_128-LABEL: mul_v32i8: +; VBITS_GE_128: // %bb.0: +; VBITS_GE_128-NEXT: mov w8, #16 +; VBITS_GE_128-NEXT: ptrue p0.b, vl16 +; VBITS_GE_128-NEXT: ld1b { z0.b }, p0/z, [x0, x8] +; VBITS_GE_128-NEXT: ld1b { z1.b }, p0/z, [x0] +; VBITS_GE_128-NEXT: ld1b { z2.b }, p0/z, [x1, x8] +; VBITS_GE_128-NEXT: ld1b { z3.b }, p0/z, [x1] +; VBITS_GE_128-NEXT: mul z1.b, p0/m, z1.b, z3.b +; VBITS_GE_128-NEXT: mul z0.b, p0/m, z0.b, z2.b +; VBITS_GE_128-NEXT: stp q1, q0, [x0] +; VBITS_GE_128-NEXT: ret +; +; VBITS_GE_256-LABEL: mul_v32i8: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: ptrue p0.b, vl32 +; VBITS_GE_256-NEXT: ld1b { z0.b }, p0/z, [x0] +; VBITS_GE_256-NEXT: ld1b { z1.b }, p0/z, [x1] +; VBITS_GE_256-NEXT: mul z0.b, p0/m, z0.b, z1.b +; VBITS_GE_256-NEXT: st1b { z0.b }, p0, [x0] +; VBITS_GE_256-NEXT: ret +; +; VBITS_GE_512-LABEL: mul_v32i8: +; VBITS_GE_512: // %bb.0: +; VBITS_GE_512-NEXT: ptrue p0.b, vl32 +; VBITS_GE_512-NEXT: ld1b { z0.b }, p0/z, [x0] +; VBITS_GE_512-NEXT: ld1b { z1.b }, p0/z, [x1] +; VBITS_GE_512-NEXT: mul z0.b, p0/m, z0.b, z1.b +; VBITS_GE_512-NEXT: st1b { z0.b }, p0, [x0] +; VBITS_GE_512-NEXT: ret + %op1 = load <32 x i8>, <32 x i8>* %a + %op2 = load <32 x i8>, <32 x i8>* %b + %res = mul <32 x i8> %op1, %op2 + store <32 x i8> %res, <32 x i8>* %a + ret void +} + +define void @mul_v64i8(<64 x i8>* %a, <64 x i8>* %b) #0 { +; VBITS_GE_128-LABEL: mul_v64i8: +; VBITS_GE_128: // %bb.0: +; VBITS_GE_128-NEXT: mov w8, #32 +; VBITS_GE_128-NEXT: mov w9, #48 +; VBITS_GE_128-NEXT: mov w10, #16 +; VBITS_GE_128-NEXT: ptrue p0.b, vl16 +; VBITS_GE_128-NEXT: ld1b { z0.b }, p0/z, [x0, x8] +; VBITS_GE_128-NEXT: ld1b { z1.b }, p0/z, [x0, x9] +; VBITS_GE_128-NEXT: ld1b { z2.b }, p0/z, [x0, x10] +; VBITS_GE_128-NEXT: ld1b { z3.b }, p0/z, [x0] +; VBITS_GE_128-NEXT: ld1b { z4.b }, p0/z, [x1, x8] +; VBITS_GE_128-NEXT: ld1b { z5.b }, p0/z, [x1, x9] +; VBITS_GE_128-NEXT: ld1b { z6.b }, p0/z, [x1, x10] +; VBITS_GE_128-NEXT: ld1b { z7.b }, p0/z, [x1] +; VBITS_GE_128-NEXT: mul z0.b, p0/m, z0.b, z4.b +; VBITS_GE_128-NEXT: mul z1.b, p0/m, z1.b, z5.b +; VBITS_GE_128-NEXT: stp q0, q1, [x0, #32] +; VBITS_GE_128-NEXT: movprfx z0, z3 +; VBITS_GE_128-NEXT: mul z0.b, p0/m, z0.b, z7.b +; VBITS_GE_128-NEXT: movprfx z1, z2 +; VBITS_GE_128-NEXT: mul z1.b, p0/m, z1.b, z6.b +; VBITS_GE_128-NEXT: stp q0, q1, [x0] +; VBITS_GE_128-NEXT: ret +; +; VBITS_GE_256-LABEL: mul_v64i8: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov w8, #32 +; VBITS_GE_256-NEXT: ptrue p0.b, vl32 +; VBITS_GE_256-NEXT: ld1b { z0.b }, p0/z, [x0, x8] +; VBITS_GE_256-NEXT: ld1b { z1.b }, p0/z, [x0] +; VBITS_GE_256-NEXT: ld1b { z2.b }, p0/z, [x1, x8] +; VBITS_GE_256-NEXT: ld1b { z3.b }, p0/z, [x1] +; VBITS_GE_256-NEXT: mul z0.b, p0/m, z0.b, z2.b +; VBITS_GE_256-NEXT: mul z1.b, p0/m, z1.b, z3.b +; VBITS_GE_256-NEXT: st1b { z0.b }, p0, [x0, x8] +; VBITS_GE_256-NEXT: st1b { z1.b }, p0, [x0] +; VBITS_GE_256-NEXT: ret +; +; VBITS_GE_512-LABEL: mul_v64i8: +; VBITS_GE_512: // %bb.0: +; VBITS_GE_512-NEXT: ptrue p0.b, vl64 +; VBITS_GE_512-NEXT: ld1b { z0.b }, p0/z, [x0] +; VBITS_GE_512-NEXT: ld1b { z1.b }, p0/z, [x1] +; VBITS_GE_512-NEXT: mul z0.b, p0/m, z0.b, z1.b +; VBITS_GE_512-NEXT: st1b { z0.b }, p0, [x0] +; VBITS_GE_512-NEXT: ret + %op1 = load <64 x i8>, <64 x i8>* %a + %op2 = load <64 x i8>, <64 x i8>* %b + %res = mul <64 x i8> %op1, %op2 + store <64 x i8> %res, <64 x i8>* %a + ret void +} + +define <4 x i16> @mul_v4i16(<4 x i16> %op1, <4 x i16> %op2) #0 { +; CHECK-LABEL: mul_v4i16: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: ptrue p0.h, vl4 +; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1 +; CHECK-NEXT: mul z0.h, p0/m, z0.h, z1.h +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 +; CHECK-NEXT: ret + %res = mul <4 x i16> %op1, %op2 + ret <4 x i16> %res +} + +define <8 x i16> @mul_v8i16(<8 x i16> %op1, <8 x i16> %op2) #0 { +; CHECK-LABEL: mul_v8i16: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: ptrue p0.h, vl8 +; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1 +; CHECK-NEXT: mul z0.h, p0/m, z0.h, z1.h +; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 +; CHECK-NEXT: ret + %res = mul <8 x i16> %op1, %op2 + ret <8 x i16> %res +} + +define void @mul_v16i16(<16 x i16>* %a, <16 x i16>* %b) #0 { +; VBITS_GE_128-LABEL: mul_v16i16: +; VBITS_GE_128: // %bb.0: +; VBITS_GE_128-NEXT: mov x8, #8 +; VBITS_GE_128-NEXT: ptrue p0.h, vl8 +; VBITS_GE_128-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] +; VBITS_GE_128-NEXT: ld1h { z1.h }, p0/z, [x0] +; VBITS_GE_128-NEXT: ld1h { z2.h }, p0/z, [x1, x8, lsl #1] +; VBITS_GE_128-NEXT: ld1h { z3.h }, p0/z, [x1] +; VBITS_GE_128-NEXT: mul z1.h, p0/m, z1.h, z3.h +; VBITS_GE_128-NEXT: mul z0.h, p0/m, z0.h, z2.h +; VBITS_GE_128-NEXT: stp q1, q0, [x0] +; VBITS_GE_128-NEXT: ret +; +; VBITS_GE_256-LABEL: mul_v16i16: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: ptrue p0.h, vl16 +; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0] +; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x1] +; VBITS_GE_256-NEXT: mul z0.h, p0/m, z0.h, z1.h +; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x0] +; VBITS_GE_256-NEXT: ret +; +; VBITS_GE_512-LABEL: mul_v16i16: +; VBITS_GE_512: // %bb.0: +; VBITS_GE_512-NEXT: ptrue p0.h, vl16 +; VBITS_GE_512-NEXT: ld1h { z0.h }, p0/z, [x0] +; VBITS_GE_512-NEXT: ld1h { z1.h }, p0/z, [x1] +; VBITS_GE_512-NEXT: mul z0.h, p0/m, z0.h, z1.h +; VBITS_GE_512-NEXT: st1h { z0.h }, p0, [x0] +; VBITS_GE_512-NEXT: ret + %op1 = load <16 x i16>, <16 x i16>* %a + %op2 = load <16 x i16>, <16 x i16>* %b + %res = mul <16 x i16> %op1, %op2 + store <16 x i16> %res, <16 x i16>* %a + ret void +} + +define void @mul_v32i16(<32 x i16>* %a, <32 x i16>* %b) #0 { +; VBITS_GE_128-LABEL: mul_v32i16: +; VBITS_GE_128: // %bb.0: +; VBITS_GE_128-NEXT: mov x8, #16 +; VBITS_GE_128-NEXT: mov x9, #24 +; VBITS_GE_128-NEXT: mov x10, #8 +; VBITS_GE_128-NEXT: ptrue p0.h, vl8 +; VBITS_GE_128-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] +; VBITS_GE_128-NEXT: ld1h { z1.h }, p0/z, [x0, x9, lsl #1] +; VBITS_GE_128-NEXT: ld1h { z2.h }, p0/z, [x0, x10, lsl #1] +; VBITS_GE_128-NEXT: ld1h { z3.h }, p0/z, [x0] +; VBITS_GE_128-NEXT: ld1h { z4.h }, p0/z, [x1, x8, lsl #1] +; VBITS_GE_128-NEXT: ld1h { z5.h }, p0/z, [x1, x9, lsl #1] +; VBITS_GE_128-NEXT: ld1h { z6.h }, p0/z, [x1, x10, lsl #1] +; VBITS_GE_128-NEXT: ld1h { z7.h }, p0/z, [x1] +; VBITS_GE_128-NEXT: mul z0.h, p0/m, z0.h, z4.h +; VBITS_GE_128-NEXT: mul z1.h, p0/m, z1.h, z5.h +; VBITS_GE_128-NEXT: stp q0, q1, [x0, #32] +; VBITS_GE_128-NEXT: movprfx z0, z3 +; VBITS_GE_128-NEXT: mul z0.h, p0/m, z0.h, z7.h +; VBITS_GE_128-NEXT: movprfx z1, z2 +; VBITS_GE_128-NEXT: mul z1.h, p0/m, z1.h, z6.h +; VBITS_GE_128-NEXT: stp q0, q1, [x0] +; VBITS_GE_128-NEXT: ret +; +; VBITS_GE_256-LABEL: mul_v32i16: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov x8, #16 +; VBITS_GE_256-NEXT: ptrue p0.h, vl16 +; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] +; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0] +; VBITS_GE_256-NEXT: ld1h { z2.h }, p0/z, [x1, x8, lsl #1] +; VBITS_GE_256-NEXT: ld1h { z3.h }, p0/z, [x1] +; VBITS_GE_256-NEXT: mul z0.h, p0/m, z0.h, z2.h +; VBITS_GE_256-NEXT: mul z1.h, p0/m, z1.h, z3.h +; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x0, x8, lsl #1] +; VBITS_GE_256-NEXT: st1h { z1.h }, p0, [x0] +; VBITS_GE_256-NEXT: ret +; +; VBITS_GE_512-LABEL: mul_v32i16: +; VBITS_GE_512: // %bb.0: +; VBITS_GE_512-NEXT: ptrue p0.h, vl32 +; VBITS_GE_512-NEXT: ld1h { z0.h }, p0/z, [x0] +; VBITS_GE_512-NEXT: ld1h { z1.h }, p0/z, [x1] +; VBITS_GE_512-NEXT: mul z0.h, p0/m, z0.h, z1.h +; VBITS_GE_512-NEXT: st1h { z0.h }, p0, [x0] +; VBITS_GE_512-NEXT: ret + %op1 = load <32 x i16>, <32 x i16>* %a + %op2 = load <32 x i16>, <32 x i16>* %b + %res = mul <32 x i16> %op1, %op2 + store <32 x i16> %res, <32 x i16>* %a + ret void +} + +define <2 x i32> @mul_v2i32(<2 x i32> %op1, <2 x i32> %op2) #0 { +; CHECK-LABEL: mul_v2i32: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: ptrue p0.s, vl2 +; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1 +; CHECK-NEXT: mul z0.s, p0/m, z0.s, z1.s +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 +; CHECK-NEXT: ret + %res = mul <2 x i32> %op1, %op2 + ret <2 x i32> %res +} + +define <4 x i32> @mul_v4i32(<4 x i32> %op1, <4 x i32> %op2) #0 { +; CHECK-LABEL: mul_v4i32: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: ptrue p0.s, vl4 +; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1 +; CHECK-NEXT: mul z0.s, p0/m, z0.s, z1.s +; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 +; CHECK-NEXT: ret + %res = mul <4 x i32> %op1, %op2 + ret <4 x i32> %res +} + +define void @mul_v8i32(<8 x i32>* %a, <8 x i32>* %b) #0 { +; VBITS_GE_128-LABEL: mul_v8i32: +; VBITS_GE_128: // %bb.0: +; VBITS_GE_128-NEXT: mov x8, #4 +; VBITS_GE_128-NEXT: ptrue p0.s, vl4 +; VBITS_GE_128-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] +; VBITS_GE_128-NEXT: ld1w { z1.s }, p0/z, [x0] +; VBITS_GE_128-NEXT: ld1w { z2.s }, p0/z, [x1, x8, lsl #2] +; VBITS_GE_128-NEXT: ld1w { z3.s }, p0/z, [x1] +; VBITS_GE_128-NEXT: mul z1.s, p0/m, z1.s, z3.s +; VBITS_GE_128-NEXT: mul z0.s, p0/m, z0.s, z2.s +; VBITS_GE_128-NEXT: stp q1, q0, [x0] +; VBITS_GE_128-NEXT: ret +; +; VBITS_GE_256-LABEL: mul_v8i32: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: ptrue p0.s, vl8 +; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0] +; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x1] +; VBITS_GE_256-NEXT: mul z0.s, p0/m, z0.s, z1.s +; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0] +; VBITS_GE_256-NEXT: ret +; +; VBITS_GE_512-LABEL: mul_v8i32: +; VBITS_GE_512: // %bb.0: +; VBITS_GE_512-NEXT: ptrue p0.s, vl8 +; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x0] +; VBITS_GE_512-NEXT: ld1w { z1.s }, p0/z, [x1] +; VBITS_GE_512-NEXT: mul z0.s, p0/m, z0.s, z1.s +; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x0] +; VBITS_GE_512-NEXT: ret + %op1 = load <8 x i32>, <8 x i32>* %a + %op2 = load <8 x i32>, <8 x i32>* %b + %res = mul <8 x i32> %op1, %op2 + store <8 x i32> %res, <8 x i32>* %a + ret void +} + +define void @mul_v16i32(<16 x i32>* %a, <16 x i32>* %b) #0 { +; VBITS_GE_128-LABEL: mul_v16i32: +; VBITS_GE_128: // %bb.0: +; VBITS_GE_128-NEXT: mov x8, #8 +; VBITS_GE_128-NEXT: mov x9, #12 +; VBITS_GE_128-NEXT: mov x10, #4 +; VBITS_GE_128-NEXT: ptrue p0.s, vl4 +; VBITS_GE_128-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] +; VBITS_GE_128-NEXT: ld1w { z1.s }, p0/z, [x0, x9, lsl #2] +; VBITS_GE_128-NEXT: ld1w { z2.s }, p0/z, [x0, x10, lsl #2] +; VBITS_GE_128-NEXT: ld1w { z3.s }, p0/z, [x0] +; VBITS_GE_128-NEXT: ld1w { z4.s }, p0/z, [x1, x8, lsl #2] +; VBITS_GE_128-NEXT: ld1w { z5.s }, p0/z, [x1, x9, lsl #2] +; VBITS_GE_128-NEXT: ld1w { z6.s }, p0/z, [x1, x10, lsl #2] +; VBITS_GE_128-NEXT: ld1w { z7.s }, p0/z, [x1] +; VBITS_GE_128-NEXT: mul z0.s, p0/m, z0.s, z4.s +; VBITS_GE_128-NEXT: mul z1.s, p0/m, z1.s, z5.s +; VBITS_GE_128-NEXT: stp q0, q1, [x0, #32] +; VBITS_GE_128-NEXT: movprfx z0, z3 +; VBITS_GE_128-NEXT: mul z0.s, p0/m, z0.s, z7.s +; VBITS_GE_128-NEXT: movprfx z1, z2 +; VBITS_GE_128-NEXT: mul z1.s, p0/m, z1.s, z6.s +; VBITS_GE_128-NEXT: stp q0, q1, [x0] +; VBITS_GE_128-NEXT: ret +; +; VBITS_GE_256-LABEL: mul_v16i32: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov x8, #8 +; VBITS_GE_256-NEXT: ptrue p0.s, vl8 +; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0] +; VBITS_GE_256-NEXT: ld1w { z2.s }, p0/z, [x1, x8, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z3.s }, p0/z, [x1] +; VBITS_GE_256-NEXT: mul z0.s, p0/m, z0.s, z2.s +; VBITS_GE_256-NEXT: mul z1.s, p0/m, z1.s, z3.s +; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0, x8, lsl #2] +; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x0] +; VBITS_GE_256-NEXT: ret +; +; VBITS_GE_512-LABEL: mul_v16i32: +; VBITS_GE_512: // %bb.0: +; VBITS_GE_512-NEXT: ptrue p0.s, vl16 +; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x0] +; VBITS_GE_512-NEXT: ld1w { z1.s }, p0/z, [x1] +; VBITS_GE_512-NEXT: mul z0.s, p0/m, z0.s, z1.s +; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x0] +; VBITS_GE_512-NEXT: ret + %op1 = load <16 x i32>, <16 x i32>* %a + %op2 = load <16 x i32>, <16 x i32>* %b + %res = mul <16 x i32> %op1, %op2 + store <16 x i32> %res, <16 x i32>* %a + ret void +} + +define <1 x i64> @mul_v1i64(<1 x i64> %op1, <1 x i64> %op2) #0 { +; CHECK-LABEL: mul_v1i64: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: ptrue p0.d, vl1 +; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1 +; CHECK-NEXT: mul z0.d, p0/m, z0.d, z1.d +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 +; CHECK-NEXT: ret + %res = mul <1 x i64> %op1, %op2 + ret <1 x i64> %res +} + +define <2 x i64> @mul_v2i64(<2 x i64> %op1, <2 x i64> %op2) #0 { +; CHECK-LABEL: mul_v2i64: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: ptrue p0.d, vl2 +; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1 +; CHECK-NEXT: mul z0.d, p0/m, z0.d, z1.d +; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 +; CHECK-NEXT: ret + %res = mul <2 x i64> %op1, %op2 + ret <2 x i64> %res +} + +define void @mul_v4i64(<4 x i64>* %a, <4 x i64>* %b) #0 { +; VBITS_GE_128-LABEL: mul_v4i64: +; VBITS_GE_128: // %bb.0: +; VBITS_GE_128-NEXT: mov x8, #2 +; VBITS_GE_128-NEXT: ptrue p0.d, vl2 +; VBITS_GE_128-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] +; VBITS_GE_128-NEXT: ld1d { z1.d }, p0/z, [x0] +; VBITS_GE_128-NEXT: ld1d { z2.d }, p0/z, [x1, x8, lsl #3] +; VBITS_GE_128-NEXT: ld1d { z3.d }, p0/z, [x1] +; VBITS_GE_128-NEXT: mul z1.d, p0/m, z1.d, z3.d +; VBITS_GE_128-NEXT: mul z0.d, p0/m, z0.d, z2.d +; VBITS_GE_128-NEXT: stp q1, q0, [x0] +; VBITS_GE_128-NEXT: ret +; +; VBITS_GE_256-LABEL: mul_v4i64: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: ptrue p0.d, vl4 +; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0] +; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x1] +; VBITS_GE_256-NEXT: mul z0.d, p0/m, z0.d, z1.d +; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x0] +; VBITS_GE_256-NEXT: ret +; +; VBITS_GE_512-LABEL: mul_v4i64: +; VBITS_GE_512: // %bb.0: +; VBITS_GE_512-NEXT: ptrue p0.d, vl4 +; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x0] +; VBITS_GE_512-NEXT: ld1d { z1.d }, p0/z, [x1] +; VBITS_GE_512-NEXT: mul z0.d, p0/m, z0.d, z1.d +; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x0] +; VBITS_GE_512-NEXT: ret + %op1 = load <4 x i64>, <4 x i64>* %a + %op2 = load <4 x i64>, <4 x i64>* %b + %res = mul <4 x i64> %op1, %op2 + store <4 x i64> %res, <4 x i64>* %a + ret void +} + +define void @mul_v8i64(<8 x i64>* %a, <8 x i64>* %b) #0 { +; VBITS_GE_128-LABEL: mul_v8i64: +; VBITS_GE_128: // %bb.0: +; VBITS_GE_128-NEXT: mov x8, #4 +; VBITS_GE_128-NEXT: mov x9, #6 +; VBITS_GE_128-NEXT: mov x10, #2 +; VBITS_GE_128-NEXT: ptrue p0.d, vl2 +; VBITS_GE_128-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] +; VBITS_GE_128-NEXT: ld1d { z1.d }, p0/z, [x0, x9, lsl #3] +; VBITS_GE_128-NEXT: ld1d { z2.d }, p0/z, [x0, x10, lsl #3] +; VBITS_GE_128-NEXT: ld1d { z3.d }, p0/z, [x0] +; VBITS_GE_128-NEXT: ld1d { z4.d }, p0/z, [x1, x8, lsl #3] +; VBITS_GE_128-NEXT: ld1d { z5.d }, p0/z, [x1, x9, lsl #3] +; VBITS_GE_128-NEXT: ld1d { z6.d }, p0/z, [x1, x10, lsl #3] +; VBITS_GE_128-NEXT: ld1d { z7.d }, p0/z, [x1] +; VBITS_GE_128-NEXT: mul z0.d, p0/m, z0.d, z4.d +; VBITS_GE_128-NEXT: mul z1.d, p0/m, z1.d, z5.d +; VBITS_GE_128-NEXT: stp q0, q1, [x0, #32] +; VBITS_GE_128-NEXT: movprfx z0, z3 +; VBITS_GE_128-NEXT: mul z0.d, p0/m, z0.d, z7.d +; VBITS_GE_128-NEXT: movprfx z1, z2 +; VBITS_GE_128-NEXT: mul z1.d, p0/m, z1.d, z6.d +; VBITS_GE_128-NEXT: stp q0, q1, [x0] +; VBITS_GE_128-NEXT: ret +; +; VBITS_GE_256-LABEL: mul_v8i64: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov x8, #4 +; VBITS_GE_256-NEXT: ptrue p0.d, vl4 +; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0] +; VBITS_GE_256-NEXT: ld1d { z2.d }, p0/z, [x1, x8, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z3.d }, p0/z, [x1] +; VBITS_GE_256-NEXT: mul z0.d, p0/m, z0.d, z2.d +; VBITS_GE_256-NEXT: mul z1.d, p0/m, z1.d, z3.d +; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x0, x8, lsl #3] +; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x0] +; VBITS_GE_256-NEXT: ret +; +; VBITS_GE_512-LABEL: mul_v8i64: +; VBITS_GE_512: // %bb.0: +; VBITS_GE_512-NEXT: ptrue p0.d, vl8 +; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x0] +; VBITS_GE_512-NEXT: ld1d { z1.d }, p0/z, [x1] +; VBITS_GE_512-NEXT: mul z0.d, p0/m, z0.d, z1.d +; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x0] +; VBITS_GE_512-NEXT: ret + %op1 = load <8 x i64>, <8 x i64>* %a + %op2 = load <8 x i64>, <8 x i64>* %b + %res = mul <8 x i64> %op1, %op2 + store <8 x i64> %res, <8 x i64>* %a + ret void +} + +; +; SUB +; + +define <8 x i8> @sub_v8i8(<8 x i8> %op1, <8 x i8> %op2) #0 { +; CHECK-LABEL: sub_v8i8: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1 +; CHECK-NEXT: sub z0.b, z0.b, z1.b +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 +; CHECK-NEXT: ret + %res = sub <8 x i8> %op1, %op2 + ret <8 x i8> %res +} + +define <16 x i8> @sub_v16i8(<16 x i8> %op1, <16 x i8> %op2) #0 { +; CHECK-LABEL: sub_v16i8: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1 +; CHECK-NEXT: sub z0.b, z0.b, z1.b +; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 +; CHECK-NEXT: ret + %res = sub <16 x i8> %op1, %op2 + ret <16 x i8> %res +} + +define void @sub_v32i8(<32 x i8>* %a, <32 x i8>* %b) #0 { +; VBITS_GE_128-LABEL: sub_v32i8: +; VBITS_GE_128: // %bb.0: +; VBITS_GE_128-NEXT: mov w8, #16 +; VBITS_GE_128-NEXT: ptrue p0.b, vl16 +; VBITS_GE_128-NEXT: ld1b { z0.b }, p0/z, [x0, x8] +; VBITS_GE_128-NEXT: ld1b { z1.b }, p0/z, [x0] +; VBITS_GE_128-NEXT: ld1b { z2.b }, p0/z, [x1, x8] +; VBITS_GE_128-NEXT: ld1b { z3.b }, p0/z, [x1] +; VBITS_GE_128-NEXT: sub z1.b, z1.b, z3.b +; VBITS_GE_128-NEXT: sub z0.b, z0.b, z2.b +; VBITS_GE_128-NEXT: stp q1, q0, [x0] +; VBITS_GE_128-NEXT: ret +; +; VBITS_GE_256-LABEL: sub_v32i8: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: ptrue p0.b, vl32 +; VBITS_GE_256-NEXT: ld1b { z0.b }, p0/z, [x0] +; VBITS_GE_256-NEXT: ld1b { z1.b }, p0/z, [x1] +; VBITS_GE_256-NEXT: sub z0.b, z0.b, z1.b +; VBITS_GE_256-NEXT: st1b { z0.b }, p0, [x0] +; VBITS_GE_256-NEXT: ret +; +; VBITS_GE_512-LABEL: sub_v32i8: +; VBITS_GE_512: // %bb.0: +; VBITS_GE_512-NEXT: ptrue p0.b, vl32 +; VBITS_GE_512-NEXT: ld1b { z0.b }, p0/z, [x0] +; VBITS_GE_512-NEXT: ld1b { z1.b }, p0/z, [x1] +; VBITS_GE_512-NEXT: sub z0.b, z0.b, z1.b +; VBITS_GE_512-NEXT: st1b { z0.b }, p0, [x0] +; VBITS_GE_512-NEXT: ret + %op1 = load <32 x i8>, <32 x i8>* %a + %op2 = load <32 x i8>, <32 x i8>* %b + %res = sub <32 x i8> %op1, %op2 + store <32 x i8> %res, <32 x i8>* %a + ret void +} + +define void @sub_v64i8(<64 x i8>* %a, <64 x i8>* %b) #0 { +; VBITS_GE_128-LABEL: sub_v64i8: +; VBITS_GE_128: // %bb.0: +; VBITS_GE_128-NEXT: mov w8, #32 +; VBITS_GE_128-NEXT: mov w9, #48 +; VBITS_GE_128-NEXT: mov w10, #16 +; VBITS_GE_128-NEXT: ptrue p0.b, vl16 +; VBITS_GE_128-NEXT: ld1b { z0.b }, p0/z, [x0, x8] +; VBITS_GE_128-NEXT: ld1b { z1.b }, p0/z, [x0, x9] +; VBITS_GE_128-NEXT: ld1b { z2.b }, p0/z, [x0, x10] +; VBITS_GE_128-NEXT: ld1b { z3.b }, p0/z, [x0] +; VBITS_GE_128-NEXT: ld1b { z4.b }, p0/z, [x1, x8] +; VBITS_GE_128-NEXT: ld1b { z5.b }, p0/z, [x1, x9] +; VBITS_GE_128-NEXT: ld1b { z6.b }, p0/z, [x1, x10] +; VBITS_GE_128-NEXT: ld1b { z7.b }, p0/z, [x1] +; VBITS_GE_128-NEXT: sub z0.b, z0.b, z4.b +; VBITS_GE_128-NEXT: sub z1.b, z1.b, z5.b +; VBITS_GE_128-NEXT: stp q0, q1, [x0, #32] +; VBITS_GE_128-NEXT: sub z0.b, z3.b, z7.b +; VBITS_GE_128-NEXT: sub z1.b, z2.b, z6.b +; VBITS_GE_128-NEXT: stp q0, q1, [x0] +; VBITS_GE_128-NEXT: ret +; +; VBITS_GE_256-LABEL: sub_v64i8: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov w8, #32 +; VBITS_GE_256-NEXT: ptrue p0.b, vl32 +; VBITS_GE_256-NEXT: ld1b { z0.b }, p0/z, [x0, x8] +; VBITS_GE_256-NEXT: ld1b { z1.b }, p0/z, [x0] +; VBITS_GE_256-NEXT: ld1b { z2.b }, p0/z, [x1, x8] +; VBITS_GE_256-NEXT: ld1b { z3.b }, p0/z, [x1] +; VBITS_GE_256-NEXT: sub z0.b, z0.b, z2.b +; VBITS_GE_256-NEXT: sub z1.b, z1.b, z3.b +; VBITS_GE_256-NEXT: st1b { z0.b }, p0, [x0, x8] +; VBITS_GE_256-NEXT: st1b { z1.b }, p0, [x0] +; VBITS_GE_256-NEXT: ret +; +; VBITS_GE_512-LABEL: sub_v64i8: +; VBITS_GE_512: // %bb.0: +; VBITS_GE_512-NEXT: ptrue p0.b, vl64 +; VBITS_GE_512-NEXT: ld1b { z0.b }, p0/z, [x0] +; VBITS_GE_512-NEXT: ld1b { z1.b }, p0/z, [x1] +; VBITS_GE_512-NEXT: sub z0.b, z0.b, z1.b +; VBITS_GE_512-NEXT: st1b { z0.b }, p0, [x0] +; VBITS_GE_512-NEXT: ret + %op1 = load <64 x i8>, <64 x i8>* %a + %op2 = load <64 x i8>, <64 x i8>* %b + %res = sub <64 x i8> %op1, %op2 + store <64 x i8> %res, <64 x i8>* %a + ret void +} + +define <4 x i16> @sub_v4i16(<4 x i16> %op1, <4 x i16> %op2) #0 { +; CHECK-LABEL: sub_v4i16: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1 +; CHECK-NEXT: sub z0.h, z0.h, z1.h +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 +; CHECK-NEXT: ret + %res = sub <4 x i16> %op1, %op2 + ret <4 x i16> %res +} + +define <8 x i16> @sub_v8i16(<8 x i16> %op1, <8 x i16> %op2) #0 { +; CHECK-LABEL: sub_v8i16: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1 +; CHECK-NEXT: sub z0.h, z0.h, z1.h +; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 +; CHECK-NEXT: ret + %res = sub <8 x i16> %op1, %op2 + ret <8 x i16> %res +} + +define void @sub_v16i16(<16 x i16>* %a, <16 x i16>* %b) #0 { +; VBITS_GE_128-LABEL: sub_v16i16: +; VBITS_GE_128: // %bb.0: +; VBITS_GE_128-NEXT: mov x8, #8 +; VBITS_GE_128-NEXT: ptrue p0.h, vl8 +; VBITS_GE_128-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] +; VBITS_GE_128-NEXT: ld1h { z1.h }, p0/z, [x0] +; VBITS_GE_128-NEXT: ld1h { z2.h }, p0/z, [x1, x8, lsl #1] +; VBITS_GE_128-NEXT: ld1h { z3.h }, p0/z, [x1] +; VBITS_GE_128-NEXT: sub z1.h, z1.h, z3.h +; VBITS_GE_128-NEXT: sub z0.h, z0.h, z2.h +; VBITS_GE_128-NEXT: stp q1, q0, [x0] +; VBITS_GE_128-NEXT: ret +; +; VBITS_GE_256-LABEL: sub_v16i16: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: ptrue p0.h, vl16 +; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0] +; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x1] +; VBITS_GE_256-NEXT: sub z0.h, z0.h, z1.h +; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x0] +; VBITS_GE_256-NEXT: ret +; +; VBITS_GE_512-LABEL: sub_v16i16: +; VBITS_GE_512: // %bb.0: +; VBITS_GE_512-NEXT: ptrue p0.h, vl16 +; VBITS_GE_512-NEXT: ld1h { z0.h }, p0/z, [x0] +; VBITS_GE_512-NEXT: ld1h { z1.h }, p0/z, [x1] +; VBITS_GE_512-NEXT: sub z0.h, z0.h, z1.h +; VBITS_GE_512-NEXT: st1h { z0.h }, p0, [x0] +; VBITS_GE_512-NEXT: ret + %op1 = load <16 x i16>, <16 x i16>* %a + %op2 = load <16 x i16>, <16 x i16>* %b + %res = sub <16 x i16> %op1, %op2 + store <16 x i16> %res, <16 x i16>* %a + ret void +} + +define void @sub_v32i16(<32 x i16>* %a, <32 x i16>* %b) #0 { +; VBITS_GE_128-LABEL: sub_v32i16: +; VBITS_GE_128: // %bb.0: +; VBITS_GE_128-NEXT: mov x8, #16 +; VBITS_GE_128-NEXT: mov x9, #24 +; VBITS_GE_128-NEXT: mov x10, #8 +; VBITS_GE_128-NEXT: ptrue p0.h, vl8 +; VBITS_GE_128-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] +; VBITS_GE_128-NEXT: ld1h { z1.h }, p0/z, [x0, x9, lsl #1] +; VBITS_GE_128-NEXT: ld1h { z2.h }, p0/z, [x0, x10, lsl #1] +; VBITS_GE_128-NEXT: ld1h { z3.h }, p0/z, [x0] +; VBITS_GE_128-NEXT: ld1h { z4.h }, p0/z, [x1, x8, lsl #1] +; VBITS_GE_128-NEXT: ld1h { z5.h }, p0/z, [x1, x9, lsl #1] +; VBITS_GE_128-NEXT: ld1h { z6.h }, p0/z, [x1, x10, lsl #1] +; VBITS_GE_128-NEXT: ld1h { z7.h }, p0/z, [x1] +; VBITS_GE_128-NEXT: sub z0.h, z0.h, z4.h +; VBITS_GE_128-NEXT: sub z1.h, z1.h, z5.h +; VBITS_GE_128-NEXT: stp q0, q1, [x0, #32] +; VBITS_GE_128-NEXT: sub z0.h, z3.h, z7.h +; VBITS_GE_128-NEXT: sub z1.h, z2.h, z6.h +; VBITS_GE_128-NEXT: stp q0, q1, [x0] +; VBITS_GE_128-NEXT: ret +; +; VBITS_GE_256-LABEL: sub_v32i16: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov x8, #16 +; VBITS_GE_256-NEXT: ptrue p0.h, vl16 +; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] +; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0] +; VBITS_GE_256-NEXT: ld1h { z2.h }, p0/z, [x1, x8, lsl #1] +; VBITS_GE_256-NEXT: ld1h { z3.h }, p0/z, [x1] +; VBITS_GE_256-NEXT: sub z0.h, z0.h, z2.h +; VBITS_GE_256-NEXT: sub z1.h, z1.h, z3.h +; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x0, x8, lsl #1] +; VBITS_GE_256-NEXT: st1h { z1.h }, p0, [x0] +; VBITS_GE_256-NEXT: ret +; +; VBITS_GE_512-LABEL: sub_v32i16: +; VBITS_GE_512: // %bb.0: +; VBITS_GE_512-NEXT: ptrue p0.h, vl32 +; VBITS_GE_512-NEXT: ld1h { z0.h }, p0/z, [x0] +; VBITS_GE_512-NEXT: ld1h { z1.h }, p0/z, [x1] +; VBITS_GE_512-NEXT: sub z0.h, z0.h, z1.h +; VBITS_GE_512-NEXT: st1h { z0.h }, p0, [x0] +; VBITS_GE_512-NEXT: ret + %op1 = load <32 x i16>, <32 x i16>* %a + %op2 = load <32 x i16>, <32 x i16>* %b + %res = sub <32 x i16> %op1, %op2 + store <32 x i16> %res, <32 x i16>* %a + ret void +} + +define <2 x i32> @sub_v2i32(<2 x i32> %op1, <2 x i32> %op2) #0 { +; CHECK-LABEL: sub_v2i32: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1 +; CHECK-NEXT: sub z0.s, z0.s, z1.s +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 +; CHECK-NEXT: ret + %res = sub <2 x i32> %op1, %op2 + ret <2 x i32> %res +} + +define <4 x i32> @sub_v4i32(<4 x i32> %op1, <4 x i32> %op2) #0 { +; CHECK-LABEL: sub_v4i32: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1 +; CHECK-NEXT: sub z0.s, z0.s, z1.s +; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 +; CHECK-NEXT: ret + %res = sub <4 x i32> %op1, %op2 + ret <4 x i32> %res +} + +define void @sub_v8i32(<8 x i32>* %a, <8 x i32>* %b) #0 { +; VBITS_GE_128-LABEL: sub_v8i32: +; VBITS_GE_128: // %bb.0: +; VBITS_GE_128-NEXT: mov x8, #4 +; VBITS_GE_128-NEXT: ptrue p0.s, vl4 +; VBITS_GE_128-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] +; VBITS_GE_128-NEXT: ld1w { z1.s }, p0/z, [x0] +; VBITS_GE_128-NEXT: ld1w { z2.s }, p0/z, [x1, x8, lsl #2] +; VBITS_GE_128-NEXT: ld1w { z3.s }, p0/z, [x1] +; VBITS_GE_128-NEXT: sub z1.s, z1.s, z3.s +; VBITS_GE_128-NEXT: sub z0.s, z0.s, z2.s +; VBITS_GE_128-NEXT: stp q1, q0, [x0] +; VBITS_GE_128-NEXT: ret +; +; VBITS_GE_256-LABEL: sub_v8i32: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: ptrue p0.s, vl8 +; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0] +; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x1] +; VBITS_GE_256-NEXT: sub z0.s, z0.s, z1.s +; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0] +; VBITS_GE_256-NEXT: ret +; +; VBITS_GE_512-LABEL: sub_v8i32: +; VBITS_GE_512: // %bb.0: +; VBITS_GE_512-NEXT: ptrue p0.s, vl8 +; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x0] +; VBITS_GE_512-NEXT: ld1w { z1.s }, p0/z, [x1] +; VBITS_GE_512-NEXT: sub z0.s, z0.s, z1.s +; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x0] +; VBITS_GE_512-NEXT: ret + %op1 = load <8 x i32>, <8 x i32>* %a + %op2 = load <8 x i32>, <8 x i32>* %b + %res = sub <8 x i32> %op1, %op2 + store <8 x i32> %res, <8 x i32>* %a + ret void +} + +define void @sub_v16i32(<16 x i32>* %a, <16 x i32>* %b) #0 { +; VBITS_GE_128-LABEL: sub_v16i32: +; VBITS_GE_128: // %bb.0: +; VBITS_GE_128-NEXT: mov x8, #8 +; VBITS_GE_128-NEXT: mov x9, #12 +; VBITS_GE_128-NEXT: mov x10, #4 +; VBITS_GE_128-NEXT: ptrue p0.s, vl4 +; VBITS_GE_128-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] +; VBITS_GE_128-NEXT: ld1w { z1.s }, p0/z, [x0, x9, lsl #2] +; VBITS_GE_128-NEXT: ld1w { z2.s }, p0/z, [x0, x10, lsl #2] +; VBITS_GE_128-NEXT: ld1w { z3.s }, p0/z, [x0] +; VBITS_GE_128-NEXT: ld1w { z4.s }, p0/z, [x1, x8, lsl #2] +; VBITS_GE_128-NEXT: ld1w { z5.s }, p0/z, [x1, x9, lsl #2] +; VBITS_GE_128-NEXT: ld1w { z6.s }, p0/z, [x1, x10, lsl #2] +; VBITS_GE_128-NEXT: ld1w { z7.s }, p0/z, [x1] +; VBITS_GE_128-NEXT: sub z0.s, z0.s, z4.s +; VBITS_GE_128-NEXT: sub z1.s, z1.s, z5.s +; VBITS_GE_128-NEXT: stp q0, q1, [x0, #32] +; VBITS_GE_128-NEXT: sub z0.s, z3.s, z7.s +; VBITS_GE_128-NEXT: sub z1.s, z2.s, z6.s +; VBITS_GE_128-NEXT: stp q0, q1, [x0] +; VBITS_GE_128-NEXT: ret +; +; VBITS_GE_256-LABEL: sub_v16i32: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov x8, #8 +; VBITS_GE_256-NEXT: ptrue p0.s, vl8 +; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0] +; VBITS_GE_256-NEXT: ld1w { z2.s }, p0/z, [x1, x8, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z3.s }, p0/z, [x1] +; VBITS_GE_256-NEXT: sub z0.s, z0.s, z2.s +; VBITS_GE_256-NEXT: sub z1.s, z1.s, z3.s +; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0, x8, lsl #2] +; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x0] +; VBITS_GE_256-NEXT: ret +; +; VBITS_GE_512-LABEL: sub_v16i32: +; VBITS_GE_512: // %bb.0: +; VBITS_GE_512-NEXT: ptrue p0.s, vl16 +; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x0] +; VBITS_GE_512-NEXT: ld1w { z1.s }, p0/z, [x1] +; VBITS_GE_512-NEXT: sub z0.s, z0.s, z1.s +; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x0] +; VBITS_GE_512-NEXT: ret + %op1 = load <16 x i32>, <16 x i32>* %a + %op2 = load <16 x i32>, <16 x i32>* %b + %res = sub <16 x i32> %op1, %op2 + store <16 x i32> %res, <16 x i32>* %a + ret void +} + +define <1 x i64> @sub_v1i64(<1 x i64> %op1, <1 x i64> %op2) #0 { +; CHECK-LABEL: sub_v1i64: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1 +; CHECK-NEXT: sub z0.d, z0.d, z1.d +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 +; CHECK-NEXT: ret + %res = sub <1 x i64> %op1, %op2 + ret <1 x i64> %res +} + +define <2 x i64> @sub_v2i64(<2 x i64> %op1, <2 x i64> %op2) #0 { +; CHECK-LABEL: sub_v2i64: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1 +; CHECK-NEXT: sub z0.d, z0.d, z1.d +; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 +; CHECK-NEXT: ret + %res = sub <2 x i64> %op1, %op2 + ret <2 x i64> %res +} + +define void @sub_v4i64(<4 x i64>* %a, <4 x i64>* %b) #0 { +; VBITS_GE_128-LABEL: sub_v4i64: +; VBITS_GE_128: // %bb.0: +; VBITS_GE_128-NEXT: mov x8, #2 +; VBITS_GE_128-NEXT: ptrue p0.d, vl2 +; VBITS_GE_128-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] +; VBITS_GE_128-NEXT: ld1d { z1.d }, p0/z, [x0] +; VBITS_GE_128-NEXT: ld1d { z2.d }, p0/z, [x1, x8, lsl #3] +; VBITS_GE_128-NEXT: ld1d { z3.d }, p0/z, [x1] +; VBITS_GE_128-NEXT: sub z1.d, z1.d, z3.d +; VBITS_GE_128-NEXT: sub z0.d, z0.d, z2.d +; VBITS_GE_128-NEXT: stp q1, q0, [x0] +; VBITS_GE_128-NEXT: ret +; +; VBITS_GE_256-LABEL: sub_v4i64: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: ptrue p0.d, vl4 +; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0] +; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x1] +; VBITS_GE_256-NEXT: sub z0.d, z0.d, z1.d +; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x0] +; VBITS_GE_256-NEXT: ret +; +; VBITS_GE_512-LABEL: sub_v4i64: +; VBITS_GE_512: // %bb.0: +; VBITS_GE_512-NEXT: ptrue p0.d, vl4 +; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x0] +; VBITS_GE_512-NEXT: ld1d { z1.d }, p0/z, [x1] +; VBITS_GE_512-NEXT: sub z0.d, z0.d, z1.d +; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x0] +; VBITS_GE_512-NEXT: ret + %op1 = load <4 x i64>, <4 x i64>* %a + %op2 = load <4 x i64>, <4 x i64>* %b + %res = sub <4 x i64> %op1, %op2 + store <4 x i64> %res, <4 x i64>* %a + ret void +} + +define void @sub_v8i64(<8 x i64>* %a, <8 x i64>* %b) #0 { +; VBITS_GE_128-LABEL: sub_v8i64: +; VBITS_GE_128: // %bb.0: +; VBITS_GE_128-NEXT: mov x8, #4 +; VBITS_GE_128-NEXT: mov x9, #6 +; VBITS_GE_128-NEXT: mov x10, #2 +; VBITS_GE_128-NEXT: ptrue p0.d, vl2 +; VBITS_GE_128-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] +; VBITS_GE_128-NEXT: ld1d { z1.d }, p0/z, [x0, x9, lsl #3] +; VBITS_GE_128-NEXT: ld1d { z2.d }, p0/z, [x0, x10, lsl #3] +; VBITS_GE_128-NEXT: ld1d { z3.d }, p0/z, [x0] +; VBITS_GE_128-NEXT: ld1d { z4.d }, p0/z, [x1, x8, lsl #3] +; VBITS_GE_128-NEXT: ld1d { z5.d }, p0/z, [x1, x9, lsl #3] +; VBITS_GE_128-NEXT: ld1d { z6.d }, p0/z, [x1, x10, lsl #3] +; VBITS_GE_128-NEXT: ld1d { z7.d }, p0/z, [x1] +; VBITS_GE_128-NEXT: sub z0.d, z0.d, z4.d +; VBITS_GE_128-NEXT: sub z1.d, z1.d, z5.d +; VBITS_GE_128-NEXT: stp q0, q1, [x0, #32] +; VBITS_GE_128-NEXT: sub z0.d, z3.d, z7.d +; VBITS_GE_128-NEXT: sub z1.d, z2.d, z6.d +; VBITS_GE_128-NEXT: stp q0, q1, [x0] +; VBITS_GE_128-NEXT: ret +; +; VBITS_GE_256-LABEL: sub_v8i64: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov x8, #4 +; VBITS_GE_256-NEXT: ptrue p0.d, vl4 +; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0] +; VBITS_GE_256-NEXT: ld1d { z2.d }, p0/z, [x1, x8, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z3.d }, p0/z, [x1] +; VBITS_GE_256-NEXT: sub z0.d, z0.d, z2.d +; VBITS_GE_256-NEXT: sub z1.d, z1.d, z3.d +; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x0, x8, lsl #3] +; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x0] +; VBITS_GE_256-NEXT: ret +; +; VBITS_GE_512-LABEL: sub_v8i64: +; VBITS_GE_512: // %bb.0: +; VBITS_GE_512-NEXT: ptrue p0.d, vl8 +; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x0] +; VBITS_GE_512-NEXT: ld1d { z1.d }, p0/z, [x1] +; VBITS_GE_512-NEXT: sub z0.d, z0.d, z1.d +; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x0] +; VBITS_GE_512-NEXT: ret + %op1 = load <8 x i64>, <8 x i64>* %a + %op2 = load <8 x i64>, <8 x i64>* %b + %res = sub <8 x i64> %op1, %op2 + store <8 x i64> %res, <8 x i64>* %a + ret void +} + + +; +; ABS +; + +define <8 x i8> @abs_v8i8(<8 x i8> %op1) #0 { +; CHECK-LABEL: abs_v8i8: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: ptrue p0.b, vl8 +; CHECK-NEXT: abs z0.b, p0/m, z0.b +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 +; CHECK-NEXT: ret + %res = call <8 x i8> @llvm.abs.v8i8(<8 x i8> %op1, i1 false) + ret <8 x i8> %res +} + +define <16 x i8> @abs_v16i8(<16 x i8> %op1) #0 { +; CHECK-LABEL: abs_v16i8: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: ptrue p0.b, vl16 +; CHECK-NEXT: abs z0.b, p0/m, z0.b +; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 +; CHECK-NEXT: ret + %res = call <16 x i8> @llvm.abs.v16i8(<16 x i8> %op1, i1 false) + ret <16 x i8> %res +} + +define void @abs_v32i8(<32 x i8>* %a) #0 { +; VBITS_GE_128-LABEL: abs_v32i8: +; VBITS_GE_128: // %bb.0: +; VBITS_GE_128-NEXT: mov w8, #16 +; VBITS_GE_128-NEXT: ptrue p0.b, vl16 +; VBITS_GE_128-NEXT: ld1b { z0.b }, p0/z, [x0, x8] +; VBITS_GE_128-NEXT: ld1b { z1.b }, p0/z, [x0] +; VBITS_GE_128-NEXT: abs z1.b, p0/m, z1.b +; VBITS_GE_128-NEXT: abs z0.b, p0/m, z0.b +; VBITS_GE_128-NEXT: stp q1, q0, [x0] +; VBITS_GE_128-NEXT: ret +; +; VBITS_GE_256-LABEL: abs_v32i8: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: ptrue p0.b, vl32 +; VBITS_GE_256-NEXT: ld1b { z0.b }, p0/z, [x0] +; VBITS_GE_256-NEXT: abs z0.b, p0/m, z0.b +; VBITS_GE_256-NEXT: st1b { z0.b }, p0, [x0] +; VBITS_GE_256-NEXT: ret +; +; VBITS_GE_512-LABEL: abs_v32i8: +; VBITS_GE_512: // %bb.0: +; VBITS_GE_512-NEXT: ptrue p0.b, vl32 +; VBITS_GE_512-NEXT: ld1b { z0.b }, p0/z, [x0] +; VBITS_GE_512-NEXT: abs z0.b, p0/m, z0.b +; VBITS_GE_512-NEXT: st1b { z0.b }, p0, [x0] +; VBITS_GE_512-NEXT: ret + %op1 = load <32 x i8>, <32 x i8>* %a + %res = call <32 x i8> @llvm.abs.v32i8(<32 x i8> %op1, i1 false) + store <32 x i8> %res, <32 x i8>* %a + ret void +} + +define void @abs_v64i8(<64 x i8>* %a) #0 { +; VBITS_GE_128-LABEL: abs_v64i8: +; VBITS_GE_128: // %bb.0: +; VBITS_GE_128-NEXT: mov w8, #32 +; VBITS_GE_128-NEXT: mov w9, #48 +; VBITS_GE_128-NEXT: ptrue p0.b, vl16 +; VBITS_GE_128-NEXT: mov w10, #16 +; VBITS_GE_128-NEXT: ld1b { z0.b }, p0/z, [x0, x8] +; VBITS_GE_128-NEXT: ld1b { z1.b }, p0/z, [x0, x9] +; VBITS_GE_128-NEXT: ld1b { z2.b }, p0/z, [x0, x10] +; VBITS_GE_128-NEXT: ld1b { z3.b }, p0/z, [x0] +; VBITS_GE_128-NEXT: abs z0.b, p0/m, z0.b +; VBITS_GE_128-NEXT: abs z1.b, p0/m, z1.b +; VBITS_GE_128-NEXT: stp q0, q1, [x0, #32] +; VBITS_GE_128-NEXT: movprfx z0, z3 +; VBITS_GE_128-NEXT: abs z0.b, p0/m, z3.b +; VBITS_GE_128-NEXT: movprfx z1, z2 +; VBITS_GE_128-NEXT: abs z1.b, p0/m, z2.b +; VBITS_GE_128-NEXT: stp q0, q1, [x0] +; VBITS_GE_128-NEXT: ret +; +; VBITS_GE_256-LABEL: abs_v64i8: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov w8, #32 +; VBITS_GE_256-NEXT: ptrue p0.b, vl32 +; VBITS_GE_256-NEXT: ld1b { z0.b }, p0/z, [x0, x8] +; VBITS_GE_256-NEXT: ld1b { z1.b }, p0/z, [x0] +; VBITS_GE_256-NEXT: abs z0.b, p0/m, z0.b +; VBITS_GE_256-NEXT: abs z1.b, p0/m, z1.b +; VBITS_GE_256-NEXT: st1b { z0.b }, p0, [x0, x8] +; VBITS_GE_256-NEXT: st1b { z1.b }, p0, [x0] +; VBITS_GE_256-NEXT: ret +; +; VBITS_GE_512-LABEL: abs_v64i8: +; VBITS_GE_512: // %bb.0: +; VBITS_GE_512-NEXT: ptrue p0.b, vl64 +; VBITS_GE_512-NEXT: ld1b { z0.b }, p0/z, [x0] +; VBITS_GE_512-NEXT: abs z0.b, p0/m, z0.b +; VBITS_GE_512-NEXT: st1b { z0.b }, p0, [x0] +; VBITS_GE_512-NEXT: ret + %op1 = load <64 x i8>, <64 x i8>* %a + %res = call <64 x i8> @llvm.abs.v64i8(<64 x i8> %op1, i1 false) + store <64 x i8> %res, <64 x i8>* %a + ret void +} + +define <4 x i16> @abs_v4i16(<4 x i16> %op1) #0 { +; CHECK-LABEL: abs_v4i16: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: ptrue p0.h, vl4 +; CHECK-NEXT: abs z0.h, p0/m, z0.h +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 +; CHECK-NEXT: ret + %res = call <4 x i16> @llvm.abs.v4i16(<4 x i16> %op1, i1 false) + ret <4 x i16> %res +} + +define <8 x i16> @abs_v8i16(<8 x i16> %op1) #0 { +; CHECK-LABEL: abs_v8i16: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: ptrue p0.h, vl8 +; CHECK-NEXT: abs z0.h, p0/m, z0.h +; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 +; CHECK-NEXT: ret + %res = call <8 x i16> @llvm.abs.v8i16(<8 x i16> %op1, i1 false) + ret <8 x i16> %res +} + +define void @abs_v16i16(<16 x i16>* %a) #0 { +; VBITS_GE_128-LABEL: abs_v16i16: +; VBITS_GE_128: // %bb.0: +; VBITS_GE_128-NEXT: mov x8, #8 +; VBITS_GE_128-NEXT: ptrue p0.h, vl8 +; VBITS_GE_128-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] +; VBITS_GE_128-NEXT: ld1h { z1.h }, p0/z, [x0] +; VBITS_GE_128-NEXT: abs z1.h, p0/m, z1.h +; VBITS_GE_128-NEXT: abs z0.h, p0/m, z0.h +; VBITS_GE_128-NEXT: stp q1, q0, [x0] +; VBITS_GE_128-NEXT: ret +; +; VBITS_GE_256-LABEL: abs_v16i16: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: ptrue p0.h, vl16 +; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0] +; VBITS_GE_256-NEXT: abs z0.h, p0/m, z0.h +; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x0] +; VBITS_GE_256-NEXT: ret +; +; VBITS_GE_512-LABEL: abs_v16i16: +; VBITS_GE_512: // %bb.0: +; VBITS_GE_512-NEXT: ptrue p0.h, vl16 +; VBITS_GE_512-NEXT: ld1h { z0.h }, p0/z, [x0] +; VBITS_GE_512-NEXT: abs z0.h, p0/m, z0.h +; VBITS_GE_512-NEXT: st1h { z0.h }, p0, [x0] +; VBITS_GE_512-NEXT: ret + %op1 = load <16 x i16>, <16 x i16>* %a + %res = call <16 x i16> @llvm.abs.v16i16(<16 x i16> %op1, i1 false) + store <16 x i16> %res, <16 x i16>* %a + ret void +} + +define void @abs_v32i16(<32 x i16>* %a) #0 { +; VBITS_GE_128-LABEL: abs_v32i16: +; VBITS_GE_128: // %bb.0: +; VBITS_GE_128-NEXT: mov x8, #16 +; VBITS_GE_128-NEXT: mov x9, #24 +; VBITS_GE_128-NEXT: ptrue p0.h, vl8 +; VBITS_GE_128-NEXT: mov x10, #8 +; VBITS_GE_128-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] +; VBITS_GE_128-NEXT: ld1h { z1.h }, p0/z, [x0, x9, lsl #1] +; VBITS_GE_128-NEXT: ld1h { z2.h }, p0/z, [x0, x10, lsl #1] +; VBITS_GE_128-NEXT: ld1h { z3.h }, p0/z, [x0] +; VBITS_GE_128-NEXT: abs z0.h, p0/m, z0.h +; VBITS_GE_128-NEXT: abs z1.h, p0/m, z1.h +; VBITS_GE_128-NEXT: stp q0, q1, [x0, #32] +; VBITS_GE_128-NEXT: movprfx z0, z3 +; VBITS_GE_128-NEXT: abs z0.h, p0/m, z3.h +; VBITS_GE_128-NEXT: movprfx z1, z2 +; VBITS_GE_128-NEXT: abs z1.h, p0/m, z2.h +; VBITS_GE_128-NEXT: stp q0, q1, [x0] +; VBITS_GE_128-NEXT: ret +; +; VBITS_GE_256-LABEL: abs_v32i16: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov x8, #16 +; VBITS_GE_256-NEXT: ptrue p0.h, vl16 +; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] +; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0] +; VBITS_GE_256-NEXT: abs z0.h, p0/m, z0.h +; VBITS_GE_256-NEXT: abs z1.h, p0/m, z1.h +; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x0, x8, lsl #1] +; VBITS_GE_256-NEXT: st1h { z1.h }, p0, [x0] +; VBITS_GE_256-NEXT: ret +; +; VBITS_GE_512-LABEL: abs_v32i16: +; VBITS_GE_512: // %bb.0: +; VBITS_GE_512-NEXT: ptrue p0.h, vl32 +; VBITS_GE_512-NEXT: ld1h { z0.h }, p0/z, [x0] +; VBITS_GE_512-NEXT: abs z0.h, p0/m, z0.h +; VBITS_GE_512-NEXT: st1h { z0.h }, p0, [x0] +; VBITS_GE_512-NEXT: ret + %op1 = load <32 x i16>, <32 x i16>* %a + %res = call <32 x i16> @llvm.abs.v32i16(<32 x i16> %op1, i1 false) + store <32 x i16> %res, <32 x i16>* %a + ret void +} + +define <2 x i32> @abs_v2i32(<2 x i32> %op1) #0 { +; CHECK-LABEL: abs_v2i32: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: ptrue p0.s, vl2 +; CHECK-NEXT: abs z0.s, p0/m, z0.s +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 +; CHECK-NEXT: ret + %res = call <2 x i32> @llvm.abs.v2i32(<2 x i32> %op1, i1 false) + ret <2 x i32> %res +} + +define <4 x i32> @abs_v4i32(<4 x i32> %op1) #0 { +; CHECK-LABEL: abs_v4i32: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: ptrue p0.s, vl4 +; CHECK-NEXT: abs z0.s, p0/m, z0.s +; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 +; CHECK-NEXT: ret + %res = call <4 x i32> @llvm.abs.v4i32(<4 x i32> %op1, i1 false) + ret <4 x i32> %res +} + +define void @abs_v8i32(<8 x i32>* %a) #0 { +; VBITS_GE_128-LABEL: abs_v8i32: +; VBITS_GE_128: // %bb.0: +; VBITS_GE_128-NEXT: mov x8, #4 +; VBITS_GE_128-NEXT: ptrue p0.s, vl4 +; VBITS_GE_128-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] +; VBITS_GE_128-NEXT: ld1w { z1.s }, p0/z, [x0] +; VBITS_GE_128-NEXT: abs z1.s, p0/m, z1.s +; VBITS_GE_128-NEXT: abs z0.s, p0/m, z0.s +; VBITS_GE_128-NEXT: stp q1, q0, [x0] +; VBITS_GE_128-NEXT: ret +; +; VBITS_GE_256-LABEL: abs_v8i32: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: ptrue p0.s, vl8 +; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0] +; VBITS_GE_256-NEXT: abs z0.s, p0/m, z0.s +; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0] +; VBITS_GE_256-NEXT: ret +; +; VBITS_GE_512-LABEL: abs_v8i32: +; VBITS_GE_512: // %bb.0: +; VBITS_GE_512-NEXT: ptrue p0.s, vl8 +; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x0] +; VBITS_GE_512-NEXT: abs z0.s, p0/m, z0.s +; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x0] +; VBITS_GE_512-NEXT: ret + %op1 = load <8 x i32>, <8 x i32>* %a + %res = call <8 x i32> @llvm.abs.v8i32(<8 x i32> %op1, i1 false) + store <8 x i32> %res, <8 x i32>* %a + ret void +} + +define void @abs_v16i32(<16 x i32>* %a) #0 { +; VBITS_GE_128-LABEL: abs_v16i32: +; VBITS_GE_128: // %bb.0: +; VBITS_GE_128-NEXT: mov x8, #8 +; VBITS_GE_128-NEXT: mov x9, #12 +; VBITS_GE_128-NEXT: ptrue p0.s, vl4 +; VBITS_GE_128-NEXT: mov x10, #4 +; VBITS_GE_128-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] +; VBITS_GE_128-NEXT: ld1w { z1.s }, p0/z, [x0, x9, lsl #2] +; VBITS_GE_128-NEXT: ld1w { z2.s }, p0/z, [x0, x10, lsl #2] +; VBITS_GE_128-NEXT: ld1w { z3.s }, p0/z, [x0] +; VBITS_GE_128-NEXT: abs z0.s, p0/m, z0.s +; VBITS_GE_128-NEXT: abs z1.s, p0/m, z1.s +; VBITS_GE_128-NEXT: stp q0, q1, [x0, #32] +; VBITS_GE_128-NEXT: movprfx z0, z3 +; VBITS_GE_128-NEXT: abs z0.s, p0/m, z3.s +; VBITS_GE_128-NEXT: movprfx z1, z2 +; VBITS_GE_128-NEXT: abs z1.s, p0/m, z2.s +; VBITS_GE_128-NEXT: stp q0, q1, [x0] +; VBITS_GE_128-NEXT: ret +; +; VBITS_GE_256-LABEL: abs_v16i32: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov x8, #8 +; VBITS_GE_256-NEXT: ptrue p0.s, vl8 +; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0] +; VBITS_GE_256-NEXT: abs z0.s, p0/m, z0.s +; VBITS_GE_256-NEXT: abs z1.s, p0/m, z1.s +; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0, x8, lsl #2] +; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x0] +; VBITS_GE_256-NEXT: ret +; +; VBITS_GE_512-LABEL: abs_v16i32: +; VBITS_GE_512: // %bb.0: +; VBITS_GE_512-NEXT: ptrue p0.s, vl16 +; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x0] +; VBITS_GE_512-NEXT: abs z0.s, p0/m, z0.s +; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x0] +; VBITS_GE_512-NEXT: ret + %op1 = load <16 x i32>, <16 x i32>* %a + %res = call <16 x i32> @llvm.abs.v16i32(<16 x i32> %op1, i1 false) + store <16 x i32> %res, <16 x i32>* %a + ret void +} + +define <1 x i64> @abs_v1i64(<1 x i64> %op1) #0 { +; CHECK-LABEL: abs_v1i64: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: ptrue p0.d, vl1 +; CHECK-NEXT: abs z0.d, p0/m, z0.d +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 +; CHECK-NEXT: ret + %res = call <1 x i64> @llvm.abs.v1i64(<1 x i64> %op1, i1 false) + ret <1 x i64> %res +} + +define <2 x i64> @abs_v2i64(<2 x i64> %op1) #0 { +; CHECK-LABEL: abs_v2i64: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: ptrue p0.d, vl2 +; CHECK-NEXT: abs z0.d, p0/m, z0.d +; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 +; CHECK-NEXT: ret + %res = call <2 x i64> @llvm.abs.v2i64(<2 x i64> %op1, i1 false) + ret <2 x i64> %res +} + +define void @abs_v4i64(<4 x i64>* %a) #0 { +; VBITS_GE_128-LABEL: abs_v4i64: +; VBITS_GE_128: // %bb.0: +; VBITS_GE_128-NEXT: mov x8, #2 +; VBITS_GE_128-NEXT: ptrue p0.d, vl2 +; VBITS_GE_128-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] +; VBITS_GE_128-NEXT: ld1d { z1.d }, p0/z, [x0] +; VBITS_GE_128-NEXT: abs z1.d, p0/m, z1.d +; VBITS_GE_128-NEXT: abs z0.d, p0/m, z0.d +; VBITS_GE_128-NEXT: stp q1, q0, [x0] +; VBITS_GE_128-NEXT: ret +; +; VBITS_GE_256-LABEL: abs_v4i64: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: ptrue p0.d, vl4 +; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0] +; VBITS_GE_256-NEXT: abs z0.d, p0/m, z0.d +; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x0] +; VBITS_GE_256-NEXT: ret +; +; VBITS_GE_512-LABEL: abs_v4i64: +; VBITS_GE_512: // %bb.0: +; VBITS_GE_512-NEXT: ptrue p0.d, vl4 +; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x0] +; VBITS_GE_512-NEXT: abs z0.d, p0/m, z0.d +; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x0] +; VBITS_GE_512-NEXT: ret + %op1 = load <4 x i64>, <4 x i64>* %a + %res = call <4 x i64> @llvm.abs.v4i64(<4 x i64> %op1, i1 false) + store <4 x i64> %res, <4 x i64>* %a + ret void +} + +define void @abs_v8i64(<8 x i64>* %a) #0 { +; VBITS_GE_128-LABEL: abs_v8i64: +; VBITS_GE_128: // %bb.0: +; VBITS_GE_128-NEXT: mov x8, #4 +; VBITS_GE_128-NEXT: mov x9, #6 +; VBITS_GE_128-NEXT: ptrue p0.d, vl2 +; VBITS_GE_128-NEXT: mov x10, #2 +; VBITS_GE_128-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] +; VBITS_GE_128-NEXT: ld1d { z1.d }, p0/z, [x0, x9, lsl #3] +; VBITS_GE_128-NEXT: ld1d { z2.d }, p0/z, [x0, x10, lsl #3] +; VBITS_GE_128-NEXT: ld1d { z3.d }, p0/z, [x0] +; VBITS_GE_128-NEXT: abs z0.d, p0/m, z0.d +; VBITS_GE_128-NEXT: abs z1.d, p0/m, z1.d +; VBITS_GE_128-NEXT: stp q0, q1, [x0, #32] +; VBITS_GE_128-NEXT: movprfx z0, z3 +; VBITS_GE_128-NEXT: abs z0.d, p0/m, z3.d +; VBITS_GE_128-NEXT: movprfx z1, z2 +; VBITS_GE_128-NEXT: abs z1.d, p0/m, z2.d +; VBITS_GE_128-NEXT: stp q0, q1, [x0] +; VBITS_GE_128-NEXT: ret +; +; VBITS_GE_256-LABEL: abs_v8i64: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov x8, #4 +; VBITS_GE_256-NEXT: ptrue p0.d, vl4 +; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0] +; VBITS_GE_256-NEXT: abs z0.d, p0/m, z0.d +; VBITS_GE_256-NEXT: abs z1.d, p0/m, z1.d +; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x0, x8, lsl #3] +; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x0] +; VBITS_GE_256-NEXT: ret +; +; VBITS_GE_512-LABEL: abs_v8i64: +; VBITS_GE_512: // %bb.0: +; VBITS_GE_512-NEXT: ptrue p0.d, vl8 +; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x0] +; VBITS_GE_512-NEXT: abs z0.d, p0/m, z0.d +; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x0] +; VBITS_GE_512-NEXT: ret + %op1 = load <8 x i64>, <8 x i64>* %a + %res = call <8 x i64> @llvm.abs.v8i64(<8 x i64> %op1, i1 false) + store <8 x i64> %res, <8 x i64>* %a + ret void +} + +declare <8 x i8> @llvm.abs.v8i8(<8 x i8>, i1) +declare <16 x i8> @llvm.abs.v16i8(<16 x i8>, i1) +declare <32 x i8> @llvm.abs.v32i8(<32 x i8>, i1) +declare <64 x i8> @llvm.abs.v64i8(<64 x i8>, i1) +declare <4 x i16> @llvm.abs.v4i16(<4 x i16>, i1) +declare <8 x i16> @llvm.abs.v8i16(<8 x i16>, i1) +declare <16 x i16> @llvm.abs.v16i16(<16 x i16>, i1) +declare <32 x i16> @llvm.abs.v32i16(<32 x i16>, i1) +declare <2 x i32> @llvm.abs.v2i32(<2 x i32>, i1) +declare <4 x i32> @llvm.abs.v4i32(<4 x i32>, i1) +declare <8 x i32> @llvm.abs.v8i32(<8 x i32>, i1) +declare <16 x i32> @llvm.abs.v16i32(<16 x i32>, i1) +declare <1 x i64> @llvm.abs.v1i64(<1 x i64>, i1) +declare <2 x i64> @llvm.abs.v2i64(<2 x i64>, i1) +declare <4 x i64> @llvm.abs.v4i64(<4 x i64>, i1) +declare <8 x i64> @llvm.abs.v8i64(<8 x i64>, i1) + + +attributes #0 = { "target-features"="+sve" } diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-div.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-div.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-div.ll @@ -0,0 +1,1594 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -aarch64-sve-vector-bits-min=128 -force-sve-when-streaming-compatible < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_128 +; RUN: llc -aarch64-sve-vector-bits-min=256 -force-sve-when-streaming-compatible < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_256 +; RUN: llc -aarch64-sve-vector-bits-min=512 -force-sve-when-streaming-compatible < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512 +; RUN: llc -aarch64-sve-vector-bits-min=2048 -force-sve-when-streaming-compatible < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512 + +target triple = "aarch64-unknown-linux-gnu" + +; +; SDIV +; + +define <8 x i8> @sdiv_v8i8(<8 x i8> %op1, <8 x i8> %op2) #0 { +; VBITS_GE_128-LABEL: sdiv_v8i8: +; VBITS_GE_128: // %bb.0: +; VBITS_GE_128-NEXT: sub sp, sp, #16 +; VBITS_GE_128-NEXT: .cfi_def_cfa_offset 16 +; VBITS_GE_128-NEXT: // kill: def $d1 killed $d1 def $z1 +; VBITS_GE_128-NEXT: // kill: def $d0 killed $d0 def $z0 +; VBITS_GE_128-NEXT: ptrue p0.s, vl4 +; VBITS_GE_128-NEXT: sunpklo z1.h, z1.b +; VBITS_GE_128-NEXT: sunpklo z0.h, z0.b +; VBITS_GE_128-NEXT: sunpkhi z2.s, z1.h +; VBITS_GE_128-NEXT: sunpkhi z3.s, z0.h +; VBITS_GE_128-NEXT: sunpklo z1.s, z1.h +; VBITS_GE_128-NEXT: sunpklo z0.s, z0.h +; VBITS_GE_128-NEXT: sdivr z2.s, p0/m, z2.s, z3.s +; VBITS_GE_128-NEXT: sdiv z0.s, p0/m, z0.s, z1.s +; VBITS_GE_128-NEXT: uzp1 z0.h, z0.h, z2.h +; VBITS_GE_128-NEXT: ptrue p0.b, vl8 +; VBITS_GE_128-NEXT: fmov w8, s0 +; VBITS_GE_128-NEXT: mov z1.h, z0.h[7] +; VBITS_GE_128-NEXT: mov z3.h, z0.h[5] +; VBITS_GE_128-NEXT: fmov w9, s1 +; VBITS_GE_128-NEXT: mov z2.h, z0.h[6] +; VBITS_GE_128-NEXT: mov z4.h, z0.h[4] +; VBITS_GE_128-NEXT: strb w8, [sp, #8] +; VBITS_GE_128-NEXT: fmov w8, s3 +; VBITS_GE_128-NEXT: mov z6.h, z0.h[2] +; VBITS_GE_128-NEXT: fmov w10, s2 +; VBITS_GE_128-NEXT: strb w9, [sp, #15] +; VBITS_GE_128-NEXT: fmov w9, s4 +; VBITS_GE_128-NEXT: strb w8, [sp, #13] +; VBITS_GE_128-NEXT: fmov w8, s6 +; VBITS_GE_128-NEXT: mov z5.h, z0.h[3] +; VBITS_GE_128-NEXT: mov z0.h, z0.h[1] +; VBITS_GE_128-NEXT: strb w10, [sp, #14] +; VBITS_GE_128-NEXT: fmov w10, s5 +; VBITS_GE_128-NEXT: strb w9, [sp, #12] +; VBITS_GE_128-NEXT: fmov w9, s0 +; VBITS_GE_128-NEXT: strb w8, [sp, #10] +; VBITS_GE_128-NEXT: add x8, sp, #8 +; VBITS_GE_128-NEXT: strb w10, [sp, #11] +; VBITS_GE_128-NEXT: strb w9, [sp, #9] +; VBITS_GE_128-NEXT: ld1b { z0.b }, p0/z, [x8] +; VBITS_GE_128-NEXT: // kill: def $d0 killed $d0 killed $z0 +; VBITS_GE_128-NEXT: add sp, sp, #16 +; VBITS_GE_128-NEXT: ret +; +; VBITS_GE_256-LABEL: sdiv_v8i8: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: sub sp, sp, #16 +; VBITS_GE_256-NEXT: .cfi_def_cfa_offset 16 +; VBITS_GE_256-NEXT: // kill: def $d1 killed $d1 def $z1 +; VBITS_GE_256-NEXT: // kill: def $d0 killed $d0 def $z0 +; VBITS_GE_256-NEXT: ptrue p0.s, vl8 +; VBITS_GE_256-NEXT: sunpklo z1.h, z1.b +; VBITS_GE_256-NEXT: sunpklo z0.h, z0.b +; VBITS_GE_256-NEXT: sunpklo z1.s, z1.h +; VBITS_GE_256-NEXT: sunpklo z0.s, z0.h +; VBITS_GE_256-NEXT: sdiv z0.s, p0/m, z0.s, z1.s +; VBITS_GE_256-NEXT: ptrue p0.b, vl8 +; VBITS_GE_256-NEXT: uzp1 z0.h, z0.h, z0.h +; VBITS_GE_256-NEXT: fmov w8, s0 +; VBITS_GE_256-NEXT: mov z1.h, z0.h[7] +; VBITS_GE_256-NEXT: mov z3.h, z0.h[5] +; VBITS_GE_256-NEXT: fmov w9, s1 +; VBITS_GE_256-NEXT: mov z2.h, z0.h[6] +; VBITS_GE_256-NEXT: mov z4.h, z0.h[4] +; VBITS_GE_256-NEXT: strb w8, [sp, #8] +; VBITS_GE_256-NEXT: fmov w8, s3 +; VBITS_GE_256-NEXT: mov z6.h, z0.h[2] +; VBITS_GE_256-NEXT: fmov w10, s2 +; VBITS_GE_256-NEXT: strb w9, [sp, #15] +; VBITS_GE_256-NEXT: fmov w9, s4 +; VBITS_GE_256-NEXT: strb w8, [sp, #13] +; VBITS_GE_256-NEXT: fmov w8, s6 +; VBITS_GE_256-NEXT: mov z5.h, z0.h[3] +; VBITS_GE_256-NEXT: mov z0.h, z0.h[1] +; VBITS_GE_256-NEXT: strb w10, [sp, #14] +; VBITS_GE_256-NEXT: fmov w10, s5 +; VBITS_GE_256-NEXT: strb w9, [sp, #12] +; VBITS_GE_256-NEXT: fmov w9, s0 +; VBITS_GE_256-NEXT: strb w8, [sp, #10] +; VBITS_GE_256-NEXT: add x8, sp, #8 +; VBITS_GE_256-NEXT: strb w10, [sp, #11] +; VBITS_GE_256-NEXT: strb w9, [sp, #9] +; VBITS_GE_256-NEXT: ld1b { z0.b }, p0/z, [x8] +; VBITS_GE_256-NEXT: // kill: def $d0 killed $d0 killed $z0 +; VBITS_GE_256-NEXT: add sp, sp, #16 +; VBITS_GE_256-NEXT: ret +; +; VBITS_GE_512-LABEL: sdiv_v8i8: +; VBITS_GE_512: // %bb.0: +; VBITS_GE_512-NEXT: sub sp, sp, #16 +; VBITS_GE_512-NEXT: .cfi_def_cfa_offset 16 +; VBITS_GE_512-NEXT: // kill: def $d1 killed $d1 def $z1 +; VBITS_GE_512-NEXT: // kill: def $d0 killed $d0 def $z0 +; VBITS_GE_512-NEXT: ptrue p0.s, vl8 +; VBITS_GE_512-NEXT: sunpklo z1.h, z1.b +; VBITS_GE_512-NEXT: sunpklo z0.h, z0.b +; VBITS_GE_512-NEXT: sunpklo z1.s, z1.h +; VBITS_GE_512-NEXT: sunpklo z0.s, z0.h +; VBITS_GE_512-NEXT: sdiv z0.s, p0/m, z0.s, z1.s +; VBITS_GE_512-NEXT: ptrue p0.b, vl8 +; VBITS_GE_512-NEXT: uzp1 z0.h, z0.h, z0.h +; VBITS_GE_512-NEXT: fmov w8, s0 +; VBITS_GE_512-NEXT: mov z1.h, z0.h[7] +; VBITS_GE_512-NEXT: mov z3.h, z0.h[5] +; VBITS_GE_512-NEXT: fmov w9, s1 +; VBITS_GE_512-NEXT: mov z2.h, z0.h[6] +; VBITS_GE_512-NEXT: mov z4.h, z0.h[4] +; VBITS_GE_512-NEXT: strb w8, [sp, #8] +; VBITS_GE_512-NEXT: fmov w8, s3 +; VBITS_GE_512-NEXT: mov z6.h, z0.h[2] +; VBITS_GE_512-NEXT: fmov w10, s2 +; VBITS_GE_512-NEXT: strb w9, [sp, #15] +; VBITS_GE_512-NEXT: fmov w9, s4 +; VBITS_GE_512-NEXT: strb w8, [sp, #13] +; VBITS_GE_512-NEXT: fmov w8, s6 +; VBITS_GE_512-NEXT: mov z5.h, z0.h[3] +; VBITS_GE_512-NEXT: mov z0.h, z0.h[1] +; VBITS_GE_512-NEXT: strb w10, [sp, #14] +; VBITS_GE_512-NEXT: fmov w10, s5 +; VBITS_GE_512-NEXT: strb w9, [sp, #12] +; VBITS_GE_512-NEXT: fmov w9, s0 +; VBITS_GE_512-NEXT: strb w8, [sp, #10] +; VBITS_GE_512-NEXT: add x8, sp, #8 +; VBITS_GE_512-NEXT: strb w10, [sp, #11] +; VBITS_GE_512-NEXT: strb w9, [sp, #9] +; VBITS_GE_512-NEXT: ld1b { z0.b }, p0/z, [x8] +; VBITS_GE_512-NEXT: // kill: def $d0 killed $d0 killed $z0 +; VBITS_GE_512-NEXT: add sp, sp, #16 +; VBITS_GE_512-NEXT: ret + %res = sdiv <8 x i8> %op1, %op2 + ret <8 x i8> %res +} + +define <16 x i8> @sdiv_v16i8(<16 x i8> %op1, <16 x i8> %op2) #0 { +; VBITS_GE_128-LABEL: sdiv_v16i8: +; VBITS_GE_128: // %bb.0: +; VBITS_GE_128-NEXT: // kill: def $q1 killed $q1 def $z1 +; VBITS_GE_128-NEXT: // kill: def $q0 killed $q0 def $z0 +; VBITS_GE_128-NEXT: sunpkhi z2.h, z1.b +; VBITS_GE_128-NEXT: sunpkhi z3.h, z0.b +; VBITS_GE_128-NEXT: ptrue p0.s, vl4 +; VBITS_GE_128-NEXT: sunpklo z1.h, z1.b +; VBITS_GE_128-NEXT: sunpkhi z4.s, z2.h +; VBITS_GE_128-NEXT: sunpkhi z5.s, z3.h +; VBITS_GE_128-NEXT: sunpklo z2.s, z2.h +; VBITS_GE_128-NEXT: sunpklo z3.s, z3.h +; VBITS_GE_128-NEXT: sunpklo z0.h, z0.b +; VBITS_GE_128-NEXT: sdivr z4.s, p0/m, z4.s, z5.s +; VBITS_GE_128-NEXT: sdivr z2.s, p0/m, z2.s, z3.s +; VBITS_GE_128-NEXT: sunpkhi z3.s, z1.h +; VBITS_GE_128-NEXT: sunpkhi z5.s, z0.h +; VBITS_GE_128-NEXT: sunpklo z1.s, z1.h +; VBITS_GE_128-NEXT: sunpklo z0.s, z0.h +; VBITS_GE_128-NEXT: sdivr z3.s, p0/m, z3.s, z5.s +; VBITS_GE_128-NEXT: sdiv z0.s, p0/m, z0.s, z1.s +; VBITS_GE_128-NEXT: uzp1 z1.h, z2.h, z4.h +; VBITS_GE_128-NEXT: uzp1 z0.h, z0.h, z3.h +; VBITS_GE_128-NEXT: uzp1 z0.b, z0.b, z1.b +; VBITS_GE_128-NEXT: // kill: def $q0 killed $q0 killed $z0 +; VBITS_GE_128-NEXT: ret +; +; VBITS_GE_256-LABEL: sdiv_v16i8: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: // kill: def $q1 killed $q1 def $z1 +; VBITS_GE_256-NEXT: // kill: def $q0 killed $q0 def $z0 +; VBITS_GE_256-NEXT: ptrue p0.s, vl8 +; VBITS_GE_256-NEXT: sunpklo z1.h, z1.b +; VBITS_GE_256-NEXT: sunpklo z0.h, z0.b +; VBITS_GE_256-NEXT: sunpkhi z2.s, z1.h +; VBITS_GE_256-NEXT: sunpkhi z3.s, z0.h +; VBITS_GE_256-NEXT: sunpklo z1.s, z1.h +; VBITS_GE_256-NEXT: sunpklo z0.s, z0.h +; VBITS_GE_256-NEXT: sdivr z2.s, p0/m, z2.s, z3.s +; VBITS_GE_256-NEXT: sdiv z0.s, p0/m, z0.s, z1.s +; VBITS_GE_256-NEXT: uzp1 z0.h, z0.h, z2.h +; VBITS_GE_256-NEXT: uzp1 z0.b, z0.b, z0.b +; VBITS_GE_256-NEXT: // kill: def $q0 killed $q0 killed $z0 +; VBITS_GE_256-NEXT: ret +; +; VBITS_GE_512-LABEL: sdiv_v16i8: +; VBITS_GE_512: // %bb.0: +; VBITS_GE_512-NEXT: // kill: def $q1 killed $q1 def $z1 +; VBITS_GE_512-NEXT: // kill: def $q0 killed $q0 def $z0 +; VBITS_GE_512-NEXT: ptrue p0.s, vl16 +; VBITS_GE_512-NEXT: sunpklo z1.h, z1.b +; VBITS_GE_512-NEXT: sunpklo z0.h, z0.b +; VBITS_GE_512-NEXT: sunpklo z1.s, z1.h +; VBITS_GE_512-NEXT: sunpklo z0.s, z0.h +; VBITS_GE_512-NEXT: sdiv z0.s, p0/m, z0.s, z1.s +; VBITS_GE_512-NEXT: uzp1 z0.h, z0.h, z0.h +; VBITS_GE_512-NEXT: uzp1 z0.b, z0.b, z0.b +; VBITS_GE_512-NEXT: // kill: def $q0 killed $q0 killed $z0 +; VBITS_GE_512-NEXT: ret + %res = sdiv <16 x i8> %op1, %op2 + ret <16 x i8> %res +} + +define void @sdiv_v32i8(<32 x i8>* %a, <32 x i8>* %b) #0 { +; VBITS_GE_128-LABEL: sdiv_v32i8: +; VBITS_GE_128: // %bb.0: +; VBITS_GE_128-NEXT: mov w8, #16 +; VBITS_GE_128-NEXT: ptrue p0.b, vl16 +; VBITS_GE_128-NEXT: ld1b { z0.b }, p0/z, [x0, x8] +; VBITS_GE_128-NEXT: ld1b { z1.b }, p0/z, [x0] +; VBITS_GE_128-NEXT: ld1b { z2.b }, p0/z, [x1, x8] +; VBITS_GE_128-NEXT: ld1b { z3.b }, p0/z, [x1] +; VBITS_GE_128-NEXT: ptrue p0.s, vl4 +; VBITS_GE_128-NEXT: sunpkhi z5.h, z0.b +; VBITS_GE_128-NEXT: sunpklo z0.h, z0.b +; VBITS_GE_128-NEXT: sunpkhi z4.h, z2.b +; VBITS_GE_128-NEXT: sunpklo z2.h, z2.b +; VBITS_GE_128-NEXT: sunpkhi z6.s, z4.h +; VBITS_GE_128-NEXT: sunpkhi z7.s, z5.h +; VBITS_GE_128-NEXT: sunpklo z4.s, z4.h +; VBITS_GE_128-NEXT: sunpklo z5.s, z5.h +; VBITS_GE_128-NEXT: sunpkhi z16.s, z2.h +; VBITS_GE_128-NEXT: sdivr z6.s, p0/m, z6.s, z7.s +; VBITS_GE_128-NEXT: sdivr z4.s, p0/m, z4.s, z5.s +; VBITS_GE_128-NEXT: sunpkhi z5.s, z0.h +; VBITS_GE_128-NEXT: sunpklo z2.s, z2.h +; VBITS_GE_128-NEXT: sunpklo z0.s, z0.h +; VBITS_GE_128-NEXT: uzp1 z4.h, z4.h, z6.h +; VBITS_GE_128-NEXT: sdiv z0.s, p0/m, z0.s, z2.s +; VBITS_GE_128-NEXT: sunpkhi z2.h, z3.b +; VBITS_GE_128-NEXT: sunpkhi z6.h, z1.b +; VBITS_GE_128-NEXT: sdiv z5.s, p0/m, z5.s, z16.s +; VBITS_GE_128-NEXT: sunpkhi z7.s, z2.h +; VBITS_GE_128-NEXT: sunpkhi z16.s, z6.h +; VBITS_GE_128-NEXT: sunpklo z2.s, z2.h +; VBITS_GE_128-NEXT: sunpklo z6.s, z6.h +; VBITS_GE_128-NEXT: sunpklo z3.h, z3.b +; VBITS_GE_128-NEXT: sunpklo z1.h, z1.b +; VBITS_GE_128-NEXT: sdivr z7.s, p0/m, z7.s, z16.s +; VBITS_GE_128-NEXT: sdivr z2.s, p0/m, z2.s, z6.s +; VBITS_GE_128-NEXT: sunpkhi z6.s, z3.h +; VBITS_GE_128-NEXT: sunpkhi z16.s, z1.h +; VBITS_GE_128-NEXT: sunpklo z3.s, z3.h +; VBITS_GE_128-NEXT: sunpklo z1.s, z1.h +; VBITS_GE_128-NEXT: sdivr z6.s, p0/m, z6.s, z16.s +; VBITS_GE_128-NEXT: sdiv z1.s, p0/m, z1.s, z3.s +; VBITS_GE_128-NEXT: uzp1 z2.h, z2.h, z7.h +; VBITS_GE_128-NEXT: uzp1 z1.h, z1.h, z6.h +; VBITS_GE_128-NEXT: uzp1 z0.h, z0.h, z5.h +; VBITS_GE_128-NEXT: uzp1 z1.b, z1.b, z2.b +; VBITS_GE_128-NEXT: uzp1 z0.b, z0.b, z4.b +; VBITS_GE_128-NEXT: stp q1, q0, [x0] +; VBITS_GE_128-NEXT: ret +; +; VBITS_GE_256-LABEL: sdiv_v32i8: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: ptrue p0.b, vl32 +; VBITS_GE_256-NEXT: ptrue p1.s, vl8 +; VBITS_GE_256-NEXT: ld1b { z0.b }, p0/z, [x0] +; VBITS_GE_256-NEXT: ld1b { z1.b }, p0/z, [x1] +; VBITS_GE_256-NEXT: sunpkhi z2.h, z1.b +; VBITS_GE_256-NEXT: sunpkhi z3.h, z0.b +; VBITS_GE_256-NEXT: sunpklo z1.h, z1.b +; VBITS_GE_256-NEXT: sunpklo z0.h, z0.b +; VBITS_GE_256-NEXT: sunpkhi z4.s, z2.h +; VBITS_GE_256-NEXT: sunpkhi z5.s, z3.h +; VBITS_GE_256-NEXT: sunpklo z2.s, z2.h +; VBITS_GE_256-NEXT: sunpklo z3.s, z3.h +; VBITS_GE_256-NEXT: sdivr z4.s, p1/m, z4.s, z5.s +; VBITS_GE_256-NEXT: sunpkhi z5.s, z1.h +; VBITS_GE_256-NEXT: sdivr z2.s, p1/m, z2.s, z3.s +; VBITS_GE_256-NEXT: sunpkhi z3.s, z0.h +; VBITS_GE_256-NEXT: sunpklo z1.s, z1.h +; VBITS_GE_256-NEXT: sunpklo z0.s, z0.h +; VBITS_GE_256-NEXT: sdiv z3.s, p1/m, z3.s, z5.s +; VBITS_GE_256-NEXT: sdiv z0.s, p1/m, z0.s, z1.s +; VBITS_GE_256-NEXT: uzp1 z1.h, z2.h, z4.h +; VBITS_GE_256-NEXT: uzp1 z0.h, z0.h, z3.h +; VBITS_GE_256-NEXT: uzp1 z0.b, z0.b, z1.b +; VBITS_GE_256-NEXT: st1b { z0.b }, p0, [x0] +; VBITS_GE_256-NEXT: ret + %op1 = load <32 x i8>, <32 x i8>* %a + %op2 = load <32 x i8>, <32 x i8>* %b + %res = sdiv <32 x i8> %op1, %op2 + store <32 x i8> %res, <32 x i8>* %a + ret void +} + +define void @sdiv_v64i8(<64 x i8>* %a, <64 x i8>* %b) vscale_range(16,0) #0 { +; CHECK-LABEL: sdiv_v64i8: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.b, vl64 +; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0] +; CHECK-NEXT: ld1b { z1.b }, p0/z, [x1] +; CHECK-NEXT: ptrue p0.s, vl64 +; CHECK-NEXT: sunpklo z1.h, z1.b +; CHECK-NEXT: sunpklo z0.h, z0.b +; CHECK-NEXT: sunpklo z1.s, z1.h +; CHECK-NEXT: sunpklo z0.s, z0.h +; CHECK-NEXT: sdiv z0.s, p0/m, z0.s, z1.s +; CHECK-NEXT: st1b { z0.s }, p0, [x0] +; CHECK-NEXT: ret + %op1 = load <64 x i8>, <64 x i8>* %a + %op2 = load <64 x i8>, <64 x i8>* %b + %res = sdiv <64 x i8> %op1, %op2 + store <64 x i8> %res, <64 x i8>* %a + ret void +} + +define <4 x i16> @sdiv_v4i16(<4 x i16> %op1, <4 x i16> %op2) #0 { +; CHECK-LABEL: sdiv_v4i16: +; CHECK: // %bb.0: +; CHECK-NEXT: sub sp, sp, #16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1 +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: ptrue p0.s, vl4 +; CHECK-NEXT: sunpklo z1.s, z1.h +; CHECK-NEXT: sunpklo z0.s, z0.h +; CHECK-NEXT: sdiv z0.s, p0/m, z0.s, z1.s +; CHECK-NEXT: ptrue p0.h, vl4 +; CHECK-NEXT: fmov w8, s0 +; CHECK-NEXT: mov z1.s, z0.s[3] +; CHECK-NEXT: mov z2.s, z0.s[2] +; CHECK-NEXT: mov z0.s, z0.s[1] +; CHECK-NEXT: fmov w9, s1 +; CHECK-NEXT: fmov w10, s2 +; CHECK-NEXT: strh w8, [sp, #8] +; CHECK-NEXT: fmov w8, s0 +; CHECK-NEXT: strh w9, [sp, #14] +; CHECK-NEXT: strh w8, [sp, #10] +; CHECK-NEXT: add x8, sp, #8 +; CHECK-NEXT: strh w10, [sp, #12] +; CHECK-NEXT: ld1h { z0.h }, p0/z, [x8] +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 +; CHECK-NEXT: add sp, sp, #16 +; CHECK-NEXT: ret + %res = sdiv <4 x i16> %op1, %op2 + ret <4 x i16> %res +} + +define <8 x i16> @sdiv_v8i16(<8 x i16> %op1, <8 x i16> %op2) #0 { +; VBITS_GE_128-LABEL: sdiv_v8i16: +; VBITS_GE_128: // %bb.0: +; VBITS_GE_128-NEXT: // kill: def $q1 killed $q1 def $z1 +; VBITS_GE_128-NEXT: // kill: def $q0 killed $q0 def $z0 +; VBITS_GE_128-NEXT: ptrue p0.s, vl4 +; VBITS_GE_128-NEXT: sunpkhi z2.s, z1.h +; VBITS_GE_128-NEXT: sunpkhi z3.s, z0.h +; VBITS_GE_128-NEXT: sunpklo z1.s, z1.h +; VBITS_GE_128-NEXT: sunpklo z0.s, z0.h +; VBITS_GE_128-NEXT: sdivr z2.s, p0/m, z2.s, z3.s +; VBITS_GE_128-NEXT: sdiv z0.s, p0/m, z0.s, z1.s +; VBITS_GE_128-NEXT: uzp1 z0.h, z0.h, z2.h +; VBITS_GE_128-NEXT: // kill: def $q0 killed $q0 killed $z0 +; VBITS_GE_128-NEXT: ret +; +; VBITS_GE_256-LABEL: sdiv_v8i16: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: // kill: def $q1 killed $q1 def $z1 +; VBITS_GE_256-NEXT: // kill: def $q0 killed $q0 def $z0 +; VBITS_GE_256-NEXT: ptrue p0.s, vl8 +; VBITS_GE_256-NEXT: sunpklo z1.s, z1.h +; VBITS_GE_256-NEXT: sunpklo z0.s, z0.h +; VBITS_GE_256-NEXT: sdiv z0.s, p0/m, z0.s, z1.s +; VBITS_GE_256-NEXT: uzp1 z0.h, z0.h, z0.h +; VBITS_GE_256-NEXT: // kill: def $q0 killed $q0 killed $z0 +; VBITS_GE_256-NEXT: ret +; +; VBITS_GE_512-LABEL: sdiv_v8i16: +; VBITS_GE_512: // %bb.0: +; VBITS_GE_512-NEXT: // kill: def $q1 killed $q1 def $z1 +; VBITS_GE_512-NEXT: // kill: def $q0 killed $q0 def $z0 +; VBITS_GE_512-NEXT: ptrue p0.s, vl8 +; VBITS_GE_512-NEXT: sunpklo z1.s, z1.h +; VBITS_GE_512-NEXT: sunpklo z0.s, z0.h +; VBITS_GE_512-NEXT: sdiv z0.s, p0/m, z0.s, z1.s +; VBITS_GE_512-NEXT: uzp1 z0.h, z0.h, z0.h +; VBITS_GE_512-NEXT: // kill: def $q0 killed $q0 killed $z0 +; VBITS_GE_512-NEXT: ret + %res = sdiv <8 x i16> %op1, %op2 + ret <8 x i16> %res +} + +define void @sdiv_v16i16(<16 x i16>* %a, <16 x i16>* %b) #0 { +; VBITS_GE_128-LABEL: sdiv_v16i16: +; VBITS_GE_128: // %bb.0: +; VBITS_GE_128-NEXT: mov x8, #8 +; VBITS_GE_128-NEXT: ptrue p0.h, vl8 +; VBITS_GE_128-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] +; VBITS_GE_128-NEXT: ld1h { z1.h }, p0/z, [x0] +; VBITS_GE_128-NEXT: ld1h { z2.h }, p0/z, [x1, x8, lsl #1] +; VBITS_GE_128-NEXT: ld1h { z3.h }, p0/z, [x1] +; VBITS_GE_128-NEXT: ptrue p0.s, vl4 +; VBITS_GE_128-NEXT: sunpkhi z5.s, z0.h +; VBITS_GE_128-NEXT: sunpklo z0.s, z0.h +; VBITS_GE_128-NEXT: sunpkhi z4.s, z2.h +; VBITS_GE_128-NEXT: sunpkhi z6.s, z3.h +; VBITS_GE_128-NEXT: sdivr z4.s, p0/m, z4.s, z5.s +; VBITS_GE_128-NEXT: sunpkhi z5.s, z1.h +; VBITS_GE_128-NEXT: sunpklo z2.s, z2.h +; VBITS_GE_128-NEXT: sunpklo z3.s, z3.h +; VBITS_GE_128-NEXT: sunpklo z1.s, z1.h +; VBITS_GE_128-NEXT: sdiv z5.s, p0/m, z5.s, z6.s +; VBITS_GE_128-NEXT: sdiv z1.s, p0/m, z1.s, z3.s +; VBITS_GE_128-NEXT: sdiv z0.s, p0/m, z0.s, z2.s +; VBITS_GE_128-NEXT: uzp1 z1.h, z1.h, z5.h +; VBITS_GE_128-NEXT: uzp1 z0.h, z0.h, z4.h +; VBITS_GE_128-NEXT: stp q1, q0, [x0] +; VBITS_GE_128-NEXT: ret +; +; VBITS_GE_256-LABEL: sdiv_v16i16: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: ptrue p0.h, vl16 +; VBITS_GE_256-NEXT: ptrue p1.s, vl8 +; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0] +; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x1] +; VBITS_GE_256-NEXT: sunpkhi z2.s, z1.h +; VBITS_GE_256-NEXT: sunpkhi z3.s, z0.h +; VBITS_GE_256-NEXT: sunpklo z1.s, z1.h +; VBITS_GE_256-NEXT: sunpklo z0.s, z0.h +; VBITS_GE_256-NEXT: sdivr z2.s, p1/m, z2.s, z3.s +; VBITS_GE_256-NEXT: sdiv z0.s, p1/m, z0.s, z1.s +; VBITS_GE_256-NEXT: uzp1 z0.h, z0.h, z2.h +; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x0] +; VBITS_GE_256-NEXT: ret +; +; VBITS_GE_512-LABEL: sdiv_v16i16: +; VBITS_GE_512: // %bb.0: +; VBITS_GE_512-NEXT: ptrue p0.h, vl16 +; VBITS_GE_512-NEXT: ld1h { z0.h }, p0/z, [x0] +; VBITS_GE_512-NEXT: ld1h { z1.h }, p0/z, [x1] +; VBITS_GE_512-NEXT: ptrue p0.s, vl16 +; VBITS_GE_512-NEXT: sunpklo z1.s, z1.h +; VBITS_GE_512-NEXT: sunpklo z0.s, z0.h +; VBITS_GE_512-NEXT: sdiv z0.s, p0/m, z0.s, z1.s +; VBITS_GE_512-NEXT: st1h { z0.s }, p0, [x0] +; VBITS_GE_512-NEXT: ret + %op1 = load <16 x i16>, <16 x i16>* %a + %op2 = load <16 x i16>, <16 x i16>* %b + %res = sdiv <16 x i16> %op1, %op2 + store <16 x i16> %res, <16 x i16>* %a + ret void +} + +define void @sdiv_v32i16(<32 x i16>* %a, <32 x i16>* %b) #0 { +; VBITS_GE_128-LABEL: sdiv_v32i16: +; VBITS_GE_128: // %bb.0: +; VBITS_GE_128-NEXT: mov x8, #16 +; VBITS_GE_128-NEXT: mov x9, #24 +; VBITS_GE_128-NEXT: mov x10, #8 +; VBITS_GE_128-NEXT: ptrue p1.h, vl8 +; VBITS_GE_128-NEXT: ptrue p0.s, vl4 +; VBITS_GE_128-NEXT: ld1h { z0.h }, p1/z, [x0, x8, lsl #1] +; VBITS_GE_128-NEXT: ld1h { z1.h }, p1/z, [x0, x9, lsl #1] +; VBITS_GE_128-NEXT: ld1h { z2.h }, p1/z, [x0, x10, lsl #1] +; VBITS_GE_128-NEXT: ld1h { z3.h }, p1/z, [x0] +; VBITS_GE_128-NEXT: ld1h { z4.h }, p1/z, [x1, x10, lsl #1] +; VBITS_GE_128-NEXT: ld1h { z5.h }, p1/z, [x1, x9, lsl #1] +; VBITS_GE_128-NEXT: ld1h { z6.h }, p1/z, [x1, x8, lsl #1] +; VBITS_GE_128-NEXT: ld1h { z17.h }, p1/z, [x1] +; VBITS_GE_128-NEXT: sunpkhi z18.s, z1.h +; VBITS_GE_128-NEXT: sunpklo z1.s, z1.h +; VBITS_GE_128-NEXT: sunpkhi z16.s, z2.h +; VBITS_GE_128-NEXT: sunpklo z2.s, z2.h +; VBITS_GE_128-NEXT: sunpkhi z7.s, z4.h +; VBITS_GE_128-NEXT: sunpklo z4.s, z4.h +; VBITS_GE_128-NEXT: sdivr z7.s, p0/m, z7.s, z16.s +; VBITS_GE_128-NEXT: sunpkhi z16.s, z5.h +; VBITS_GE_128-NEXT: sunpklo z5.s, z5.h +; VBITS_GE_128-NEXT: sdiv z2.s, p0/m, z2.s, z4.s +; VBITS_GE_128-NEXT: sdiv z1.s, p0/m, z1.s, z5.s +; VBITS_GE_128-NEXT: sunpkhi z4.s, z6.h +; VBITS_GE_128-NEXT: sunpkhi z5.s, z0.h +; VBITS_GE_128-NEXT: sunpklo z6.s, z6.h +; VBITS_GE_128-NEXT: sunpklo z0.s, z0.h +; VBITS_GE_128-NEXT: sdivr z4.s, p0/m, z4.s, z5.s +; VBITS_GE_128-NEXT: sdiv z0.s, p0/m, z0.s, z6.s +; VBITS_GE_128-NEXT: sunpkhi z5.s, z17.h +; VBITS_GE_128-NEXT: sdivr z16.s, p0/m, z16.s, z18.s +; VBITS_GE_128-NEXT: sunpkhi z6.s, z3.h +; VBITS_GE_128-NEXT: uzp1 z0.h, z0.h, z4.h +; VBITS_GE_128-NEXT: movprfx z4, z6 +; VBITS_GE_128-NEXT: sdiv z4.s, p0/m, z4.s, z5.s +; VBITS_GE_128-NEXT: sunpklo z5.s, z17.h +; VBITS_GE_128-NEXT: sunpklo z3.s, z3.h +; VBITS_GE_128-NEXT: uzp1 z1.h, z1.h, z16.h +; VBITS_GE_128-NEXT: sdiv z3.s, p0/m, z3.s, z5.s +; VBITS_GE_128-NEXT: stp q0, q1, [x0, #32] +; VBITS_GE_128-NEXT: uzp1 z0.h, z3.h, z4.h +; VBITS_GE_128-NEXT: uzp1 z1.h, z2.h, z7.h +; VBITS_GE_128-NEXT: stp q0, q1, [x0] +; VBITS_GE_128-NEXT: ret +; +; VBITS_GE_256-LABEL: sdiv_v32i16: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov x8, #16 +; VBITS_GE_256-NEXT: ptrue p0.h, vl16 +; VBITS_GE_256-NEXT: ptrue p1.s, vl8 +; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] +; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0] +; VBITS_GE_256-NEXT: ld1h { z2.h }, p0/z, [x1, x8, lsl #1] +; VBITS_GE_256-NEXT: ld1h { z3.h }, p0/z, [x1] +; VBITS_GE_256-NEXT: sunpkhi z5.s, z0.h +; VBITS_GE_256-NEXT: sunpklo z0.s, z0.h +; VBITS_GE_256-NEXT: sunpkhi z4.s, z2.h +; VBITS_GE_256-NEXT: sunpklo z2.s, z2.h +; VBITS_GE_256-NEXT: sdivr z4.s, p1/m, z4.s, z5.s +; VBITS_GE_256-NEXT: sunpkhi z5.s, z3.h +; VBITS_GE_256-NEXT: sdiv z0.s, p1/m, z0.s, z2.s +; VBITS_GE_256-NEXT: sunpkhi z2.s, z1.h +; VBITS_GE_256-NEXT: sunpklo z3.s, z3.h +; VBITS_GE_256-NEXT: sunpklo z1.s, z1.h +; VBITS_GE_256-NEXT: sdiv z2.s, p1/m, z2.s, z5.s +; VBITS_GE_256-NEXT: sdiv z1.s, p1/m, z1.s, z3.s +; VBITS_GE_256-NEXT: uzp1 z0.h, z0.h, z4.h +; VBITS_GE_256-NEXT: uzp1 z1.h, z1.h, z2.h +; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x0, x8, lsl #1] +; VBITS_GE_256-NEXT: st1h { z1.h }, p0, [x0] +; VBITS_GE_256-NEXT: ret + %op1 = load <32 x i16>, <32 x i16>* %a + %op2 = load <32 x i16>, <32 x i16>* %b + %res = sdiv <32 x i16> %op1, %op2 + store <32 x i16> %res, <32 x i16>* %a + ret void +} + +define <2 x i32> @sdiv_v2i32(<2 x i32> %op1, <2 x i32> %op2) #0 { +; CHECK-LABEL: sdiv_v2i32: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: ptrue p0.s, vl2 +; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1 +; CHECK-NEXT: sdiv z0.s, p0/m, z0.s, z1.s +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 +; CHECK-NEXT: ret + %res = sdiv <2 x i32> %op1, %op2 + ret <2 x i32> %res +} + +define <4 x i32> @sdiv_v4i32(<4 x i32> %op1, <4 x i32> %op2) #0 { +; CHECK-LABEL: sdiv_v4i32: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: ptrue p0.s, vl4 +; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1 +; CHECK-NEXT: sdiv z0.s, p0/m, z0.s, z1.s +; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 +; CHECK-NEXT: ret + %res = sdiv <4 x i32> %op1, %op2 + ret <4 x i32> %res +} + +define void @sdiv_v8i32(<8 x i32>* %a, <8 x i32>* %b) #0 { +; VBITS_GE_128-LABEL: sdiv_v8i32: +; VBITS_GE_128: // %bb.0: +; VBITS_GE_128-NEXT: mov x8, #4 +; VBITS_GE_128-NEXT: ptrue p0.s, vl4 +; VBITS_GE_128-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] +; VBITS_GE_128-NEXT: ld1w { z1.s }, p0/z, [x0] +; VBITS_GE_128-NEXT: ld1w { z2.s }, p0/z, [x1, x8, lsl #2] +; VBITS_GE_128-NEXT: ld1w { z3.s }, p0/z, [x1] +; VBITS_GE_128-NEXT: sdiv z1.s, p0/m, z1.s, z3.s +; VBITS_GE_128-NEXT: sdiv z0.s, p0/m, z0.s, z2.s +; VBITS_GE_128-NEXT: stp q1, q0, [x0] +; VBITS_GE_128-NEXT: ret +; +; VBITS_GE_256-LABEL: sdiv_v8i32: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: ptrue p0.s, vl8 +; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0] +; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x1] +; VBITS_GE_256-NEXT: sdiv z0.s, p0/m, z0.s, z1.s +; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0] +; VBITS_GE_256-NEXT: ret +; +; VBITS_GE_512-LABEL: sdiv_v8i32: +; VBITS_GE_512: // %bb.0: +; VBITS_GE_512-NEXT: ptrue p0.s, vl8 +; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x0] +; VBITS_GE_512-NEXT: ld1w { z1.s }, p0/z, [x1] +; VBITS_GE_512-NEXT: sdiv z0.s, p0/m, z0.s, z1.s +; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x0] +; VBITS_GE_512-NEXT: ret + %op1 = load <8 x i32>, <8 x i32>* %a + %op2 = load <8 x i32>, <8 x i32>* %b + %res = sdiv <8 x i32> %op1, %op2 + store <8 x i32> %res, <8 x i32>* %a + ret void +} + +define void @sdiv_v16i32(<16 x i32>* %a, <16 x i32>* %b) #0 { +; VBITS_GE_128-LABEL: sdiv_v16i32: +; VBITS_GE_128: // %bb.0: +; VBITS_GE_128-NEXT: mov x8, #8 +; VBITS_GE_128-NEXT: mov x9, #12 +; VBITS_GE_128-NEXT: mov x10, #4 +; VBITS_GE_128-NEXT: ptrue p0.s, vl4 +; VBITS_GE_128-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] +; VBITS_GE_128-NEXT: ld1w { z1.s }, p0/z, [x0, x9, lsl #2] +; VBITS_GE_128-NEXT: ld1w { z2.s }, p0/z, [x0, x10, lsl #2] +; VBITS_GE_128-NEXT: ld1w { z3.s }, p0/z, [x0] +; VBITS_GE_128-NEXT: ld1w { z4.s }, p0/z, [x1, x8, lsl #2] +; VBITS_GE_128-NEXT: ld1w { z5.s }, p0/z, [x1, x9, lsl #2] +; VBITS_GE_128-NEXT: ld1w { z6.s }, p0/z, [x1, x10, lsl #2] +; VBITS_GE_128-NEXT: ld1w { z7.s }, p0/z, [x1] +; VBITS_GE_128-NEXT: sdiv z0.s, p0/m, z0.s, z4.s +; VBITS_GE_128-NEXT: sdiv z1.s, p0/m, z1.s, z5.s +; VBITS_GE_128-NEXT: stp q0, q1, [x0, #32] +; VBITS_GE_128-NEXT: movprfx z0, z3 +; VBITS_GE_128-NEXT: sdiv z0.s, p0/m, z0.s, z7.s +; VBITS_GE_128-NEXT: movprfx z1, z2 +; VBITS_GE_128-NEXT: sdiv z1.s, p0/m, z1.s, z6.s +; VBITS_GE_128-NEXT: stp q0, q1, [x0] +; VBITS_GE_128-NEXT: ret +; +; VBITS_GE_256-LABEL: sdiv_v16i32: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov x8, #8 +; VBITS_GE_256-NEXT: ptrue p0.s, vl8 +; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0] +; VBITS_GE_256-NEXT: ld1w { z2.s }, p0/z, [x1, x8, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z3.s }, p0/z, [x1] +; VBITS_GE_256-NEXT: sdiv z0.s, p0/m, z0.s, z2.s +; VBITS_GE_256-NEXT: sdiv z1.s, p0/m, z1.s, z3.s +; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0, x8, lsl #2] +; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x0] +; VBITS_GE_256-NEXT: ret +; +; VBITS_GE_512-LABEL: sdiv_v16i32: +; VBITS_GE_512: // %bb.0: +; VBITS_GE_512-NEXT: ptrue p0.s, vl16 +; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x0] +; VBITS_GE_512-NEXT: ld1w { z1.s }, p0/z, [x1] +; VBITS_GE_512-NEXT: sdiv z0.s, p0/m, z0.s, z1.s +; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x0] +; VBITS_GE_512-NEXT: ret + %op1 = load <16 x i32>, <16 x i32>* %a + %op2 = load <16 x i32>, <16 x i32>* %b + %res = sdiv <16 x i32> %op1, %op2 + store <16 x i32> %res, <16 x i32>* %a + ret void +} + +define <1 x i64> @sdiv_v1i64(<1 x i64> %op1, <1 x i64> %op2) #0 { +; CHECK-LABEL: sdiv_v1i64: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: ptrue p0.d, vl1 +; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1 +; CHECK-NEXT: sdiv z0.d, p0/m, z0.d, z1.d +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 +; CHECK-NEXT: ret + %res = sdiv <1 x i64> %op1, %op2 + ret <1 x i64> %res +} + +define <2 x i64> @sdiv_v2i64(<2 x i64> %op1, <2 x i64> %op2) #0 { +; CHECK-LABEL: sdiv_v2i64: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: ptrue p0.d, vl2 +; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1 +; CHECK-NEXT: sdiv z0.d, p0/m, z0.d, z1.d +; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 +; CHECK-NEXT: ret + %res = sdiv <2 x i64> %op1, %op2 + ret <2 x i64> %res +} + +define void @sdiv_v4i64(<4 x i64>* %a, <4 x i64>* %b) #0 { +; VBITS_GE_128-LABEL: sdiv_v4i64: +; VBITS_GE_128: // %bb.0: +; VBITS_GE_128-NEXT: mov x8, #2 +; VBITS_GE_128-NEXT: ptrue p0.d, vl2 +; VBITS_GE_128-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] +; VBITS_GE_128-NEXT: ld1d { z1.d }, p0/z, [x0] +; VBITS_GE_128-NEXT: ld1d { z2.d }, p0/z, [x1, x8, lsl #3] +; VBITS_GE_128-NEXT: ld1d { z3.d }, p0/z, [x1] +; VBITS_GE_128-NEXT: sdiv z1.d, p0/m, z1.d, z3.d +; VBITS_GE_128-NEXT: sdiv z0.d, p0/m, z0.d, z2.d +; VBITS_GE_128-NEXT: stp q1, q0, [x0] +; VBITS_GE_128-NEXT: ret +; +; VBITS_GE_256-LABEL: sdiv_v4i64: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: ptrue p0.d, vl4 +; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0] +; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x1] +; VBITS_GE_256-NEXT: sdiv z0.d, p0/m, z0.d, z1.d +; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x0] +; VBITS_GE_256-NEXT: ret +; +; VBITS_GE_512-LABEL: sdiv_v4i64: +; VBITS_GE_512: // %bb.0: +; VBITS_GE_512-NEXT: ptrue p0.d, vl4 +; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x0] +; VBITS_GE_512-NEXT: ld1d { z1.d }, p0/z, [x1] +; VBITS_GE_512-NEXT: sdiv z0.d, p0/m, z0.d, z1.d +; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x0] +; VBITS_GE_512-NEXT: ret + %op1 = load <4 x i64>, <4 x i64>* %a + %op2 = load <4 x i64>, <4 x i64>* %b + %res = sdiv <4 x i64> %op1, %op2 + store <4 x i64> %res, <4 x i64>* %a + ret void +} + +define void @sdiv_v8i64(<8 x i64>* %a, <8 x i64>* %b) #0 { +; VBITS_GE_128-LABEL: sdiv_v8i64: +; VBITS_GE_128: // %bb.0: +; VBITS_GE_128-NEXT: mov x8, #4 +; VBITS_GE_128-NEXT: mov x9, #6 +; VBITS_GE_128-NEXT: mov x10, #2 +; VBITS_GE_128-NEXT: ptrue p0.d, vl2 +; VBITS_GE_128-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] +; VBITS_GE_128-NEXT: ld1d { z1.d }, p0/z, [x0, x9, lsl #3] +; VBITS_GE_128-NEXT: ld1d { z2.d }, p0/z, [x0, x10, lsl #3] +; VBITS_GE_128-NEXT: ld1d { z3.d }, p0/z, [x0] +; VBITS_GE_128-NEXT: ld1d { z4.d }, p0/z, [x1, x8, lsl #3] +; VBITS_GE_128-NEXT: ld1d { z5.d }, p0/z, [x1, x9, lsl #3] +; VBITS_GE_128-NEXT: ld1d { z6.d }, p0/z, [x1, x10, lsl #3] +; VBITS_GE_128-NEXT: ld1d { z7.d }, p0/z, [x1] +; VBITS_GE_128-NEXT: sdiv z0.d, p0/m, z0.d, z4.d +; VBITS_GE_128-NEXT: sdiv z1.d, p0/m, z1.d, z5.d +; VBITS_GE_128-NEXT: stp q0, q1, [x0, #32] +; VBITS_GE_128-NEXT: movprfx z0, z3 +; VBITS_GE_128-NEXT: sdiv z0.d, p0/m, z0.d, z7.d +; VBITS_GE_128-NEXT: movprfx z1, z2 +; VBITS_GE_128-NEXT: sdiv z1.d, p0/m, z1.d, z6.d +; VBITS_GE_128-NEXT: stp q0, q1, [x0] +; VBITS_GE_128-NEXT: ret +; +; VBITS_GE_256-LABEL: sdiv_v8i64: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov x8, #4 +; VBITS_GE_256-NEXT: ptrue p0.d, vl4 +; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0] +; VBITS_GE_256-NEXT: ld1d { z2.d }, p0/z, [x1, x8, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z3.d }, p0/z, [x1] +; VBITS_GE_256-NEXT: sdiv z0.d, p0/m, z0.d, z2.d +; VBITS_GE_256-NEXT: sdiv z1.d, p0/m, z1.d, z3.d +; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x0, x8, lsl #3] +; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x0] +; VBITS_GE_256-NEXT: ret +; +; VBITS_GE_512-LABEL: sdiv_v8i64: +; VBITS_GE_512: // %bb.0: +; VBITS_GE_512-NEXT: ptrue p0.d, vl8 +; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x0] +; VBITS_GE_512-NEXT: ld1d { z1.d }, p0/z, [x1] +; VBITS_GE_512-NEXT: sdiv z0.d, p0/m, z0.d, z1.d +; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x0] +; VBITS_GE_512-NEXT: ret + %op1 = load <8 x i64>, <8 x i64>* %a + %op2 = load <8 x i64>, <8 x i64>* %b + %res = sdiv <8 x i64> %op1, %op2 + store <8 x i64> %res, <8 x i64>* %a + ret void +} + +; +; UDIV +; + +define <8 x i8> @udiv_v8i8(<8 x i8> %op1, <8 x i8> %op2) #0 { +; VBITS_GE_128-LABEL: udiv_v8i8: +; VBITS_GE_128: // %bb.0: +; VBITS_GE_128-NEXT: sub sp, sp, #16 +; VBITS_GE_128-NEXT: .cfi_def_cfa_offset 16 +; VBITS_GE_128-NEXT: // kill: def $d1 killed $d1 def $z1 +; VBITS_GE_128-NEXT: // kill: def $d0 killed $d0 def $z0 +; VBITS_GE_128-NEXT: ptrue p0.s, vl4 +; VBITS_GE_128-NEXT: uunpklo z1.h, z1.b +; VBITS_GE_128-NEXT: uunpklo z0.h, z0.b +; VBITS_GE_128-NEXT: uunpkhi z2.s, z1.h +; VBITS_GE_128-NEXT: uunpkhi z3.s, z0.h +; VBITS_GE_128-NEXT: uunpklo z1.s, z1.h +; VBITS_GE_128-NEXT: uunpklo z0.s, z0.h +; VBITS_GE_128-NEXT: udivr z2.s, p0/m, z2.s, z3.s +; VBITS_GE_128-NEXT: udiv z0.s, p0/m, z0.s, z1.s +; VBITS_GE_128-NEXT: uzp1 z0.h, z0.h, z2.h +; VBITS_GE_128-NEXT: ptrue p0.b, vl8 +; VBITS_GE_128-NEXT: fmov w8, s0 +; VBITS_GE_128-NEXT: mov z1.h, z0.h[7] +; VBITS_GE_128-NEXT: mov z3.h, z0.h[5] +; VBITS_GE_128-NEXT: fmov w9, s1 +; VBITS_GE_128-NEXT: mov z2.h, z0.h[6] +; VBITS_GE_128-NEXT: mov z4.h, z0.h[4] +; VBITS_GE_128-NEXT: strb w8, [sp, #8] +; VBITS_GE_128-NEXT: fmov w8, s3 +; VBITS_GE_128-NEXT: mov z6.h, z0.h[2] +; VBITS_GE_128-NEXT: fmov w10, s2 +; VBITS_GE_128-NEXT: strb w9, [sp, #15] +; VBITS_GE_128-NEXT: fmov w9, s4 +; VBITS_GE_128-NEXT: strb w8, [sp, #13] +; VBITS_GE_128-NEXT: fmov w8, s6 +; VBITS_GE_128-NEXT: mov z5.h, z0.h[3] +; VBITS_GE_128-NEXT: mov z0.h, z0.h[1] +; VBITS_GE_128-NEXT: strb w10, [sp, #14] +; VBITS_GE_128-NEXT: fmov w10, s5 +; VBITS_GE_128-NEXT: strb w9, [sp, #12] +; VBITS_GE_128-NEXT: fmov w9, s0 +; VBITS_GE_128-NEXT: strb w8, [sp, #10] +; VBITS_GE_128-NEXT: add x8, sp, #8 +; VBITS_GE_128-NEXT: strb w10, [sp, #11] +; VBITS_GE_128-NEXT: strb w9, [sp, #9] +; VBITS_GE_128-NEXT: ld1b { z0.b }, p0/z, [x8] +; VBITS_GE_128-NEXT: // kill: def $d0 killed $d0 killed $z0 +; VBITS_GE_128-NEXT: add sp, sp, #16 +; VBITS_GE_128-NEXT: ret +; +; VBITS_GE_256-LABEL: udiv_v8i8: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: sub sp, sp, #16 +; VBITS_GE_256-NEXT: .cfi_def_cfa_offset 16 +; VBITS_GE_256-NEXT: // kill: def $d1 killed $d1 def $z1 +; VBITS_GE_256-NEXT: // kill: def $d0 killed $d0 def $z0 +; VBITS_GE_256-NEXT: ptrue p0.s, vl8 +; VBITS_GE_256-NEXT: uunpklo z1.h, z1.b +; VBITS_GE_256-NEXT: uunpklo z0.h, z0.b +; VBITS_GE_256-NEXT: uunpklo z1.s, z1.h +; VBITS_GE_256-NEXT: uunpklo z0.s, z0.h +; VBITS_GE_256-NEXT: udiv z0.s, p0/m, z0.s, z1.s +; VBITS_GE_256-NEXT: ptrue p0.b, vl8 +; VBITS_GE_256-NEXT: uzp1 z0.h, z0.h, z0.h +; VBITS_GE_256-NEXT: fmov w8, s0 +; VBITS_GE_256-NEXT: mov z1.h, z0.h[7] +; VBITS_GE_256-NEXT: mov z3.h, z0.h[5] +; VBITS_GE_256-NEXT: fmov w9, s1 +; VBITS_GE_256-NEXT: mov z2.h, z0.h[6] +; VBITS_GE_256-NEXT: mov z4.h, z0.h[4] +; VBITS_GE_256-NEXT: strb w8, [sp, #8] +; VBITS_GE_256-NEXT: fmov w8, s3 +; VBITS_GE_256-NEXT: mov z6.h, z0.h[2] +; VBITS_GE_256-NEXT: fmov w10, s2 +; VBITS_GE_256-NEXT: strb w9, [sp, #15] +; VBITS_GE_256-NEXT: fmov w9, s4 +; VBITS_GE_256-NEXT: strb w8, [sp, #13] +; VBITS_GE_256-NEXT: fmov w8, s6 +; VBITS_GE_256-NEXT: mov z5.h, z0.h[3] +; VBITS_GE_256-NEXT: mov z0.h, z0.h[1] +; VBITS_GE_256-NEXT: strb w10, [sp, #14] +; VBITS_GE_256-NEXT: fmov w10, s5 +; VBITS_GE_256-NEXT: strb w9, [sp, #12] +; VBITS_GE_256-NEXT: fmov w9, s0 +; VBITS_GE_256-NEXT: strb w8, [sp, #10] +; VBITS_GE_256-NEXT: add x8, sp, #8 +; VBITS_GE_256-NEXT: strb w10, [sp, #11] +; VBITS_GE_256-NEXT: strb w9, [sp, #9] +; VBITS_GE_256-NEXT: ld1b { z0.b }, p0/z, [x8] +; VBITS_GE_256-NEXT: // kill: def $d0 killed $d0 killed $z0 +; VBITS_GE_256-NEXT: add sp, sp, #16 +; VBITS_GE_256-NEXT: ret +; +; VBITS_GE_512-LABEL: udiv_v8i8: +; VBITS_GE_512: // %bb.0: +; VBITS_GE_512-NEXT: sub sp, sp, #16 +; VBITS_GE_512-NEXT: .cfi_def_cfa_offset 16 +; VBITS_GE_512-NEXT: // kill: def $d1 killed $d1 def $z1 +; VBITS_GE_512-NEXT: // kill: def $d0 killed $d0 def $z0 +; VBITS_GE_512-NEXT: ptrue p0.s, vl8 +; VBITS_GE_512-NEXT: uunpklo z1.h, z1.b +; VBITS_GE_512-NEXT: uunpklo z0.h, z0.b +; VBITS_GE_512-NEXT: uunpklo z1.s, z1.h +; VBITS_GE_512-NEXT: uunpklo z0.s, z0.h +; VBITS_GE_512-NEXT: udiv z0.s, p0/m, z0.s, z1.s +; VBITS_GE_512-NEXT: ptrue p0.b, vl8 +; VBITS_GE_512-NEXT: uzp1 z0.h, z0.h, z0.h +; VBITS_GE_512-NEXT: fmov w8, s0 +; VBITS_GE_512-NEXT: mov z1.h, z0.h[7] +; VBITS_GE_512-NEXT: mov z3.h, z0.h[5] +; VBITS_GE_512-NEXT: fmov w9, s1 +; VBITS_GE_512-NEXT: mov z2.h, z0.h[6] +; VBITS_GE_512-NEXT: mov z4.h, z0.h[4] +; VBITS_GE_512-NEXT: strb w8, [sp, #8] +; VBITS_GE_512-NEXT: fmov w8, s3 +; VBITS_GE_512-NEXT: mov z6.h, z0.h[2] +; VBITS_GE_512-NEXT: fmov w10, s2 +; VBITS_GE_512-NEXT: strb w9, [sp, #15] +; VBITS_GE_512-NEXT: fmov w9, s4 +; VBITS_GE_512-NEXT: strb w8, [sp, #13] +; VBITS_GE_512-NEXT: fmov w8, s6 +; VBITS_GE_512-NEXT: mov z5.h, z0.h[3] +; VBITS_GE_512-NEXT: mov z0.h, z0.h[1] +; VBITS_GE_512-NEXT: strb w10, [sp, #14] +; VBITS_GE_512-NEXT: fmov w10, s5 +; VBITS_GE_512-NEXT: strb w9, [sp, #12] +; VBITS_GE_512-NEXT: fmov w9, s0 +; VBITS_GE_512-NEXT: strb w8, [sp, #10] +; VBITS_GE_512-NEXT: add x8, sp, #8 +; VBITS_GE_512-NEXT: strb w10, [sp, #11] +; VBITS_GE_512-NEXT: strb w9, [sp, #9] +; VBITS_GE_512-NEXT: ld1b { z0.b }, p0/z, [x8] +; VBITS_GE_512-NEXT: // kill: def $d0 killed $d0 killed $z0 +; VBITS_GE_512-NEXT: add sp, sp, #16 +; VBITS_GE_512-NEXT: ret + %res = udiv <8 x i8> %op1, %op2 + ret <8 x i8> %res +} + +define <16 x i8> @udiv_v16i8(<16 x i8> %op1, <16 x i8> %op2) #0 { +; VBITS_GE_128-LABEL: udiv_v16i8: +; VBITS_GE_128: // %bb.0: +; VBITS_GE_128-NEXT: // kill: def $q1 killed $q1 def $z1 +; VBITS_GE_128-NEXT: // kill: def $q0 killed $q0 def $z0 +; VBITS_GE_128-NEXT: uunpkhi z2.h, z1.b +; VBITS_GE_128-NEXT: uunpkhi z3.h, z0.b +; VBITS_GE_128-NEXT: ptrue p0.s, vl4 +; VBITS_GE_128-NEXT: uunpklo z1.h, z1.b +; VBITS_GE_128-NEXT: uunpkhi z4.s, z2.h +; VBITS_GE_128-NEXT: uunpkhi z5.s, z3.h +; VBITS_GE_128-NEXT: uunpklo z2.s, z2.h +; VBITS_GE_128-NEXT: uunpklo z3.s, z3.h +; VBITS_GE_128-NEXT: uunpklo z0.h, z0.b +; VBITS_GE_128-NEXT: udivr z4.s, p0/m, z4.s, z5.s +; VBITS_GE_128-NEXT: udivr z2.s, p0/m, z2.s, z3.s +; VBITS_GE_128-NEXT: uunpkhi z3.s, z1.h +; VBITS_GE_128-NEXT: uunpkhi z5.s, z0.h +; VBITS_GE_128-NEXT: uunpklo z1.s, z1.h +; VBITS_GE_128-NEXT: uunpklo z0.s, z0.h +; VBITS_GE_128-NEXT: udivr z3.s, p0/m, z3.s, z5.s +; VBITS_GE_128-NEXT: udiv z0.s, p0/m, z0.s, z1.s +; VBITS_GE_128-NEXT: uzp1 z1.h, z2.h, z4.h +; VBITS_GE_128-NEXT: uzp1 z0.h, z0.h, z3.h +; VBITS_GE_128-NEXT: uzp1 z0.b, z0.b, z1.b +; VBITS_GE_128-NEXT: // kill: def $q0 killed $q0 killed $z0 +; VBITS_GE_128-NEXT: ret +; +; VBITS_GE_256-LABEL: udiv_v16i8: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: // kill: def $q1 killed $q1 def $z1 +; VBITS_GE_256-NEXT: // kill: def $q0 killed $q0 def $z0 +; VBITS_GE_256-NEXT: ptrue p0.s, vl8 +; VBITS_GE_256-NEXT: uunpklo z1.h, z1.b +; VBITS_GE_256-NEXT: uunpklo z0.h, z0.b +; VBITS_GE_256-NEXT: uunpkhi z2.s, z1.h +; VBITS_GE_256-NEXT: uunpkhi z3.s, z0.h +; VBITS_GE_256-NEXT: uunpklo z1.s, z1.h +; VBITS_GE_256-NEXT: uunpklo z0.s, z0.h +; VBITS_GE_256-NEXT: udivr z2.s, p0/m, z2.s, z3.s +; VBITS_GE_256-NEXT: udiv z0.s, p0/m, z0.s, z1.s +; VBITS_GE_256-NEXT: uzp1 z0.h, z0.h, z2.h +; VBITS_GE_256-NEXT: uzp1 z0.b, z0.b, z0.b +; VBITS_GE_256-NEXT: // kill: def $q0 killed $q0 killed $z0 +; VBITS_GE_256-NEXT: ret +; +; VBITS_GE_512-LABEL: udiv_v16i8: +; VBITS_GE_512: // %bb.0: +; VBITS_GE_512-NEXT: // kill: def $q1 killed $q1 def $z1 +; VBITS_GE_512-NEXT: // kill: def $q0 killed $q0 def $z0 +; VBITS_GE_512-NEXT: ptrue p0.s, vl16 +; VBITS_GE_512-NEXT: uunpklo z1.h, z1.b +; VBITS_GE_512-NEXT: uunpklo z0.h, z0.b +; VBITS_GE_512-NEXT: uunpklo z1.s, z1.h +; VBITS_GE_512-NEXT: uunpklo z0.s, z0.h +; VBITS_GE_512-NEXT: udiv z0.s, p0/m, z0.s, z1.s +; VBITS_GE_512-NEXT: uzp1 z0.h, z0.h, z0.h +; VBITS_GE_512-NEXT: uzp1 z0.b, z0.b, z0.b +; VBITS_GE_512-NEXT: // kill: def $q0 killed $q0 killed $z0 +; VBITS_GE_512-NEXT: ret + %res = udiv <16 x i8> %op1, %op2 + ret <16 x i8> %res +} + +define void @udiv_v32i8(<32 x i8>* %a, <32 x i8>* %b) #0 { +; VBITS_GE_128-LABEL: udiv_v32i8: +; VBITS_GE_128: // %bb.0: +; VBITS_GE_128-NEXT: mov w8, #16 +; VBITS_GE_128-NEXT: ptrue p0.b, vl16 +; VBITS_GE_128-NEXT: ld1b { z0.b }, p0/z, [x0, x8] +; VBITS_GE_128-NEXT: ld1b { z1.b }, p0/z, [x0] +; VBITS_GE_128-NEXT: ld1b { z2.b }, p0/z, [x1, x8] +; VBITS_GE_128-NEXT: ld1b { z3.b }, p0/z, [x1] +; VBITS_GE_128-NEXT: ptrue p0.s, vl4 +; VBITS_GE_128-NEXT: uunpkhi z5.h, z0.b +; VBITS_GE_128-NEXT: uunpklo z0.h, z0.b +; VBITS_GE_128-NEXT: uunpkhi z4.h, z2.b +; VBITS_GE_128-NEXT: uunpklo z2.h, z2.b +; VBITS_GE_128-NEXT: uunpkhi z6.s, z4.h +; VBITS_GE_128-NEXT: uunpkhi z7.s, z5.h +; VBITS_GE_128-NEXT: uunpklo z4.s, z4.h +; VBITS_GE_128-NEXT: uunpklo z5.s, z5.h +; VBITS_GE_128-NEXT: uunpkhi z16.s, z2.h +; VBITS_GE_128-NEXT: udivr z6.s, p0/m, z6.s, z7.s +; VBITS_GE_128-NEXT: udivr z4.s, p0/m, z4.s, z5.s +; VBITS_GE_128-NEXT: uunpkhi z5.s, z0.h +; VBITS_GE_128-NEXT: uunpklo z2.s, z2.h +; VBITS_GE_128-NEXT: uunpklo z0.s, z0.h +; VBITS_GE_128-NEXT: uzp1 z4.h, z4.h, z6.h +; VBITS_GE_128-NEXT: udiv z0.s, p0/m, z0.s, z2.s +; VBITS_GE_128-NEXT: uunpkhi z2.h, z3.b +; VBITS_GE_128-NEXT: uunpkhi z6.h, z1.b +; VBITS_GE_128-NEXT: udiv z5.s, p0/m, z5.s, z16.s +; VBITS_GE_128-NEXT: uunpkhi z7.s, z2.h +; VBITS_GE_128-NEXT: uunpkhi z16.s, z6.h +; VBITS_GE_128-NEXT: uunpklo z2.s, z2.h +; VBITS_GE_128-NEXT: uunpklo z6.s, z6.h +; VBITS_GE_128-NEXT: uunpklo z3.h, z3.b +; VBITS_GE_128-NEXT: uunpklo z1.h, z1.b +; VBITS_GE_128-NEXT: udivr z7.s, p0/m, z7.s, z16.s +; VBITS_GE_128-NEXT: udivr z2.s, p0/m, z2.s, z6.s +; VBITS_GE_128-NEXT: uunpkhi z6.s, z3.h +; VBITS_GE_128-NEXT: uunpkhi z16.s, z1.h +; VBITS_GE_128-NEXT: uunpklo z3.s, z3.h +; VBITS_GE_128-NEXT: uunpklo z1.s, z1.h +; VBITS_GE_128-NEXT: udivr z6.s, p0/m, z6.s, z16.s +; VBITS_GE_128-NEXT: udiv z1.s, p0/m, z1.s, z3.s +; VBITS_GE_128-NEXT: uzp1 z2.h, z2.h, z7.h +; VBITS_GE_128-NEXT: uzp1 z1.h, z1.h, z6.h +; VBITS_GE_128-NEXT: uzp1 z0.h, z0.h, z5.h +; VBITS_GE_128-NEXT: uzp1 z1.b, z1.b, z2.b +; VBITS_GE_128-NEXT: uzp1 z0.b, z0.b, z4.b +; VBITS_GE_128-NEXT: stp q1, q0, [x0] +; VBITS_GE_128-NEXT: ret +; +; VBITS_GE_256-LABEL: udiv_v32i8: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: ptrue p0.b, vl32 +; VBITS_GE_256-NEXT: ptrue p1.s, vl8 +; VBITS_GE_256-NEXT: ld1b { z0.b }, p0/z, [x0] +; VBITS_GE_256-NEXT: ld1b { z1.b }, p0/z, [x1] +; VBITS_GE_256-NEXT: uunpkhi z2.h, z1.b +; VBITS_GE_256-NEXT: uunpkhi z3.h, z0.b +; VBITS_GE_256-NEXT: uunpklo z1.h, z1.b +; VBITS_GE_256-NEXT: uunpklo z0.h, z0.b +; VBITS_GE_256-NEXT: uunpkhi z4.s, z2.h +; VBITS_GE_256-NEXT: uunpkhi z5.s, z3.h +; VBITS_GE_256-NEXT: uunpklo z2.s, z2.h +; VBITS_GE_256-NEXT: uunpklo z3.s, z3.h +; VBITS_GE_256-NEXT: udivr z4.s, p1/m, z4.s, z5.s +; VBITS_GE_256-NEXT: uunpkhi z5.s, z1.h +; VBITS_GE_256-NEXT: udivr z2.s, p1/m, z2.s, z3.s +; VBITS_GE_256-NEXT: uunpkhi z3.s, z0.h +; VBITS_GE_256-NEXT: uunpklo z1.s, z1.h +; VBITS_GE_256-NEXT: uunpklo z0.s, z0.h +; VBITS_GE_256-NEXT: udiv z3.s, p1/m, z3.s, z5.s +; VBITS_GE_256-NEXT: udiv z0.s, p1/m, z0.s, z1.s +; VBITS_GE_256-NEXT: uzp1 z1.h, z2.h, z4.h +; VBITS_GE_256-NEXT: uzp1 z0.h, z0.h, z3.h +; VBITS_GE_256-NEXT: uzp1 z0.b, z0.b, z1.b +; VBITS_GE_256-NEXT: st1b { z0.b }, p0, [x0] +; VBITS_GE_256-NEXT: ret + %op1 = load <32 x i8>, <32 x i8>* %a + %op2 = load <32 x i8>, <32 x i8>* %b + %res = udiv <32 x i8> %op1, %op2 + store <32 x i8> %res, <32 x i8>* %a + ret void +} + +define void @udiv_v64i8(<64 x i8>* %a, <64 x i8>* %b) vscale_range(16,0) #0 { +; CHECK-LABEL: udiv_v64i8: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.s, vl64 +; CHECK-NEXT: ld1b { z0.s }, p0/z, [x1] +; CHECK-NEXT: ld1b { z1.s }, p0/z, [x0] +; CHECK-NEXT: udivr z0.s, p0/m, z0.s, z1.s +; CHECK-NEXT: st1b { z0.s }, p0, [x0] +; CHECK-NEXT: ret + %op1 = load <64 x i8>, <64 x i8>* %a + %op2 = load <64 x i8>, <64 x i8>* %b + %res = udiv <64 x i8> %op1, %op2 + store <64 x i8> %res, <64 x i8>* %a + ret void +} + +define <4 x i16> @udiv_v4i16(<4 x i16> %op1, <4 x i16> %op2) #0 { +; CHECK-LABEL: udiv_v4i16: +; CHECK: // %bb.0: +; CHECK-NEXT: sub sp, sp, #16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1 +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: ptrue p0.s, vl4 +; CHECK-NEXT: uunpklo z1.s, z1.h +; CHECK-NEXT: uunpklo z0.s, z0.h +; CHECK-NEXT: udiv z0.s, p0/m, z0.s, z1.s +; CHECK-NEXT: ptrue p0.h, vl4 +; CHECK-NEXT: fmov w8, s0 +; CHECK-NEXT: mov z1.s, z0.s[3] +; CHECK-NEXT: mov z2.s, z0.s[2] +; CHECK-NEXT: mov z0.s, z0.s[1] +; CHECK-NEXT: fmov w9, s1 +; CHECK-NEXT: fmov w10, s2 +; CHECK-NEXT: strh w8, [sp, #8] +; CHECK-NEXT: fmov w8, s0 +; CHECK-NEXT: strh w9, [sp, #14] +; CHECK-NEXT: strh w8, [sp, #10] +; CHECK-NEXT: add x8, sp, #8 +; CHECK-NEXT: strh w10, [sp, #12] +; CHECK-NEXT: ld1h { z0.h }, p0/z, [x8] +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 +; CHECK-NEXT: add sp, sp, #16 +; CHECK-NEXT: ret + %res = udiv <4 x i16> %op1, %op2 + ret <4 x i16> %res +} + +define <8 x i16> @udiv_v8i16(<8 x i16> %op1, <8 x i16> %op2) #0 { +; VBITS_GE_128-LABEL: udiv_v8i16: +; VBITS_GE_128: // %bb.0: +; VBITS_GE_128-NEXT: // kill: def $q1 killed $q1 def $z1 +; VBITS_GE_128-NEXT: // kill: def $q0 killed $q0 def $z0 +; VBITS_GE_128-NEXT: ptrue p0.s, vl4 +; VBITS_GE_128-NEXT: uunpkhi z2.s, z1.h +; VBITS_GE_128-NEXT: uunpkhi z3.s, z0.h +; VBITS_GE_128-NEXT: uunpklo z1.s, z1.h +; VBITS_GE_128-NEXT: uunpklo z0.s, z0.h +; VBITS_GE_128-NEXT: udivr z2.s, p0/m, z2.s, z3.s +; VBITS_GE_128-NEXT: udiv z0.s, p0/m, z0.s, z1.s +; VBITS_GE_128-NEXT: uzp1 z0.h, z0.h, z2.h +; VBITS_GE_128-NEXT: // kill: def $q0 killed $q0 killed $z0 +; VBITS_GE_128-NEXT: ret +; +; VBITS_GE_256-LABEL: udiv_v8i16: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: // kill: def $q1 killed $q1 def $z1 +; VBITS_GE_256-NEXT: // kill: def $q0 killed $q0 def $z0 +; VBITS_GE_256-NEXT: ptrue p0.s, vl8 +; VBITS_GE_256-NEXT: uunpklo z1.s, z1.h +; VBITS_GE_256-NEXT: uunpklo z0.s, z0.h +; VBITS_GE_256-NEXT: udiv z0.s, p0/m, z0.s, z1.s +; VBITS_GE_256-NEXT: uzp1 z0.h, z0.h, z0.h +; VBITS_GE_256-NEXT: // kill: def $q0 killed $q0 killed $z0 +; VBITS_GE_256-NEXT: ret +; +; VBITS_GE_512-LABEL: udiv_v8i16: +; VBITS_GE_512: // %bb.0: +; VBITS_GE_512-NEXT: // kill: def $q1 killed $q1 def $z1 +; VBITS_GE_512-NEXT: // kill: def $q0 killed $q0 def $z0 +; VBITS_GE_512-NEXT: ptrue p0.s, vl8 +; VBITS_GE_512-NEXT: uunpklo z1.s, z1.h +; VBITS_GE_512-NEXT: uunpklo z0.s, z0.h +; VBITS_GE_512-NEXT: udiv z0.s, p0/m, z0.s, z1.s +; VBITS_GE_512-NEXT: uzp1 z0.h, z0.h, z0.h +; VBITS_GE_512-NEXT: // kill: def $q0 killed $q0 killed $z0 +; VBITS_GE_512-NEXT: ret + %res = udiv <8 x i16> %op1, %op2 + ret <8 x i16> %res +} + +define void @udiv_v16i16(<16 x i16>* %a, <16 x i16>* %b) #0 { +; VBITS_GE_128-LABEL: udiv_v16i16: +; VBITS_GE_128: // %bb.0: +; VBITS_GE_128-NEXT: mov x8, #8 +; VBITS_GE_128-NEXT: ptrue p0.h, vl8 +; VBITS_GE_128-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] +; VBITS_GE_128-NEXT: ld1h { z1.h }, p0/z, [x0] +; VBITS_GE_128-NEXT: ld1h { z2.h }, p0/z, [x1, x8, lsl #1] +; VBITS_GE_128-NEXT: ld1h { z3.h }, p0/z, [x1] +; VBITS_GE_128-NEXT: ptrue p0.s, vl4 +; VBITS_GE_128-NEXT: uunpkhi z5.s, z0.h +; VBITS_GE_128-NEXT: uunpklo z0.s, z0.h +; VBITS_GE_128-NEXT: uunpkhi z4.s, z2.h +; VBITS_GE_128-NEXT: uunpkhi z6.s, z3.h +; VBITS_GE_128-NEXT: udivr z4.s, p0/m, z4.s, z5.s +; VBITS_GE_128-NEXT: uunpkhi z5.s, z1.h +; VBITS_GE_128-NEXT: uunpklo z2.s, z2.h +; VBITS_GE_128-NEXT: uunpklo z3.s, z3.h +; VBITS_GE_128-NEXT: uunpklo z1.s, z1.h +; VBITS_GE_128-NEXT: udiv z5.s, p0/m, z5.s, z6.s +; VBITS_GE_128-NEXT: udiv z1.s, p0/m, z1.s, z3.s +; VBITS_GE_128-NEXT: udiv z0.s, p0/m, z0.s, z2.s +; VBITS_GE_128-NEXT: uzp1 z1.h, z1.h, z5.h +; VBITS_GE_128-NEXT: uzp1 z0.h, z0.h, z4.h +; VBITS_GE_128-NEXT: stp q1, q0, [x0] +; VBITS_GE_128-NEXT: ret +; +; VBITS_GE_256-LABEL: udiv_v16i16: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: ptrue p0.h, vl16 +; VBITS_GE_256-NEXT: ptrue p1.s, vl8 +; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0] +; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x1] +; VBITS_GE_256-NEXT: uunpkhi z2.s, z1.h +; VBITS_GE_256-NEXT: uunpkhi z3.s, z0.h +; VBITS_GE_256-NEXT: uunpklo z1.s, z1.h +; VBITS_GE_256-NEXT: uunpklo z0.s, z0.h +; VBITS_GE_256-NEXT: udivr z2.s, p1/m, z2.s, z3.s +; VBITS_GE_256-NEXT: udiv z0.s, p1/m, z0.s, z1.s +; VBITS_GE_256-NEXT: uzp1 z0.h, z0.h, z2.h +; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x0] +; VBITS_GE_256-NEXT: ret +; +; VBITS_GE_512-LABEL: udiv_v16i16: +; VBITS_GE_512: // %bb.0: +; VBITS_GE_512-NEXT: ptrue p0.s, vl16 +; VBITS_GE_512-NEXT: ld1h { z0.s }, p0/z, [x1] +; VBITS_GE_512-NEXT: ld1h { z1.s }, p0/z, [x0] +; VBITS_GE_512-NEXT: udivr z0.s, p0/m, z0.s, z1.s +; VBITS_GE_512-NEXT: st1h { z0.s }, p0, [x0] +; VBITS_GE_512-NEXT: ret + %op1 = load <16 x i16>, <16 x i16>* %a + %op2 = load <16 x i16>, <16 x i16>* %b + %res = udiv <16 x i16> %op1, %op2 + store <16 x i16> %res, <16 x i16>* %a + ret void +} + +define void @udiv_v32i16(<32 x i16>* %a, <32 x i16>* %b) #0 { +; VBITS_GE_128-LABEL: udiv_v32i16: +; VBITS_GE_128: // %bb.0: +; VBITS_GE_128-NEXT: mov x8, #16 +; VBITS_GE_128-NEXT: mov x9, #24 +; VBITS_GE_128-NEXT: mov x10, #8 +; VBITS_GE_128-NEXT: ptrue p1.h, vl8 +; VBITS_GE_128-NEXT: ptrue p0.s, vl4 +; VBITS_GE_128-NEXT: ld1h { z0.h }, p1/z, [x0, x8, lsl #1] +; VBITS_GE_128-NEXT: ld1h { z1.h }, p1/z, [x0, x9, lsl #1] +; VBITS_GE_128-NEXT: ld1h { z2.h }, p1/z, [x0, x10, lsl #1] +; VBITS_GE_128-NEXT: ld1h { z3.h }, p1/z, [x0] +; VBITS_GE_128-NEXT: ld1h { z4.h }, p1/z, [x1, x10, lsl #1] +; VBITS_GE_128-NEXT: ld1h { z5.h }, p1/z, [x1, x9, lsl #1] +; VBITS_GE_128-NEXT: ld1h { z6.h }, p1/z, [x1, x8, lsl #1] +; VBITS_GE_128-NEXT: ld1h { z17.h }, p1/z, [x1] +; VBITS_GE_128-NEXT: uunpkhi z18.s, z1.h +; VBITS_GE_128-NEXT: uunpklo z1.s, z1.h +; VBITS_GE_128-NEXT: uunpkhi z16.s, z2.h +; VBITS_GE_128-NEXT: uunpklo z2.s, z2.h +; VBITS_GE_128-NEXT: uunpkhi z7.s, z4.h +; VBITS_GE_128-NEXT: uunpklo z4.s, z4.h +; VBITS_GE_128-NEXT: udivr z7.s, p0/m, z7.s, z16.s +; VBITS_GE_128-NEXT: uunpkhi z16.s, z5.h +; VBITS_GE_128-NEXT: uunpklo z5.s, z5.h +; VBITS_GE_128-NEXT: udiv z2.s, p0/m, z2.s, z4.s +; VBITS_GE_128-NEXT: udiv z1.s, p0/m, z1.s, z5.s +; VBITS_GE_128-NEXT: uunpkhi z4.s, z6.h +; VBITS_GE_128-NEXT: uunpkhi z5.s, z0.h +; VBITS_GE_128-NEXT: uunpklo z6.s, z6.h +; VBITS_GE_128-NEXT: uunpklo z0.s, z0.h +; VBITS_GE_128-NEXT: udivr z4.s, p0/m, z4.s, z5.s +; VBITS_GE_128-NEXT: udiv z0.s, p0/m, z0.s, z6.s +; VBITS_GE_128-NEXT: uunpkhi z5.s, z17.h +; VBITS_GE_128-NEXT: udivr z16.s, p0/m, z16.s, z18.s +; VBITS_GE_128-NEXT: uunpkhi z6.s, z3.h +; VBITS_GE_128-NEXT: uzp1 z0.h, z0.h, z4.h +; VBITS_GE_128-NEXT: movprfx z4, z6 +; VBITS_GE_128-NEXT: udiv z4.s, p0/m, z4.s, z5.s +; VBITS_GE_128-NEXT: uunpklo z5.s, z17.h +; VBITS_GE_128-NEXT: uunpklo z3.s, z3.h +; VBITS_GE_128-NEXT: uzp1 z1.h, z1.h, z16.h +; VBITS_GE_128-NEXT: udiv z3.s, p0/m, z3.s, z5.s +; VBITS_GE_128-NEXT: stp q0, q1, [x0, #32] +; VBITS_GE_128-NEXT: uzp1 z0.h, z3.h, z4.h +; VBITS_GE_128-NEXT: uzp1 z1.h, z2.h, z7.h +; VBITS_GE_128-NEXT: stp q0, q1, [x0] +; VBITS_GE_128-NEXT: ret +; +; VBITS_GE_256-LABEL: udiv_v32i16: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov x8, #16 +; VBITS_GE_256-NEXT: ptrue p0.h, vl16 +; VBITS_GE_256-NEXT: ptrue p1.s, vl8 +; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] +; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0] +; VBITS_GE_256-NEXT: ld1h { z2.h }, p0/z, [x1, x8, lsl #1] +; VBITS_GE_256-NEXT: ld1h { z3.h }, p0/z, [x1] +; VBITS_GE_256-NEXT: uunpkhi z5.s, z0.h +; VBITS_GE_256-NEXT: uunpklo z0.s, z0.h +; VBITS_GE_256-NEXT: uunpkhi z4.s, z2.h +; VBITS_GE_256-NEXT: uunpklo z2.s, z2.h +; VBITS_GE_256-NEXT: udivr z4.s, p1/m, z4.s, z5.s +; VBITS_GE_256-NEXT: uunpkhi z5.s, z3.h +; VBITS_GE_256-NEXT: udiv z0.s, p1/m, z0.s, z2.s +; VBITS_GE_256-NEXT: uunpkhi z2.s, z1.h +; VBITS_GE_256-NEXT: uunpklo z3.s, z3.h +; VBITS_GE_256-NEXT: uunpklo z1.s, z1.h +; VBITS_GE_256-NEXT: udiv z2.s, p1/m, z2.s, z5.s +; VBITS_GE_256-NEXT: udiv z1.s, p1/m, z1.s, z3.s +; VBITS_GE_256-NEXT: uzp1 z0.h, z0.h, z4.h +; VBITS_GE_256-NEXT: uzp1 z1.h, z1.h, z2.h +; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x0, x8, lsl #1] +; VBITS_GE_256-NEXT: st1h { z1.h }, p0, [x0] +; VBITS_GE_256-NEXT: ret + %op1 = load <32 x i16>, <32 x i16>* %a + %op2 = load <32 x i16>, <32 x i16>* %b + %res = udiv <32 x i16> %op1, %op2 + store <32 x i16> %res, <32 x i16>* %a + ret void +} + +define <2 x i32> @udiv_v2i32(<2 x i32> %op1, <2 x i32> %op2) #0 { +; CHECK-LABEL: udiv_v2i32: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: ptrue p0.s, vl2 +; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1 +; CHECK-NEXT: udiv z0.s, p0/m, z0.s, z1.s +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 +; CHECK-NEXT: ret + %res = udiv <2 x i32> %op1, %op2 + ret <2 x i32> %res +} + +define <4 x i32> @udiv_v4i32(<4 x i32> %op1, <4 x i32> %op2) #0 { +; CHECK-LABEL: udiv_v4i32: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: ptrue p0.s, vl4 +; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1 +; CHECK-NEXT: udiv z0.s, p0/m, z0.s, z1.s +; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 +; CHECK-NEXT: ret + %res = udiv <4 x i32> %op1, %op2 + ret <4 x i32> %res +} + +define void @udiv_v8i32(<8 x i32>* %a, <8 x i32>* %b) #0 { +; VBITS_GE_128-LABEL: udiv_v8i32: +; VBITS_GE_128: // %bb.0: +; VBITS_GE_128-NEXT: mov x8, #4 +; VBITS_GE_128-NEXT: ptrue p0.s, vl4 +; VBITS_GE_128-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] +; VBITS_GE_128-NEXT: ld1w { z1.s }, p0/z, [x0] +; VBITS_GE_128-NEXT: ld1w { z2.s }, p0/z, [x1, x8, lsl #2] +; VBITS_GE_128-NEXT: ld1w { z3.s }, p0/z, [x1] +; VBITS_GE_128-NEXT: udiv z1.s, p0/m, z1.s, z3.s +; VBITS_GE_128-NEXT: udiv z0.s, p0/m, z0.s, z2.s +; VBITS_GE_128-NEXT: stp q1, q0, [x0] +; VBITS_GE_128-NEXT: ret +; +; VBITS_GE_256-LABEL: udiv_v8i32: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: ptrue p0.s, vl8 +; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0] +; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x1] +; VBITS_GE_256-NEXT: udiv z0.s, p0/m, z0.s, z1.s +; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0] +; VBITS_GE_256-NEXT: ret +; +; VBITS_GE_512-LABEL: udiv_v8i32: +; VBITS_GE_512: // %bb.0: +; VBITS_GE_512-NEXT: ptrue p0.s, vl8 +; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x0] +; VBITS_GE_512-NEXT: ld1w { z1.s }, p0/z, [x1] +; VBITS_GE_512-NEXT: udiv z0.s, p0/m, z0.s, z1.s +; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x0] +; VBITS_GE_512-NEXT: ret + %op1 = load <8 x i32>, <8 x i32>* %a + %op2 = load <8 x i32>, <8 x i32>* %b + %res = udiv <8 x i32> %op1, %op2 + store <8 x i32> %res, <8 x i32>* %a + ret void +} + +define void @udiv_v16i32(<16 x i32>* %a, <16 x i32>* %b) #0 { +; VBITS_GE_128-LABEL: udiv_v16i32: +; VBITS_GE_128: // %bb.0: +; VBITS_GE_128-NEXT: mov x8, #8 +; VBITS_GE_128-NEXT: mov x9, #12 +; VBITS_GE_128-NEXT: mov x10, #4 +; VBITS_GE_128-NEXT: ptrue p0.s, vl4 +; VBITS_GE_128-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] +; VBITS_GE_128-NEXT: ld1w { z1.s }, p0/z, [x0, x9, lsl #2] +; VBITS_GE_128-NEXT: ld1w { z2.s }, p0/z, [x0, x10, lsl #2] +; VBITS_GE_128-NEXT: ld1w { z3.s }, p0/z, [x0] +; VBITS_GE_128-NEXT: ld1w { z4.s }, p0/z, [x1, x8, lsl #2] +; VBITS_GE_128-NEXT: ld1w { z5.s }, p0/z, [x1, x9, lsl #2] +; VBITS_GE_128-NEXT: ld1w { z6.s }, p0/z, [x1, x10, lsl #2] +; VBITS_GE_128-NEXT: ld1w { z7.s }, p0/z, [x1] +; VBITS_GE_128-NEXT: udiv z0.s, p0/m, z0.s, z4.s +; VBITS_GE_128-NEXT: udiv z1.s, p0/m, z1.s, z5.s +; VBITS_GE_128-NEXT: stp q0, q1, [x0, #32] +; VBITS_GE_128-NEXT: movprfx z0, z3 +; VBITS_GE_128-NEXT: udiv z0.s, p0/m, z0.s, z7.s +; VBITS_GE_128-NEXT: movprfx z1, z2 +; VBITS_GE_128-NEXT: udiv z1.s, p0/m, z1.s, z6.s +; VBITS_GE_128-NEXT: stp q0, q1, [x0] +; VBITS_GE_128-NEXT: ret +; +; VBITS_GE_256-LABEL: udiv_v16i32: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov x8, #8 +; VBITS_GE_256-NEXT: ptrue p0.s, vl8 +; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0] +; VBITS_GE_256-NEXT: ld1w { z2.s }, p0/z, [x1, x8, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z3.s }, p0/z, [x1] +; VBITS_GE_256-NEXT: udiv z0.s, p0/m, z0.s, z2.s +; VBITS_GE_256-NEXT: udiv z1.s, p0/m, z1.s, z3.s +; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0, x8, lsl #2] +; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x0] +; VBITS_GE_256-NEXT: ret +; +; VBITS_GE_512-LABEL: udiv_v16i32: +; VBITS_GE_512: // %bb.0: +; VBITS_GE_512-NEXT: ptrue p0.s, vl16 +; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x0] +; VBITS_GE_512-NEXT: ld1w { z1.s }, p0/z, [x1] +; VBITS_GE_512-NEXT: udiv z0.s, p0/m, z0.s, z1.s +; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x0] +; VBITS_GE_512-NEXT: ret + %op1 = load <16 x i32>, <16 x i32>* %a + %op2 = load <16 x i32>, <16 x i32>* %b + %res = udiv <16 x i32> %op1, %op2 + store <16 x i32> %res, <16 x i32>* %a + ret void +} + +define <1 x i64> @udiv_v1i64(<1 x i64> %op1, <1 x i64> %op2) #0 { +; CHECK-LABEL: udiv_v1i64: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: ptrue p0.d, vl1 +; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1 +; CHECK-NEXT: udiv z0.d, p0/m, z0.d, z1.d +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 +; CHECK-NEXT: ret + %res = udiv <1 x i64> %op1, %op2 + ret <1 x i64> %res +} + +define <2 x i64> @udiv_v2i64(<2 x i64> %op1, <2 x i64> %op2) #0 { +; CHECK-LABEL: udiv_v2i64: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: ptrue p0.d, vl2 +; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1 +; CHECK-NEXT: udiv z0.d, p0/m, z0.d, z1.d +; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 +; CHECK-NEXT: ret + %res = udiv <2 x i64> %op1, %op2 + ret <2 x i64> %res +} + +define void @udiv_v4i64(<4 x i64>* %a, <4 x i64>* %b) #0 { +; VBITS_GE_128-LABEL: udiv_v4i64: +; VBITS_GE_128: // %bb.0: +; VBITS_GE_128-NEXT: mov x8, #2 +; VBITS_GE_128-NEXT: ptrue p0.d, vl2 +; VBITS_GE_128-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] +; VBITS_GE_128-NEXT: ld1d { z1.d }, p0/z, [x0] +; VBITS_GE_128-NEXT: ld1d { z2.d }, p0/z, [x1, x8, lsl #3] +; VBITS_GE_128-NEXT: ld1d { z3.d }, p0/z, [x1] +; VBITS_GE_128-NEXT: udiv z1.d, p0/m, z1.d, z3.d +; VBITS_GE_128-NEXT: udiv z0.d, p0/m, z0.d, z2.d +; VBITS_GE_128-NEXT: stp q1, q0, [x0] +; VBITS_GE_128-NEXT: ret +; +; VBITS_GE_256-LABEL: udiv_v4i64: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: ptrue p0.d, vl4 +; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0] +; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x1] +; VBITS_GE_256-NEXT: udiv z0.d, p0/m, z0.d, z1.d +; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x0] +; VBITS_GE_256-NEXT: ret +; +; VBITS_GE_512-LABEL: udiv_v4i64: +; VBITS_GE_512: // %bb.0: +; VBITS_GE_512-NEXT: ptrue p0.d, vl4 +; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x0] +; VBITS_GE_512-NEXT: ld1d { z1.d }, p0/z, [x1] +; VBITS_GE_512-NEXT: udiv z0.d, p0/m, z0.d, z1.d +; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x0] +; VBITS_GE_512-NEXT: ret + %op1 = load <4 x i64>, <4 x i64>* %a + %op2 = load <4 x i64>, <4 x i64>* %b + %res = udiv <4 x i64> %op1, %op2 + store <4 x i64> %res, <4 x i64>* %a + ret void +} + +define void @udiv_v8i64(<8 x i64>* %a, <8 x i64>* %b) #0 { +; VBITS_GE_128-LABEL: udiv_v8i64: +; VBITS_GE_128: // %bb.0: +; VBITS_GE_128-NEXT: mov x8, #4 +; VBITS_GE_128-NEXT: mov x9, #6 +; VBITS_GE_128-NEXT: mov x10, #2 +; VBITS_GE_128-NEXT: ptrue p0.d, vl2 +; VBITS_GE_128-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] +; VBITS_GE_128-NEXT: ld1d { z1.d }, p0/z, [x0, x9, lsl #3] +; VBITS_GE_128-NEXT: ld1d { z2.d }, p0/z, [x0, x10, lsl #3] +; VBITS_GE_128-NEXT: ld1d { z3.d }, p0/z, [x0] +; VBITS_GE_128-NEXT: ld1d { z4.d }, p0/z, [x1, x8, lsl #3] +; VBITS_GE_128-NEXT: ld1d { z5.d }, p0/z, [x1, x9, lsl #3] +; VBITS_GE_128-NEXT: ld1d { z6.d }, p0/z, [x1, x10, lsl #3] +; VBITS_GE_128-NEXT: ld1d { z7.d }, p0/z, [x1] +; VBITS_GE_128-NEXT: udiv z0.d, p0/m, z0.d, z4.d +; VBITS_GE_128-NEXT: udiv z1.d, p0/m, z1.d, z5.d +; VBITS_GE_128-NEXT: stp q0, q1, [x0, #32] +; VBITS_GE_128-NEXT: movprfx z0, z3 +; VBITS_GE_128-NEXT: udiv z0.d, p0/m, z0.d, z7.d +; VBITS_GE_128-NEXT: movprfx z1, z2 +; VBITS_GE_128-NEXT: udiv z1.d, p0/m, z1.d, z6.d +; VBITS_GE_128-NEXT: stp q0, q1, [x0] +; VBITS_GE_128-NEXT: ret +; +; VBITS_GE_256-LABEL: udiv_v8i64: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov x8, #4 +; VBITS_GE_256-NEXT: ptrue p0.d, vl4 +; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0] +; VBITS_GE_256-NEXT: ld1d { z2.d }, p0/z, [x1, x8, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z3.d }, p0/z, [x1] +; VBITS_GE_256-NEXT: udiv z0.d, p0/m, z0.d, z2.d +; VBITS_GE_256-NEXT: udiv z1.d, p0/m, z1.d, z3.d +; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x0, x8, lsl #3] +; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x0] +; VBITS_GE_256-NEXT: ret +; +; VBITS_GE_512-LABEL: udiv_v8i64: +; VBITS_GE_512: // %bb.0: +; VBITS_GE_512-NEXT: ptrue p0.d, vl8 +; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x0] +; VBITS_GE_512-NEXT: ld1d { z1.d }, p0/z, [x1] +; VBITS_GE_512-NEXT: udiv z0.d, p0/m, z0.d, z1.d +; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x0] +; VBITS_GE_512-NEXT: ret + %op1 = load <8 x i64>, <8 x i64>* %a + %op2 = load <8 x i64>, <8 x i64>* %b + %res = udiv <8 x i64> %op1, %op2 + store <8 x i64> %res, <8 x i64>* %a + ret void +} + +define void @udiv_constantsplat_v8i32(<8 x i32>* %a) #0 { +; VBITS_GE_128-LABEL: udiv_constantsplat_v8i32: +; VBITS_GE_128: // %bb.0: +; VBITS_GE_128-NEXT: mov x8, #4 +; VBITS_GE_128-NEXT: ptrue p0.s, vl4 +; VBITS_GE_128-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] +; VBITS_GE_128-NEXT: adrp x8, .LCPI32_0 +; VBITS_GE_128-NEXT: add x8, x8, :lo12:.LCPI32_0 +; VBITS_GE_128-NEXT: ld1w { z1.s }, p0/z, [x0] +; VBITS_GE_128-NEXT: ld1w { z2.s }, p0/z, [x8] +; VBITS_GE_128-NEXT: adrp x8, .LCPI32_1 +; VBITS_GE_128-NEXT: add x8, x8, :lo12:.LCPI32_1 +; VBITS_GE_128-NEXT: ld1w { z3.s }, p0/z, [x8] +; VBITS_GE_128-NEXT: adrp x8, .LCPI32_2 +; VBITS_GE_128-NEXT: add x8, x8, :lo12:.LCPI32_2 +; VBITS_GE_128-NEXT: ld1w { z4.s }, p0/z, [x8] +; VBITS_GE_128-NEXT: movprfx z5, z1 +; VBITS_GE_128-NEXT: umulh z5.s, p0/m, z5.s, z2.s +; VBITS_GE_128-NEXT: umulh z2.s, p0/m, z2.s, z0.s +; VBITS_GE_128-NEXT: sub z1.s, z1.s, z5.s +; VBITS_GE_128-NEXT: sub z0.s, z0.s, z2.s +; VBITS_GE_128-NEXT: lsr z1.s, p0/m, z1.s, z3.s +; VBITS_GE_128-NEXT: lsr z0.s, p0/m, z0.s, z3.s +; VBITS_GE_128-NEXT: add z1.s, z1.s, z5.s +; VBITS_GE_128-NEXT: add z0.s, z0.s, z2.s +; VBITS_GE_128-NEXT: lsr z1.s, p0/m, z1.s, z4.s +; VBITS_GE_128-NEXT: lsr z0.s, p0/m, z0.s, z4.s +; VBITS_GE_128-NEXT: stp q1, q0, [x0] +; VBITS_GE_128-NEXT: ret +; +; VBITS_GE_256-LABEL: udiv_constantsplat_v8i32: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: ptrue p0.s, vl8 +; VBITS_GE_256-NEXT: mov w8, #8969 +; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0] +; VBITS_GE_256-NEXT: movk w8, #22765, lsl #16 +; VBITS_GE_256-NEXT: mov z1.s, w8 +; VBITS_GE_256-NEXT: umulh z1.s, p0/m, z1.s, z0.s +; VBITS_GE_256-NEXT: sub z0.s, z0.s, z1.s +; VBITS_GE_256-NEXT: lsr z0.s, p0/m, z0.s, #1 +; VBITS_GE_256-NEXT: add z0.s, z0.s, z1.s +; VBITS_GE_256-NEXT: lsr z0.s, p0/m, z0.s, #6 +; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0] +; VBITS_GE_256-NEXT: ret +; +; VBITS_GE_512-LABEL: udiv_constantsplat_v8i32: +; VBITS_GE_512: // %bb.0: +; VBITS_GE_512-NEXT: ptrue p0.s, vl8 +; VBITS_GE_512-NEXT: mov w8, #8969 +; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x0] +; VBITS_GE_512-NEXT: movk w8, #22765, lsl #16 +; VBITS_GE_512-NEXT: mov z1.s, w8 +; VBITS_GE_512-NEXT: umulh z1.s, p0/m, z1.s, z0.s +; VBITS_GE_512-NEXT: sub z0.s, z0.s, z1.s +; VBITS_GE_512-NEXT: lsr z0.s, p0/m, z0.s, #1 +; VBITS_GE_512-NEXT: add z0.s, z0.s, z1.s +; VBITS_GE_512-NEXT: lsr z0.s, p0/m, z0.s, #6 +; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x0] +; VBITS_GE_512-NEXT: ret + %op1 = load <8 x i32>, <8 x i32>* %a + %res = udiv <8 x i32> %op1, + store <8 x i32> %res, <8 x i32>* %a + ret void +} + +attributes #0 = { "target-features"="+sve" } diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-log.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-log.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-log.ll @@ -0,0 +1,3109 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -aarch64-sve-vector-bits-min=128 -force-sve-when-streaming-compatible < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_128 +; RUN: llc -aarch64-sve-vector-bits-min=256 -force-sve-when-streaming-compatible < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_256 +; RUN: llc -aarch64-sve-vector-bits-min=512 -force-sve-when-streaming-compatible < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512 +; RUN: llc -aarch64-sve-vector-bits-min=2048 -force-sve-when-streaming-compatible < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512 + +target triple = "aarch64-unknown-linux-gnu" + +; +; AND +; + +define <8 x i8> @and_v8i8(<8 x i8> %op1, <8 x i8> %op2) #0 { +; CHECK-LABEL: and_v8i8: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1 +; CHECK-NEXT: and z0.d, z0.d, z1.d +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 +; CHECK-NEXT: ret + %res = and <8 x i8> %op1, %op2 + ret <8 x i8> %res +} + +define <16 x i8> @and_v16i8(<16 x i8> %op1, <16 x i8> %op2) #0 { +; CHECK-LABEL: and_v16i8: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1 +; CHECK-NEXT: and z0.d, z0.d, z1.d +; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 +; CHECK-NEXT: ret + %res = and <16 x i8> %op1, %op2 + ret <16 x i8> %res +} + +define void @and_v32i8(<32 x i8>* %a, <32 x i8>* %b) #0 { +; VBITS_GE_128-LABEL: and_v32i8: +; VBITS_GE_128: // %bb.0: +; VBITS_GE_128-NEXT: mov w8, #16 +; VBITS_GE_128-NEXT: ptrue p0.b, vl16 +; VBITS_GE_128-NEXT: ld1b { z0.b }, p0/z, [x0, x8] +; VBITS_GE_128-NEXT: ld1b { z1.b }, p0/z, [x0] +; VBITS_GE_128-NEXT: ld1b { z2.b }, p0/z, [x1, x8] +; VBITS_GE_128-NEXT: ld1b { z3.b }, p0/z, [x1] +; VBITS_GE_128-NEXT: and z1.d, z1.d, z3.d +; VBITS_GE_128-NEXT: and z0.d, z0.d, z2.d +; VBITS_GE_128-NEXT: stp q1, q0, [x0] +; VBITS_GE_128-NEXT: ret +; +; VBITS_GE_256-LABEL: and_v32i8: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: ptrue p0.b, vl32 +; VBITS_GE_256-NEXT: ld1b { z0.b }, p0/z, [x0] +; VBITS_GE_256-NEXT: ld1b { z1.b }, p0/z, [x1] +; VBITS_GE_256-NEXT: and z0.d, z0.d, z1.d +; VBITS_GE_256-NEXT: st1b { z0.b }, p0, [x0] +; VBITS_GE_256-NEXT: ret +; +; VBITS_GE_512-LABEL: and_v32i8: +; VBITS_GE_512: // %bb.0: +; VBITS_GE_512-NEXT: ptrue p0.b, vl32 +; VBITS_GE_512-NEXT: ld1b { z0.b }, p0/z, [x0] +; VBITS_GE_512-NEXT: ld1b { z1.b }, p0/z, [x1] +; VBITS_GE_512-NEXT: and z0.d, z0.d, z1.d +; VBITS_GE_512-NEXT: st1b { z0.b }, p0, [x0] +; VBITS_GE_512-NEXT: ret + %op1 = load <32 x i8>, <32 x i8>* %a + %op2 = load <32 x i8>, <32 x i8>* %b + %res = and <32 x i8> %op1, %op2 + store <32 x i8> %res, <32 x i8>* %a + ret void +} + +define void @and_v64i8(<64 x i8>* %a, <64 x i8>* %b) #0 { +; VBITS_GE_128-LABEL: and_v64i8: +; VBITS_GE_128: // %bb.0: +; VBITS_GE_128-NEXT: mov w8, #32 +; VBITS_GE_128-NEXT: mov w9, #48 +; VBITS_GE_128-NEXT: mov w10, #16 +; VBITS_GE_128-NEXT: ptrue p0.b, vl16 +; VBITS_GE_128-NEXT: ld1b { z0.b }, p0/z, [x0, x8] +; VBITS_GE_128-NEXT: ld1b { z1.b }, p0/z, [x0, x9] +; VBITS_GE_128-NEXT: ld1b { z2.b }, p0/z, [x0, x10] +; VBITS_GE_128-NEXT: ld1b { z3.b }, p0/z, [x0] +; VBITS_GE_128-NEXT: ld1b { z4.b }, p0/z, [x1, x8] +; VBITS_GE_128-NEXT: ld1b { z5.b }, p0/z, [x1, x9] +; VBITS_GE_128-NEXT: ld1b { z6.b }, p0/z, [x1, x10] +; VBITS_GE_128-NEXT: ld1b { z7.b }, p0/z, [x1] +; VBITS_GE_128-NEXT: and z0.d, z0.d, z4.d +; VBITS_GE_128-NEXT: and z1.d, z1.d, z5.d +; VBITS_GE_128-NEXT: stp q0, q1, [x0, #32] +; VBITS_GE_128-NEXT: and z0.d, z3.d, z7.d +; VBITS_GE_128-NEXT: and z1.d, z2.d, z6.d +; VBITS_GE_128-NEXT: stp q0, q1, [x0] +; VBITS_GE_128-NEXT: ret +; +; VBITS_GE_256-LABEL: and_v64i8: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov w8, #32 +; VBITS_GE_256-NEXT: ptrue p0.b, vl32 +; VBITS_GE_256-NEXT: ld1b { z0.b }, p0/z, [x0, x8] +; VBITS_GE_256-NEXT: ld1b { z1.b }, p0/z, [x0] +; VBITS_GE_256-NEXT: ld1b { z2.b }, p0/z, [x1, x8] +; VBITS_GE_256-NEXT: ld1b { z3.b }, p0/z, [x1] +; VBITS_GE_256-NEXT: and z0.d, z0.d, z2.d +; VBITS_GE_256-NEXT: and z1.d, z1.d, z3.d +; VBITS_GE_256-NEXT: st1b { z0.b }, p0, [x0, x8] +; VBITS_GE_256-NEXT: st1b { z1.b }, p0, [x0] +; VBITS_GE_256-NEXT: ret +; +; VBITS_GE_512-LABEL: and_v64i8: +; VBITS_GE_512: // %bb.0: +; VBITS_GE_512-NEXT: ptrue p0.b, vl64 +; VBITS_GE_512-NEXT: ld1b { z0.b }, p0/z, [x0] +; VBITS_GE_512-NEXT: ld1b { z1.b }, p0/z, [x1] +; VBITS_GE_512-NEXT: and z0.d, z0.d, z1.d +; VBITS_GE_512-NEXT: st1b { z0.b }, p0, [x0] +; VBITS_GE_512-NEXT: ret + %op1 = load <64 x i8>, <64 x i8>* %a + %op2 = load <64 x i8>, <64 x i8>* %b + %res = and <64 x i8> %op1, %op2 + store <64 x i8> %res, <64 x i8>* %a + ret void +} + +define void @and_v128i8(<128 x i8>* %a, <128 x i8>* %b) #0 { +; VBITS_GE_128-LABEL: and_v128i8: +; VBITS_GE_128: // %bb.0: +; VBITS_GE_128-NEXT: mov w8, #96 +; VBITS_GE_128-NEXT: mov w9, #112 +; VBITS_GE_128-NEXT: mov w10, #64 +; VBITS_GE_128-NEXT: mov w11, #80 +; VBITS_GE_128-NEXT: mov w12, #32 +; VBITS_GE_128-NEXT: mov w13, #48 +; VBITS_GE_128-NEXT: mov w14, #16 +; VBITS_GE_128-NEXT: ptrue p0.b, vl16 +; VBITS_GE_128-NEXT: ld1b { z0.b }, p0/z, [x0, x8] +; VBITS_GE_128-NEXT: ld1b { z1.b }, p0/z, [x0, x9] +; VBITS_GE_128-NEXT: ld1b { z2.b }, p0/z, [x0, x10] +; VBITS_GE_128-NEXT: ld1b { z3.b }, p0/z, [x0, x11] +; VBITS_GE_128-NEXT: ld1b { z4.b }, p0/z, [x0, x12] +; VBITS_GE_128-NEXT: ld1b { z5.b }, p0/z, [x0, x13] +; VBITS_GE_128-NEXT: ld1b { z6.b }, p0/z, [x0, x14] +; VBITS_GE_128-NEXT: ld1b { z7.b }, p0/z, [x0] +; VBITS_GE_128-NEXT: ld1b { z16.b }, p0/z, [x1, x8] +; VBITS_GE_128-NEXT: ld1b { z17.b }, p0/z, [x1, x9] +; VBITS_GE_128-NEXT: ld1b { z18.b }, p0/z, [x1, x10] +; VBITS_GE_128-NEXT: ld1b { z19.b }, p0/z, [x1, x11] +; VBITS_GE_128-NEXT: ld1b { z20.b }, p0/z, [x1, x12] +; VBITS_GE_128-NEXT: ld1b { z21.b }, p0/z, [x1, x13] +; VBITS_GE_128-NEXT: ld1b { z22.b }, p0/z, [x1, x14] +; VBITS_GE_128-NEXT: ld1b { z23.b }, p0/z, [x1] +; VBITS_GE_128-NEXT: and z0.d, z0.d, z16.d +; VBITS_GE_128-NEXT: and z1.d, z1.d, z17.d +; VBITS_GE_128-NEXT: stp q0, q1, [x0, #96] +; VBITS_GE_128-NEXT: and z0.d, z2.d, z18.d +; VBITS_GE_128-NEXT: and z1.d, z3.d, z19.d +; VBITS_GE_128-NEXT: stp q0, q1, [x0, #64] +; VBITS_GE_128-NEXT: and z0.d, z4.d, z20.d +; VBITS_GE_128-NEXT: and z1.d, z5.d, z21.d +; VBITS_GE_128-NEXT: stp q0, q1, [x0, #32] +; VBITS_GE_128-NEXT: and z0.d, z7.d, z23.d +; VBITS_GE_128-NEXT: and z1.d, z6.d, z22.d +; VBITS_GE_128-NEXT: stp q0, q1, [x0] +; VBITS_GE_128-NEXT: ret +; +; VBITS_GE_256-LABEL: and_v128i8: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov w8, #64 +; VBITS_GE_256-NEXT: mov w9, #96 +; VBITS_GE_256-NEXT: mov w10, #32 +; VBITS_GE_256-NEXT: ptrue p0.b, vl32 +; VBITS_GE_256-NEXT: ld1b { z0.b }, p0/z, [x0, x8] +; VBITS_GE_256-NEXT: ld1b { z1.b }, p0/z, [x0, x9] +; VBITS_GE_256-NEXT: ld1b { z2.b }, p0/z, [x0, x10] +; VBITS_GE_256-NEXT: ld1b { z3.b }, p0/z, [x0] +; VBITS_GE_256-NEXT: ld1b { z4.b }, p0/z, [x1, x9] +; VBITS_GE_256-NEXT: ld1b { z5.b }, p0/z, [x1, x10] +; VBITS_GE_256-NEXT: ld1b { z6.b }, p0/z, [x1, x8] +; VBITS_GE_256-NEXT: ld1b { z7.b }, p0/z, [x1] +; VBITS_GE_256-NEXT: and z2.d, z2.d, z5.d +; VBITS_GE_256-NEXT: and z1.d, z1.d, z4.d +; VBITS_GE_256-NEXT: and z0.d, z0.d, z6.d +; VBITS_GE_256-NEXT: and z3.d, z3.d, z7.d +; VBITS_GE_256-NEXT: st1b { z0.b }, p0, [x0, x8] +; VBITS_GE_256-NEXT: st1b { z1.b }, p0, [x0, x9] +; VBITS_GE_256-NEXT: st1b { z2.b }, p0, [x0, x10] +; VBITS_GE_256-NEXT: st1b { z3.b }, p0, [x0] +; VBITS_GE_256-NEXT: ret + %op1 = load <128 x i8>, <128 x i8>* %a + %op2 = load <128 x i8>, <128 x i8>* %b + %res = and <128 x i8> %op1, %op2 + store <128 x i8> %res, <128 x i8>* %a + ret void +} + +define void @and_v256i8(<256 x i8>* %a, <256 x i8>* %b) #0 { +; VBITS_GE_128-LABEL: and_v256i8: +; VBITS_GE_128: // %bb.0: +; VBITS_GE_128-NEXT: stp d15, d14, [sp, #-64]! // 16-byte Folded Spill +; VBITS_GE_128-NEXT: .cfi_def_cfa_offset 64 +; VBITS_GE_128-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill +; VBITS_GE_128-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill +; VBITS_GE_128-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill +; VBITS_GE_128-NEXT: .cfi_offset b8, -8 +; VBITS_GE_128-NEXT: .cfi_offset b9, -16 +; VBITS_GE_128-NEXT: .cfi_offset b10, -24 +; VBITS_GE_128-NEXT: .cfi_offset b11, -32 +; VBITS_GE_128-NEXT: .cfi_offset b12, -40 +; VBITS_GE_128-NEXT: .cfi_offset b13, -48 +; VBITS_GE_128-NEXT: .cfi_offset b14, -56 +; VBITS_GE_128-NEXT: .cfi_offset b15, -64 +; VBITS_GE_128-NEXT: mov w8, #240 +; VBITS_GE_128-NEXT: mov w9, #224 +; VBITS_GE_128-NEXT: mov w10, #208 +; VBITS_GE_128-NEXT: mov w11, #192 +; VBITS_GE_128-NEXT: mov w12, #176 +; VBITS_GE_128-NEXT: mov w13, #160 +; VBITS_GE_128-NEXT: mov w14, #144 +; VBITS_GE_128-NEXT: mov w15, #128 +; VBITS_GE_128-NEXT: mov w16, #112 +; VBITS_GE_128-NEXT: mov w17, #96 +; VBITS_GE_128-NEXT: mov w18, #80 +; VBITS_GE_128-NEXT: mov w2, #64 +; VBITS_GE_128-NEXT: mov w3, #48 +; VBITS_GE_128-NEXT: mov w4, #32 +; VBITS_GE_128-NEXT: mov w5, #16 +; VBITS_GE_128-NEXT: ptrue p0.b, vl16 +; VBITS_GE_128-NEXT: ld1b { z1.b }, p0/z, [x0, x8] +; VBITS_GE_128-NEXT: ld1b { z2.b }, p0/z, [x0, x9] +; VBITS_GE_128-NEXT: ld1b { z0.b }, p0/z, [x0, x10] +; VBITS_GE_128-NEXT: ld1b { z16.b }, p0/z, [x0, x11] +; VBITS_GE_128-NEXT: ld1b { z7.b }, p0/z, [x0, x12] +; VBITS_GE_128-NEXT: ld1b { z6.b }, p0/z, [x0, x13] +; VBITS_GE_128-NEXT: ld1b { z5.b }, p0/z, [x0, x14] +; VBITS_GE_128-NEXT: ld1b { z3.b }, p0/z, [x0, x15] +; VBITS_GE_128-NEXT: ld1b { z4.b }, p0/z, [x0, x16] +; VBITS_GE_128-NEXT: ld1b { z17.b }, p0/z, [x0, x17] +; VBITS_GE_128-NEXT: ld1b { z18.b }, p0/z, [x0, x18] +; VBITS_GE_128-NEXT: ld1b { z19.b }, p0/z, [x0, x2] +; VBITS_GE_128-NEXT: ld1b { z20.b }, p0/z, [x0, x3] +; VBITS_GE_128-NEXT: ld1b { z21.b }, p0/z, [x0, x4] +; VBITS_GE_128-NEXT: ld1b { z22.b }, p0/z, [x0, x5] +; VBITS_GE_128-NEXT: ld1b { z23.b }, p0/z, [x0] +; VBITS_GE_128-NEXT: ld1b { z24.b }, p0/z, [x1, x8] +; VBITS_GE_128-NEXT: ld1b { z25.b }, p0/z, [x1, x9] +; VBITS_GE_128-NEXT: ld1b { z26.b }, p0/z, [x1, x10] +; VBITS_GE_128-NEXT: ld1b { z27.b }, p0/z, [x1, x11] +; VBITS_GE_128-NEXT: ld1b { z28.b }, p0/z, [x1, x12] +; VBITS_GE_128-NEXT: ld1b { z29.b }, p0/z, [x1, x13] +; VBITS_GE_128-NEXT: ld1b { z30.b }, p0/z, [x1, x14] +; VBITS_GE_128-NEXT: ld1b { z31.b }, p0/z, [x1, x15] +; VBITS_GE_128-NEXT: ld1b { z8.b }, p0/z, [x1, x16] +; VBITS_GE_128-NEXT: ld1b { z9.b }, p0/z, [x1, x17] +; VBITS_GE_128-NEXT: and z1.d, z1.d, z24.d +; VBITS_GE_128-NEXT: and z2.d, z2.d, z25.d +; VBITS_GE_128-NEXT: ld1b { z10.b }, p0/z, [x1, x18] +; VBITS_GE_128-NEXT: ld1b { z11.b }, p0/z, [x1, x2] +; VBITS_GE_128-NEXT: ld1b { z12.b }, p0/z, [x1, x3] +; VBITS_GE_128-NEXT: ld1b { z13.b }, p0/z, [x1, x4] +; VBITS_GE_128-NEXT: ld1b { z14.b }, p0/z, [x1, x5] +; VBITS_GE_128-NEXT: ld1b { z15.b }, p0/z, [x1] +; VBITS_GE_128-NEXT: stp q2, q1, [x0, #224] +; VBITS_GE_128-NEXT: and z0.d, z0.d, z26.d +; VBITS_GE_128-NEXT: and z1.d, z16.d, z27.d +; VBITS_GE_128-NEXT: stp q1, q0, [x0, #192] +; VBITS_GE_128-NEXT: and z0.d, z7.d, z28.d +; VBITS_GE_128-NEXT: and z1.d, z6.d, z29.d +; VBITS_GE_128-NEXT: stp q1, q0, [x0, #160] +; VBITS_GE_128-NEXT: and z0.d, z5.d, z30.d +; VBITS_GE_128-NEXT: and z1.d, z3.d, z31.d +; VBITS_GE_128-NEXT: stp q1, q0, [x0, #128] +; VBITS_GE_128-NEXT: and z0.d, z4.d, z8.d +; VBITS_GE_128-NEXT: and z1.d, z17.d, z9.d +; VBITS_GE_128-NEXT: stp q1, q0, [x0, #96] +; VBITS_GE_128-NEXT: and z0.d, z18.d, z10.d +; VBITS_GE_128-NEXT: and z1.d, z19.d, z11.d +; VBITS_GE_128-NEXT: stp q1, q0, [x0, #64] +; VBITS_GE_128-NEXT: and z0.d, z20.d, z12.d +; VBITS_GE_128-NEXT: and z1.d, z21.d, z13.d +; VBITS_GE_128-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload +; VBITS_GE_128-NEXT: stp q1, q0, [x0, #32] +; VBITS_GE_128-NEXT: and z0.d, z23.d, z15.d +; VBITS_GE_128-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload +; VBITS_GE_128-NEXT: and z1.d, z22.d, z14.d +; VBITS_GE_128-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload +; VBITS_GE_128-NEXT: stp q0, q1, [x0] +; VBITS_GE_128-NEXT: ldp d15, d14, [sp], #64 // 16-byte Folded Reload +; VBITS_GE_128-NEXT: ret +; +; VBITS_GE_256-LABEL: and_v256i8: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov w8, #192 +; VBITS_GE_256-NEXT: mov w9, #224 +; VBITS_GE_256-NEXT: mov w10, #128 +; VBITS_GE_256-NEXT: mov w11, #160 +; VBITS_GE_256-NEXT: mov w12, #64 +; VBITS_GE_256-NEXT: mov w13, #96 +; VBITS_GE_256-NEXT: mov w14, #32 +; VBITS_GE_256-NEXT: ptrue p0.b, vl32 +; VBITS_GE_256-NEXT: ld1b { z0.b }, p0/z, [x0, x8] +; VBITS_GE_256-NEXT: ld1b { z1.b }, p0/z, [x0, x9] +; VBITS_GE_256-NEXT: ld1b { z2.b }, p0/z, [x0, x10] +; VBITS_GE_256-NEXT: ld1b { z3.b }, p0/z, [x0, x11] +; VBITS_GE_256-NEXT: ld1b { z4.b }, p0/z, [x0, x12] +; VBITS_GE_256-NEXT: ld1b { z5.b }, p0/z, [x0, x13] +; VBITS_GE_256-NEXT: ld1b { z6.b }, p0/z, [x0, x14] +; VBITS_GE_256-NEXT: ld1b { z7.b }, p0/z, [x0] +; VBITS_GE_256-NEXT: ld1b { z16.b }, p0/z, [x1, x13] +; VBITS_GE_256-NEXT: ld1b { z17.b }, p0/z, [x1, x14] +; VBITS_GE_256-NEXT: ld1b { z18.b }, p0/z, [x1, x11] +; VBITS_GE_256-NEXT: ld1b { z19.b }, p0/z, [x1, x12] +; VBITS_GE_256-NEXT: ld1b { z20.b }, p0/z, [x1, x9] +; VBITS_GE_256-NEXT: ld1b { z21.b }, p0/z, [x1, x10] +; VBITS_GE_256-NEXT: ld1b { z22.b }, p0/z, [x1, x8] +; VBITS_GE_256-NEXT: ld1b { z23.b }, p0/z, [x1] +; VBITS_GE_256-NEXT: and z6.d, z6.d, z17.d +; VBITS_GE_256-NEXT: and z5.d, z5.d, z16.d +; VBITS_GE_256-NEXT: and z4.d, z4.d, z19.d +; VBITS_GE_256-NEXT: and z3.d, z3.d, z18.d +; VBITS_GE_256-NEXT: and z2.d, z2.d, z21.d +; VBITS_GE_256-NEXT: and z1.d, z1.d, z20.d +; VBITS_GE_256-NEXT: and z0.d, z0.d, z22.d +; VBITS_GE_256-NEXT: and z7.d, z7.d, z23.d +; VBITS_GE_256-NEXT: st1b { z0.b }, p0, [x0, x8] +; VBITS_GE_256-NEXT: st1b { z1.b }, p0, [x0, x9] +; VBITS_GE_256-NEXT: st1b { z2.b }, p0, [x0, x10] +; VBITS_GE_256-NEXT: st1b { z3.b }, p0, [x0, x11] +; VBITS_GE_256-NEXT: st1b { z4.b }, p0, [x0, x12] +; VBITS_GE_256-NEXT: st1b { z5.b }, p0, [x0, x13] +; VBITS_GE_256-NEXT: st1b { z6.b }, p0, [x0, x14] +; VBITS_GE_256-NEXT: st1b { z7.b }, p0, [x0] +; VBITS_GE_256-NEXT: ret + %op1 = load <256 x i8>, <256 x i8>* %a + %op2 = load <256 x i8>, <256 x i8>* %b + %res = and <256 x i8> %op1, %op2 + store <256 x i8> %res, <256 x i8>* %a + ret void +} + +define <4 x i16> @and_v4i16(<4 x i16> %op1, <4 x i16> %op2) #0 { +; CHECK-LABEL: and_v4i16: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1 +; CHECK-NEXT: and z0.d, z0.d, z1.d +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 +; CHECK-NEXT: ret + %res = and <4 x i16> %op1, %op2 + ret <4 x i16> %res +} + +define <8 x i16> @and_v8i16(<8 x i16> %op1, <8 x i16> %op2) #0 { +; CHECK-LABEL: and_v8i16: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1 +; CHECK-NEXT: and z0.d, z0.d, z1.d +; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 +; CHECK-NEXT: ret + %res = and <8 x i16> %op1, %op2 + ret <8 x i16> %res +} + +define void @and_v16i16(<16 x i16>* %a, <16 x i16>* %b) #0 { +; VBITS_GE_128-LABEL: and_v16i16: +; VBITS_GE_128: // %bb.0: +; VBITS_GE_128-NEXT: mov x8, #8 +; VBITS_GE_128-NEXT: ptrue p0.h, vl8 +; VBITS_GE_128-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] +; VBITS_GE_128-NEXT: ld1h { z1.h }, p0/z, [x0] +; VBITS_GE_128-NEXT: ld1h { z2.h }, p0/z, [x1, x8, lsl #1] +; VBITS_GE_128-NEXT: ld1h { z3.h }, p0/z, [x1] +; VBITS_GE_128-NEXT: and z1.d, z1.d, z3.d +; VBITS_GE_128-NEXT: and z0.d, z0.d, z2.d +; VBITS_GE_128-NEXT: stp q1, q0, [x0] +; VBITS_GE_128-NEXT: ret +; +; VBITS_GE_256-LABEL: and_v16i16: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: ptrue p0.h, vl16 +; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0] +; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x1] +; VBITS_GE_256-NEXT: and z0.d, z0.d, z1.d +; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x0] +; VBITS_GE_256-NEXT: ret +; +; VBITS_GE_512-LABEL: and_v16i16: +; VBITS_GE_512: // %bb.0: +; VBITS_GE_512-NEXT: ptrue p0.h, vl16 +; VBITS_GE_512-NEXT: ld1h { z0.h }, p0/z, [x0] +; VBITS_GE_512-NEXT: ld1h { z1.h }, p0/z, [x1] +; VBITS_GE_512-NEXT: and z0.d, z0.d, z1.d +; VBITS_GE_512-NEXT: st1h { z0.h }, p0, [x0] +; VBITS_GE_512-NEXT: ret + %op1 = load <16 x i16>, <16 x i16>* %a + %op2 = load <16 x i16>, <16 x i16>* %b + %res = and <16 x i16> %op1, %op2 + store <16 x i16> %res, <16 x i16>* %a + ret void +} + +define void @and_v32i16(<32 x i16>* %a, <32 x i16>* %b) #0 { +; VBITS_GE_128-LABEL: and_v32i16: +; VBITS_GE_128: // %bb.0: +; VBITS_GE_128-NEXT: mov x8, #16 +; VBITS_GE_128-NEXT: mov x9, #24 +; VBITS_GE_128-NEXT: mov x10, #8 +; VBITS_GE_128-NEXT: ptrue p0.h, vl8 +; VBITS_GE_128-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] +; VBITS_GE_128-NEXT: ld1h { z1.h }, p0/z, [x0, x9, lsl #1] +; VBITS_GE_128-NEXT: ld1h { z2.h }, p0/z, [x0, x10, lsl #1] +; VBITS_GE_128-NEXT: ld1h { z3.h }, p0/z, [x0] +; VBITS_GE_128-NEXT: ld1h { z4.h }, p0/z, [x1, x8, lsl #1] +; VBITS_GE_128-NEXT: ld1h { z5.h }, p0/z, [x1, x9, lsl #1] +; VBITS_GE_128-NEXT: ld1h { z6.h }, p0/z, [x1, x10, lsl #1] +; VBITS_GE_128-NEXT: ld1h { z7.h }, p0/z, [x1] +; VBITS_GE_128-NEXT: and z0.d, z0.d, z4.d +; VBITS_GE_128-NEXT: and z1.d, z1.d, z5.d +; VBITS_GE_128-NEXT: stp q0, q1, [x0, #32] +; VBITS_GE_128-NEXT: and z0.d, z3.d, z7.d +; VBITS_GE_128-NEXT: and z1.d, z2.d, z6.d +; VBITS_GE_128-NEXT: stp q0, q1, [x0] +; VBITS_GE_128-NEXT: ret +; +; VBITS_GE_256-LABEL: and_v32i16: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov x8, #16 +; VBITS_GE_256-NEXT: ptrue p0.h, vl16 +; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] +; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0] +; VBITS_GE_256-NEXT: ld1h { z2.h }, p0/z, [x1, x8, lsl #1] +; VBITS_GE_256-NEXT: ld1h { z3.h }, p0/z, [x1] +; VBITS_GE_256-NEXT: and z0.d, z0.d, z2.d +; VBITS_GE_256-NEXT: and z1.d, z1.d, z3.d +; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x0, x8, lsl #1] +; VBITS_GE_256-NEXT: st1h { z1.h }, p0, [x0] +; VBITS_GE_256-NEXT: ret +; +; VBITS_GE_512-LABEL: and_v32i16: +; VBITS_GE_512: // %bb.0: +; VBITS_GE_512-NEXT: ptrue p0.h, vl32 +; VBITS_GE_512-NEXT: ld1h { z0.h }, p0/z, [x0] +; VBITS_GE_512-NEXT: ld1h { z1.h }, p0/z, [x1] +; VBITS_GE_512-NEXT: and z0.d, z0.d, z1.d +; VBITS_GE_512-NEXT: st1h { z0.h }, p0, [x0] +; VBITS_GE_512-NEXT: ret + %op1 = load <32 x i16>, <32 x i16>* %a + %op2 = load <32 x i16>, <32 x i16>* %b + %res = and <32 x i16> %op1, %op2 + store <32 x i16> %res, <32 x i16>* %a + ret void +} + +define void @and_v64i16(<64 x i16>* %a, <64 x i16>* %b) #0 { +; VBITS_GE_128-LABEL: and_v64i16: +; VBITS_GE_128: // %bb.0: +; VBITS_GE_128-NEXT: mov x8, #48 +; VBITS_GE_128-NEXT: mov x9, #56 +; VBITS_GE_128-NEXT: mov x10, #32 +; VBITS_GE_128-NEXT: mov x11, #40 +; VBITS_GE_128-NEXT: mov x12, #16 +; VBITS_GE_128-NEXT: mov x13, #24 +; VBITS_GE_128-NEXT: mov x14, #8 +; VBITS_GE_128-NEXT: ptrue p0.h, vl8 +; VBITS_GE_128-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] +; VBITS_GE_128-NEXT: ld1h { z1.h }, p0/z, [x0, x9, lsl #1] +; VBITS_GE_128-NEXT: ld1h { z2.h }, p0/z, [x0, x10, lsl #1] +; VBITS_GE_128-NEXT: ld1h { z3.h }, p0/z, [x0, x11, lsl #1] +; VBITS_GE_128-NEXT: ld1h { z4.h }, p0/z, [x0, x12, lsl #1] +; VBITS_GE_128-NEXT: ld1h { z5.h }, p0/z, [x0, x13, lsl #1] +; VBITS_GE_128-NEXT: ld1h { z6.h }, p0/z, [x0, x14, lsl #1] +; VBITS_GE_128-NEXT: ld1h { z7.h }, p0/z, [x0] +; VBITS_GE_128-NEXT: ld1h { z16.h }, p0/z, [x1, x8, lsl #1] +; VBITS_GE_128-NEXT: ld1h { z17.h }, p0/z, [x1, x9, lsl #1] +; VBITS_GE_128-NEXT: ld1h { z18.h }, p0/z, [x1, x10, lsl #1] +; VBITS_GE_128-NEXT: ld1h { z19.h }, p0/z, [x1, x11, lsl #1] +; VBITS_GE_128-NEXT: ld1h { z20.h }, p0/z, [x1, x12, lsl #1] +; VBITS_GE_128-NEXT: ld1h { z21.h }, p0/z, [x1, x13, lsl #1] +; VBITS_GE_128-NEXT: ld1h { z22.h }, p0/z, [x1, x14, lsl #1] +; VBITS_GE_128-NEXT: ld1h { z23.h }, p0/z, [x1] +; VBITS_GE_128-NEXT: and z0.d, z0.d, z16.d +; VBITS_GE_128-NEXT: and z1.d, z1.d, z17.d +; VBITS_GE_128-NEXT: stp q0, q1, [x0, #96] +; VBITS_GE_128-NEXT: and z0.d, z2.d, z18.d +; VBITS_GE_128-NEXT: and z1.d, z3.d, z19.d +; VBITS_GE_128-NEXT: stp q0, q1, [x0, #64] +; VBITS_GE_128-NEXT: and z0.d, z4.d, z20.d +; VBITS_GE_128-NEXT: and z1.d, z5.d, z21.d +; VBITS_GE_128-NEXT: stp q0, q1, [x0, #32] +; VBITS_GE_128-NEXT: and z0.d, z7.d, z23.d +; VBITS_GE_128-NEXT: and z1.d, z6.d, z22.d +; VBITS_GE_128-NEXT: stp q0, q1, [x0] +; VBITS_GE_128-NEXT: ret +; +; VBITS_GE_256-LABEL: and_v64i16: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov x8, #32 +; VBITS_GE_256-NEXT: mov x9, #48 +; VBITS_GE_256-NEXT: mov x10, #16 +; VBITS_GE_256-NEXT: ptrue p0.h, vl16 +; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] +; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0, x9, lsl #1] +; VBITS_GE_256-NEXT: ld1h { z2.h }, p0/z, [x0, x10, lsl #1] +; VBITS_GE_256-NEXT: ld1h { z3.h }, p0/z, [x0] +; VBITS_GE_256-NEXT: ld1h { z4.h }, p0/z, [x1, x9, lsl #1] +; VBITS_GE_256-NEXT: ld1h { z5.h }, p0/z, [x1, x10, lsl #1] +; VBITS_GE_256-NEXT: ld1h { z6.h }, p0/z, [x1, x8, lsl #1] +; VBITS_GE_256-NEXT: ld1h { z7.h }, p0/z, [x1] +; VBITS_GE_256-NEXT: and z2.d, z2.d, z5.d +; VBITS_GE_256-NEXT: and z1.d, z1.d, z4.d +; VBITS_GE_256-NEXT: and z0.d, z0.d, z6.d +; VBITS_GE_256-NEXT: and z3.d, z3.d, z7.d +; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x0, x8, lsl #1] +; VBITS_GE_256-NEXT: st1h { z1.h }, p0, [x0, x9, lsl #1] +; VBITS_GE_256-NEXT: st1h { z2.h }, p0, [x0, x10, lsl #1] +; VBITS_GE_256-NEXT: st1h { z3.h }, p0, [x0] +; VBITS_GE_256-NEXT: ret + %op1 = load <64 x i16>, <64 x i16>* %a + %op2 = load <64 x i16>, <64 x i16>* %b + %res = and <64 x i16> %op1, %op2 + store <64 x i16> %res, <64 x i16>* %a + ret void +} + +define void @and_v128i16(<128 x i16>* %a, <128 x i16>* %b) #0 { +; VBITS_GE_128-LABEL: and_v128i16: +; VBITS_GE_128: // %bb.0: +; VBITS_GE_128-NEXT: stp d15, d14, [sp, #-64]! // 16-byte Folded Spill +; VBITS_GE_128-NEXT: .cfi_def_cfa_offset 64 +; VBITS_GE_128-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill +; VBITS_GE_128-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill +; VBITS_GE_128-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill +; VBITS_GE_128-NEXT: .cfi_offset b8, -8 +; VBITS_GE_128-NEXT: .cfi_offset b9, -16 +; VBITS_GE_128-NEXT: .cfi_offset b10, -24 +; VBITS_GE_128-NEXT: .cfi_offset b11, -32 +; VBITS_GE_128-NEXT: .cfi_offset b12, -40 +; VBITS_GE_128-NEXT: .cfi_offset b13, -48 +; VBITS_GE_128-NEXT: .cfi_offset b14, -56 +; VBITS_GE_128-NEXT: .cfi_offset b15, -64 +; VBITS_GE_128-NEXT: mov x8, #120 +; VBITS_GE_128-NEXT: mov x9, #112 +; VBITS_GE_128-NEXT: mov x10, #104 +; VBITS_GE_128-NEXT: mov x11, #96 +; VBITS_GE_128-NEXT: mov x12, #88 +; VBITS_GE_128-NEXT: mov x13, #80 +; VBITS_GE_128-NEXT: mov x14, #72 +; VBITS_GE_128-NEXT: mov x15, #64 +; VBITS_GE_128-NEXT: mov x16, #56 +; VBITS_GE_128-NEXT: mov x17, #48 +; VBITS_GE_128-NEXT: mov x18, #40 +; VBITS_GE_128-NEXT: mov x2, #32 +; VBITS_GE_128-NEXT: mov x3, #24 +; VBITS_GE_128-NEXT: mov x4, #16 +; VBITS_GE_128-NEXT: mov x5, #8 +; VBITS_GE_128-NEXT: ptrue p0.h, vl8 +; VBITS_GE_128-NEXT: ld1h { z1.h }, p0/z, [x0, x8, lsl #1] +; VBITS_GE_128-NEXT: ld1h { z2.h }, p0/z, [x0, x9, lsl #1] +; VBITS_GE_128-NEXT: ld1h { z0.h }, p0/z, [x0, x10, lsl #1] +; VBITS_GE_128-NEXT: ld1h { z7.h }, p0/z, [x0, x11, lsl #1] +; VBITS_GE_128-NEXT: ld1h { z16.h }, p0/z, [x0, x12, lsl #1] +; VBITS_GE_128-NEXT: ld1h { z6.h }, p0/z, [x0, x13, lsl #1] +; VBITS_GE_128-NEXT: ld1h { z5.h }, p0/z, [x0, x14, lsl #1] +; VBITS_GE_128-NEXT: ld1h { z4.h }, p0/z, [x0, x15, lsl #1] +; VBITS_GE_128-NEXT: ld1h { z3.h }, p0/z, [x0, x16, lsl #1] +; VBITS_GE_128-NEXT: ld1h { z17.h }, p0/z, [x0, x17, lsl #1] +; VBITS_GE_128-NEXT: ld1h { z18.h }, p0/z, [x0, x18, lsl #1] +; VBITS_GE_128-NEXT: ld1h { z19.h }, p0/z, [x0, x2, lsl #1] +; VBITS_GE_128-NEXT: ld1h { z20.h }, p0/z, [x0, x3, lsl #1] +; VBITS_GE_128-NEXT: ld1h { z21.h }, p0/z, [x0, x4, lsl #1] +; VBITS_GE_128-NEXT: ld1h { z22.h }, p0/z, [x0, x5, lsl #1] +; VBITS_GE_128-NEXT: ld1h { z23.h }, p0/z, [x0] +; VBITS_GE_128-NEXT: ld1h { z24.h }, p0/z, [x1, x8, lsl #1] +; VBITS_GE_128-NEXT: ld1h { z25.h }, p0/z, [x1, x9, lsl #1] +; VBITS_GE_128-NEXT: ld1h { z26.h }, p0/z, [x1, x10, lsl #1] +; VBITS_GE_128-NEXT: ld1h { z27.h }, p0/z, [x1, x11, lsl #1] +; VBITS_GE_128-NEXT: ld1h { z28.h }, p0/z, [x1, x12, lsl #1] +; VBITS_GE_128-NEXT: ld1h { z29.h }, p0/z, [x1, x13, lsl #1] +; VBITS_GE_128-NEXT: ld1h { z30.h }, p0/z, [x1, x14, lsl #1] +; VBITS_GE_128-NEXT: ld1h { z31.h }, p0/z, [x1, x15, lsl #1] +; VBITS_GE_128-NEXT: ld1h { z8.h }, p0/z, [x1, x16, lsl #1] +; VBITS_GE_128-NEXT: ld1h { z9.h }, p0/z, [x1, x17, lsl #1] +; VBITS_GE_128-NEXT: and z1.d, z1.d, z24.d +; VBITS_GE_128-NEXT: and z2.d, z2.d, z25.d +; VBITS_GE_128-NEXT: ld1h { z10.h }, p0/z, [x1, x18, lsl #1] +; VBITS_GE_128-NEXT: ld1h { z11.h }, p0/z, [x1, x2, lsl #1] +; VBITS_GE_128-NEXT: ld1h { z12.h }, p0/z, [x1, x3, lsl #1] +; VBITS_GE_128-NEXT: ld1h { z13.h }, p0/z, [x1, x4, lsl #1] +; VBITS_GE_128-NEXT: ld1h { z14.h }, p0/z, [x1, x5, lsl #1] +; VBITS_GE_128-NEXT: ld1h { z15.h }, p0/z, [x1] +; VBITS_GE_128-NEXT: stp q2, q1, [x0, #224] +; VBITS_GE_128-NEXT: and z0.d, z0.d, z26.d +; VBITS_GE_128-NEXT: and z1.d, z7.d, z27.d +; VBITS_GE_128-NEXT: stp q1, q0, [x0, #192] +; VBITS_GE_128-NEXT: and z0.d, z16.d, z28.d +; VBITS_GE_128-NEXT: and z1.d, z6.d, z29.d +; VBITS_GE_128-NEXT: stp q1, q0, [x0, #160] +; VBITS_GE_128-NEXT: and z0.d, z5.d, z30.d +; VBITS_GE_128-NEXT: and z1.d, z4.d, z31.d +; VBITS_GE_128-NEXT: stp q1, q0, [x0, #128] +; VBITS_GE_128-NEXT: and z0.d, z3.d, z8.d +; VBITS_GE_128-NEXT: and z1.d, z17.d, z9.d +; VBITS_GE_128-NEXT: stp q1, q0, [x0, #96] +; VBITS_GE_128-NEXT: and z0.d, z18.d, z10.d +; VBITS_GE_128-NEXT: and z1.d, z19.d, z11.d +; VBITS_GE_128-NEXT: stp q1, q0, [x0, #64] +; VBITS_GE_128-NEXT: and z0.d, z20.d, z12.d +; VBITS_GE_128-NEXT: and z1.d, z21.d, z13.d +; VBITS_GE_128-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload +; VBITS_GE_128-NEXT: stp q1, q0, [x0, #32] +; VBITS_GE_128-NEXT: and z0.d, z22.d, z14.d +; VBITS_GE_128-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload +; VBITS_GE_128-NEXT: and z1.d, z23.d, z15.d +; VBITS_GE_128-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload +; VBITS_GE_128-NEXT: stp q1, q0, [x0] +; VBITS_GE_128-NEXT: ldp d15, d14, [sp], #64 // 16-byte Folded Reload +; VBITS_GE_128-NEXT: ret +; +; VBITS_GE_256-LABEL: and_v128i16: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov x8, #96 +; VBITS_GE_256-NEXT: mov x9, #112 +; VBITS_GE_256-NEXT: mov x10, #64 +; VBITS_GE_256-NEXT: mov x11, #80 +; VBITS_GE_256-NEXT: mov x12, #32 +; VBITS_GE_256-NEXT: mov x13, #48 +; VBITS_GE_256-NEXT: mov x14, #16 +; VBITS_GE_256-NEXT: ptrue p0.h, vl16 +; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] +; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0, x9, lsl #1] +; VBITS_GE_256-NEXT: ld1h { z2.h }, p0/z, [x0, x10, lsl #1] +; VBITS_GE_256-NEXT: ld1h { z3.h }, p0/z, [x0, x11, lsl #1] +; VBITS_GE_256-NEXT: ld1h { z4.h }, p0/z, [x0, x12, lsl #1] +; VBITS_GE_256-NEXT: ld1h { z5.h }, p0/z, [x0, x13, lsl #1] +; VBITS_GE_256-NEXT: ld1h { z6.h }, p0/z, [x0, x14, lsl #1] +; VBITS_GE_256-NEXT: ld1h { z7.h }, p0/z, [x0] +; VBITS_GE_256-NEXT: ld1h { z16.h }, p0/z, [x1, x13, lsl #1] +; VBITS_GE_256-NEXT: ld1h { z17.h }, p0/z, [x1, x14, lsl #1] +; VBITS_GE_256-NEXT: ld1h { z18.h }, p0/z, [x1, x11, lsl #1] +; VBITS_GE_256-NEXT: ld1h { z19.h }, p0/z, [x1, x12, lsl #1] +; VBITS_GE_256-NEXT: ld1h { z20.h }, p0/z, [x1, x9, lsl #1] +; VBITS_GE_256-NEXT: ld1h { z21.h }, p0/z, [x1, x10, lsl #1] +; VBITS_GE_256-NEXT: ld1h { z22.h }, p0/z, [x1, x8, lsl #1] +; VBITS_GE_256-NEXT: ld1h { z23.h }, p0/z, [x1] +; VBITS_GE_256-NEXT: and z6.d, z6.d, z17.d +; VBITS_GE_256-NEXT: and z5.d, z5.d, z16.d +; VBITS_GE_256-NEXT: and z4.d, z4.d, z19.d +; VBITS_GE_256-NEXT: and z3.d, z3.d, z18.d +; VBITS_GE_256-NEXT: and z2.d, z2.d, z21.d +; VBITS_GE_256-NEXT: and z1.d, z1.d, z20.d +; VBITS_GE_256-NEXT: and z0.d, z0.d, z22.d +; VBITS_GE_256-NEXT: and z7.d, z7.d, z23.d +; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x0, x8, lsl #1] +; VBITS_GE_256-NEXT: st1h { z1.h }, p0, [x0, x9, lsl #1] +; VBITS_GE_256-NEXT: st1h { z2.h }, p0, [x0, x10, lsl #1] +; VBITS_GE_256-NEXT: st1h { z3.h }, p0, [x0, x11, lsl #1] +; VBITS_GE_256-NEXT: st1h { z4.h }, p0, [x0, x12, lsl #1] +; VBITS_GE_256-NEXT: st1h { z5.h }, p0, [x0, x13, lsl #1] +; VBITS_GE_256-NEXT: st1h { z6.h }, p0, [x0, x14, lsl #1] +; VBITS_GE_256-NEXT: st1h { z7.h }, p0, [x0] +; VBITS_GE_256-NEXT: ret + %op1 = load <128 x i16>, <128 x i16>* %a + %op2 = load <128 x i16>, <128 x i16>* %b + %res = and <128 x i16> %op1, %op2 + store <128 x i16> %res, <128 x i16>* %a + ret void +} + +define <2 x i32> @and_v2i32(<2 x i32> %op1, <2 x i32> %op2) #0 { +; CHECK-LABEL: and_v2i32: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1 +; CHECK-NEXT: and z0.d, z0.d, z1.d +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 +; CHECK-NEXT: ret + %res = and <2 x i32> %op1, %op2 + ret <2 x i32> %res +} + +define <4 x i32> @and_v4i32(<4 x i32> %op1, <4 x i32> %op2) #0 { +; CHECK-LABEL: and_v4i32: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1 +; CHECK-NEXT: and z0.d, z0.d, z1.d +; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 +; CHECK-NEXT: ret + %res = and <4 x i32> %op1, %op2 + ret <4 x i32> %res +} + +define void @and_v8i32(<8 x i32>* %a, <8 x i32>* %b) #0 { +; VBITS_GE_128-LABEL: and_v8i32: +; VBITS_GE_128: // %bb.0: +; VBITS_GE_128-NEXT: mov x8, #4 +; VBITS_GE_128-NEXT: ptrue p0.s, vl4 +; VBITS_GE_128-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] +; VBITS_GE_128-NEXT: ld1w { z1.s }, p0/z, [x0] +; VBITS_GE_128-NEXT: ld1w { z2.s }, p0/z, [x1, x8, lsl #2] +; VBITS_GE_128-NEXT: ld1w { z3.s }, p0/z, [x1] +; VBITS_GE_128-NEXT: and z1.d, z1.d, z3.d +; VBITS_GE_128-NEXT: and z0.d, z0.d, z2.d +; VBITS_GE_128-NEXT: stp q1, q0, [x0] +; VBITS_GE_128-NEXT: ret +; +; VBITS_GE_256-LABEL: and_v8i32: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: ptrue p0.s, vl8 +; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0] +; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x1] +; VBITS_GE_256-NEXT: and z0.d, z0.d, z1.d +; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0] +; VBITS_GE_256-NEXT: ret +; +; VBITS_GE_512-LABEL: and_v8i32: +; VBITS_GE_512: // %bb.0: +; VBITS_GE_512-NEXT: ptrue p0.s, vl8 +; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x0] +; VBITS_GE_512-NEXT: ld1w { z1.s }, p0/z, [x1] +; VBITS_GE_512-NEXT: and z0.d, z0.d, z1.d +; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x0] +; VBITS_GE_512-NEXT: ret + %op1 = load <8 x i32>, <8 x i32>* %a + %op2 = load <8 x i32>, <8 x i32>* %b + %res = and <8 x i32> %op1, %op2 + store <8 x i32> %res, <8 x i32>* %a + ret void +} + +define void @and_v16i32(<16 x i32>* %a, <16 x i32>* %b) #0 { +; VBITS_GE_128-LABEL: and_v16i32: +; VBITS_GE_128: // %bb.0: +; VBITS_GE_128-NEXT: mov x8, #8 +; VBITS_GE_128-NEXT: mov x9, #12 +; VBITS_GE_128-NEXT: mov x10, #4 +; VBITS_GE_128-NEXT: ptrue p0.s, vl4 +; VBITS_GE_128-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] +; VBITS_GE_128-NEXT: ld1w { z1.s }, p0/z, [x0, x9, lsl #2] +; VBITS_GE_128-NEXT: ld1w { z2.s }, p0/z, [x0, x10, lsl #2] +; VBITS_GE_128-NEXT: ld1w { z3.s }, p0/z, [x0] +; VBITS_GE_128-NEXT: ld1w { z4.s }, p0/z, [x1, x8, lsl #2] +; VBITS_GE_128-NEXT: ld1w { z5.s }, p0/z, [x1, x9, lsl #2] +; VBITS_GE_128-NEXT: ld1w { z6.s }, p0/z, [x1, x10, lsl #2] +; VBITS_GE_128-NEXT: ld1w { z7.s }, p0/z, [x1] +; VBITS_GE_128-NEXT: and z0.d, z0.d, z4.d +; VBITS_GE_128-NEXT: and z1.d, z1.d, z5.d +; VBITS_GE_128-NEXT: stp q0, q1, [x0, #32] +; VBITS_GE_128-NEXT: and z0.d, z3.d, z7.d +; VBITS_GE_128-NEXT: and z1.d, z2.d, z6.d +; VBITS_GE_128-NEXT: stp q0, q1, [x0] +; VBITS_GE_128-NEXT: ret +; +; VBITS_GE_256-LABEL: and_v16i32: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov x8, #8 +; VBITS_GE_256-NEXT: ptrue p0.s, vl8 +; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0] +; VBITS_GE_256-NEXT: ld1w { z2.s }, p0/z, [x1, x8, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z3.s }, p0/z, [x1] +; VBITS_GE_256-NEXT: and z0.d, z0.d, z2.d +; VBITS_GE_256-NEXT: and z1.d, z1.d, z3.d +; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0, x8, lsl #2] +; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x0] +; VBITS_GE_256-NEXT: ret +; +; VBITS_GE_512-LABEL: and_v16i32: +; VBITS_GE_512: // %bb.0: +; VBITS_GE_512-NEXT: ptrue p0.s, vl16 +; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x0] +; VBITS_GE_512-NEXT: ld1w { z1.s }, p0/z, [x1] +; VBITS_GE_512-NEXT: and z0.d, z0.d, z1.d +; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x0] +; VBITS_GE_512-NEXT: ret + %op1 = load <16 x i32>, <16 x i32>* %a + %op2 = load <16 x i32>, <16 x i32>* %b + %res = and <16 x i32> %op1, %op2 + store <16 x i32> %res, <16 x i32>* %a + ret void +} + +define void @and_v32i32(<32 x i32>* %a, <32 x i32>* %b) #0 { +; VBITS_GE_128-LABEL: and_v32i32: +; VBITS_GE_128: // %bb.0: +; VBITS_GE_128-NEXT: mov x8, #24 +; VBITS_GE_128-NEXT: mov x9, #28 +; VBITS_GE_128-NEXT: mov x10, #16 +; VBITS_GE_128-NEXT: mov x11, #20 +; VBITS_GE_128-NEXT: mov x12, #8 +; VBITS_GE_128-NEXT: mov x13, #12 +; VBITS_GE_128-NEXT: mov x14, #4 +; VBITS_GE_128-NEXT: ptrue p0.s, vl4 +; VBITS_GE_128-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] +; VBITS_GE_128-NEXT: ld1w { z1.s }, p0/z, [x0, x9, lsl #2] +; VBITS_GE_128-NEXT: ld1w { z2.s }, p0/z, [x0, x10, lsl #2] +; VBITS_GE_128-NEXT: ld1w { z3.s }, p0/z, [x0, x11, lsl #2] +; VBITS_GE_128-NEXT: ld1w { z4.s }, p0/z, [x0, x12, lsl #2] +; VBITS_GE_128-NEXT: ld1w { z5.s }, p0/z, [x0, x13, lsl #2] +; VBITS_GE_128-NEXT: ld1w { z6.s }, p0/z, [x0, x14, lsl #2] +; VBITS_GE_128-NEXT: ld1w { z7.s }, p0/z, [x0] +; VBITS_GE_128-NEXT: ld1w { z16.s }, p0/z, [x1, x8, lsl #2] +; VBITS_GE_128-NEXT: ld1w { z17.s }, p0/z, [x1, x9, lsl #2] +; VBITS_GE_128-NEXT: ld1w { z18.s }, p0/z, [x1, x10, lsl #2] +; VBITS_GE_128-NEXT: ld1w { z19.s }, p0/z, [x1, x11, lsl #2] +; VBITS_GE_128-NEXT: ld1w { z20.s }, p0/z, [x1, x12, lsl #2] +; VBITS_GE_128-NEXT: ld1w { z21.s }, p0/z, [x1, x13, lsl #2] +; VBITS_GE_128-NEXT: ld1w { z22.s }, p0/z, [x1, x14, lsl #2] +; VBITS_GE_128-NEXT: ld1w { z23.s }, p0/z, [x1] +; VBITS_GE_128-NEXT: and z0.d, z0.d, z16.d +; VBITS_GE_128-NEXT: and z1.d, z1.d, z17.d +; VBITS_GE_128-NEXT: stp q0, q1, [x0, #96] +; VBITS_GE_128-NEXT: and z0.d, z2.d, z18.d +; VBITS_GE_128-NEXT: and z1.d, z3.d, z19.d +; VBITS_GE_128-NEXT: stp q0, q1, [x0, #64] +; VBITS_GE_128-NEXT: and z0.d, z4.d, z20.d +; VBITS_GE_128-NEXT: and z1.d, z5.d, z21.d +; VBITS_GE_128-NEXT: stp q0, q1, [x0, #32] +; VBITS_GE_128-NEXT: and z0.d, z7.d, z23.d +; VBITS_GE_128-NEXT: and z1.d, z6.d, z22.d +; VBITS_GE_128-NEXT: stp q0, q1, [x0] +; VBITS_GE_128-NEXT: ret +; +; VBITS_GE_256-LABEL: and_v32i32: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov x8, #16 +; VBITS_GE_256-NEXT: mov x9, #24 +; VBITS_GE_256-NEXT: mov x10, #8 +; VBITS_GE_256-NEXT: ptrue p0.s, vl8 +; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0, x9, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z2.s }, p0/z, [x0, x10, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z3.s }, p0/z, [x0] +; VBITS_GE_256-NEXT: ld1w { z4.s }, p0/z, [x1, x9, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z5.s }, p0/z, [x1, x10, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z6.s }, p0/z, [x1, x8, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z7.s }, p0/z, [x1] +; VBITS_GE_256-NEXT: and z2.d, z2.d, z5.d +; VBITS_GE_256-NEXT: and z1.d, z1.d, z4.d +; VBITS_GE_256-NEXT: and z0.d, z0.d, z6.d +; VBITS_GE_256-NEXT: and z3.d, z3.d, z7.d +; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0, x8, lsl #2] +; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x0, x9, lsl #2] +; VBITS_GE_256-NEXT: st1w { z2.s }, p0, [x0, x10, lsl #2] +; VBITS_GE_256-NEXT: st1w { z3.s }, p0, [x0] +; VBITS_GE_256-NEXT: ret + %op1 = load <32 x i32>, <32 x i32>* %a + %op2 = load <32 x i32>, <32 x i32>* %b + %res = and <32 x i32> %op1, %op2 + store <32 x i32> %res, <32 x i32>* %a + ret void +} + +define void @and_v64i32(<64 x i32>* %a, <64 x i32>* %b) #0 { +; VBITS_GE_128-LABEL: and_v64i32: +; VBITS_GE_128: // %bb.0: +; VBITS_GE_128-NEXT: stp d15, d14, [sp, #-64]! // 16-byte Folded Spill +; VBITS_GE_128-NEXT: .cfi_def_cfa_offset 64 +; VBITS_GE_128-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill +; VBITS_GE_128-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill +; VBITS_GE_128-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill +; VBITS_GE_128-NEXT: .cfi_offset b8, -8 +; VBITS_GE_128-NEXT: .cfi_offset b9, -16 +; VBITS_GE_128-NEXT: .cfi_offset b10, -24 +; VBITS_GE_128-NEXT: .cfi_offset b11, -32 +; VBITS_GE_128-NEXT: .cfi_offset b12, -40 +; VBITS_GE_128-NEXT: .cfi_offset b13, -48 +; VBITS_GE_128-NEXT: .cfi_offset b14, -56 +; VBITS_GE_128-NEXT: .cfi_offset b15, -64 +; VBITS_GE_128-NEXT: mov x8, #60 +; VBITS_GE_128-NEXT: mov x9, #56 +; VBITS_GE_128-NEXT: mov x10, #52 +; VBITS_GE_128-NEXT: mov x11, #48 +; VBITS_GE_128-NEXT: mov x12, #44 +; VBITS_GE_128-NEXT: mov x13, #40 +; VBITS_GE_128-NEXT: mov x14, #36 +; VBITS_GE_128-NEXT: mov x15, #32 +; VBITS_GE_128-NEXT: mov x16, #28 +; VBITS_GE_128-NEXT: mov x17, #24 +; VBITS_GE_128-NEXT: mov x18, #20 +; VBITS_GE_128-NEXT: mov x2, #16 +; VBITS_GE_128-NEXT: mov x3, #12 +; VBITS_GE_128-NEXT: mov x4, #8 +; VBITS_GE_128-NEXT: mov x5, #4 +; VBITS_GE_128-NEXT: ptrue p0.s, vl4 +; VBITS_GE_128-NEXT: ld1w { z1.s }, p0/z, [x0, x8, lsl #2] +; VBITS_GE_128-NEXT: ld1w { z2.s }, p0/z, [x0, x9, lsl #2] +; VBITS_GE_128-NEXT: ld1w { z0.s }, p0/z, [x0, x10, lsl #2] +; VBITS_GE_128-NEXT: ld1w { z7.s }, p0/z, [x0, x11, lsl #2] +; VBITS_GE_128-NEXT: ld1w { z16.s }, p0/z, [x0, x12, lsl #2] +; VBITS_GE_128-NEXT: ld1w { z6.s }, p0/z, [x0, x13, lsl #2] +; VBITS_GE_128-NEXT: ld1w { z5.s }, p0/z, [x0, x14, lsl #2] +; VBITS_GE_128-NEXT: ld1w { z4.s }, p0/z, [x0, x15, lsl #2] +; VBITS_GE_128-NEXT: ld1w { z3.s }, p0/z, [x0, x16, lsl #2] +; VBITS_GE_128-NEXT: ld1w { z17.s }, p0/z, [x0, x17, lsl #2] +; VBITS_GE_128-NEXT: ld1w { z18.s }, p0/z, [x0, x18, lsl #2] +; VBITS_GE_128-NEXT: ld1w { z19.s }, p0/z, [x0, x2, lsl #2] +; VBITS_GE_128-NEXT: ld1w { z20.s }, p0/z, [x0, x3, lsl #2] +; VBITS_GE_128-NEXT: ld1w { z21.s }, p0/z, [x0, x4, lsl #2] +; VBITS_GE_128-NEXT: ld1w { z22.s }, p0/z, [x0, x5, lsl #2] +; VBITS_GE_128-NEXT: ld1w { z23.s }, p0/z, [x0] +; VBITS_GE_128-NEXT: ld1w { z24.s }, p0/z, [x1, x8, lsl #2] +; VBITS_GE_128-NEXT: ld1w { z25.s }, p0/z, [x1, x9, lsl #2] +; VBITS_GE_128-NEXT: ld1w { z26.s }, p0/z, [x1, x10, lsl #2] +; VBITS_GE_128-NEXT: ld1w { z27.s }, p0/z, [x1, x11, lsl #2] +; VBITS_GE_128-NEXT: ld1w { z28.s }, p0/z, [x1, x12, lsl #2] +; VBITS_GE_128-NEXT: ld1w { z29.s }, p0/z, [x1, x13, lsl #2] +; VBITS_GE_128-NEXT: ld1w { z30.s }, p0/z, [x1, x14, lsl #2] +; VBITS_GE_128-NEXT: ld1w { z31.s }, p0/z, [x1, x15, lsl #2] +; VBITS_GE_128-NEXT: ld1w { z8.s }, p0/z, [x1, x16, lsl #2] +; VBITS_GE_128-NEXT: ld1w { z9.s }, p0/z, [x1, x17, lsl #2] +; VBITS_GE_128-NEXT: and z1.d, z1.d, z24.d +; VBITS_GE_128-NEXT: and z2.d, z2.d, z25.d +; VBITS_GE_128-NEXT: ld1w { z10.s }, p0/z, [x1, x18, lsl #2] +; VBITS_GE_128-NEXT: ld1w { z11.s }, p0/z, [x1, x2, lsl #2] +; VBITS_GE_128-NEXT: ld1w { z12.s }, p0/z, [x1, x3, lsl #2] +; VBITS_GE_128-NEXT: ld1w { z13.s }, p0/z, [x1, x4, lsl #2] +; VBITS_GE_128-NEXT: ld1w { z14.s }, p0/z, [x1, x5, lsl #2] +; VBITS_GE_128-NEXT: ld1w { z15.s }, p0/z, [x1] +; VBITS_GE_128-NEXT: stp q2, q1, [x0, #224] +; VBITS_GE_128-NEXT: and z0.d, z0.d, z26.d +; VBITS_GE_128-NEXT: and z1.d, z7.d, z27.d +; VBITS_GE_128-NEXT: stp q1, q0, [x0, #192] +; VBITS_GE_128-NEXT: and z0.d, z16.d, z28.d +; VBITS_GE_128-NEXT: and z1.d, z6.d, z29.d +; VBITS_GE_128-NEXT: stp q1, q0, [x0, #160] +; VBITS_GE_128-NEXT: and z0.d, z5.d, z30.d +; VBITS_GE_128-NEXT: and z1.d, z4.d, z31.d +; VBITS_GE_128-NEXT: stp q1, q0, [x0, #128] +; VBITS_GE_128-NEXT: and z0.d, z3.d, z8.d +; VBITS_GE_128-NEXT: and z1.d, z17.d, z9.d +; VBITS_GE_128-NEXT: stp q1, q0, [x0, #96] +; VBITS_GE_128-NEXT: and z0.d, z18.d, z10.d +; VBITS_GE_128-NEXT: and z1.d, z19.d, z11.d +; VBITS_GE_128-NEXT: stp q1, q0, [x0, #64] +; VBITS_GE_128-NEXT: and z0.d, z20.d, z12.d +; VBITS_GE_128-NEXT: and z1.d, z21.d, z13.d +; VBITS_GE_128-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload +; VBITS_GE_128-NEXT: stp q1, q0, [x0, #32] +; VBITS_GE_128-NEXT: and z0.d, z22.d, z14.d +; VBITS_GE_128-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload +; VBITS_GE_128-NEXT: and z1.d, z23.d, z15.d +; VBITS_GE_128-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload +; VBITS_GE_128-NEXT: stp q1, q0, [x0] +; VBITS_GE_128-NEXT: ldp d15, d14, [sp], #64 // 16-byte Folded Reload +; VBITS_GE_128-NEXT: ret +; +; VBITS_GE_256-LABEL: and_v64i32: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov x8, #48 +; VBITS_GE_256-NEXT: mov x9, #56 +; VBITS_GE_256-NEXT: mov x10, #32 +; VBITS_GE_256-NEXT: mov x11, #40 +; VBITS_GE_256-NEXT: mov x12, #16 +; VBITS_GE_256-NEXT: mov x13, #24 +; VBITS_GE_256-NEXT: mov x14, #8 +; VBITS_GE_256-NEXT: ptrue p0.s, vl8 +; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0, x9, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z2.s }, p0/z, [x0, x10, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z3.s }, p0/z, [x0, x11, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z4.s }, p0/z, [x0, x12, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z5.s }, p0/z, [x0, x13, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z6.s }, p0/z, [x0, x14, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z7.s }, p0/z, [x0] +; VBITS_GE_256-NEXT: ld1w { z16.s }, p0/z, [x1, x13, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z17.s }, p0/z, [x1, x14, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z18.s }, p0/z, [x1, x11, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z19.s }, p0/z, [x1, x12, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z20.s }, p0/z, [x1, x9, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z21.s }, p0/z, [x1, x10, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z22.s }, p0/z, [x1, x8, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z23.s }, p0/z, [x1] +; VBITS_GE_256-NEXT: and z6.d, z6.d, z17.d +; VBITS_GE_256-NEXT: and z5.d, z5.d, z16.d +; VBITS_GE_256-NEXT: and z4.d, z4.d, z19.d +; VBITS_GE_256-NEXT: and z3.d, z3.d, z18.d +; VBITS_GE_256-NEXT: and z2.d, z2.d, z21.d +; VBITS_GE_256-NEXT: and z1.d, z1.d, z20.d +; VBITS_GE_256-NEXT: and z0.d, z0.d, z22.d +; VBITS_GE_256-NEXT: and z7.d, z7.d, z23.d +; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0, x8, lsl #2] +; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x0, x9, lsl #2] +; VBITS_GE_256-NEXT: st1w { z2.s }, p0, [x0, x10, lsl #2] +; VBITS_GE_256-NEXT: st1w { z3.s }, p0, [x0, x11, lsl #2] +; VBITS_GE_256-NEXT: st1w { z4.s }, p0, [x0, x12, lsl #2] +; VBITS_GE_256-NEXT: st1w { z5.s }, p0, [x0, x13, lsl #2] +; VBITS_GE_256-NEXT: st1w { z6.s }, p0, [x0, x14, lsl #2] +; VBITS_GE_256-NEXT: st1w { z7.s }, p0, [x0] +; VBITS_GE_256-NEXT: ret + %op1 = load <64 x i32>, <64 x i32>* %a + %op2 = load <64 x i32>, <64 x i32>* %b + %res = and <64 x i32> %op1, %op2 + store <64 x i32> %res, <64 x i32>* %a + ret void +} + +define <1 x i64> @and_v1i64(<1 x i64> %op1, <1 x i64> %op2) #0 { +; CHECK-LABEL: and_v1i64: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1 +; CHECK-NEXT: and z0.d, z0.d, z1.d +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 +; CHECK-NEXT: ret + %res = and <1 x i64> %op1, %op2 + ret <1 x i64> %res +} + +define <2 x i64> @and_v2i64(<2 x i64> %op1, <2 x i64> %op2) #0 { +; CHECK-LABEL: and_v2i64: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1 +; CHECK-NEXT: and z0.d, z0.d, z1.d +; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 +; CHECK-NEXT: ret + %res = and <2 x i64> %op1, %op2 + ret <2 x i64> %res +} + +define void @and_v4i64(<4 x i64>* %a, <4 x i64>* %b) #0 { +; VBITS_GE_128-LABEL: and_v4i64: +; VBITS_GE_128: // %bb.0: +; VBITS_GE_128-NEXT: mov x8, #2 +; VBITS_GE_128-NEXT: ptrue p0.d, vl2 +; VBITS_GE_128-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] +; VBITS_GE_128-NEXT: ld1d { z1.d }, p0/z, [x0] +; VBITS_GE_128-NEXT: ld1d { z2.d }, p0/z, [x1, x8, lsl #3] +; VBITS_GE_128-NEXT: ld1d { z3.d }, p0/z, [x1] +; VBITS_GE_128-NEXT: and z1.d, z1.d, z3.d +; VBITS_GE_128-NEXT: and z0.d, z0.d, z2.d +; VBITS_GE_128-NEXT: stp q1, q0, [x0] +; VBITS_GE_128-NEXT: ret +; +; VBITS_GE_256-LABEL: and_v4i64: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: ptrue p0.d, vl4 +; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0] +; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x1] +; VBITS_GE_256-NEXT: and z0.d, z0.d, z1.d +; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x0] +; VBITS_GE_256-NEXT: ret +; +; VBITS_GE_512-LABEL: and_v4i64: +; VBITS_GE_512: // %bb.0: +; VBITS_GE_512-NEXT: ptrue p0.d, vl4 +; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x0] +; VBITS_GE_512-NEXT: ld1d { z1.d }, p0/z, [x1] +; VBITS_GE_512-NEXT: and z0.d, z0.d, z1.d +; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x0] +; VBITS_GE_512-NEXT: ret + %op1 = load <4 x i64>, <4 x i64>* %a + %op2 = load <4 x i64>, <4 x i64>* %b + %res = and <4 x i64> %op1, %op2 + store <4 x i64> %res, <4 x i64>* %a + ret void +} + +define void @and_v8i64(<8 x i64>* %a, <8 x i64>* %b) #0 { +; VBITS_GE_128-LABEL: and_v8i64: +; VBITS_GE_128: // %bb.0: +; VBITS_GE_128-NEXT: mov x8, #4 +; VBITS_GE_128-NEXT: mov x9, #6 +; VBITS_GE_128-NEXT: mov x10, #2 +; VBITS_GE_128-NEXT: ptrue p0.d, vl2 +; VBITS_GE_128-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] +; VBITS_GE_128-NEXT: ld1d { z1.d }, p0/z, [x0, x9, lsl #3] +; VBITS_GE_128-NEXT: ld1d { z2.d }, p0/z, [x0, x10, lsl #3] +; VBITS_GE_128-NEXT: ld1d { z3.d }, p0/z, [x0] +; VBITS_GE_128-NEXT: ld1d { z4.d }, p0/z, [x1, x8, lsl #3] +; VBITS_GE_128-NEXT: ld1d { z5.d }, p0/z, [x1, x9, lsl #3] +; VBITS_GE_128-NEXT: ld1d { z6.d }, p0/z, [x1, x10, lsl #3] +; VBITS_GE_128-NEXT: ld1d { z7.d }, p0/z, [x1] +; VBITS_GE_128-NEXT: and z0.d, z0.d, z4.d +; VBITS_GE_128-NEXT: and z1.d, z1.d, z5.d +; VBITS_GE_128-NEXT: stp q0, q1, [x0, #32] +; VBITS_GE_128-NEXT: and z0.d, z3.d, z7.d +; VBITS_GE_128-NEXT: and z1.d, z2.d, z6.d +; VBITS_GE_128-NEXT: stp q0, q1, [x0] +; VBITS_GE_128-NEXT: ret +; +; VBITS_GE_256-LABEL: and_v8i64: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov x8, #4 +; VBITS_GE_256-NEXT: ptrue p0.d, vl4 +; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0] +; VBITS_GE_256-NEXT: ld1d { z2.d }, p0/z, [x1, x8, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z3.d }, p0/z, [x1] +; VBITS_GE_256-NEXT: and z0.d, z0.d, z2.d +; VBITS_GE_256-NEXT: and z1.d, z1.d, z3.d +; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x0, x8, lsl #3] +; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x0] +; VBITS_GE_256-NEXT: ret +; +; VBITS_GE_512-LABEL: and_v8i64: +; VBITS_GE_512: // %bb.0: +; VBITS_GE_512-NEXT: ptrue p0.d, vl8 +; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x0] +; VBITS_GE_512-NEXT: ld1d { z1.d }, p0/z, [x1] +; VBITS_GE_512-NEXT: and z0.d, z0.d, z1.d +; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x0] +; VBITS_GE_512-NEXT: ret + %op1 = load <8 x i64>, <8 x i64>* %a + %op2 = load <8 x i64>, <8 x i64>* %b + %res = and <8 x i64> %op1, %op2 + store <8 x i64> %res, <8 x i64>* %a + ret void +} + +define void @and_v16i64(<16 x i64>* %a, <16 x i64>* %b) #0 { +; VBITS_GE_128-LABEL: and_v16i64: +; VBITS_GE_128: // %bb.0: +; VBITS_GE_128-NEXT: mov x8, #12 +; VBITS_GE_128-NEXT: mov x9, #14 +; VBITS_GE_128-NEXT: mov x10, #8 +; VBITS_GE_128-NEXT: mov x11, #10 +; VBITS_GE_128-NEXT: mov x12, #4 +; VBITS_GE_128-NEXT: mov x13, #6 +; VBITS_GE_128-NEXT: mov x14, #2 +; VBITS_GE_128-NEXT: ptrue p0.d, vl2 +; VBITS_GE_128-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] +; VBITS_GE_128-NEXT: ld1d { z1.d }, p0/z, [x0, x9, lsl #3] +; VBITS_GE_128-NEXT: ld1d { z2.d }, p0/z, [x0, x10, lsl #3] +; VBITS_GE_128-NEXT: ld1d { z3.d }, p0/z, [x0, x11, lsl #3] +; VBITS_GE_128-NEXT: ld1d { z4.d }, p0/z, [x0, x12, lsl #3] +; VBITS_GE_128-NEXT: ld1d { z5.d }, p0/z, [x0, x13, lsl #3] +; VBITS_GE_128-NEXT: ld1d { z6.d }, p0/z, [x0, x14, lsl #3] +; VBITS_GE_128-NEXT: ld1d { z7.d }, p0/z, [x0] +; VBITS_GE_128-NEXT: ld1d { z16.d }, p0/z, [x1, x8, lsl #3] +; VBITS_GE_128-NEXT: ld1d { z17.d }, p0/z, [x1, x9, lsl #3] +; VBITS_GE_128-NEXT: ld1d { z18.d }, p0/z, [x1, x10, lsl #3] +; VBITS_GE_128-NEXT: ld1d { z19.d }, p0/z, [x1, x11, lsl #3] +; VBITS_GE_128-NEXT: ld1d { z20.d }, p0/z, [x1, x12, lsl #3] +; VBITS_GE_128-NEXT: ld1d { z21.d }, p0/z, [x1, x13, lsl #3] +; VBITS_GE_128-NEXT: ld1d { z22.d }, p0/z, [x1, x14, lsl #3] +; VBITS_GE_128-NEXT: ld1d { z23.d }, p0/z, [x1] +; VBITS_GE_128-NEXT: and z0.d, z0.d, z16.d +; VBITS_GE_128-NEXT: and z1.d, z1.d, z17.d +; VBITS_GE_128-NEXT: stp q0, q1, [x0, #96] +; VBITS_GE_128-NEXT: and z0.d, z2.d, z18.d +; VBITS_GE_128-NEXT: and z1.d, z3.d, z19.d +; VBITS_GE_128-NEXT: stp q0, q1, [x0, #64] +; VBITS_GE_128-NEXT: and z0.d, z4.d, z20.d +; VBITS_GE_128-NEXT: and z1.d, z5.d, z21.d +; VBITS_GE_128-NEXT: stp q0, q1, [x0, #32] +; VBITS_GE_128-NEXT: and z0.d, z7.d, z23.d +; VBITS_GE_128-NEXT: and z1.d, z6.d, z22.d +; VBITS_GE_128-NEXT: stp q0, q1, [x0] +; VBITS_GE_128-NEXT: ret +; +; VBITS_GE_256-LABEL: and_v16i64: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov x8, #8 +; VBITS_GE_256-NEXT: mov x9, #12 +; VBITS_GE_256-NEXT: mov x10, #4 +; VBITS_GE_256-NEXT: ptrue p0.d, vl4 +; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0, x9, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z2.d }, p0/z, [x0, x10, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z3.d }, p0/z, [x0] +; VBITS_GE_256-NEXT: ld1d { z4.d }, p0/z, [x1, x9, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z5.d }, p0/z, [x1, x10, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z6.d }, p0/z, [x1, x8, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z7.d }, p0/z, [x1] +; VBITS_GE_256-NEXT: and z2.d, z2.d, z5.d +; VBITS_GE_256-NEXT: and z1.d, z1.d, z4.d +; VBITS_GE_256-NEXT: and z0.d, z0.d, z6.d +; VBITS_GE_256-NEXT: and z3.d, z3.d, z7.d +; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x0, x8, lsl #3] +; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x0, x9, lsl #3] +; VBITS_GE_256-NEXT: st1d { z2.d }, p0, [x0, x10, lsl #3] +; VBITS_GE_256-NEXT: st1d { z3.d }, p0, [x0] +; VBITS_GE_256-NEXT: ret + %op1 = load <16 x i64>, <16 x i64>* %a + %op2 = load <16 x i64>, <16 x i64>* %b + %res = and <16 x i64> %op1, %op2 + store <16 x i64> %res, <16 x i64>* %a + ret void +} + +define void @and_v32i64(<32 x i64>* %a, <32 x i64>* %b) #0 { +; VBITS_GE_128-LABEL: and_v32i64: +; VBITS_GE_128: // %bb.0: +; VBITS_GE_128-NEXT: stp d15, d14, [sp, #-64]! // 16-byte Folded Spill +; VBITS_GE_128-NEXT: .cfi_def_cfa_offset 64 +; VBITS_GE_128-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill +; VBITS_GE_128-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill +; VBITS_GE_128-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill +; VBITS_GE_128-NEXT: .cfi_offset b8, -8 +; VBITS_GE_128-NEXT: .cfi_offset b9, -16 +; VBITS_GE_128-NEXT: .cfi_offset b10, -24 +; VBITS_GE_128-NEXT: .cfi_offset b11, -32 +; VBITS_GE_128-NEXT: .cfi_offset b12, -40 +; VBITS_GE_128-NEXT: .cfi_offset b13, -48 +; VBITS_GE_128-NEXT: .cfi_offset b14, -56 +; VBITS_GE_128-NEXT: .cfi_offset b15, -64 +; VBITS_GE_128-NEXT: mov x8, #30 +; VBITS_GE_128-NEXT: mov x9, #28 +; VBITS_GE_128-NEXT: mov x10, #26 +; VBITS_GE_128-NEXT: mov x11, #24 +; VBITS_GE_128-NEXT: mov x12, #22 +; VBITS_GE_128-NEXT: mov x13, #20 +; VBITS_GE_128-NEXT: mov x14, #18 +; VBITS_GE_128-NEXT: mov x15, #16 +; VBITS_GE_128-NEXT: mov x16, #14 +; VBITS_GE_128-NEXT: mov x17, #12 +; VBITS_GE_128-NEXT: mov x18, #10 +; VBITS_GE_128-NEXT: mov x2, #8 +; VBITS_GE_128-NEXT: mov x3, #6 +; VBITS_GE_128-NEXT: mov x4, #4 +; VBITS_GE_128-NEXT: mov x5, #2 +; VBITS_GE_128-NEXT: ptrue p0.d, vl2 +; VBITS_GE_128-NEXT: ld1d { z1.d }, p0/z, [x0, x8, lsl #3] +; VBITS_GE_128-NEXT: ld1d { z2.d }, p0/z, [x0, x9, lsl #3] +; VBITS_GE_128-NEXT: ld1d { z0.d }, p0/z, [x0, x10, lsl #3] +; VBITS_GE_128-NEXT: ld1d { z7.d }, p0/z, [x0, x11, lsl #3] +; VBITS_GE_128-NEXT: ld1d { z16.d }, p0/z, [x0, x12, lsl #3] +; VBITS_GE_128-NEXT: ld1d { z6.d }, p0/z, [x0, x13, lsl #3] +; VBITS_GE_128-NEXT: ld1d { z5.d }, p0/z, [x0, x14, lsl #3] +; VBITS_GE_128-NEXT: ld1d { z4.d }, p0/z, [x0, x15, lsl #3] +; VBITS_GE_128-NEXT: ld1d { z3.d }, p0/z, [x0, x16, lsl #3] +; VBITS_GE_128-NEXT: ld1d { z17.d }, p0/z, [x0, x17, lsl #3] +; VBITS_GE_128-NEXT: ld1d { z18.d }, p0/z, [x0, x18, lsl #3] +; VBITS_GE_128-NEXT: ld1d { z19.d }, p0/z, [x0, x2, lsl #3] +; VBITS_GE_128-NEXT: ld1d { z20.d }, p0/z, [x0, x3, lsl #3] +; VBITS_GE_128-NEXT: ld1d { z21.d }, p0/z, [x0, x4, lsl #3] +; VBITS_GE_128-NEXT: ld1d { z22.d }, p0/z, [x0, x5, lsl #3] +; VBITS_GE_128-NEXT: ld1d { z23.d }, p0/z, [x0] +; VBITS_GE_128-NEXT: ld1d { z24.d }, p0/z, [x1, x8, lsl #3] +; VBITS_GE_128-NEXT: ld1d { z25.d }, p0/z, [x1, x9, lsl #3] +; VBITS_GE_128-NEXT: ld1d { z26.d }, p0/z, [x1, x10, lsl #3] +; VBITS_GE_128-NEXT: ld1d { z27.d }, p0/z, [x1, x11, lsl #3] +; VBITS_GE_128-NEXT: ld1d { z28.d }, p0/z, [x1, x12, lsl #3] +; VBITS_GE_128-NEXT: ld1d { z29.d }, p0/z, [x1, x13, lsl #3] +; VBITS_GE_128-NEXT: ld1d { z30.d }, p0/z, [x1, x14, lsl #3] +; VBITS_GE_128-NEXT: ld1d { z31.d }, p0/z, [x1, x15, lsl #3] +; VBITS_GE_128-NEXT: ld1d { z8.d }, p0/z, [x1, x16, lsl #3] +; VBITS_GE_128-NEXT: ld1d { z9.d }, p0/z, [x1, x17, lsl #3] +; VBITS_GE_128-NEXT: and z1.d, z1.d, z24.d +; VBITS_GE_128-NEXT: and z2.d, z2.d, z25.d +; VBITS_GE_128-NEXT: ld1d { z10.d }, p0/z, [x1, x18, lsl #3] +; VBITS_GE_128-NEXT: ld1d { z11.d }, p0/z, [x1, x2, lsl #3] +; VBITS_GE_128-NEXT: ld1d { z12.d }, p0/z, [x1, x3, lsl #3] +; VBITS_GE_128-NEXT: ld1d { z13.d }, p0/z, [x1, x4, lsl #3] +; VBITS_GE_128-NEXT: ld1d { z14.d }, p0/z, [x1, x5, lsl #3] +; VBITS_GE_128-NEXT: ld1d { z15.d }, p0/z, [x1] +; VBITS_GE_128-NEXT: stp q2, q1, [x0, #224] +; VBITS_GE_128-NEXT: and z0.d, z0.d, z26.d +; VBITS_GE_128-NEXT: and z1.d, z7.d, z27.d +; VBITS_GE_128-NEXT: stp q1, q0, [x0, #192] +; VBITS_GE_128-NEXT: and z0.d, z16.d, z28.d +; VBITS_GE_128-NEXT: and z1.d, z6.d, z29.d +; VBITS_GE_128-NEXT: stp q1, q0, [x0, #160] +; VBITS_GE_128-NEXT: and z0.d, z5.d, z30.d +; VBITS_GE_128-NEXT: and z1.d, z4.d, z31.d +; VBITS_GE_128-NEXT: stp q1, q0, [x0, #128] +; VBITS_GE_128-NEXT: and z0.d, z3.d, z8.d +; VBITS_GE_128-NEXT: and z1.d, z17.d, z9.d +; VBITS_GE_128-NEXT: stp q1, q0, [x0, #96] +; VBITS_GE_128-NEXT: and z0.d, z18.d, z10.d +; VBITS_GE_128-NEXT: and z1.d, z19.d, z11.d +; VBITS_GE_128-NEXT: stp q1, q0, [x0, #64] +; VBITS_GE_128-NEXT: and z0.d, z20.d, z12.d +; VBITS_GE_128-NEXT: and z1.d, z21.d, z13.d +; VBITS_GE_128-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload +; VBITS_GE_128-NEXT: stp q1, q0, [x0, #32] +; VBITS_GE_128-NEXT: and z0.d, z22.d, z14.d +; VBITS_GE_128-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload +; VBITS_GE_128-NEXT: and z1.d, z23.d, z15.d +; VBITS_GE_128-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload +; VBITS_GE_128-NEXT: stp q1, q0, [x0] +; VBITS_GE_128-NEXT: ldp d15, d14, [sp], #64 // 16-byte Folded Reload +; VBITS_GE_128-NEXT: ret +; +; VBITS_GE_256-LABEL: and_v32i64: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov x8, #24 +; VBITS_GE_256-NEXT: mov x9, #28 +; VBITS_GE_256-NEXT: mov x10, #16 +; VBITS_GE_256-NEXT: mov x11, #20 +; VBITS_GE_256-NEXT: mov x12, #8 +; VBITS_GE_256-NEXT: mov x13, #12 +; VBITS_GE_256-NEXT: mov x14, #4 +; VBITS_GE_256-NEXT: ptrue p0.d, vl4 +; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0, x9, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z2.d }, p0/z, [x0, x10, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z3.d }, p0/z, [x0, x11, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z4.d }, p0/z, [x0, x12, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z5.d }, p0/z, [x0, x13, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z6.d }, p0/z, [x0, x14, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z7.d }, p0/z, [x0] +; VBITS_GE_256-NEXT: ld1d { z16.d }, p0/z, [x1, x13, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z17.d }, p0/z, [x1, x14, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z18.d }, p0/z, [x1, x11, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z19.d }, p0/z, [x1, x12, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z20.d }, p0/z, [x1, x9, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z21.d }, p0/z, [x1, x10, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z22.d }, p0/z, [x1, x8, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z23.d }, p0/z, [x1] +; VBITS_GE_256-NEXT: and z6.d, z6.d, z17.d +; VBITS_GE_256-NEXT: and z5.d, z5.d, z16.d +; VBITS_GE_256-NEXT: and z4.d, z4.d, z19.d +; VBITS_GE_256-NEXT: and z3.d, z3.d, z18.d +; VBITS_GE_256-NEXT: and z2.d, z2.d, z21.d +; VBITS_GE_256-NEXT: and z1.d, z1.d, z20.d +; VBITS_GE_256-NEXT: and z0.d, z0.d, z22.d +; VBITS_GE_256-NEXT: and z7.d, z7.d, z23.d +; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x0, x8, lsl #3] +; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x0, x9, lsl #3] +; VBITS_GE_256-NEXT: st1d { z2.d }, p0, [x0, x10, lsl #3] +; VBITS_GE_256-NEXT: st1d { z3.d }, p0, [x0, x11, lsl #3] +; VBITS_GE_256-NEXT: st1d { z4.d }, p0, [x0, x12, lsl #3] +; VBITS_GE_256-NEXT: st1d { z5.d }, p0, [x0, x13, lsl #3] +; VBITS_GE_256-NEXT: st1d { z6.d }, p0, [x0, x14, lsl #3] +; VBITS_GE_256-NEXT: st1d { z7.d }, p0, [x0] +; VBITS_GE_256-NEXT: ret + %op1 = load <32 x i64>, <32 x i64>* %a + %op2 = load <32 x i64>, <32 x i64>* %b + %res = and <32 x i64> %op1, %op2 + store <32 x i64> %res, <32 x i64>* %a + ret void +} + +; +; OR +; + +define <8 x i8> @or_v8i8(<8 x i8> %op1, <8 x i8> %op2) #0 { +; CHECK-LABEL: or_v8i8: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1 +; CHECK-NEXT: orr z0.d, z0.d, z1.d +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 +; CHECK-NEXT: ret + %res = or <8 x i8> %op1, %op2 + ret <8 x i8> %res +} + +define <16 x i8> @or_v16i8(<16 x i8> %op1, <16 x i8> %op2) #0 { +; CHECK-LABEL: or_v16i8: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1 +; CHECK-NEXT: orr z0.d, z0.d, z1.d +; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 +; CHECK-NEXT: ret + %res = or <16 x i8> %op1, %op2 + ret <16 x i8> %res +} + +define void @or_v32i8(<32 x i8>* %a, <32 x i8>* %b) #0 { +; VBITS_GE_128-LABEL: or_v32i8: +; VBITS_GE_128: // %bb.0: +; VBITS_GE_128-NEXT: mov w8, #16 +; VBITS_GE_128-NEXT: ptrue p0.b, vl16 +; VBITS_GE_128-NEXT: ld1b { z0.b }, p0/z, [x0, x8] +; VBITS_GE_128-NEXT: ld1b { z1.b }, p0/z, [x0] +; VBITS_GE_128-NEXT: ld1b { z2.b }, p0/z, [x1, x8] +; VBITS_GE_128-NEXT: ld1b { z3.b }, p0/z, [x1] +; VBITS_GE_128-NEXT: orr z1.d, z1.d, z3.d +; VBITS_GE_128-NEXT: orr z0.d, z0.d, z2.d +; VBITS_GE_128-NEXT: stp q1, q0, [x0] +; VBITS_GE_128-NEXT: ret +; +; VBITS_GE_256-LABEL: or_v32i8: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: ptrue p0.b, vl32 +; VBITS_GE_256-NEXT: ld1b { z0.b }, p0/z, [x0] +; VBITS_GE_256-NEXT: ld1b { z1.b }, p0/z, [x1] +; VBITS_GE_256-NEXT: orr z0.d, z0.d, z1.d +; VBITS_GE_256-NEXT: st1b { z0.b }, p0, [x0] +; VBITS_GE_256-NEXT: ret +; +; VBITS_GE_512-LABEL: or_v32i8: +; VBITS_GE_512: // %bb.0: +; VBITS_GE_512-NEXT: ptrue p0.b, vl32 +; VBITS_GE_512-NEXT: ld1b { z0.b }, p0/z, [x0] +; VBITS_GE_512-NEXT: ld1b { z1.b }, p0/z, [x1] +; VBITS_GE_512-NEXT: orr z0.d, z0.d, z1.d +; VBITS_GE_512-NEXT: st1b { z0.b }, p0, [x0] +; VBITS_GE_512-NEXT: ret + %op1 = load <32 x i8>, <32 x i8>* %a + %op2 = load <32 x i8>, <32 x i8>* %b + %res = or <32 x i8> %op1, %op2 + store <32 x i8> %res, <32 x i8>* %a + ret void +} + +define void @or_v64i8(<64 x i8>* %a, <64 x i8>* %b) #0 { +; VBITS_GE_128-LABEL: or_v64i8: +; VBITS_GE_128: // %bb.0: +; VBITS_GE_128-NEXT: mov w8, #32 +; VBITS_GE_128-NEXT: mov w9, #48 +; VBITS_GE_128-NEXT: mov w10, #16 +; VBITS_GE_128-NEXT: ptrue p0.b, vl16 +; VBITS_GE_128-NEXT: ld1b { z0.b }, p0/z, [x0, x8] +; VBITS_GE_128-NEXT: ld1b { z1.b }, p0/z, [x0, x9] +; VBITS_GE_128-NEXT: ld1b { z2.b }, p0/z, [x0, x10] +; VBITS_GE_128-NEXT: ld1b { z3.b }, p0/z, [x0] +; VBITS_GE_128-NEXT: ld1b { z4.b }, p0/z, [x1, x8] +; VBITS_GE_128-NEXT: ld1b { z5.b }, p0/z, [x1, x9] +; VBITS_GE_128-NEXT: ld1b { z6.b }, p0/z, [x1, x10] +; VBITS_GE_128-NEXT: ld1b { z7.b }, p0/z, [x1] +; VBITS_GE_128-NEXT: orr z0.d, z0.d, z4.d +; VBITS_GE_128-NEXT: orr z1.d, z1.d, z5.d +; VBITS_GE_128-NEXT: stp q0, q1, [x0, #32] +; VBITS_GE_128-NEXT: orr z0.d, z3.d, z7.d +; VBITS_GE_128-NEXT: orr z1.d, z2.d, z6.d +; VBITS_GE_128-NEXT: stp q0, q1, [x0] +; VBITS_GE_128-NEXT: ret +; +; VBITS_GE_256-LABEL: or_v64i8: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov w8, #32 +; VBITS_GE_256-NEXT: ptrue p0.b, vl32 +; VBITS_GE_256-NEXT: ld1b { z0.b }, p0/z, [x0, x8] +; VBITS_GE_256-NEXT: ld1b { z1.b }, p0/z, [x0] +; VBITS_GE_256-NEXT: ld1b { z2.b }, p0/z, [x1, x8] +; VBITS_GE_256-NEXT: ld1b { z3.b }, p0/z, [x1] +; VBITS_GE_256-NEXT: orr z0.d, z0.d, z2.d +; VBITS_GE_256-NEXT: orr z1.d, z1.d, z3.d +; VBITS_GE_256-NEXT: st1b { z0.b }, p0, [x0, x8] +; VBITS_GE_256-NEXT: st1b { z1.b }, p0, [x0] +; VBITS_GE_256-NEXT: ret +; +; VBITS_GE_512-LABEL: or_v64i8: +; VBITS_GE_512: // %bb.0: +; VBITS_GE_512-NEXT: ptrue p0.b, vl64 +; VBITS_GE_512-NEXT: ld1b { z0.b }, p0/z, [x0] +; VBITS_GE_512-NEXT: ld1b { z1.b }, p0/z, [x1] +; VBITS_GE_512-NEXT: orr z0.d, z0.d, z1.d +; VBITS_GE_512-NEXT: st1b { z0.b }, p0, [x0] +; VBITS_GE_512-NEXT: ret + %op1 = load <64 x i8>, <64 x i8>* %a + %op2 = load <64 x i8>, <64 x i8>* %b + %res = or <64 x i8> %op1, %op2 + store <64 x i8> %res, <64 x i8>* %a + ret void +} + +define void @or_v128i8(<128 x i8>* %a, <128 x i8>* %b) #0 { +; VBITS_GE_128-LABEL: or_v128i8: +; VBITS_GE_128: // %bb.0: +; VBITS_GE_128-NEXT: mov w8, #96 +; VBITS_GE_128-NEXT: mov w9, #112 +; VBITS_GE_128-NEXT: mov w10, #64 +; VBITS_GE_128-NEXT: mov w11, #80 +; VBITS_GE_128-NEXT: mov w12, #32 +; VBITS_GE_128-NEXT: mov w13, #48 +; VBITS_GE_128-NEXT: mov w14, #16 +; VBITS_GE_128-NEXT: ptrue p0.b, vl16 +; VBITS_GE_128-NEXT: ld1b { z0.b }, p0/z, [x0, x8] +; VBITS_GE_128-NEXT: ld1b { z1.b }, p0/z, [x0, x9] +; VBITS_GE_128-NEXT: ld1b { z2.b }, p0/z, [x0, x10] +; VBITS_GE_128-NEXT: ld1b { z3.b }, p0/z, [x0, x11] +; VBITS_GE_128-NEXT: ld1b { z4.b }, p0/z, [x0, x12] +; VBITS_GE_128-NEXT: ld1b { z5.b }, p0/z, [x0, x13] +; VBITS_GE_128-NEXT: ld1b { z6.b }, p0/z, [x0, x14] +; VBITS_GE_128-NEXT: ld1b { z7.b }, p0/z, [x0] +; VBITS_GE_128-NEXT: ld1b { z16.b }, p0/z, [x1, x8] +; VBITS_GE_128-NEXT: ld1b { z17.b }, p0/z, [x1, x9] +; VBITS_GE_128-NEXT: ld1b { z18.b }, p0/z, [x1, x10] +; VBITS_GE_128-NEXT: ld1b { z19.b }, p0/z, [x1, x11] +; VBITS_GE_128-NEXT: ld1b { z20.b }, p0/z, [x1, x12] +; VBITS_GE_128-NEXT: ld1b { z21.b }, p0/z, [x1, x13] +; VBITS_GE_128-NEXT: ld1b { z22.b }, p0/z, [x1, x14] +; VBITS_GE_128-NEXT: ld1b { z23.b }, p0/z, [x1] +; VBITS_GE_128-NEXT: orr z0.d, z0.d, z16.d +; VBITS_GE_128-NEXT: orr z1.d, z1.d, z17.d +; VBITS_GE_128-NEXT: stp q0, q1, [x0, #96] +; VBITS_GE_128-NEXT: orr z0.d, z2.d, z18.d +; VBITS_GE_128-NEXT: orr z1.d, z3.d, z19.d +; VBITS_GE_128-NEXT: stp q0, q1, [x0, #64] +; VBITS_GE_128-NEXT: orr z0.d, z4.d, z20.d +; VBITS_GE_128-NEXT: orr z1.d, z5.d, z21.d +; VBITS_GE_128-NEXT: stp q0, q1, [x0, #32] +; VBITS_GE_128-NEXT: orr z0.d, z7.d, z23.d +; VBITS_GE_128-NEXT: orr z1.d, z6.d, z22.d +; VBITS_GE_128-NEXT: stp q0, q1, [x0] +; VBITS_GE_128-NEXT: ret +; +; VBITS_GE_256-LABEL: or_v128i8: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov w8, #64 +; VBITS_GE_256-NEXT: mov w9, #96 +; VBITS_GE_256-NEXT: mov w10, #32 +; VBITS_GE_256-NEXT: ptrue p0.b, vl32 +; VBITS_GE_256-NEXT: ld1b { z0.b }, p0/z, [x0, x8] +; VBITS_GE_256-NEXT: ld1b { z1.b }, p0/z, [x0, x9] +; VBITS_GE_256-NEXT: ld1b { z2.b }, p0/z, [x0, x10] +; VBITS_GE_256-NEXT: ld1b { z3.b }, p0/z, [x0] +; VBITS_GE_256-NEXT: ld1b { z4.b }, p0/z, [x1, x9] +; VBITS_GE_256-NEXT: ld1b { z5.b }, p0/z, [x1, x10] +; VBITS_GE_256-NEXT: ld1b { z6.b }, p0/z, [x1, x8] +; VBITS_GE_256-NEXT: ld1b { z7.b }, p0/z, [x1] +; VBITS_GE_256-NEXT: orr z2.d, z2.d, z5.d +; VBITS_GE_256-NEXT: orr z1.d, z1.d, z4.d +; VBITS_GE_256-NEXT: orr z0.d, z0.d, z6.d +; VBITS_GE_256-NEXT: orr z3.d, z3.d, z7.d +; VBITS_GE_256-NEXT: st1b { z0.b }, p0, [x0, x8] +; VBITS_GE_256-NEXT: st1b { z1.b }, p0, [x0, x9] +; VBITS_GE_256-NEXT: st1b { z2.b }, p0, [x0, x10] +; VBITS_GE_256-NEXT: st1b { z3.b }, p0, [x0] +; VBITS_GE_256-NEXT: ret + %op1 = load <128 x i8>, <128 x i8>* %a + %op2 = load <128 x i8>, <128 x i8>* %b + %res = or <128 x i8> %op1, %op2 + store <128 x i8> %res, <128 x i8>* %a + ret void +} + +define void @or_v256i8(<256 x i8>* %a, <256 x i8>* %b) #0 { +; VBITS_GE_128-LABEL: or_v256i8: +; VBITS_GE_128: // %bb.0: +; VBITS_GE_128-NEXT: stp d15, d14, [sp, #-64]! // 16-byte Folded Spill +; VBITS_GE_128-NEXT: .cfi_def_cfa_offset 64 +; VBITS_GE_128-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill +; VBITS_GE_128-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill +; VBITS_GE_128-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill +; VBITS_GE_128-NEXT: .cfi_offset b8, -8 +; VBITS_GE_128-NEXT: .cfi_offset b9, -16 +; VBITS_GE_128-NEXT: .cfi_offset b10, -24 +; VBITS_GE_128-NEXT: .cfi_offset b11, -32 +; VBITS_GE_128-NEXT: .cfi_offset b12, -40 +; VBITS_GE_128-NEXT: .cfi_offset b13, -48 +; VBITS_GE_128-NEXT: .cfi_offset b14, -56 +; VBITS_GE_128-NEXT: .cfi_offset b15, -64 +; VBITS_GE_128-NEXT: mov w8, #240 +; VBITS_GE_128-NEXT: mov w9, #224 +; VBITS_GE_128-NEXT: mov w10, #208 +; VBITS_GE_128-NEXT: mov w11, #192 +; VBITS_GE_128-NEXT: mov w12, #176 +; VBITS_GE_128-NEXT: mov w13, #160 +; VBITS_GE_128-NEXT: mov w14, #144 +; VBITS_GE_128-NEXT: mov w15, #128 +; VBITS_GE_128-NEXT: mov w16, #112 +; VBITS_GE_128-NEXT: mov w17, #96 +; VBITS_GE_128-NEXT: mov w18, #80 +; VBITS_GE_128-NEXT: mov w2, #64 +; VBITS_GE_128-NEXT: mov w3, #48 +; VBITS_GE_128-NEXT: mov w4, #32 +; VBITS_GE_128-NEXT: mov w5, #16 +; VBITS_GE_128-NEXT: ptrue p0.b, vl16 +; VBITS_GE_128-NEXT: ld1b { z1.b }, p0/z, [x0, x8] +; VBITS_GE_128-NEXT: ld1b { z2.b }, p0/z, [x0, x9] +; VBITS_GE_128-NEXT: ld1b { z0.b }, p0/z, [x0, x10] +; VBITS_GE_128-NEXT: ld1b { z16.b }, p0/z, [x0, x11] +; VBITS_GE_128-NEXT: ld1b { z7.b }, p0/z, [x0, x12] +; VBITS_GE_128-NEXT: ld1b { z6.b }, p0/z, [x0, x13] +; VBITS_GE_128-NEXT: ld1b { z5.b }, p0/z, [x0, x14] +; VBITS_GE_128-NEXT: ld1b { z3.b }, p0/z, [x0, x15] +; VBITS_GE_128-NEXT: ld1b { z4.b }, p0/z, [x0, x16] +; VBITS_GE_128-NEXT: ld1b { z17.b }, p0/z, [x0, x17] +; VBITS_GE_128-NEXT: ld1b { z18.b }, p0/z, [x0, x18] +; VBITS_GE_128-NEXT: ld1b { z19.b }, p0/z, [x0, x2] +; VBITS_GE_128-NEXT: ld1b { z20.b }, p0/z, [x0, x3] +; VBITS_GE_128-NEXT: ld1b { z21.b }, p0/z, [x0, x4] +; VBITS_GE_128-NEXT: ld1b { z22.b }, p0/z, [x0, x5] +; VBITS_GE_128-NEXT: ld1b { z23.b }, p0/z, [x0] +; VBITS_GE_128-NEXT: ld1b { z24.b }, p0/z, [x1, x8] +; VBITS_GE_128-NEXT: ld1b { z25.b }, p0/z, [x1, x9] +; VBITS_GE_128-NEXT: ld1b { z26.b }, p0/z, [x1, x10] +; VBITS_GE_128-NEXT: ld1b { z27.b }, p0/z, [x1, x11] +; VBITS_GE_128-NEXT: ld1b { z28.b }, p0/z, [x1, x12] +; VBITS_GE_128-NEXT: ld1b { z29.b }, p0/z, [x1, x13] +; VBITS_GE_128-NEXT: ld1b { z30.b }, p0/z, [x1, x14] +; VBITS_GE_128-NEXT: ld1b { z31.b }, p0/z, [x1, x15] +; VBITS_GE_128-NEXT: ld1b { z8.b }, p0/z, [x1, x16] +; VBITS_GE_128-NEXT: ld1b { z9.b }, p0/z, [x1, x17] +; VBITS_GE_128-NEXT: orr z1.d, z1.d, z24.d +; VBITS_GE_128-NEXT: orr z2.d, z2.d, z25.d +; VBITS_GE_128-NEXT: ld1b { z10.b }, p0/z, [x1, x18] +; VBITS_GE_128-NEXT: ld1b { z11.b }, p0/z, [x1, x2] +; VBITS_GE_128-NEXT: ld1b { z12.b }, p0/z, [x1, x3] +; VBITS_GE_128-NEXT: ld1b { z13.b }, p0/z, [x1, x4] +; VBITS_GE_128-NEXT: ld1b { z14.b }, p0/z, [x1, x5] +; VBITS_GE_128-NEXT: ld1b { z15.b }, p0/z, [x1] +; VBITS_GE_128-NEXT: stp q2, q1, [x0, #224] +; VBITS_GE_128-NEXT: orr z0.d, z0.d, z26.d +; VBITS_GE_128-NEXT: orr z1.d, z16.d, z27.d +; VBITS_GE_128-NEXT: stp q1, q0, [x0, #192] +; VBITS_GE_128-NEXT: orr z0.d, z7.d, z28.d +; VBITS_GE_128-NEXT: orr z1.d, z6.d, z29.d +; VBITS_GE_128-NEXT: stp q1, q0, [x0, #160] +; VBITS_GE_128-NEXT: orr z0.d, z5.d, z30.d +; VBITS_GE_128-NEXT: orr z1.d, z3.d, z31.d +; VBITS_GE_128-NEXT: stp q1, q0, [x0, #128] +; VBITS_GE_128-NEXT: orr z0.d, z4.d, z8.d +; VBITS_GE_128-NEXT: orr z1.d, z17.d, z9.d +; VBITS_GE_128-NEXT: stp q1, q0, [x0, #96] +; VBITS_GE_128-NEXT: orr z0.d, z18.d, z10.d +; VBITS_GE_128-NEXT: orr z1.d, z19.d, z11.d +; VBITS_GE_128-NEXT: stp q1, q0, [x0, #64] +; VBITS_GE_128-NEXT: orr z0.d, z20.d, z12.d +; VBITS_GE_128-NEXT: orr z1.d, z21.d, z13.d +; VBITS_GE_128-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload +; VBITS_GE_128-NEXT: stp q1, q0, [x0, #32] +; VBITS_GE_128-NEXT: orr z0.d, z23.d, z15.d +; VBITS_GE_128-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload +; VBITS_GE_128-NEXT: orr z1.d, z22.d, z14.d +; VBITS_GE_128-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload +; VBITS_GE_128-NEXT: stp q0, q1, [x0] +; VBITS_GE_128-NEXT: ldp d15, d14, [sp], #64 // 16-byte Folded Reload +; VBITS_GE_128-NEXT: ret +; +; VBITS_GE_256-LABEL: or_v256i8: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov w8, #192 +; VBITS_GE_256-NEXT: mov w9, #224 +; VBITS_GE_256-NEXT: mov w10, #128 +; VBITS_GE_256-NEXT: mov w11, #160 +; VBITS_GE_256-NEXT: mov w12, #64 +; VBITS_GE_256-NEXT: mov w13, #96 +; VBITS_GE_256-NEXT: mov w14, #32 +; VBITS_GE_256-NEXT: ptrue p0.b, vl32 +; VBITS_GE_256-NEXT: ld1b { z0.b }, p0/z, [x0, x8] +; VBITS_GE_256-NEXT: ld1b { z1.b }, p0/z, [x0, x9] +; VBITS_GE_256-NEXT: ld1b { z2.b }, p0/z, [x0, x10] +; VBITS_GE_256-NEXT: ld1b { z3.b }, p0/z, [x0, x11] +; VBITS_GE_256-NEXT: ld1b { z4.b }, p0/z, [x0, x12] +; VBITS_GE_256-NEXT: ld1b { z5.b }, p0/z, [x0, x13] +; VBITS_GE_256-NEXT: ld1b { z6.b }, p0/z, [x0, x14] +; VBITS_GE_256-NEXT: ld1b { z7.b }, p0/z, [x0] +; VBITS_GE_256-NEXT: ld1b { z16.b }, p0/z, [x1, x13] +; VBITS_GE_256-NEXT: ld1b { z17.b }, p0/z, [x1, x14] +; VBITS_GE_256-NEXT: ld1b { z18.b }, p0/z, [x1, x11] +; VBITS_GE_256-NEXT: ld1b { z19.b }, p0/z, [x1, x12] +; VBITS_GE_256-NEXT: ld1b { z20.b }, p0/z, [x1, x9] +; VBITS_GE_256-NEXT: ld1b { z21.b }, p0/z, [x1, x10] +; VBITS_GE_256-NEXT: ld1b { z22.b }, p0/z, [x1, x8] +; VBITS_GE_256-NEXT: ld1b { z23.b }, p0/z, [x1] +; VBITS_GE_256-NEXT: orr z6.d, z6.d, z17.d +; VBITS_GE_256-NEXT: orr z5.d, z5.d, z16.d +; VBITS_GE_256-NEXT: orr z4.d, z4.d, z19.d +; VBITS_GE_256-NEXT: orr z3.d, z3.d, z18.d +; VBITS_GE_256-NEXT: orr z2.d, z2.d, z21.d +; VBITS_GE_256-NEXT: orr z1.d, z1.d, z20.d +; VBITS_GE_256-NEXT: orr z0.d, z0.d, z22.d +; VBITS_GE_256-NEXT: orr z7.d, z7.d, z23.d +; VBITS_GE_256-NEXT: st1b { z0.b }, p0, [x0, x8] +; VBITS_GE_256-NEXT: st1b { z1.b }, p0, [x0, x9] +; VBITS_GE_256-NEXT: st1b { z2.b }, p0, [x0, x10] +; VBITS_GE_256-NEXT: st1b { z3.b }, p0, [x0, x11] +; VBITS_GE_256-NEXT: st1b { z4.b }, p0, [x0, x12] +; VBITS_GE_256-NEXT: st1b { z5.b }, p0, [x0, x13] +; VBITS_GE_256-NEXT: st1b { z6.b }, p0, [x0, x14] +; VBITS_GE_256-NEXT: st1b { z7.b }, p0, [x0] +; VBITS_GE_256-NEXT: ret + %op1 = load <256 x i8>, <256 x i8>* %a + %op2 = load <256 x i8>, <256 x i8>* %b + %res = or <256 x i8> %op1, %op2 + store <256 x i8> %res, <256 x i8>* %a + ret void +} + +define <4 x i16> @or_v4i16(<4 x i16> %op1, <4 x i16> %op2) #0 { +; CHECK-LABEL: or_v4i16: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1 +; CHECK-NEXT: orr z0.d, z0.d, z1.d +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 +; CHECK-NEXT: ret + %res = or <4 x i16> %op1, %op2 + ret <4 x i16> %res +} + +define <8 x i16> @or_v8i16(<8 x i16> %op1, <8 x i16> %op2) #0 { +; CHECK-LABEL: or_v8i16: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1 +; CHECK-NEXT: orr z0.d, z0.d, z1.d +; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 +; CHECK-NEXT: ret + %res = or <8 x i16> %op1, %op2 + ret <8 x i16> %res +} + +define void @or_v16i16(<16 x i16>* %a, <16 x i16>* %b) #0 { +; VBITS_GE_128-LABEL: or_v16i16: +; VBITS_GE_128: // %bb.0: +; VBITS_GE_128-NEXT: mov x8, #8 +; VBITS_GE_128-NEXT: ptrue p0.h, vl8 +; VBITS_GE_128-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] +; VBITS_GE_128-NEXT: ld1h { z1.h }, p0/z, [x0] +; VBITS_GE_128-NEXT: ld1h { z2.h }, p0/z, [x1, x8, lsl #1] +; VBITS_GE_128-NEXT: ld1h { z3.h }, p0/z, [x1] +; VBITS_GE_128-NEXT: orr z1.d, z1.d, z3.d +; VBITS_GE_128-NEXT: orr z0.d, z0.d, z2.d +; VBITS_GE_128-NEXT: stp q1, q0, [x0] +; VBITS_GE_128-NEXT: ret +; +; VBITS_GE_256-LABEL: or_v16i16: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: ptrue p0.h, vl16 +; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0] +; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x1] +; VBITS_GE_256-NEXT: orr z0.d, z0.d, z1.d +; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x0] +; VBITS_GE_256-NEXT: ret +; +; VBITS_GE_512-LABEL: or_v16i16: +; VBITS_GE_512: // %bb.0: +; VBITS_GE_512-NEXT: ptrue p0.h, vl16 +; VBITS_GE_512-NEXT: ld1h { z0.h }, p0/z, [x0] +; VBITS_GE_512-NEXT: ld1h { z1.h }, p0/z, [x1] +; VBITS_GE_512-NEXT: orr z0.d, z0.d, z1.d +; VBITS_GE_512-NEXT: st1h { z0.h }, p0, [x0] +; VBITS_GE_512-NEXT: ret + %op1 = load <16 x i16>, <16 x i16>* %a + %op2 = load <16 x i16>, <16 x i16>* %b + %res = or <16 x i16> %op1, %op2 + store <16 x i16> %res, <16 x i16>* %a + ret void +} + +define void @or_v32i16(<32 x i16>* %a, <32 x i16>* %b) #0 { +; VBITS_GE_128-LABEL: or_v32i16: +; VBITS_GE_128: // %bb.0: +; VBITS_GE_128-NEXT: mov x8, #16 +; VBITS_GE_128-NEXT: mov x9, #24 +; VBITS_GE_128-NEXT: mov x10, #8 +; VBITS_GE_128-NEXT: ptrue p0.h, vl8 +; VBITS_GE_128-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] +; VBITS_GE_128-NEXT: ld1h { z1.h }, p0/z, [x0, x9, lsl #1] +; VBITS_GE_128-NEXT: ld1h { z2.h }, p0/z, [x0, x10, lsl #1] +; VBITS_GE_128-NEXT: ld1h { z3.h }, p0/z, [x0] +; VBITS_GE_128-NEXT: ld1h { z4.h }, p0/z, [x1, x8, lsl #1] +; VBITS_GE_128-NEXT: ld1h { z5.h }, p0/z, [x1, x9, lsl #1] +; VBITS_GE_128-NEXT: ld1h { z6.h }, p0/z, [x1, x10, lsl #1] +; VBITS_GE_128-NEXT: ld1h { z7.h }, p0/z, [x1] +; VBITS_GE_128-NEXT: orr z0.d, z0.d, z4.d +; VBITS_GE_128-NEXT: orr z1.d, z1.d, z5.d +; VBITS_GE_128-NEXT: stp q0, q1, [x0, #32] +; VBITS_GE_128-NEXT: orr z0.d, z3.d, z7.d +; VBITS_GE_128-NEXT: orr z1.d, z2.d, z6.d +; VBITS_GE_128-NEXT: stp q0, q1, [x0] +; VBITS_GE_128-NEXT: ret +; +; VBITS_GE_256-LABEL: or_v32i16: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov x8, #16 +; VBITS_GE_256-NEXT: ptrue p0.h, vl16 +; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] +; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0] +; VBITS_GE_256-NEXT: ld1h { z2.h }, p0/z, [x1, x8, lsl #1] +; VBITS_GE_256-NEXT: ld1h { z3.h }, p0/z, [x1] +; VBITS_GE_256-NEXT: orr z0.d, z0.d, z2.d +; VBITS_GE_256-NEXT: orr z1.d, z1.d, z3.d +; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x0, x8, lsl #1] +; VBITS_GE_256-NEXT: st1h { z1.h }, p0, [x0] +; VBITS_GE_256-NEXT: ret +; +; VBITS_GE_512-LABEL: or_v32i16: +; VBITS_GE_512: // %bb.0: +; VBITS_GE_512-NEXT: ptrue p0.h, vl32 +; VBITS_GE_512-NEXT: ld1h { z0.h }, p0/z, [x0] +; VBITS_GE_512-NEXT: ld1h { z1.h }, p0/z, [x1] +; VBITS_GE_512-NEXT: orr z0.d, z0.d, z1.d +; VBITS_GE_512-NEXT: st1h { z0.h }, p0, [x0] +; VBITS_GE_512-NEXT: ret + %op1 = load <32 x i16>, <32 x i16>* %a + %op2 = load <32 x i16>, <32 x i16>* %b + %res = or <32 x i16> %op1, %op2 + store <32 x i16> %res, <32 x i16>* %a + ret void +} + +define void @or_v64i16(<64 x i16>* %a, <64 x i16>* %b) #0 { +; VBITS_GE_128-LABEL: or_v64i16: +; VBITS_GE_128: // %bb.0: +; VBITS_GE_128-NEXT: mov x8, #48 +; VBITS_GE_128-NEXT: mov x9, #56 +; VBITS_GE_128-NEXT: mov x10, #32 +; VBITS_GE_128-NEXT: mov x11, #40 +; VBITS_GE_128-NEXT: mov x12, #16 +; VBITS_GE_128-NEXT: mov x13, #24 +; VBITS_GE_128-NEXT: mov x14, #8 +; VBITS_GE_128-NEXT: ptrue p0.h, vl8 +; VBITS_GE_128-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] +; VBITS_GE_128-NEXT: ld1h { z1.h }, p0/z, [x0, x9, lsl #1] +; VBITS_GE_128-NEXT: ld1h { z2.h }, p0/z, [x0, x10, lsl #1] +; VBITS_GE_128-NEXT: ld1h { z3.h }, p0/z, [x0, x11, lsl #1] +; VBITS_GE_128-NEXT: ld1h { z4.h }, p0/z, [x0, x12, lsl #1] +; VBITS_GE_128-NEXT: ld1h { z5.h }, p0/z, [x0, x13, lsl #1] +; VBITS_GE_128-NEXT: ld1h { z6.h }, p0/z, [x0, x14, lsl #1] +; VBITS_GE_128-NEXT: ld1h { z7.h }, p0/z, [x0] +; VBITS_GE_128-NEXT: ld1h { z16.h }, p0/z, [x1, x8, lsl #1] +; VBITS_GE_128-NEXT: ld1h { z17.h }, p0/z, [x1, x9, lsl #1] +; VBITS_GE_128-NEXT: ld1h { z18.h }, p0/z, [x1, x10, lsl #1] +; VBITS_GE_128-NEXT: ld1h { z19.h }, p0/z, [x1, x11, lsl #1] +; VBITS_GE_128-NEXT: ld1h { z20.h }, p0/z, [x1, x12, lsl #1] +; VBITS_GE_128-NEXT: ld1h { z21.h }, p0/z, [x1, x13, lsl #1] +; VBITS_GE_128-NEXT: ld1h { z22.h }, p0/z, [x1, x14, lsl #1] +; VBITS_GE_128-NEXT: ld1h { z23.h }, p0/z, [x1] +; VBITS_GE_128-NEXT: orr z0.d, z0.d, z16.d +; VBITS_GE_128-NEXT: orr z1.d, z1.d, z17.d +; VBITS_GE_128-NEXT: stp q0, q1, [x0, #96] +; VBITS_GE_128-NEXT: orr z0.d, z2.d, z18.d +; VBITS_GE_128-NEXT: orr z1.d, z3.d, z19.d +; VBITS_GE_128-NEXT: stp q0, q1, [x0, #64] +; VBITS_GE_128-NEXT: orr z0.d, z4.d, z20.d +; VBITS_GE_128-NEXT: orr z1.d, z5.d, z21.d +; VBITS_GE_128-NEXT: stp q0, q1, [x0, #32] +; VBITS_GE_128-NEXT: orr z0.d, z7.d, z23.d +; VBITS_GE_128-NEXT: orr z1.d, z6.d, z22.d +; VBITS_GE_128-NEXT: stp q0, q1, [x0] +; VBITS_GE_128-NEXT: ret +; +; VBITS_GE_256-LABEL: or_v64i16: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov x8, #32 +; VBITS_GE_256-NEXT: mov x9, #48 +; VBITS_GE_256-NEXT: mov x10, #16 +; VBITS_GE_256-NEXT: ptrue p0.h, vl16 +; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] +; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0, x9, lsl #1] +; VBITS_GE_256-NEXT: ld1h { z2.h }, p0/z, [x0, x10, lsl #1] +; VBITS_GE_256-NEXT: ld1h { z3.h }, p0/z, [x0] +; VBITS_GE_256-NEXT: ld1h { z4.h }, p0/z, [x1, x9, lsl #1] +; VBITS_GE_256-NEXT: ld1h { z5.h }, p0/z, [x1, x10, lsl #1] +; VBITS_GE_256-NEXT: ld1h { z6.h }, p0/z, [x1, x8, lsl #1] +; VBITS_GE_256-NEXT: ld1h { z7.h }, p0/z, [x1] +; VBITS_GE_256-NEXT: orr z2.d, z2.d, z5.d +; VBITS_GE_256-NEXT: orr z1.d, z1.d, z4.d +; VBITS_GE_256-NEXT: orr z0.d, z0.d, z6.d +; VBITS_GE_256-NEXT: orr z3.d, z3.d, z7.d +; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x0, x8, lsl #1] +; VBITS_GE_256-NEXT: st1h { z1.h }, p0, [x0, x9, lsl #1] +; VBITS_GE_256-NEXT: st1h { z2.h }, p0, [x0, x10, lsl #1] +; VBITS_GE_256-NEXT: st1h { z3.h }, p0, [x0] +; VBITS_GE_256-NEXT: ret + %op1 = load <64 x i16>, <64 x i16>* %a + %op2 = load <64 x i16>, <64 x i16>* %b + %res = or <64 x i16> %op1, %op2 + store <64 x i16> %res, <64 x i16>* %a + ret void +} + +define void @or_v128i16(<128 x i16>* %a, <128 x i16>* %b) #0 { +; VBITS_GE_128-LABEL: or_v128i16: +; VBITS_GE_128: // %bb.0: +; VBITS_GE_128-NEXT: stp d15, d14, [sp, #-64]! // 16-byte Folded Spill +; VBITS_GE_128-NEXT: .cfi_def_cfa_offset 64 +; VBITS_GE_128-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill +; VBITS_GE_128-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill +; VBITS_GE_128-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill +; VBITS_GE_128-NEXT: .cfi_offset b8, -8 +; VBITS_GE_128-NEXT: .cfi_offset b9, -16 +; VBITS_GE_128-NEXT: .cfi_offset b10, -24 +; VBITS_GE_128-NEXT: .cfi_offset b11, -32 +; VBITS_GE_128-NEXT: .cfi_offset b12, -40 +; VBITS_GE_128-NEXT: .cfi_offset b13, -48 +; VBITS_GE_128-NEXT: .cfi_offset b14, -56 +; VBITS_GE_128-NEXT: .cfi_offset b15, -64 +; VBITS_GE_128-NEXT: mov x8, #120 +; VBITS_GE_128-NEXT: mov x9, #112 +; VBITS_GE_128-NEXT: mov x10, #104 +; VBITS_GE_128-NEXT: mov x11, #96 +; VBITS_GE_128-NEXT: mov x12, #88 +; VBITS_GE_128-NEXT: mov x13, #80 +; VBITS_GE_128-NEXT: mov x14, #72 +; VBITS_GE_128-NEXT: mov x15, #64 +; VBITS_GE_128-NEXT: mov x16, #56 +; VBITS_GE_128-NEXT: mov x17, #48 +; VBITS_GE_128-NEXT: mov x18, #40 +; VBITS_GE_128-NEXT: mov x2, #32 +; VBITS_GE_128-NEXT: mov x3, #24 +; VBITS_GE_128-NEXT: mov x4, #16 +; VBITS_GE_128-NEXT: mov x5, #8 +; VBITS_GE_128-NEXT: ptrue p0.h, vl8 +; VBITS_GE_128-NEXT: ld1h { z1.h }, p0/z, [x0, x8, lsl #1] +; VBITS_GE_128-NEXT: ld1h { z2.h }, p0/z, [x0, x9, lsl #1] +; VBITS_GE_128-NEXT: ld1h { z0.h }, p0/z, [x0, x10, lsl #1] +; VBITS_GE_128-NEXT: ld1h { z7.h }, p0/z, [x0, x11, lsl #1] +; VBITS_GE_128-NEXT: ld1h { z16.h }, p0/z, [x0, x12, lsl #1] +; VBITS_GE_128-NEXT: ld1h { z6.h }, p0/z, [x0, x13, lsl #1] +; VBITS_GE_128-NEXT: ld1h { z5.h }, p0/z, [x0, x14, lsl #1] +; VBITS_GE_128-NEXT: ld1h { z4.h }, p0/z, [x0, x15, lsl #1] +; VBITS_GE_128-NEXT: ld1h { z3.h }, p0/z, [x0, x16, lsl #1] +; VBITS_GE_128-NEXT: ld1h { z17.h }, p0/z, [x0, x17, lsl #1] +; VBITS_GE_128-NEXT: ld1h { z18.h }, p0/z, [x0, x18, lsl #1] +; VBITS_GE_128-NEXT: ld1h { z19.h }, p0/z, [x0, x2, lsl #1] +; VBITS_GE_128-NEXT: ld1h { z20.h }, p0/z, [x0, x3, lsl #1] +; VBITS_GE_128-NEXT: ld1h { z21.h }, p0/z, [x0, x4, lsl #1] +; VBITS_GE_128-NEXT: ld1h { z22.h }, p0/z, [x0, x5, lsl #1] +; VBITS_GE_128-NEXT: ld1h { z23.h }, p0/z, [x0] +; VBITS_GE_128-NEXT: ld1h { z24.h }, p0/z, [x1, x8, lsl #1] +; VBITS_GE_128-NEXT: ld1h { z25.h }, p0/z, [x1, x9, lsl #1] +; VBITS_GE_128-NEXT: ld1h { z26.h }, p0/z, [x1, x10, lsl #1] +; VBITS_GE_128-NEXT: ld1h { z27.h }, p0/z, [x1, x11, lsl #1] +; VBITS_GE_128-NEXT: ld1h { z28.h }, p0/z, [x1, x12, lsl #1] +; VBITS_GE_128-NEXT: ld1h { z29.h }, p0/z, [x1, x13, lsl #1] +; VBITS_GE_128-NEXT: ld1h { z30.h }, p0/z, [x1, x14, lsl #1] +; VBITS_GE_128-NEXT: ld1h { z31.h }, p0/z, [x1, x15, lsl #1] +; VBITS_GE_128-NEXT: ld1h { z8.h }, p0/z, [x1, x16, lsl #1] +; VBITS_GE_128-NEXT: ld1h { z9.h }, p0/z, [x1, x17, lsl #1] +; VBITS_GE_128-NEXT: orr z1.d, z1.d, z24.d +; VBITS_GE_128-NEXT: orr z2.d, z2.d, z25.d +; VBITS_GE_128-NEXT: ld1h { z10.h }, p0/z, [x1, x18, lsl #1] +; VBITS_GE_128-NEXT: ld1h { z11.h }, p0/z, [x1, x2, lsl #1] +; VBITS_GE_128-NEXT: ld1h { z12.h }, p0/z, [x1, x3, lsl #1] +; VBITS_GE_128-NEXT: ld1h { z13.h }, p0/z, [x1, x4, lsl #1] +; VBITS_GE_128-NEXT: ld1h { z14.h }, p0/z, [x1, x5, lsl #1] +; VBITS_GE_128-NEXT: ld1h { z15.h }, p0/z, [x1] +; VBITS_GE_128-NEXT: stp q2, q1, [x0, #224] +; VBITS_GE_128-NEXT: orr z0.d, z0.d, z26.d +; VBITS_GE_128-NEXT: orr z1.d, z7.d, z27.d +; VBITS_GE_128-NEXT: stp q1, q0, [x0, #192] +; VBITS_GE_128-NEXT: orr z0.d, z16.d, z28.d +; VBITS_GE_128-NEXT: orr z1.d, z6.d, z29.d +; VBITS_GE_128-NEXT: stp q1, q0, [x0, #160] +; VBITS_GE_128-NEXT: orr z0.d, z5.d, z30.d +; VBITS_GE_128-NEXT: orr z1.d, z4.d, z31.d +; VBITS_GE_128-NEXT: stp q1, q0, [x0, #128] +; VBITS_GE_128-NEXT: orr z0.d, z3.d, z8.d +; VBITS_GE_128-NEXT: orr z1.d, z17.d, z9.d +; VBITS_GE_128-NEXT: stp q1, q0, [x0, #96] +; VBITS_GE_128-NEXT: orr z0.d, z18.d, z10.d +; VBITS_GE_128-NEXT: orr z1.d, z19.d, z11.d +; VBITS_GE_128-NEXT: stp q1, q0, [x0, #64] +; VBITS_GE_128-NEXT: orr z0.d, z20.d, z12.d +; VBITS_GE_128-NEXT: orr z1.d, z21.d, z13.d +; VBITS_GE_128-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload +; VBITS_GE_128-NEXT: stp q1, q0, [x0, #32] +; VBITS_GE_128-NEXT: orr z0.d, z22.d, z14.d +; VBITS_GE_128-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload +; VBITS_GE_128-NEXT: orr z1.d, z23.d, z15.d +; VBITS_GE_128-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload +; VBITS_GE_128-NEXT: stp q1, q0, [x0] +; VBITS_GE_128-NEXT: ldp d15, d14, [sp], #64 // 16-byte Folded Reload +; VBITS_GE_128-NEXT: ret +; +; VBITS_GE_256-LABEL: or_v128i16: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov x8, #96 +; VBITS_GE_256-NEXT: mov x9, #112 +; VBITS_GE_256-NEXT: mov x10, #64 +; VBITS_GE_256-NEXT: mov x11, #80 +; VBITS_GE_256-NEXT: mov x12, #32 +; VBITS_GE_256-NEXT: mov x13, #48 +; VBITS_GE_256-NEXT: mov x14, #16 +; VBITS_GE_256-NEXT: ptrue p0.h, vl16 +; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] +; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0, x9, lsl #1] +; VBITS_GE_256-NEXT: ld1h { z2.h }, p0/z, [x0, x10, lsl #1] +; VBITS_GE_256-NEXT: ld1h { z3.h }, p0/z, [x0, x11, lsl #1] +; VBITS_GE_256-NEXT: ld1h { z4.h }, p0/z, [x0, x12, lsl #1] +; VBITS_GE_256-NEXT: ld1h { z5.h }, p0/z, [x0, x13, lsl #1] +; VBITS_GE_256-NEXT: ld1h { z6.h }, p0/z, [x0, x14, lsl #1] +; VBITS_GE_256-NEXT: ld1h { z7.h }, p0/z, [x0] +; VBITS_GE_256-NEXT: ld1h { z16.h }, p0/z, [x1, x13, lsl #1] +; VBITS_GE_256-NEXT: ld1h { z17.h }, p0/z, [x1, x14, lsl #1] +; VBITS_GE_256-NEXT: ld1h { z18.h }, p0/z, [x1, x11, lsl #1] +; VBITS_GE_256-NEXT: ld1h { z19.h }, p0/z, [x1, x12, lsl #1] +; VBITS_GE_256-NEXT: ld1h { z20.h }, p0/z, [x1, x9, lsl #1] +; VBITS_GE_256-NEXT: ld1h { z21.h }, p0/z, [x1, x10, lsl #1] +; VBITS_GE_256-NEXT: ld1h { z22.h }, p0/z, [x1, x8, lsl #1] +; VBITS_GE_256-NEXT: ld1h { z23.h }, p0/z, [x1] +; VBITS_GE_256-NEXT: orr z6.d, z6.d, z17.d +; VBITS_GE_256-NEXT: orr z5.d, z5.d, z16.d +; VBITS_GE_256-NEXT: orr z4.d, z4.d, z19.d +; VBITS_GE_256-NEXT: orr z3.d, z3.d, z18.d +; VBITS_GE_256-NEXT: orr z2.d, z2.d, z21.d +; VBITS_GE_256-NEXT: orr z1.d, z1.d, z20.d +; VBITS_GE_256-NEXT: orr z0.d, z0.d, z22.d +; VBITS_GE_256-NEXT: orr z7.d, z7.d, z23.d +; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x0, x8, lsl #1] +; VBITS_GE_256-NEXT: st1h { z1.h }, p0, [x0, x9, lsl #1] +; VBITS_GE_256-NEXT: st1h { z2.h }, p0, [x0, x10, lsl #1] +; VBITS_GE_256-NEXT: st1h { z3.h }, p0, [x0, x11, lsl #1] +; VBITS_GE_256-NEXT: st1h { z4.h }, p0, [x0, x12, lsl #1] +; VBITS_GE_256-NEXT: st1h { z5.h }, p0, [x0, x13, lsl #1] +; VBITS_GE_256-NEXT: st1h { z6.h }, p0, [x0, x14, lsl #1] +; VBITS_GE_256-NEXT: st1h { z7.h }, p0, [x0] +; VBITS_GE_256-NEXT: ret + %op1 = load <128 x i16>, <128 x i16>* %a + %op2 = load <128 x i16>, <128 x i16>* %b + %res = or <128 x i16> %op1, %op2 + store <128 x i16> %res, <128 x i16>* %a + ret void +} + +define <2 x i32> @or_v2i32(<2 x i32> %op1, <2 x i32> %op2) #0 { +; CHECK-LABEL: or_v2i32: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1 +; CHECK-NEXT: orr z0.d, z0.d, z1.d +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 +; CHECK-NEXT: ret + %res = or <2 x i32> %op1, %op2 + ret <2 x i32> %res +} + +define <4 x i32> @or_v4i32(<4 x i32> %op1, <4 x i32> %op2) #0 { +; CHECK-LABEL: or_v4i32: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1 +; CHECK-NEXT: orr z0.d, z0.d, z1.d +; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 +; CHECK-NEXT: ret + %res = or <4 x i32> %op1, %op2 + ret <4 x i32> %res +} + +define void @or_v8i32(<8 x i32>* %a, <8 x i32>* %b) #0 { +; VBITS_GE_128-LABEL: or_v8i32: +; VBITS_GE_128: // %bb.0: +; VBITS_GE_128-NEXT: mov x8, #4 +; VBITS_GE_128-NEXT: ptrue p0.s, vl4 +; VBITS_GE_128-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] +; VBITS_GE_128-NEXT: ld1w { z1.s }, p0/z, [x0] +; VBITS_GE_128-NEXT: ld1w { z2.s }, p0/z, [x1, x8, lsl #2] +; VBITS_GE_128-NEXT: ld1w { z3.s }, p0/z, [x1] +; VBITS_GE_128-NEXT: orr z1.d, z1.d, z3.d +; VBITS_GE_128-NEXT: orr z0.d, z0.d, z2.d +; VBITS_GE_128-NEXT: stp q1, q0, [x0] +; VBITS_GE_128-NEXT: ret +; +; VBITS_GE_256-LABEL: or_v8i32: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: ptrue p0.s, vl8 +; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0] +; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x1] +; VBITS_GE_256-NEXT: orr z0.d, z0.d, z1.d +; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0] +; VBITS_GE_256-NEXT: ret +; +; VBITS_GE_512-LABEL: or_v8i32: +; VBITS_GE_512: // %bb.0: +; VBITS_GE_512-NEXT: ptrue p0.s, vl8 +; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x0] +; VBITS_GE_512-NEXT: ld1w { z1.s }, p0/z, [x1] +; VBITS_GE_512-NEXT: orr z0.d, z0.d, z1.d +; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x0] +; VBITS_GE_512-NEXT: ret + %op1 = load <8 x i32>, <8 x i32>* %a + %op2 = load <8 x i32>, <8 x i32>* %b + %res = or <8 x i32> %op1, %op2 + store <8 x i32> %res, <8 x i32>* %a + ret void +} + +define void @or_v16i32(<16 x i32>* %a, <16 x i32>* %b) #0 { +; VBITS_GE_128-LABEL: or_v16i32: +; VBITS_GE_128: // %bb.0: +; VBITS_GE_128-NEXT: mov x8, #8 +; VBITS_GE_128-NEXT: mov x9, #12 +; VBITS_GE_128-NEXT: mov x10, #4 +; VBITS_GE_128-NEXT: ptrue p0.s, vl4 +; VBITS_GE_128-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] +; VBITS_GE_128-NEXT: ld1w { z1.s }, p0/z, [x0, x9, lsl #2] +; VBITS_GE_128-NEXT: ld1w { z2.s }, p0/z, [x0, x10, lsl #2] +; VBITS_GE_128-NEXT: ld1w { z3.s }, p0/z, [x0] +; VBITS_GE_128-NEXT: ld1w { z4.s }, p0/z, [x1, x8, lsl #2] +; VBITS_GE_128-NEXT: ld1w { z5.s }, p0/z, [x1, x9, lsl #2] +; VBITS_GE_128-NEXT: ld1w { z6.s }, p0/z, [x1, x10, lsl #2] +; VBITS_GE_128-NEXT: ld1w { z7.s }, p0/z, [x1] +; VBITS_GE_128-NEXT: orr z0.d, z0.d, z4.d +; VBITS_GE_128-NEXT: orr z1.d, z1.d, z5.d +; VBITS_GE_128-NEXT: stp q0, q1, [x0, #32] +; VBITS_GE_128-NEXT: orr z0.d, z3.d, z7.d +; VBITS_GE_128-NEXT: orr z1.d, z2.d, z6.d +; VBITS_GE_128-NEXT: stp q0, q1, [x0] +; VBITS_GE_128-NEXT: ret +; +; VBITS_GE_256-LABEL: or_v16i32: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov x8, #8 +; VBITS_GE_256-NEXT: ptrue p0.s, vl8 +; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0] +; VBITS_GE_256-NEXT: ld1w { z2.s }, p0/z, [x1, x8, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z3.s }, p0/z, [x1] +; VBITS_GE_256-NEXT: orr z0.d, z0.d, z2.d +; VBITS_GE_256-NEXT: orr z1.d, z1.d, z3.d +; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0, x8, lsl #2] +; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x0] +; VBITS_GE_256-NEXT: ret +; +; VBITS_GE_512-LABEL: or_v16i32: +; VBITS_GE_512: // %bb.0: +; VBITS_GE_512-NEXT: ptrue p0.s, vl16 +; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x0] +; VBITS_GE_512-NEXT: ld1w { z1.s }, p0/z, [x1] +; VBITS_GE_512-NEXT: orr z0.d, z0.d, z1.d +; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x0] +; VBITS_GE_512-NEXT: ret + %op1 = load <16 x i32>, <16 x i32>* %a + %op2 = load <16 x i32>, <16 x i32>* %b + %res = or <16 x i32> %op1, %op2 + store <16 x i32> %res, <16 x i32>* %a + ret void +} + +define void @or_v32i32(<32 x i32>* %a, <32 x i32>* %b) #0 { +; VBITS_GE_128-LABEL: or_v32i32: +; VBITS_GE_128: // %bb.0: +; VBITS_GE_128-NEXT: mov x8, #24 +; VBITS_GE_128-NEXT: mov x9, #28 +; VBITS_GE_128-NEXT: mov x10, #16 +; VBITS_GE_128-NEXT: mov x11, #20 +; VBITS_GE_128-NEXT: mov x12, #8 +; VBITS_GE_128-NEXT: mov x13, #12 +; VBITS_GE_128-NEXT: mov x14, #4 +; VBITS_GE_128-NEXT: ptrue p0.s, vl4 +; VBITS_GE_128-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] +; VBITS_GE_128-NEXT: ld1w { z1.s }, p0/z, [x0, x9, lsl #2] +; VBITS_GE_128-NEXT: ld1w { z2.s }, p0/z, [x0, x10, lsl #2] +; VBITS_GE_128-NEXT: ld1w { z3.s }, p0/z, [x0, x11, lsl #2] +; VBITS_GE_128-NEXT: ld1w { z4.s }, p0/z, [x0, x12, lsl #2] +; VBITS_GE_128-NEXT: ld1w { z5.s }, p0/z, [x0, x13, lsl #2] +; VBITS_GE_128-NEXT: ld1w { z6.s }, p0/z, [x0, x14, lsl #2] +; VBITS_GE_128-NEXT: ld1w { z7.s }, p0/z, [x0] +; VBITS_GE_128-NEXT: ld1w { z16.s }, p0/z, [x1, x8, lsl #2] +; VBITS_GE_128-NEXT: ld1w { z17.s }, p0/z, [x1, x9, lsl #2] +; VBITS_GE_128-NEXT: ld1w { z18.s }, p0/z, [x1, x10, lsl #2] +; VBITS_GE_128-NEXT: ld1w { z19.s }, p0/z, [x1, x11, lsl #2] +; VBITS_GE_128-NEXT: ld1w { z20.s }, p0/z, [x1, x12, lsl #2] +; VBITS_GE_128-NEXT: ld1w { z21.s }, p0/z, [x1, x13, lsl #2] +; VBITS_GE_128-NEXT: ld1w { z22.s }, p0/z, [x1, x14, lsl #2] +; VBITS_GE_128-NEXT: ld1w { z23.s }, p0/z, [x1] +; VBITS_GE_128-NEXT: orr z0.d, z0.d, z16.d +; VBITS_GE_128-NEXT: orr z1.d, z1.d, z17.d +; VBITS_GE_128-NEXT: stp q0, q1, [x0, #96] +; VBITS_GE_128-NEXT: orr z0.d, z2.d, z18.d +; VBITS_GE_128-NEXT: orr z1.d, z3.d, z19.d +; VBITS_GE_128-NEXT: stp q0, q1, [x0, #64] +; VBITS_GE_128-NEXT: orr z0.d, z4.d, z20.d +; VBITS_GE_128-NEXT: orr z1.d, z5.d, z21.d +; VBITS_GE_128-NEXT: stp q0, q1, [x0, #32] +; VBITS_GE_128-NEXT: orr z0.d, z7.d, z23.d +; VBITS_GE_128-NEXT: orr z1.d, z6.d, z22.d +; VBITS_GE_128-NEXT: stp q0, q1, [x0] +; VBITS_GE_128-NEXT: ret +; +; VBITS_GE_256-LABEL: or_v32i32: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov x8, #16 +; VBITS_GE_256-NEXT: mov x9, #24 +; VBITS_GE_256-NEXT: mov x10, #8 +; VBITS_GE_256-NEXT: ptrue p0.s, vl8 +; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0, x9, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z2.s }, p0/z, [x0, x10, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z3.s }, p0/z, [x0] +; VBITS_GE_256-NEXT: ld1w { z4.s }, p0/z, [x1, x9, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z5.s }, p0/z, [x1, x10, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z6.s }, p0/z, [x1, x8, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z7.s }, p0/z, [x1] +; VBITS_GE_256-NEXT: orr z2.d, z2.d, z5.d +; VBITS_GE_256-NEXT: orr z1.d, z1.d, z4.d +; VBITS_GE_256-NEXT: orr z0.d, z0.d, z6.d +; VBITS_GE_256-NEXT: orr z3.d, z3.d, z7.d +; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0, x8, lsl #2] +; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x0, x9, lsl #2] +; VBITS_GE_256-NEXT: st1w { z2.s }, p0, [x0, x10, lsl #2] +; VBITS_GE_256-NEXT: st1w { z3.s }, p0, [x0] +; VBITS_GE_256-NEXT: ret + %op1 = load <32 x i32>, <32 x i32>* %a + %op2 = load <32 x i32>, <32 x i32>* %b + %res = or <32 x i32> %op1, %op2 + store <32 x i32> %res, <32 x i32>* %a + ret void +} + +define void @or_v64i32(<64 x i32>* %a, <64 x i32>* %b) #0 { +; VBITS_GE_128-LABEL: or_v64i32: +; VBITS_GE_128: // %bb.0: +; VBITS_GE_128-NEXT: stp d15, d14, [sp, #-64]! // 16-byte Folded Spill +; VBITS_GE_128-NEXT: .cfi_def_cfa_offset 64 +; VBITS_GE_128-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill +; VBITS_GE_128-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill +; VBITS_GE_128-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill +; VBITS_GE_128-NEXT: .cfi_offset b8, -8 +; VBITS_GE_128-NEXT: .cfi_offset b9, -16 +; VBITS_GE_128-NEXT: .cfi_offset b10, -24 +; VBITS_GE_128-NEXT: .cfi_offset b11, -32 +; VBITS_GE_128-NEXT: .cfi_offset b12, -40 +; VBITS_GE_128-NEXT: .cfi_offset b13, -48 +; VBITS_GE_128-NEXT: .cfi_offset b14, -56 +; VBITS_GE_128-NEXT: .cfi_offset b15, -64 +; VBITS_GE_128-NEXT: mov x8, #60 +; VBITS_GE_128-NEXT: mov x9, #56 +; VBITS_GE_128-NEXT: mov x10, #52 +; VBITS_GE_128-NEXT: mov x11, #48 +; VBITS_GE_128-NEXT: mov x12, #44 +; VBITS_GE_128-NEXT: mov x13, #40 +; VBITS_GE_128-NEXT: mov x14, #36 +; VBITS_GE_128-NEXT: mov x15, #32 +; VBITS_GE_128-NEXT: mov x16, #28 +; VBITS_GE_128-NEXT: mov x17, #24 +; VBITS_GE_128-NEXT: mov x18, #20 +; VBITS_GE_128-NEXT: mov x2, #16 +; VBITS_GE_128-NEXT: mov x3, #12 +; VBITS_GE_128-NEXT: mov x4, #8 +; VBITS_GE_128-NEXT: mov x5, #4 +; VBITS_GE_128-NEXT: ptrue p0.s, vl4 +; VBITS_GE_128-NEXT: ld1w { z1.s }, p0/z, [x0, x8, lsl #2] +; VBITS_GE_128-NEXT: ld1w { z2.s }, p0/z, [x0, x9, lsl #2] +; VBITS_GE_128-NEXT: ld1w { z0.s }, p0/z, [x0, x10, lsl #2] +; VBITS_GE_128-NEXT: ld1w { z7.s }, p0/z, [x0, x11, lsl #2] +; VBITS_GE_128-NEXT: ld1w { z16.s }, p0/z, [x0, x12, lsl #2] +; VBITS_GE_128-NEXT: ld1w { z6.s }, p0/z, [x0, x13, lsl #2] +; VBITS_GE_128-NEXT: ld1w { z5.s }, p0/z, [x0, x14, lsl #2] +; VBITS_GE_128-NEXT: ld1w { z4.s }, p0/z, [x0, x15, lsl #2] +; VBITS_GE_128-NEXT: ld1w { z3.s }, p0/z, [x0, x16, lsl #2] +; VBITS_GE_128-NEXT: ld1w { z17.s }, p0/z, [x0, x17, lsl #2] +; VBITS_GE_128-NEXT: ld1w { z18.s }, p0/z, [x0, x18, lsl #2] +; VBITS_GE_128-NEXT: ld1w { z19.s }, p0/z, [x0, x2, lsl #2] +; VBITS_GE_128-NEXT: ld1w { z20.s }, p0/z, [x0, x3, lsl #2] +; VBITS_GE_128-NEXT: ld1w { z21.s }, p0/z, [x0, x4, lsl #2] +; VBITS_GE_128-NEXT: ld1w { z22.s }, p0/z, [x0, x5, lsl #2] +; VBITS_GE_128-NEXT: ld1w { z23.s }, p0/z, [x0] +; VBITS_GE_128-NEXT: ld1w { z24.s }, p0/z, [x1, x8, lsl #2] +; VBITS_GE_128-NEXT: ld1w { z25.s }, p0/z, [x1, x9, lsl #2] +; VBITS_GE_128-NEXT: ld1w { z26.s }, p0/z, [x1, x10, lsl #2] +; VBITS_GE_128-NEXT: ld1w { z27.s }, p0/z, [x1, x11, lsl #2] +; VBITS_GE_128-NEXT: ld1w { z28.s }, p0/z, [x1, x12, lsl #2] +; VBITS_GE_128-NEXT: ld1w { z29.s }, p0/z, [x1, x13, lsl #2] +; VBITS_GE_128-NEXT: ld1w { z30.s }, p0/z, [x1, x14, lsl #2] +; VBITS_GE_128-NEXT: ld1w { z31.s }, p0/z, [x1, x15, lsl #2] +; VBITS_GE_128-NEXT: ld1w { z8.s }, p0/z, [x1, x16, lsl #2] +; VBITS_GE_128-NEXT: ld1w { z9.s }, p0/z, [x1, x17, lsl #2] +; VBITS_GE_128-NEXT: orr z1.d, z1.d, z24.d +; VBITS_GE_128-NEXT: orr z2.d, z2.d, z25.d +; VBITS_GE_128-NEXT: ld1w { z10.s }, p0/z, [x1, x18, lsl #2] +; VBITS_GE_128-NEXT: ld1w { z11.s }, p0/z, [x1, x2, lsl #2] +; VBITS_GE_128-NEXT: ld1w { z12.s }, p0/z, [x1, x3, lsl #2] +; VBITS_GE_128-NEXT: ld1w { z13.s }, p0/z, [x1, x4, lsl #2] +; VBITS_GE_128-NEXT: ld1w { z14.s }, p0/z, [x1, x5, lsl #2] +; VBITS_GE_128-NEXT: ld1w { z15.s }, p0/z, [x1] +; VBITS_GE_128-NEXT: stp q2, q1, [x0, #224] +; VBITS_GE_128-NEXT: orr z0.d, z0.d, z26.d +; VBITS_GE_128-NEXT: orr z1.d, z7.d, z27.d +; VBITS_GE_128-NEXT: stp q1, q0, [x0, #192] +; VBITS_GE_128-NEXT: orr z0.d, z16.d, z28.d +; VBITS_GE_128-NEXT: orr z1.d, z6.d, z29.d +; VBITS_GE_128-NEXT: stp q1, q0, [x0, #160] +; VBITS_GE_128-NEXT: orr z0.d, z5.d, z30.d +; VBITS_GE_128-NEXT: orr z1.d, z4.d, z31.d +; VBITS_GE_128-NEXT: stp q1, q0, [x0, #128] +; VBITS_GE_128-NEXT: orr z0.d, z3.d, z8.d +; VBITS_GE_128-NEXT: orr z1.d, z17.d, z9.d +; VBITS_GE_128-NEXT: stp q1, q0, [x0, #96] +; VBITS_GE_128-NEXT: orr z0.d, z18.d, z10.d +; VBITS_GE_128-NEXT: orr z1.d, z19.d, z11.d +; VBITS_GE_128-NEXT: stp q1, q0, [x0, #64] +; VBITS_GE_128-NEXT: orr z0.d, z20.d, z12.d +; VBITS_GE_128-NEXT: orr z1.d, z21.d, z13.d +; VBITS_GE_128-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload +; VBITS_GE_128-NEXT: stp q1, q0, [x0, #32] +; VBITS_GE_128-NEXT: orr z0.d, z22.d, z14.d +; VBITS_GE_128-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload +; VBITS_GE_128-NEXT: orr z1.d, z23.d, z15.d +; VBITS_GE_128-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload +; VBITS_GE_128-NEXT: stp q1, q0, [x0] +; VBITS_GE_128-NEXT: ldp d15, d14, [sp], #64 // 16-byte Folded Reload +; VBITS_GE_128-NEXT: ret +; +; VBITS_GE_256-LABEL: or_v64i32: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov x8, #48 +; VBITS_GE_256-NEXT: mov x9, #56 +; VBITS_GE_256-NEXT: mov x10, #32 +; VBITS_GE_256-NEXT: mov x11, #40 +; VBITS_GE_256-NEXT: mov x12, #16 +; VBITS_GE_256-NEXT: mov x13, #24 +; VBITS_GE_256-NEXT: mov x14, #8 +; VBITS_GE_256-NEXT: ptrue p0.s, vl8 +; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0, x9, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z2.s }, p0/z, [x0, x10, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z3.s }, p0/z, [x0, x11, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z4.s }, p0/z, [x0, x12, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z5.s }, p0/z, [x0, x13, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z6.s }, p0/z, [x0, x14, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z7.s }, p0/z, [x0] +; VBITS_GE_256-NEXT: ld1w { z16.s }, p0/z, [x1, x13, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z17.s }, p0/z, [x1, x14, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z18.s }, p0/z, [x1, x11, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z19.s }, p0/z, [x1, x12, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z20.s }, p0/z, [x1, x9, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z21.s }, p0/z, [x1, x10, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z22.s }, p0/z, [x1, x8, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z23.s }, p0/z, [x1] +; VBITS_GE_256-NEXT: orr z6.d, z6.d, z17.d +; VBITS_GE_256-NEXT: orr z5.d, z5.d, z16.d +; VBITS_GE_256-NEXT: orr z4.d, z4.d, z19.d +; VBITS_GE_256-NEXT: orr z3.d, z3.d, z18.d +; VBITS_GE_256-NEXT: orr z2.d, z2.d, z21.d +; VBITS_GE_256-NEXT: orr z1.d, z1.d, z20.d +; VBITS_GE_256-NEXT: orr z0.d, z0.d, z22.d +; VBITS_GE_256-NEXT: orr z7.d, z7.d, z23.d +; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0, x8, lsl #2] +; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x0, x9, lsl #2] +; VBITS_GE_256-NEXT: st1w { z2.s }, p0, [x0, x10, lsl #2] +; VBITS_GE_256-NEXT: st1w { z3.s }, p0, [x0, x11, lsl #2] +; VBITS_GE_256-NEXT: st1w { z4.s }, p0, [x0, x12, lsl #2] +; VBITS_GE_256-NEXT: st1w { z5.s }, p0, [x0, x13, lsl #2] +; VBITS_GE_256-NEXT: st1w { z6.s }, p0, [x0, x14, lsl #2] +; VBITS_GE_256-NEXT: st1w { z7.s }, p0, [x0] +; VBITS_GE_256-NEXT: ret + %op1 = load <64 x i32>, <64 x i32>* %a + %op2 = load <64 x i32>, <64 x i32>* %b + %res = or <64 x i32> %op1, %op2 + store <64 x i32> %res, <64 x i32>* %a + ret void +} + +define <1 x i64> @or_v1i64(<1 x i64> %op1, <1 x i64> %op2) #0 { +; CHECK-LABEL: or_v1i64: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1 +; CHECK-NEXT: orr z0.d, z0.d, z1.d +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 +; CHECK-NEXT: ret + %res = or <1 x i64> %op1, %op2 + ret <1 x i64> %res +} + +define <2 x i64> @or_v2i64(<2 x i64> %op1, <2 x i64> %op2) #0 { +; CHECK-LABEL: or_v2i64: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1 +; CHECK-NEXT: orr z0.d, z0.d, z1.d +; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 +; CHECK-NEXT: ret + %res = or <2 x i64> %op1, %op2 + ret <2 x i64> %res +} + +define void @or_v4i64(<4 x i64>* %a, <4 x i64>* %b) #0 { +; VBITS_GE_128-LABEL: or_v4i64: +; VBITS_GE_128: // %bb.0: +; VBITS_GE_128-NEXT: mov x8, #2 +; VBITS_GE_128-NEXT: ptrue p0.d, vl2 +; VBITS_GE_128-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] +; VBITS_GE_128-NEXT: ld1d { z1.d }, p0/z, [x0] +; VBITS_GE_128-NEXT: ld1d { z2.d }, p0/z, [x1, x8, lsl #3] +; VBITS_GE_128-NEXT: ld1d { z3.d }, p0/z, [x1] +; VBITS_GE_128-NEXT: orr z1.d, z1.d, z3.d +; VBITS_GE_128-NEXT: orr z0.d, z0.d, z2.d +; VBITS_GE_128-NEXT: stp q1, q0, [x0] +; VBITS_GE_128-NEXT: ret +; +; VBITS_GE_256-LABEL: or_v4i64: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: ptrue p0.d, vl4 +; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0] +; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x1] +; VBITS_GE_256-NEXT: orr z0.d, z0.d, z1.d +; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x0] +; VBITS_GE_256-NEXT: ret +; +; VBITS_GE_512-LABEL: or_v4i64: +; VBITS_GE_512: // %bb.0: +; VBITS_GE_512-NEXT: ptrue p0.d, vl4 +; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x0] +; VBITS_GE_512-NEXT: ld1d { z1.d }, p0/z, [x1] +; VBITS_GE_512-NEXT: orr z0.d, z0.d, z1.d +; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x0] +; VBITS_GE_512-NEXT: ret + %op1 = load <4 x i64>, <4 x i64>* %a + %op2 = load <4 x i64>, <4 x i64>* %b + %res = or <4 x i64> %op1, %op2 + store <4 x i64> %res, <4 x i64>* %a + ret void +} + +define void @or_v8i64(<8 x i64>* %a, <8 x i64>* %b) #0 { +; VBITS_GE_128-LABEL: or_v8i64: +; VBITS_GE_128: // %bb.0: +; VBITS_GE_128-NEXT: mov x8, #4 +; VBITS_GE_128-NEXT: mov x9, #6 +; VBITS_GE_128-NEXT: mov x10, #2 +; VBITS_GE_128-NEXT: ptrue p0.d, vl2 +; VBITS_GE_128-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] +; VBITS_GE_128-NEXT: ld1d { z1.d }, p0/z, [x0, x9, lsl #3] +; VBITS_GE_128-NEXT: ld1d { z2.d }, p0/z, [x0, x10, lsl #3] +; VBITS_GE_128-NEXT: ld1d { z3.d }, p0/z, [x0] +; VBITS_GE_128-NEXT: ld1d { z4.d }, p0/z, [x1, x8, lsl #3] +; VBITS_GE_128-NEXT: ld1d { z5.d }, p0/z, [x1, x9, lsl #3] +; VBITS_GE_128-NEXT: ld1d { z6.d }, p0/z, [x1, x10, lsl #3] +; VBITS_GE_128-NEXT: ld1d { z7.d }, p0/z, [x1] +; VBITS_GE_128-NEXT: orr z0.d, z0.d, z4.d +; VBITS_GE_128-NEXT: orr z1.d, z1.d, z5.d +; VBITS_GE_128-NEXT: stp q0, q1, [x0, #32] +; VBITS_GE_128-NEXT: orr z0.d, z3.d, z7.d +; VBITS_GE_128-NEXT: orr z1.d, z2.d, z6.d +; VBITS_GE_128-NEXT: stp q0, q1, [x0] +; VBITS_GE_128-NEXT: ret +; +; VBITS_GE_256-LABEL: or_v8i64: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov x8, #4 +; VBITS_GE_256-NEXT: ptrue p0.d, vl4 +; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0] +; VBITS_GE_256-NEXT: ld1d { z2.d }, p0/z, [x1, x8, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z3.d }, p0/z, [x1] +; VBITS_GE_256-NEXT: orr z0.d, z0.d, z2.d +; VBITS_GE_256-NEXT: orr z1.d, z1.d, z3.d +; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x0, x8, lsl #3] +; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x0] +; VBITS_GE_256-NEXT: ret +; +; VBITS_GE_512-LABEL: or_v8i64: +; VBITS_GE_512: // %bb.0: +; VBITS_GE_512-NEXT: ptrue p0.d, vl8 +; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x0] +; VBITS_GE_512-NEXT: ld1d { z1.d }, p0/z, [x1] +; VBITS_GE_512-NEXT: orr z0.d, z0.d, z1.d +; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x0] +; VBITS_GE_512-NEXT: ret + %op1 = load <8 x i64>, <8 x i64>* %a + %op2 = load <8 x i64>, <8 x i64>* %b + %res = or <8 x i64> %op1, %op2 + store <8 x i64> %res, <8 x i64>* %a + ret void +} + +define void @or_v16i64(<16 x i64>* %a, <16 x i64>* %b) #0 { +; VBITS_GE_128-LABEL: or_v16i64: +; VBITS_GE_128: // %bb.0: +; VBITS_GE_128-NEXT: mov x8, #12 +; VBITS_GE_128-NEXT: mov x9, #14 +; VBITS_GE_128-NEXT: mov x10, #8 +; VBITS_GE_128-NEXT: mov x11, #10 +; VBITS_GE_128-NEXT: mov x12, #4 +; VBITS_GE_128-NEXT: mov x13, #6 +; VBITS_GE_128-NEXT: mov x14, #2 +; VBITS_GE_128-NEXT: ptrue p0.d, vl2 +; VBITS_GE_128-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] +; VBITS_GE_128-NEXT: ld1d { z1.d }, p0/z, [x0, x9, lsl #3] +; VBITS_GE_128-NEXT: ld1d { z2.d }, p0/z, [x0, x10, lsl #3] +; VBITS_GE_128-NEXT: ld1d { z3.d }, p0/z, [x0, x11, lsl #3] +; VBITS_GE_128-NEXT: ld1d { z4.d }, p0/z, [x0, x12, lsl #3] +; VBITS_GE_128-NEXT: ld1d { z5.d }, p0/z, [x0, x13, lsl #3] +; VBITS_GE_128-NEXT: ld1d { z6.d }, p0/z, [x0, x14, lsl #3] +; VBITS_GE_128-NEXT: ld1d { z7.d }, p0/z, [x0] +; VBITS_GE_128-NEXT: ld1d { z16.d }, p0/z, [x1, x8, lsl #3] +; VBITS_GE_128-NEXT: ld1d { z17.d }, p0/z, [x1, x9, lsl #3] +; VBITS_GE_128-NEXT: ld1d { z18.d }, p0/z, [x1, x10, lsl #3] +; VBITS_GE_128-NEXT: ld1d { z19.d }, p0/z, [x1, x11, lsl #3] +; VBITS_GE_128-NEXT: ld1d { z20.d }, p0/z, [x1, x12, lsl #3] +; VBITS_GE_128-NEXT: ld1d { z21.d }, p0/z, [x1, x13, lsl #3] +; VBITS_GE_128-NEXT: ld1d { z22.d }, p0/z, [x1, x14, lsl #3] +; VBITS_GE_128-NEXT: ld1d { z23.d }, p0/z, [x1] +; VBITS_GE_128-NEXT: orr z0.d, z0.d, z16.d +; VBITS_GE_128-NEXT: orr z1.d, z1.d, z17.d +; VBITS_GE_128-NEXT: stp q0, q1, [x0, #96] +; VBITS_GE_128-NEXT: orr z0.d, z2.d, z18.d +; VBITS_GE_128-NEXT: orr z1.d, z3.d, z19.d +; VBITS_GE_128-NEXT: stp q0, q1, [x0, #64] +; VBITS_GE_128-NEXT: orr z0.d, z4.d, z20.d +; VBITS_GE_128-NEXT: orr z1.d, z5.d, z21.d +; VBITS_GE_128-NEXT: stp q0, q1, [x0, #32] +; VBITS_GE_128-NEXT: orr z0.d, z7.d, z23.d +; VBITS_GE_128-NEXT: orr z1.d, z6.d, z22.d +; VBITS_GE_128-NEXT: stp q0, q1, [x0] +; VBITS_GE_128-NEXT: ret +; +; VBITS_GE_256-LABEL: or_v16i64: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov x8, #8 +; VBITS_GE_256-NEXT: mov x9, #12 +; VBITS_GE_256-NEXT: mov x10, #4 +; VBITS_GE_256-NEXT: ptrue p0.d, vl4 +; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0, x9, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z2.d }, p0/z, [x0, x10, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z3.d }, p0/z, [x0] +; VBITS_GE_256-NEXT: ld1d { z4.d }, p0/z, [x1, x9, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z5.d }, p0/z, [x1, x10, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z6.d }, p0/z, [x1, x8, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z7.d }, p0/z, [x1] +; VBITS_GE_256-NEXT: orr z2.d, z2.d, z5.d +; VBITS_GE_256-NEXT: orr z1.d, z1.d, z4.d +; VBITS_GE_256-NEXT: orr z0.d, z0.d, z6.d +; VBITS_GE_256-NEXT: orr z3.d, z3.d, z7.d +; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x0, x8, lsl #3] +; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x0, x9, lsl #3] +; VBITS_GE_256-NEXT: st1d { z2.d }, p0, [x0, x10, lsl #3] +; VBITS_GE_256-NEXT: st1d { z3.d }, p0, [x0] +; VBITS_GE_256-NEXT: ret + %op1 = load <16 x i64>, <16 x i64>* %a + %op2 = load <16 x i64>, <16 x i64>* %b + %res = or <16 x i64> %op1, %op2 + store <16 x i64> %res, <16 x i64>* %a + ret void +} + +define void @or_v32i64(<32 x i64>* %a, <32 x i64>* %b) #0 { +; VBITS_GE_128-LABEL: or_v32i64: +; VBITS_GE_128: // %bb.0: +; VBITS_GE_128-NEXT: stp d15, d14, [sp, #-64]! // 16-byte Folded Spill +; VBITS_GE_128-NEXT: .cfi_def_cfa_offset 64 +; VBITS_GE_128-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill +; VBITS_GE_128-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill +; VBITS_GE_128-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill +; VBITS_GE_128-NEXT: .cfi_offset b8, -8 +; VBITS_GE_128-NEXT: .cfi_offset b9, -16 +; VBITS_GE_128-NEXT: .cfi_offset b10, -24 +; VBITS_GE_128-NEXT: .cfi_offset b11, -32 +; VBITS_GE_128-NEXT: .cfi_offset b12, -40 +; VBITS_GE_128-NEXT: .cfi_offset b13, -48 +; VBITS_GE_128-NEXT: .cfi_offset b14, -56 +; VBITS_GE_128-NEXT: .cfi_offset b15, -64 +; VBITS_GE_128-NEXT: mov x8, #30 +; VBITS_GE_128-NEXT: mov x9, #28 +; VBITS_GE_128-NEXT: mov x10, #26 +; VBITS_GE_128-NEXT: mov x11, #24 +; VBITS_GE_128-NEXT: mov x12, #22 +; VBITS_GE_128-NEXT: mov x13, #20 +; VBITS_GE_128-NEXT: mov x14, #18 +; VBITS_GE_128-NEXT: mov x15, #16 +; VBITS_GE_128-NEXT: mov x16, #14 +; VBITS_GE_128-NEXT: mov x17, #12 +; VBITS_GE_128-NEXT: mov x18, #10 +; VBITS_GE_128-NEXT: mov x2, #8 +; VBITS_GE_128-NEXT: mov x3, #6 +; VBITS_GE_128-NEXT: mov x4, #4 +; VBITS_GE_128-NEXT: mov x5, #2 +; VBITS_GE_128-NEXT: ptrue p0.d, vl2 +; VBITS_GE_128-NEXT: ld1d { z1.d }, p0/z, [x0, x8, lsl #3] +; VBITS_GE_128-NEXT: ld1d { z2.d }, p0/z, [x0, x9, lsl #3] +; VBITS_GE_128-NEXT: ld1d { z0.d }, p0/z, [x0, x10, lsl #3] +; VBITS_GE_128-NEXT: ld1d { z7.d }, p0/z, [x0, x11, lsl #3] +; VBITS_GE_128-NEXT: ld1d { z16.d }, p0/z, [x0, x12, lsl #3] +; VBITS_GE_128-NEXT: ld1d { z6.d }, p0/z, [x0, x13, lsl #3] +; VBITS_GE_128-NEXT: ld1d { z5.d }, p0/z, [x0, x14, lsl #3] +; VBITS_GE_128-NEXT: ld1d { z4.d }, p0/z, [x0, x15, lsl #3] +; VBITS_GE_128-NEXT: ld1d { z3.d }, p0/z, [x0, x16, lsl #3] +; VBITS_GE_128-NEXT: ld1d { z17.d }, p0/z, [x0, x17, lsl #3] +; VBITS_GE_128-NEXT: ld1d { z18.d }, p0/z, [x0, x18, lsl #3] +; VBITS_GE_128-NEXT: ld1d { z19.d }, p0/z, [x0, x2, lsl #3] +; VBITS_GE_128-NEXT: ld1d { z20.d }, p0/z, [x0, x3, lsl #3] +; VBITS_GE_128-NEXT: ld1d { z21.d }, p0/z, [x0, x4, lsl #3] +; VBITS_GE_128-NEXT: ld1d { z22.d }, p0/z, [x0, x5, lsl #3] +; VBITS_GE_128-NEXT: ld1d { z23.d }, p0/z, [x0] +; VBITS_GE_128-NEXT: ld1d { z24.d }, p0/z, [x1, x8, lsl #3] +; VBITS_GE_128-NEXT: ld1d { z25.d }, p0/z, [x1, x9, lsl #3] +; VBITS_GE_128-NEXT: ld1d { z26.d }, p0/z, [x1, x10, lsl #3] +; VBITS_GE_128-NEXT: ld1d { z27.d }, p0/z, [x1, x11, lsl #3] +; VBITS_GE_128-NEXT: ld1d { z28.d }, p0/z, [x1, x12, lsl #3] +; VBITS_GE_128-NEXT: ld1d { z29.d }, p0/z, [x1, x13, lsl #3] +; VBITS_GE_128-NEXT: ld1d { z30.d }, p0/z, [x1, x14, lsl #3] +; VBITS_GE_128-NEXT: ld1d { z31.d }, p0/z, [x1, x15, lsl #3] +; VBITS_GE_128-NEXT: ld1d { z8.d }, p0/z, [x1, x16, lsl #3] +; VBITS_GE_128-NEXT: ld1d { z9.d }, p0/z, [x1, x17, lsl #3] +; VBITS_GE_128-NEXT: orr z1.d, z1.d, z24.d +; VBITS_GE_128-NEXT: orr z2.d, z2.d, z25.d +; VBITS_GE_128-NEXT: ld1d { z10.d }, p0/z, [x1, x18, lsl #3] +; VBITS_GE_128-NEXT: ld1d { z11.d }, p0/z, [x1, x2, lsl #3] +; VBITS_GE_128-NEXT: ld1d { z12.d }, p0/z, [x1, x3, lsl #3] +; VBITS_GE_128-NEXT: ld1d { z13.d }, p0/z, [x1, x4, lsl #3] +; VBITS_GE_128-NEXT: ld1d { z14.d }, p0/z, [x1, x5, lsl #3] +; VBITS_GE_128-NEXT: ld1d { z15.d }, p0/z, [x1] +; VBITS_GE_128-NEXT: stp q2, q1, [x0, #224] +; VBITS_GE_128-NEXT: orr z0.d, z0.d, z26.d +; VBITS_GE_128-NEXT: orr z1.d, z7.d, z27.d +; VBITS_GE_128-NEXT: stp q1, q0, [x0, #192] +; VBITS_GE_128-NEXT: orr z0.d, z16.d, z28.d +; VBITS_GE_128-NEXT: orr z1.d, z6.d, z29.d +; VBITS_GE_128-NEXT: stp q1, q0, [x0, #160] +; VBITS_GE_128-NEXT: orr z0.d, z5.d, z30.d +; VBITS_GE_128-NEXT: orr z1.d, z4.d, z31.d +; VBITS_GE_128-NEXT: stp q1, q0, [x0, #128] +; VBITS_GE_128-NEXT: orr z0.d, z3.d, z8.d +; VBITS_GE_128-NEXT: orr z1.d, z17.d, z9.d +; VBITS_GE_128-NEXT: stp q1, q0, [x0, #96] +; VBITS_GE_128-NEXT: orr z0.d, z18.d, z10.d +; VBITS_GE_128-NEXT: orr z1.d, z19.d, z11.d +; VBITS_GE_128-NEXT: stp q1, q0, [x0, #64] +; VBITS_GE_128-NEXT: orr z0.d, z20.d, z12.d +; VBITS_GE_128-NEXT: orr z1.d, z21.d, z13.d +; VBITS_GE_128-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload +; VBITS_GE_128-NEXT: stp q1, q0, [x0, #32] +; VBITS_GE_128-NEXT: orr z0.d, z22.d, z14.d +; VBITS_GE_128-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload +; VBITS_GE_128-NEXT: orr z1.d, z23.d, z15.d +; VBITS_GE_128-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload +; VBITS_GE_128-NEXT: stp q1, q0, [x0] +; VBITS_GE_128-NEXT: ldp d15, d14, [sp], #64 // 16-byte Folded Reload +; VBITS_GE_128-NEXT: ret +; +; VBITS_GE_256-LABEL: or_v32i64: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov x8, #24 +; VBITS_GE_256-NEXT: mov x9, #28 +; VBITS_GE_256-NEXT: mov x10, #16 +; VBITS_GE_256-NEXT: mov x11, #20 +; VBITS_GE_256-NEXT: mov x12, #8 +; VBITS_GE_256-NEXT: mov x13, #12 +; VBITS_GE_256-NEXT: mov x14, #4 +; VBITS_GE_256-NEXT: ptrue p0.d, vl4 +; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0, x9, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z2.d }, p0/z, [x0, x10, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z3.d }, p0/z, [x0, x11, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z4.d }, p0/z, [x0, x12, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z5.d }, p0/z, [x0, x13, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z6.d }, p0/z, [x0, x14, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z7.d }, p0/z, [x0] +; VBITS_GE_256-NEXT: ld1d { z16.d }, p0/z, [x1, x13, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z17.d }, p0/z, [x1, x14, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z18.d }, p0/z, [x1, x11, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z19.d }, p0/z, [x1, x12, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z20.d }, p0/z, [x1, x9, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z21.d }, p0/z, [x1, x10, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z22.d }, p0/z, [x1, x8, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z23.d }, p0/z, [x1] +; VBITS_GE_256-NEXT: orr z6.d, z6.d, z17.d +; VBITS_GE_256-NEXT: orr z5.d, z5.d, z16.d +; VBITS_GE_256-NEXT: orr z4.d, z4.d, z19.d +; VBITS_GE_256-NEXT: orr z3.d, z3.d, z18.d +; VBITS_GE_256-NEXT: orr z2.d, z2.d, z21.d +; VBITS_GE_256-NEXT: orr z1.d, z1.d, z20.d +; VBITS_GE_256-NEXT: orr z0.d, z0.d, z22.d +; VBITS_GE_256-NEXT: orr z7.d, z7.d, z23.d +; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x0, x8, lsl #3] +; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x0, x9, lsl #3] +; VBITS_GE_256-NEXT: st1d { z2.d }, p0, [x0, x10, lsl #3] +; VBITS_GE_256-NEXT: st1d { z3.d }, p0, [x0, x11, lsl #3] +; VBITS_GE_256-NEXT: st1d { z4.d }, p0, [x0, x12, lsl #3] +; VBITS_GE_256-NEXT: st1d { z5.d }, p0, [x0, x13, lsl #3] +; VBITS_GE_256-NEXT: st1d { z6.d }, p0, [x0, x14, lsl #3] +; VBITS_GE_256-NEXT: st1d { z7.d }, p0, [x0] +; VBITS_GE_256-NEXT: ret + %op1 = load <32 x i64>, <32 x i64>* %a + %op2 = load <32 x i64>, <32 x i64>* %b + %res = or <32 x i64> %op1, %op2 + store <32 x i64> %res, <32 x i64>* %a + ret void +} + +; +; XOR +; + +define <8 x i8> @xor_v8i8(<8 x i8> %op1, <8 x i8> %op2) #0 { +; CHECK-LABEL: xor_v8i8: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1 +; CHECK-NEXT: eor z0.d, z0.d, z1.d +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 +; CHECK-NEXT: ret + %res = xor <8 x i8> %op1, %op2 + ret <8 x i8> %res +} + +define <16 x i8> @xor_v16i8(<16 x i8> %op1, <16 x i8> %op2) #0 { +; CHECK-LABEL: xor_v16i8: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1 +; CHECK-NEXT: eor z0.d, z0.d, z1.d +; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 +; CHECK-NEXT: ret + %res = xor <16 x i8> %op1, %op2 + ret <16 x i8> %res +} + +define void @xor_v32i8(<32 x i8>* %a, <32 x i8>* %b) #0 { +; VBITS_GE_128-LABEL: xor_v32i8: +; VBITS_GE_128: // %bb.0: +; VBITS_GE_128-NEXT: mov w8, #16 +; VBITS_GE_128-NEXT: ptrue p0.b, vl16 +; VBITS_GE_128-NEXT: ld1b { z0.b }, p0/z, [x0, x8] +; VBITS_GE_128-NEXT: ld1b { z1.b }, p0/z, [x0] +; VBITS_GE_128-NEXT: ld1b { z2.b }, p0/z, [x1, x8] +; VBITS_GE_128-NEXT: ld1b { z3.b }, p0/z, [x1] +; VBITS_GE_128-NEXT: eor z1.d, z1.d, z3.d +; VBITS_GE_128-NEXT: eor z0.d, z0.d, z2.d +; VBITS_GE_128-NEXT: stp q1, q0, [x0] +; VBITS_GE_128-NEXT: ret +; +; VBITS_GE_256-LABEL: xor_v32i8: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: ptrue p0.b, vl32 +; VBITS_GE_256-NEXT: ld1b { z0.b }, p0/z, [x0] +; VBITS_GE_256-NEXT: ld1b { z1.b }, p0/z, [x1] +; VBITS_GE_256-NEXT: eor z0.d, z0.d, z1.d +; VBITS_GE_256-NEXT: st1b { z0.b }, p0, [x0] +; VBITS_GE_256-NEXT: ret +; +; VBITS_GE_512-LABEL: xor_v32i8: +; VBITS_GE_512: // %bb.0: +; VBITS_GE_512-NEXT: ptrue p0.b, vl32 +; VBITS_GE_512-NEXT: ld1b { z0.b }, p0/z, [x0] +; VBITS_GE_512-NEXT: ld1b { z1.b }, p0/z, [x1] +; VBITS_GE_512-NEXT: eor z0.d, z0.d, z1.d +; VBITS_GE_512-NEXT: st1b { z0.b }, p0, [x0] +; VBITS_GE_512-NEXT: ret + %op1 = load <32 x i8>, <32 x i8>* %a + %op2 = load <32 x i8>, <32 x i8>* %b + %res = xor <32 x i8> %op1, %op2 + store <32 x i8> %res, <32 x i8>* %a + ret void +} + +define void @xor_v64i8(<64 x i8>* %a, <64 x i8>* %b) #0 { +; VBITS_GE_128-LABEL: xor_v64i8: +; VBITS_GE_128: // %bb.0: +; VBITS_GE_128-NEXT: mov w8, #32 +; VBITS_GE_128-NEXT: mov w9, #48 +; VBITS_GE_128-NEXT: mov w10, #16 +; VBITS_GE_128-NEXT: ptrue p0.b, vl16 +; VBITS_GE_128-NEXT: ld1b { z0.b }, p0/z, [x0, x8] +; VBITS_GE_128-NEXT: ld1b { z1.b }, p0/z, [x0, x9] +; VBITS_GE_128-NEXT: ld1b { z2.b }, p0/z, [x0, x10] +; VBITS_GE_128-NEXT: ld1b { z3.b }, p0/z, [x0] +; VBITS_GE_128-NEXT: ld1b { z4.b }, p0/z, [x1, x8] +; VBITS_GE_128-NEXT: ld1b { z5.b }, p0/z, [x1, x9] +; VBITS_GE_128-NEXT: ld1b { z6.b }, p0/z, [x1, x10] +; VBITS_GE_128-NEXT: ld1b { z7.b }, p0/z, [x1] +; VBITS_GE_128-NEXT: eor z0.d, z0.d, z4.d +; VBITS_GE_128-NEXT: eor z1.d, z1.d, z5.d +; VBITS_GE_128-NEXT: stp q0, q1, [x0, #32] +; VBITS_GE_128-NEXT: eor z0.d, z3.d, z7.d +; VBITS_GE_128-NEXT: eor z1.d, z2.d, z6.d +; VBITS_GE_128-NEXT: stp q0, q1, [x0] +; VBITS_GE_128-NEXT: ret +; +; VBITS_GE_256-LABEL: xor_v64i8: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov w8, #32 +; VBITS_GE_256-NEXT: ptrue p0.b, vl32 +; VBITS_GE_256-NEXT: ld1b { z0.b }, p0/z, [x0, x8] +; VBITS_GE_256-NEXT: ld1b { z1.b }, p0/z, [x0] +; VBITS_GE_256-NEXT: ld1b { z2.b }, p0/z, [x1, x8] +; VBITS_GE_256-NEXT: ld1b { z3.b }, p0/z, [x1] +; VBITS_GE_256-NEXT: eor z0.d, z0.d, z2.d +; VBITS_GE_256-NEXT: eor z1.d, z1.d, z3.d +; VBITS_GE_256-NEXT: st1b { z0.b }, p0, [x0, x8] +; VBITS_GE_256-NEXT: st1b { z1.b }, p0, [x0] +; VBITS_GE_256-NEXT: ret +; +; VBITS_GE_512-LABEL: xor_v64i8: +; VBITS_GE_512: // %bb.0: +; VBITS_GE_512-NEXT: ptrue p0.b, vl64 +; VBITS_GE_512-NEXT: ld1b { z0.b }, p0/z, [x0] +; VBITS_GE_512-NEXT: ld1b { z1.b }, p0/z, [x1] +; VBITS_GE_512-NEXT: eor z0.d, z0.d, z1.d +; VBITS_GE_512-NEXT: st1b { z0.b }, p0, [x0] +; VBITS_GE_512-NEXT: ret + %op1 = load <64 x i8>, <64 x i8>* %a + %op2 = load <64 x i8>, <64 x i8>* %b + %res = xor <64 x i8> %op1, %op2 + store <64 x i8> %res, <64 x i8>* %a + ret void +} + +define <4 x i16> @xor_v4i16(<4 x i16> %op1, <4 x i16> %op2) #0 { +; CHECK-LABEL: xor_v4i16: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1 +; CHECK-NEXT: eor z0.d, z0.d, z1.d +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 +; CHECK-NEXT: ret + %res = xor <4 x i16> %op1, %op2 + ret <4 x i16> %res +} + +define <8 x i16> @xor_v8i16(<8 x i16> %op1, <8 x i16> %op2) #0 { +; CHECK-LABEL: xor_v8i16: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1 +; CHECK-NEXT: eor z0.d, z0.d, z1.d +; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 +; CHECK-NEXT: ret + %res = xor <8 x i16> %op1, %op2 + ret <8 x i16> %res +} + +define void @xor_v16i16(<16 x i16>* %a, <16 x i16>* %b) #0 { +; VBITS_GE_128-LABEL: xor_v16i16: +; VBITS_GE_128: // %bb.0: +; VBITS_GE_128-NEXT: mov x8, #8 +; VBITS_GE_128-NEXT: ptrue p0.h, vl8 +; VBITS_GE_128-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] +; VBITS_GE_128-NEXT: ld1h { z1.h }, p0/z, [x0] +; VBITS_GE_128-NEXT: ld1h { z2.h }, p0/z, [x1, x8, lsl #1] +; VBITS_GE_128-NEXT: ld1h { z3.h }, p0/z, [x1] +; VBITS_GE_128-NEXT: eor z1.d, z1.d, z3.d +; VBITS_GE_128-NEXT: eor z0.d, z0.d, z2.d +; VBITS_GE_128-NEXT: stp q1, q0, [x0] +; VBITS_GE_128-NEXT: ret +; +; VBITS_GE_256-LABEL: xor_v16i16: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: ptrue p0.h, vl16 +; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0] +; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x1] +; VBITS_GE_256-NEXT: eor z0.d, z0.d, z1.d +; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x0] +; VBITS_GE_256-NEXT: ret +; +; VBITS_GE_512-LABEL: xor_v16i16: +; VBITS_GE_512: // %bb.0: +; VBITS_GE_512-NEXT: ptrue p0.h, vl16 +; VBITS_GE_512-NEXT: ld1h { z0.h }, p0/z, [x0] +; VBITS_GE_512-NEXT: ld1h { z1.h }, p0/z, [x1] +; VBITS_GE_512-NEXT: eor z0.d, z0.d, z1.d +; VBITS_GE_512-NEXT: st1h { z0.h }, p0, [x0] +; VBITS_GE_512-NEXT: ret + %op1 = load <16 x i16>, <16 x i16>* %a + %op2 = load <16 x i16>, <16 x i16>* %b + %res = xor <16 x i16> %op1, %op2 + store <16 x i16> %res, <16 x i16>* %a + ret void +} + +define void @xor_v32i16(<32 x i16>* %a, <32 x i16>* %b) #0 { +; VBITS_GE_128-LABEL: xor_v32i16: +; VBITS_GE_128: // %bb.0: +; VBITS_GE_128-NEXT: mov x8, #16 +; VBITS_GE_128-NEXT: mov x9, #24 +; VBITS_GE_128-NEXT: mov x10, #8 +; VBITS_GE_128-NEXT: ptrue p0.h, vl8 +; VBITS_GE_128-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] +; VBITS_GE_128-NEXT: ld1h { z1.h }, p0/z, [x0, x9, lsl #1] +; VBITS_GE_128-NEXT: ld1h { z2.h }, p0/z, [x0, x10, lsl #1] +; VBITS_GE_128-NEXT: ld1h { z3.h }, p0/z, [x0] +; VBITS_GE_128-NEXT: ld1h { z4.h }, p0/z, [x1, x8, lsl #1] +; VBITS_GE_128-NEXT: ld1h { z5.h }, p0/z, [x1, x9, lsl #1] +; VBITS_GE_128-NEXT: ld1h { z6.h }, p0/z, [x1, x10, lsl #1] +; VBITS_GE_128-NEXT: ld1h { z7.h }, p0/z, [x1] +; VBITS_GE_128-NEXT: eor z0.d, z0.d, z4.d +; VBITS_GE_128-NEXT: eor z1.d, z1.d, z5.d +; VBITS_GE_128-NEXT: stp q0, q1, [x0, #32] +; VBITS_GE_128-NEXT: eor z0.d, z3.d, z7.d +; VBITS_GE_128-NEXT: eor z1.d, z2.d, z6.d +; VBITS_GE_128-NEXT: stp q0, q1, [x0] +; VBITS_GE_128-NEXT: ret +; +; VBITS_GE_256-LABEL: xor_v32i16: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov x8, #16 +; VBITS_GE_256-NEXT: ptrue p0.h, vl16 +; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] +; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0] +; VBITS_GE_256-NEXT: ld1h { z2.h }, p0/z, [x1, x8, lsl #1] +; VBITS_GE_256-NEXT: ld1h { z3.h }, p0/z, [x1] +; VBITS_GE_256-NEXT: eor z0.d, z0.d, z2.d +; VBITS_GE_256-NEXT: eor z1.d, z1.d, z3.d +; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x0, x8, lsl #1] +; VBITS_GE_256-NEXT: st1h { z1.h }, p0, [x0] +; VBITS_GE_256-NEXT: ret +; +; VBITS_GE_512-LABEL: xor_v32i16: +; VBITS_GE_512: // %bb.0: +; VBITS_GE_512-NEXT: ptrue p0.h, vl32 +; VBITS_GE_512-NEXT: ld1h { z0.h }, p0/z, [x0] +; VBITS_GE_512-NEXT: ld1h { z1.h }, p0/z, [x1] +; VBITS_GE_512-NEXT: eor z0.d, z0.d, z1.d +; VBITS_GE_512-NEXT: st1h { z0.h }, p0, [x0] +; VBITS_GE_512-NEXT: ret + %op1 = load <32 x i16>, <32 x i16>* %a + %op2 = load <32 x i16>, <32 x i16>* %b + %res = xor <32 x i16> %op1, %op2 + store <32 x i16> %res, <32 x i16>* %a + ret void +} + +define <2 x i32> @xor_v2i32(<2 x i32> %op1, <2 x i32> %op2) #0 { +; CHECK-LABEL: xor_v2i32: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1 +; CHECK-NEXT: eor z0.d, z0.d, z1.d +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 +; CHECK-NEXT: ret + %res = xor <2 x i32> %op1, %op2 + ret <2 x i32> %res +} + +define <4 x i32> @xor_v4i32(<4 x i32> %op1, <4 x i32> %op2) #0 { +; CHECK-LABEL: xor_v4i32: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1 +; CHECK-NEXT: eor z0.d, z0.d, z1.d +; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 +; CHECK-NEXT: ret + %res = xor <4 x i32> %op1, %op2 + ret <4 x i32> %res +} + +define void @xor_v8i32(<8 x i32>* %a, <8 x i32>* %b) #0 { +; VBITS_GE_128-LABEL: xor_v8i32: +; VBITS_GE_128: // %bb.0: +; VBITS_GE_128-NEXT: mov x8, #4 +; VBITS_GE_128-NEXT: ptrue p0.s, vl4 +; VBITS_GE_128-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] +; VBITS_GE_128-NEXT: ld1w { z1.s }, p0/z, [x0] +; VBITS_GE_128-NEXT: ld1w { z2.s }, p0/z, [x1, x8, lsl #2] +; VBITS_GE_128-NEXT: ld1w { z3.s }, p0/z, [x1] +; VBITS_GE_128-NEXT: eor z1.d, z1.d, z3.d +; VBITS_GE_128-NEXT: eor z0.d, z0.d, z2.d +; VBITS_GE_128-NEXT: stp q1, q0, [x0] +; VBITS_GE_128-NEXT: ret +; +; VBITS_GE_256-LABEL: xor_v8i32: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: ptrue p0.s, vl8 +; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0] +; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x1] +; VBITS_GE_256-NEXT: eor z0.d, z0.d, z1.d +; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0] +; VBITS_GE_256-NEXT: ret +; +; VBITS_GE_512-LABEL: xor_v8i32: +; VBITS_GE_512: // %bb.0: +; VBITS_GE_512-NEXT: ptrue p0.s, vl8 +; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x0] +; VBITS_GE_512-NEXT: ld1w { z1.s }, p0/z, [x1] +; VBITS_GE_512-NEXT: eor z0.d, z0.d, z1.d +; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x0] +; VBITS_GE_512-NEXT: ret + %op1 = load <8 x i32>, <8 x i32>* %a + %op2 = load <8 x i32>, <8 x i32>* %b + %res = xor <8 x i32> %op1, %op2 + store <8 x i32> %res, <8 x i32>* %a + ret void +} + +define void @xor_v16i32(<16 x i32>* %a, <16 x i32>* %b) #0 { +; VBITS_GE_128-LABEL: xor_v16i32: +; VBITS_GE_128: // %bb.0: +; VBITS_GE_128-NEXT: mov x8, #8 +; VBITS_GE_128-NEXT: mov x9, #12 +; VBITS_GE_128-NEXT: mov x10, #4 +; VBITS_GE_128-NEXT: ptrue p0.s, vl4 +; VBITS_GE_128-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] +; VBITS_GE_128-NEXT: ld1w { z1.s }, p0/z, [x0, x9, lsl #2] +; VBITS_GE_128-NEXT: ld1w { z2.s }, p0/z, [x0, x10, lsl #2] +; VBITS_GE_128-NEXT: ld1w { z3.s }, p0/z, [x0] +; VBITS_GE_128-NEXT: ld1w { z4.s }, p0/z, [x1, x8, lsl #2] +; VBITS_GE_128-NEXT: ld1w { z5.s }, p0/z, [x1, x9, lsl #2] +; VBITS_GE_128-NEXT: ld1w { z6.s }, p0/z, [x1, x10, lsl #2] +; VBITS_GE_128-NEXT: ld1w { z7.s }, p0/z, [x1] +; VBITS_GE_128-NEXT: eor z0.d, z0.d, z4.d +; VBITS_GE_128-NEXT: eor z1.d, z1.d, z5.d +; VBITS_GE_128-NEXT: stp q0, q1, [x0, #32] +; VBITS_GE_128-NEXT: eor z0.d, z3.d, z7.d +; VBITS_GE_128-NEXT: eor z1.d, z2.d, z6.d +; VBITS_GE_128-NEXT: stp q0, q1, [x0] +; VBITS_GE_128-NEXT: ret +; +; VBITS_GE_256-LABEL: xor_v16i32: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov x8, #8 +; VBITS_GE_256-NEXT: ptrue p0.s, vl8 +; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0] +; VBITS_GE_256-NEXT: ld1w { z2.s }, p0/z, [x1, x8, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z3.s }, p0/z, [x1] +; VBITS_GE_256-NEXT: eor z0.d, z0.d, z2.d +; VBITS_GE_256-NEXT: eor z1.d, z1.d, z3.d +; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0, x8, lsl #2] +; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x0] +; VBITS_GE_256-NEXT: ret +; +; VBITS_GE_512-LABEL: xor_v16i32: +; VBITS_GE_512: // %bb.0: +; VBITS_GE_512-NEXT: ptrue p0.s, vl16 +; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x0] +; VBITS_GE_512-NEXT: ld1w { z1.s }, p0/z, [x1] +; VBITS_GE_512-NEXT: eor z0.d, z0.d, z1.d +; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x0] +; VBITS_GE_512-NEXT: ret + %op1 = load <16 x i32>, <16 x i32>* %a + %op2 = load <16 x i32>, <16 x i32>* %b + %res = xor <16 x i32> %op1, %op2 + store <16 x i32> %res, <16 x i32>* %a + ret void +} + +define <1 x i64> @xor_v1i64(<1 x i64> %op1, <1 x i64> %op2) #0 { +; CHECK-LABEL: xor_v1i64: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1 +; CHECK-NEXT: eor z0.d, z0.d, z1.d +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 +; CHECK-NEXT: ret + %res = xor <1 x i64> %op1, %op2 + ret <1 x i64> %res +} + +define <2 x i64> @xor_v2i64(<2 x i64> %op1, <2 x i64> %op2) #0 { +; CHECK-LABEL: xor_v2i64: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1 +; CHECK-NEXT: eor z0.d, z0.d, z1.d +; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 +; CHECK-NEXT: ret + %res = xor <2 x i64> %op1, %op2 + ret <2 x i64> %res +} + +define void @xor_v4i64(<4 x i64>* %a, <4 x i64>* %b) #0 { +; VBITS_GE_128-LABEL: xor_v4i64: +; VBITS_GE_128: // %bb.0: +; VBITS_GE_128-NEXT: mov x8, #2 +; VBITS_GE_128-NEXT: ptrue p0.d, vl2 +; VBITS_GE_128-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] +; VBITS_GE_128-NEXT: ld1d { z1.d }, p0/z, [x0] +; VBITS_GE_128-NEXT: ld1d { z2.d }, p0/z, [x1, x8, lsl #3] +; VBITS_GE_128-NEXT: ld1d { z3.d }, p0/z, [x1] +; VBITS_GE_128-NEXT: eor z1.d, z1.d, z3.d +; VBITS_GE_128-NEXT: eor z0.d, z0.d, z2.d +; VBITS_GE_128-NEXT: stp q1, q0, [x0] +; VBITS_GE_128-NEXT: ret +; +; VBITS_GE_256-LABEL: xor_v4i64: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: ptrue p0.d, vl4 +; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0] +; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x1] +; VBITS_GE_256-NEXT: eor z0.d, z0.d, z1.d +; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x0] +; VBITS_GE_256-NEXT: ret +; +; VBITS_GE_512-LABEL: xor_v4i64: +; VBITS_GE_512: // %bb.0: +; VBITS_GE_512-NEXT: ptrue p0.d, vl4 +; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x0] +; VBITS_GE_512-NEXT: ld1d { z1.d }, p0/z, [x1] +; VBITS_GE_512-NEXT: eor z0.d, z0.d, z1.d +; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x0] +; VBITS_GE_512-NEXT: ret + %op1 = load <4 x i64>, <4 x i64>* %a + %op2 = load <4 x i64>, <4 x i64>* %b + %res = xor <4 x i64> %op1, %op2 + store <4 x i64> %res, <4 x i64>* %a + ret void +} + +define void @xor_v8i64(<8 x i64>* %a, <8 x i64>* %b) #0 { +; VBITS_GE_128-LABEL: xor_v8i64: +; VBITS_GE_128: // %bb.0: +; VBITS_GE_128-NEXT: mov x8, #4 +; VBITS_GE_128-NEXT: mov x9, #6 +; VBITS_GE_128-NEXT: mov x10, #2 +; VBITS_GE_128-NEXT: ptrue p0.d, vl2 +; VBITS_GE_128-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] +; VBITS_GE_128-NEXT: ld1d { z1.d }, p0/z, [x0, x9, lsl #3] +; VBITS_GE_128-NEXT: ld1d { z2.d }, p0/z, [x0, x10, lsl #3] +; VBITS_GE_128-NEXT: ld1d { z3.d }, p0/z, [x0] +; VBITS_GE_128-NEXT: ld1d { z4.d }, p0/z, [x1, x8, lsl #3] +; VBITS_GE_128-NEXT: ld1d { z5.d }, p0/z, [x1, x9, lsl #3] +; VBITS_GE_128-NEXT: ld1d { z6.d }, p0/z, [x1, x10, lsl #3] +; VBITS_GE_128-NEXT: ld1d { z7.d }, p0/z, [x1] +; VBITS_GE_128-NEXT: eor z0.d, z0.d, z4.d +; VBITS_GE_128-NEXT: eor z1.d, z1.d, z5.d +; VBITS_GE_128-NEXT: stp q0, q1, [x0, #32] +; VBITS_GE_128-NEXT: eor z0.d, z3.d, z7.d +; VBITS_GE_128-NEXT: eor z1.d, z2.d, z6.d +; VBITS_GE_128-NEXT: stp q0, q1, [x0] +; VBITS_GE_128-NEXT: ret +; +; VBITS_GE_256-LABEL: xor_v8i64: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov x8, #4 +; VBITS_GE_256-NEXT: ptrue p0.d, vl4 +; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0] +; VBITS_GE_256-NEXT: ld1d { z2.d }, p0/z, [x1, x8, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z3.d }, p0/z, [x1] +; VBITS_GE_256-NEXT: eor z0.d, z0.d, z2.d +; VBITS_GE_256-NEXT: eor z1.d, z1.d, z3.d +; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x0, x8, lsl #3] +; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x0] +; VBITS_GE_256-NEXT: ret +; +; VBITS_GE_512-LABEL: xor_v8i64: +; VBITS_GE_512: // %bb.0: +; VBITS_GE_512-NEXT: ptrue p0.d, vl8 +; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x0] +; VBITS_GE_512-NEXT: ld1d { z1.d }, p0/z, [x1] +; VBITS_GE_512-NEXT: eor z0.d, z0.d, z1.d +; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x0] +; VBITS_GE_512-NEXT: ret + %op1 = load <8 x i64>, <8 x i64>* %a + %op2 = load <8 x i64>, <8 x i64>* %b + %res = xor <8 x i64> %op1, %op2 + store <8 x i64> %res, <8 x i64>* %a + ret void +} + +attributes #0 = { "target-features"="+sve" } diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-mulh.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-mulh.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-mulh.ll @@ -0,0 +1,2272 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -aarch64-sve-vector-bits-min=128 -force-sve-when-streaming-compatible < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_128 +; RUN: llc -aarch64-sve-vector-bits-min=256 -force-sve-when-streaming-compatible < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_256 +; RUN: llc -aarch64-sve-vector-bits-min=512 -force-sve-when-streaming-compatible < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512 +; RUN: llc -aarch64-sve-vector-bits-min=2048 -force-sve-when-streaming-compatible< %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512 + +; This test only tests the legal types for a given vector width, as mulh nodes +; do not get generated for non-legal types. + +target triple = "aarch64-unknown-linux-gnu" + +; +; SMULH +; + +define <8 x i8> @smulh_v8i8(<8 x i8> %op1, <8 x i8> %op2) #0 { +; CHECK-LABEL: smulh_v8i8: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: ptrue p0.b, vl8 +; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1 +; CHECK-NEXT: smulh z0.b, p0/m, z0.b, z1.b +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 +; CHECK-NEXT: ret + %insert = insertelement <8 x i16> undef, i16 8, i64 0 + %splat = shufflevector <8 x i16> %insert, <8 x i16> undef, <8 x i32> zeroinitializer + %1 = sext <8 x i8> %op1 to <8 x i16> + %2 = sext <8 x i8> %op2 to <8 x i16> + %mul = mul <8 x i16> %1, %2 + %shr = lshr <8 x i16> %mul, + %res = trunc <8 x i16> %shr to <8 x i8> + ret <8 x i8> %res +} + +define <16 x i8> @smulh_v16i8(<16 x i8> %op1, <16 x i8> %op2) #0 { +; CHECK-LABEL: smulh_v16i8: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: ptrue p0.b, vl16 +; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1 +; CHECK-NEXT: smulh z0.b, p0/m, z0.b, z1.b +; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 +; CHECK-NEXT: ret + %1 = sext <16 x i8> %op1 to <16 x i16> + %2 = sext <16 x i8> %op2 to <16 x i16> + %mul = mul <16 x i16> %1, %2 + %shr = lshr <16 x i16> %mul, + %res = trunc <16 x i16> %shr to <16 x i8> + ret <16 x i8> %res +} + +define void @smulh_v32i8(<32 x i8>* %a, <32 x i8>* %b) #0 { +; VBITS_GE_128-LABEL: smulh_v32i8: +; VBITS_GE_128: // %bb.0: +; VBITS_GE_128-NEXT: mov w8, #16 +; VBITS_GE_128-NEXT: ptrue p0.b, vl16 +; VBITS_GE_128-NEXT: ptrue p1.h, vl8 +; VBITS_GE_128-NEXT: ld1b { z0.b }, p0/z, [x0, x8] +; VBITS_GE_128-NEXT: ld1b { z2.b }, p0/z, [x0] +; VBITS_GE_128-NEXT: ld1b { z1.b }, p0/z, [x1, x8] +; VBITS_GE_128-NEXT: ld1b { z3.b }, p0/z, [x1] +; VBITS_GE_128-NEXT: adrp x8, .LCPI2_0 +; VBITS_GE_128-NEXT: add x8, x8, :lo12:.LCPI2_0 +; VBITS_GE_128-NEXT: sunpklo z5.h, z2.b +; VBITS_GE_128-NEXT: ext z2.b, z2.b, z2.b, #8 +; VBITS_GE_128-NEXT: sunpklo z7.h, z3.b +; VBITS_GE_128-NEXT: ld1h { z16.h }, p1/z, [x8] +; VBITS_GE_128-NEXT: ext z3.b, z3.b, z3.b, #8 +; VBITS_GE_128-NEXT: sunpklo z2.h, z2.b +; VBITS_GE_128-NEXT: sunpklo z3.h, z3.b +; VBITS_GE_128-NEXT: mul z5.h, p1/m, z5.h, z7.h +; VBITS_GE_128-NEXT: mul z2.h, p1/m, z2.h, z3.h +; VBITS_GE_128-NEXT: movprfx z3, z5 +; VBITS_GE_128-NEXT: lsr z3.h, p1/m, z3.h, z16.h +; VBITS_GE_128-NEXT: fmov w8, s3 +; VBITS_GE_128-NEXT: sunpklo z4.h, z0.b +; VBITS_GE_128-NEXT: sunpklo z6.h, z1.b +; VBITS_GE_128-NEXT: ext z0.b, z0.b, z0.b, #8 +; VBITS_GE_128-NEXT: ext z1.b, z1.b, z1.b, #8 +; VBITS_GE_128-NEXT: lsr z2.h, p1/m, z2.h, z16.h +; VBITS_GE_128-NEXT: mov z5.h, z3.h[7] +; VBITS_GE_128-NEXT: sunpklo z0.h, z0.b +; VBITS_GE_128-NEXT: sunpklo z1.h, z1.b +; VBITS_GE_128-NEXT: mul z4.h, p1/m, z4.h, z6.h +; VBITS_GE_128-NEXT: mov z6.h, z3.h[6] +; VBITS_GE_128-NEXT: mov z7.h, z3.h[5] +; VBITS_GE_128-NEXT: mov z17.h, z3.h[4] +; VBITS_GE_128-NEXT: mov z18.h, z3.h[3] +; VBITS_GE_128-NEXT: mov z19.h, z3.h[2] +; VBITS_GE_128-NEXT: mov z20.h, z3.h[1] +; VBITS_GE_128-NEXT: mov z3.h, z2.h[7] +; VBITS_GE_128-NEXT: mov z21.h, z2.h[6] +; VBITS_GE_128-NEXT: mov z22.h, z2.h[5] +; VBITS_GE_128-NEXT: mov z23.h, z2.h[4] +; VBITS_GE_128-NEXT: mov z24.h, z2.h[3] +; VBITS_GE_128-NEXT: mov z25.h, z2.h[2] +; VBITS_GE_128-NEXT: mov z26.h, z2.h[1] +; VBITS_GE_128-NEXT: fmov w9, s2 +; VBITS_GE_128-NEXT: mul z0.h, p1/m, z0.h, z1.h +; VBITS_GE_128-NEXT: fmov w10, s5 +; VBITS_GE_128-NEXT: strb w8, [sp, #-32]! +; VBITS_GE_128-NEXT: .cfi_def_cfa_offset 32 +; VBITS_GE_128-NEXT: fmov w8, s6 +; VBITS_GE_128-NEXT: strb w9, [sp, #8] +; VBITS_GE_128-NEXT: fmov w9, s7 +; VBITS_GE_128-NEXT: strb w10, [sp, #7] +; VBITS_GE_128-NEXT: fmov w10, s17 +; VBITS_GE_128-NEXT: lsr z0.h, p1/m, z0.h, z16.h +; VBITS_GE_128-NEXT: strb w8, [sp, #6] +; VBITS_GE_128-NEXT: fmov w8, s18 +; VBITS_GE_128-NEXT: strb w9, [sp, #5] +; VBITS_GE_128-NEXT: fmov w9, s19 +; VBITS_GE_128-NEXT: strb w10, [sp, #4] +; VBITS_GE_128-NEXT: fmov w10, s20 +; VBITS_GE_128-NEXT: strb w8, [sp, #3] +; VBITS_GE_128-NEXT: fmov w8, s3 +; VBITS_GE_128-NEXT: strb w9, [sp, #2] +; VBITS_GE_128-NEXT: fmov w9, s21 +; VBITS_GE_128-NEXT: strb w10, [sp, #1] +; VBITS_GE_128-NEXT: fmov w10, s22 +; VBITS_GE_128-NEXT: strb w8, [sp, #15] +; VBITS_GE_128-NEXT: fmov w8, s23 +; VBITS_GE_128-NEXT: strb w9, [sp, #14] +; VBITS_GE_128-NEXT: fmov w9, s24 +; VBITS_GE_128-NEXT: strb w10, [sp, #13] +; VBITS_GE_128-NEXT: fmov w10, s25 +; VBITS_GE_128-NEXT: strb w8, [sp, #12] +; VBITS_GE_128-NEXT: fmov w8, s26 +; VBITS_GE_128-NEXT: movprfx z1, z4 +; VBITS_GE_128-NEXT: lsr z1.h, p1/m, z1.h, z16.h +; VBITS_GE_128-NEXT: strb w9, [sp, #11] +; VBITS_GE_128-NEXT: mov z2.h, z1.h[7] +; VBITS_GE_128-NEXT: fmov w9, s1 +; VBITS_GE_128-NEXT: strb w10, [sp, #10] +; VBITS_GE_128-NEXT: fmov w10, s0 +; VBITS_GE_128-NEXT: strb w8, [sp, #9] +; VBITS_GE_128-NEXT: fmov w8, s2 +; VBITS_GE_128-NEXT: mov z3.h, z1.h[6] +; VBITS_GE_128-NEXT: mov z4.h, z1.h[5] +; VBITS_GE_128-NEXT: mov z5.h, z1.h[4] +; VBITS_GE_128-NEXT: strb w9, [sp, #16] +; VBITS_GE_128-NEXT: fmov w9, s3 +; VBITS_GE_128-NEXT: strb w10, [sp, #24] +; VBITS_GE_128-NEXT: fmov w10, s4 +; VBITS_GE_128-NEXT: strb w8, [sp, #23] +; VBITS_GE_128-NEXT: fmov w8, s5 +; VBITS_GE_128-NEXT: mov z6.h, z1.h[3] +; VBITS_GE_128-NEXT: mov z7.h, z1.h[2] +; VBITS_GE_128-NEXT: mov z16.h, z1.h[1] +; VBITS_GE_128-NEXT: strb w9, [sp, #22] +; VBITS_GE_128-NEXT: fmov w9, s6 +; VBITS_GE_128-NEXT: strb w10, [sp, #21] +; VBITS_GE_128-NEXT: fmov w10, s7 +; VBITS_GE_128-NEXT: strb w8, [sp, #20] +; VBITS_GE_128-NEXT: fmov w8, s16 +; VBITS_GE_128-NEXT: mov z1.h, z0.h[7] +; VBITS_GE_128-NEXT: mov z17.h, z0.h[6] +; VBITS_GE_128-NEXT: mov z18.h, z0.h[5] +; VBITS_GE_128-NEXT: strb w9, [sp, #19] +; VBITS_GE_128-NEXT: fmov w9, s1 +; VBITS_GE_128-NEXT: strb w10, [sp, #18] +; VBITS_GE_128-NEXT: fmov w10, s17 +; VBITS_GE_128-NEXT: strb w8, [sp, #17] +; VBITS_GE_128-NEXT: fmov w8, s18 +; VBITS_GE_128-NEXT: mov z19.h, z0.h[4] +; VBITS_GE_128-NEXT: mov z20.h, z0.h[3] +; VBITS_GE_128-NEXT: mov z21.h, z0.h[2] +; VBITS_GE_128-NEXT: strb w9, [sp, #31] +; VBITS_GE_128-NEXT: fmov w9, s19 +; VBITS_GE_128-NEXT: strb w10, [sp, #30] +; VBITS_GE_128-NEXT: fmov w10, s20 +; VBITS_GE_128-NEXT: strb w8, [sp, #29] +; VBITS_GE_128-NEXT: fmov w8, s21 +; VBITS_GE_128-NEXT: mov z22.h, z0.h[1] +; VBITS_GE_128-NEXT: strb w9, [sp, #28] +; VBITS_GE_128-NEXT: fmov w9, s22 +; VBITS_GE_128-NEXT: strb w10, [sp, #27] +; VBITS_GE_128-NEXT: mov x10, sp +; VBITS_GE_128-NEXT: strb w8, [sp, #26] +; VBITS_GE_128-NEXT: add x8, sp, #16 +; VBITS_GE_128-NEXT: strb w9, [sp, #25] +; VBITS_GE_128-NEXT: ld1b { z0.b }, p0/z, [x10] +; VBITS_GE_128-NEXT: ld1b { z1.b }, p0/z, [x8] +; VBITS_GE_128-NEXT: stp q0, q1, [x0] +; VBITS_GE_128-NEXT: add sp, sp, #32 +; VBITS_GE_128-NEXT: ret +; +; VBITS_GE_256-LABEL: smulh_v32i8: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: ptrue p0.b, vl32 +; VBITS_GE_256-NEXT: ld1b { z0.b }, p0/z, [x0] +; VBITS_GE_256-NEXT: ld1b { z1.b }, p0/z, [x1] +; VBITS_GE_256-NEXT: smulh z0.b, p0/m, z0.b, z1.b +; VBITS_GE_256-NEXT: st1b { z0.b }, p0, [x0] +; VBITS_GE_256-NEXT: ret +; +; VBITS_GE_512-LABEL: smulh_v32i8: +; VBITS_GE_512: // %bb.0: +; VBITS_GE_512-NEXT: ptrue p0.b, vl32 +; VBITS_GE_512-NEXT: ld1b { z0.b }, p0/z, [x0] +; VBITS_GE_512-NEXT: ld1b { z1.b }, p0/z, [x1] +; VBITS_GE_512-NEXT: smulh z0.b, p0/m, z0.b, z1.b +; VBITS_GE_512-NEXT: st1b { z0.b }, p0, [x0] +; VBITS_GE_512-NEXT: ret + %op1 = load <32 x i8>, <32 x i8>* %a + %op2 = load <32 x i8>, <32 x i8>* %b + %1 = sext <32 x i8> %op1 to <32 x i16> + %2 = sext <32 x i8> %op2 to <32 x i16> + %mul = mul <32 x i16> %1, %2 + %shr = lshr <32 x i16> %mul, + %res = trunc <32 x i16> %shr to <32 x i8> + store <32 x i8> %res, <32 x i8>* %a + ret void +} + +define void @smulh_v64i8(<64 x i8>* %a, <64 x i8>* %b) #0 { +; VBITS_GE_128-LABEL: smulh_v64i8: +; VBITS_GE_128: // %bb.0: +; VBITS_GE_128-NEXT: sub sp, sp, #64 +; VBITS_GE_128-NEXT: .cfi_def_cfa_offset 64 +; VBITS_GE_128-NEXT: mov w8, #16 +; VBITS_GE_128-NEXT: mov w9, #48 +; VBITS_GE_128-NEXT: mov w10, #32 +; VBITS_GE_128-NEXT: ptrue p0.b, vl16 +; VBITS_GE_128-NEXT: ptrue p1.h, vl8 +; VBITS_GE_128-NEXT: ld1b { z2.b }, p0/z, [x0, x8] +; VBITS_GE_128-NEXT: ld1b { z3.b }, p0/z, [x0, x9] +; VBITS_GE_128-NEXT: ld1b { z0.b }, p0/z, [x0, x10] +; VBITS_GE_128-NEXT: ld1b { z5.b }, p0/z, [x0] +; VBITS_GE_128-NEXT: ld1b { z4.b }, p0/z, [x1, x8] +; VBITS_GE_128-NEXT: ld1b { z7.b }, p0/z, [x1, x9] +; VBITS_GE_128-NEXT: adrp x8, .LCPI3_0 +; VBITS_GE_128-NEXT: add x8, x8, :lo12:.LCPI3_0 +; VBITS_GE_128-NEXT: sunpklo z18.h, z2.b +; VBITS_GE_128-NEXT: ext z2.b, z2.b, z2.b, #8 +; VBITS_GE_128-NEXT: ld1b { z1.b }, p0/z, [x1, x10] +; VBITS_GE_128-NEXT: ld1b { z6.b }, p0/z, [x1] +; VBITS_GE_128-NEXT: sunpklo z19.h, z2.b +; VBITS_GE_128-NEXT: ld1h { z2.h }, p1/z, [x8] +; VBITS_GE_128-NEXT: sunpklo z21.h, z7.b +; VBITS_GE_128-NEXT: ext z7.b, z7.b, z7.b, #8 +; VBITS_GE_128-NEXT: sunpklo z23.h, z7.b +; VBITS_GE_128-NEXT: sunpklo z7.h, z4.b +; VBITS_GE_128-NEXT: mul z18.h, p1/m, z18.h, z7.h +; VBITS_GE_128-NEXT: sunpklo z16.h, z3.b +; VBITS_GE_128-NEXT: lsr z18.h, p1/m, z18.h, z2.h +; VBITS_GE_128-NEXT: ext z3.b, z3.b, z3.b, #8 +; VBITS_GE_128-NEXT: ext z4.b, z4.b, z4.b, #8 +; VBITS_GE_128-NEXT: fmov w8, s18 +; VBITS_GE_128-NEXT: sunpklo z3.h, z3.b +; VBITS_GE_128-NEXT: sunpklo z20.h, z5.b +; VBITS_GE_128-NEXT: sunpklo z4.h, z4.b +; VBITS_GE_128-NEXT: sunpklo z17.h, z0.b +; VBITS_GE_128-NEXT: sunpklo z22.h, z1.b +; VBITS_GE_128-NEXT: sunpklo z24.h, z6.b +; VBITS_GE_128-NEXT: mul z19.h, p1/m, z19.h, z4.h +; VBITS_GE_128-NEXT: movprfx z7, z16 +; VBITS_GE_128-NEXT: mul z7.h, p1/m, z7.h, z21.h +; VBITS_GE_128-NEXT: movprfx z16, z3 +; VBITS_GE_128-NEXT: mul z16.h, p1/m, z16.h, z23.h +; VBITS_GE_128-NEXT: movprfx z3, z17 +; VBITS_GE_128-NEXT: mul z3.h, p1/m, z3.h, z22.h +; VBITS_GE_128-NEXT: movprfx z4, z20 +; VBITS_GE_128-NEXT: mul z4.h, p1/m, z4.h, z24.h +; VBITS_GE_128-NEXT: mov z20.h, z18.h[6] +; VBITS_GE_128-NEXT: movprfx z17, z19 +; VBITS_GE_128-NEXT: lsr z17.h, p1/m, z17.h, z2.h +; VBITS_GE_128-NEXT: fmov w9, s17 +; VBITS_GE_128-NEXT: strb w8, [sp, #32] +; VBITS_GE_128-NEXT: fmov w8, s20 +; VBITS_GE_128-NEXT: mov z19.h, z18.h[7] +; VBITS_GE_128-NEXT: mov z21.h, z18.h[5] +; VBITS_GE_128-NEXT: mov z23.h, z18.h[3] +; VBITS_GE_128-NEXT: fmov w10, s19 +; VBITS_GE_128-NEXT: strb w9, [sp, #40] +; VBITS_GE_128-NEXT: fmov w9, s21 +; VBITS_GE_128-NEXT: strb w8, [sp, #38] +; VBITS_GE_128-NEXT: fmov w8, s23 +; VBITS_GE_128-NEXT: mov z22.h, z18.h[4] +; VBITS_GE_128-NEXT: mov z24.h, z18.h[2] +; VBITS_GE_128-NEXT: mov z25.h, z18.h[1] +; VBITS_GE_128-NEXT: mov z18.h, z17.h[7] +; VBITS_GE_128-NEXT: strb w10, [sp, #39] +; VBITS_GE_128-NEXT: fmov w10, s22 +; VBITS_GE_128-NEXT: strb w9, [sp, #37] +; VBITS_GE_128-NEXT: fmov w9, s24 +; VBITS_GE_128-NEXT: strb w8, [sp, #35] +; VBITS_GE_128-NEXT: fmov w8, s18 +; VBITS_GE_128-NEXT: mov z26.h, z17.h[6] +; VBITS_GE_128-NEXT: mov z28.h, z17.h[4] +; VBITS_GE_128-NEXT: strb w10, [sp, #36] +; VBITS_GE_128-NEXT: fmov w10, s25 +; VBITS_GE_128-NEXT: strb w9, [sp, #34] +; VBITS_GE_128-NEXT: fmov w9, s26 +; VBITS_GE_128-NEXT: strb w8, [sp, #47] +; VBITS_GE_128-NEXT: fmov w8, s28 +; VBITS_GE_128-NEXT: ext z5.b, z5.b, z5.b, #8 +; VBITS_GE_128-NEXT: ext z6.b, z6.b, z6.b, #8 +; VBITS_GE_128-NEXT: mov z27.h, z17.h[5] +; VBITS_GE_128-NEXT: mov z29.h, z17.h[3] +; VBITS_GE_128-NEXT: mov z30.h, z17.h[2] +; VBITS_GE_128-NEXT: sunpklo z5.h, z5.b +; VBITS_GE_128-NEXT: sunpklo z6.h, z6.b +; VBITS_GE_128-NEXT: strb w10, [sp, #33] +; VBITS_GE_128-NEXT: mul z5.h, p1/m, z5.h, z6.h +; VBITS_GE_128-NEXT: fmov w10, s27 +; VBITS_GE_128-NEXT: strb w9, [sp, #46] +; VBITS_GE_128-NEXT: fmov w9, s29 +; VBITS_GE_128-NEXT: movprfx z6, z16 +; VBITS_GE_128-NEXT: lsr z6.h, p1/m, z6.h, z2.h +; VBITS_GE_128-NEXT: strb w8, [sp, #44] +; VBITS_GE_128-NEXT: fmov w8, s30 +; VBITS_GE_128-NEXT: lsr z7.h, p1/m, z7.h, z2.h +; VBITS_GE_128-NEXT: mov z31.h, z17.h[1] +; VBITS_GE_128-NEXT: mov z16.h, z7.h[7] +; VBITS_GE_128-NEXT: strb w10, [sp, #45] +; VBITS_GE_128-NEXT: fmov w10, s7 +; VBITS_GE_128-NEXT: strb w9, [sp, #43] +; VBITS_GE_128-NEXT: fmov w9, s31 +; VBITS_GE_128-NEXT: strb w8, [sp, #42] +; VBITS_GE_128-NEXT: fmov w8, s16 +; VBITS_GE_128-NEXT: mov z17.h, z7.h[6] +; VBITS_GE_128-NEXT: mov z18.h, z7.h[5] +; VBITS_GE_128-NEXT: mov z19.h, z7.h[4] +; VBITS_GE_128-NEXT: strb w9, [sp, #41] +; VBITS_GE_128-NEXT: strb w10, [sp, #16] +; VBITS_GE_128-NEXT: fmov w9, s17 +; VBITS_GE_128-NEXT: fmov w10, s18 +; VBITS_GE_128-NEXT: strb w8, [sp, #23] +; VBITS_GE_128-NEXT: fmov w8, s19 +; VBITS_GE_128-NEXT: mov z20.h, z7.h[3] +; VBITS_GE_128-NEXT: mov z21.h, z7.h[2] +; VBITS_GE_128-NEXT: mov z22.h, z7.h[1] +; VBITS_GE_128-NEXT: strb w9, [sp, #22] +; VBITS_GE_128-NEXT: fmov w9, s20 +; VBITS_GE_128-NEXT: strb w10, [sp, #21] +; VBITS_GE_128-NEXT: fmov w10, s21 +; VBITS_GE_128-NEXT: strb w8, [sp, #20] +; VBITS_GE_128-NEXT: fmov w8, s22 +; VBITS_GE_128-NEXT: mov z7.h, z6.h[7] +; VBITS_GE_128-NEXT: mov z23.h, z6.h[6] +; VBITS_GE_128-NEXT: mov z24.h, z6.h[5] +; VBITS_GE_128-NEXT: strb w9, [sp, #19] +; VBITS_GE_128-NEXT: fmov w9, s7 +; VBITS_GE_128-NEXT: strb w10, [sp, #18] +; VBITS_GE_128-NEXT: fmov w10, s23 +; VBITS_GE_128-NEXT: strb w8, [sp, #17] +; VBITS_GE_128-NEXT: fmov w8, s24 +; VBITS_GE_128-NEXT: mov z25.h, z6.h[4] +; VBITS_GE_128-NEXT: mov z26.h, z6.h[3] +; VBITS_GE_128-NEXT: mov z27.h, z6.h[2] +; VBITS_GE_128-NEXT: strb w9, [sp, #31] +; VBITS_GE_128-NEXT: fmov w9, s25 +; VBITS_GE_128-NEXT: strb w10, [sp, #30] +; VBITS_GE_128-NEXT: fmov w10, s26 +; VBITS_GE_128-NEXT: strb w8, [sp, #29] +; VBITS_GE_128-NEXT: fmov w8, s27 +; VBITS_GE_128-NEXT: lsr z4.h, p1/m, z4.h, z2.h +; VBITS_GE_128-NEXT: mov z28.h, z6.h[1] +; VBITS_GE_128-NEXT: fmov w11, s6 +; VBITS_GE_128-NEXT: mov z6.h, z4.h[7] +; VBITS_GE_128-NEXT: strb w9, [sp, #28] +; VBITS_GE_128-NEXT: fmov w9, s28 +; VBITS_GE_128-NEXT: strb w10, [sp, #27] +; VBITS_GE_128-NEXT: fmov w10, s4 +; VBITS_GE_128-NEXT: strb w8, [sp, #26] +; VBITS_GE_128-NEXT: fmov w8, s6 +; VBITS_GE_128-NEXT: mov z7.h, z4.h[6] +; VBITS_GE_128-NEXT: mov z16.h, z4.h[5] +; VBITS_GE_128-NEXT: mov z17.h, z4.h[4] +; VBITS_GE_128-NEXT: strb w9, [sp, #25] +; VBITS_GE_128-NEXT: fmov w9, s7 +; VBITS_GE_128-NEXT: strb w10, [sp] +; VBITS_GE_128-NEXT: fmov w10, s16 +; VBITS_GE_128-NEXT: strb w8, [sp, #7] +; VBITS_GE_128-NEXT: fmov w8, s17 +; VBITS_GE_128-NEXT: mov z18.h, z4.h[3] +; VBITS_GE_128-NEXT: mov z19.h, z4.h[2] +; VBITS_GE_128-NEXT: mov z20.h, z4.h[1] +; VBITS_GE_128-NEXT: strb w9, [sp, #6] +; VBITS_GE_128-NEXT: fmov w9, s18 +; VBITS_GE_128-NEXT: strb w10, [sp, #5] +; VBITS_GE_128-NEXT: fmov w10, s19 +; VBITS_GE_128-NEXT: strb w8, [sp, #4] +; VBITS_GE_128-NEXT: fmov w8, s20 +; VBITS_GE_128-NEXT: lsr z5.h, p1/m, z5.h, z2.h +; VBITS_GE_128-NEXT: strb w9, [sp, #3] +; VBITS_GE_128-NEXT: mov z4.h, z5.h[7] +; VBITS_GE_128-NEXT: mov z21.h, z5.h[6] +; VBITS_GE_128-NEXT: mov z22.h, z5.h[5] +; VBITS_GE_128-NEXT: fmov w9, s4 +; VBITS_GE_128-NEXT: strb w10, [sp, #2] +; VBITS_GE_128-NEXT: fmov w10, s21 +; VBITS_GE_128-NEXT: strb w8, [sp, #1] +; VBITS_GE_128-NEXT: fmov w8, s22 +; VBITS_GE_128-NEXT: mov z23.h, z5.h[4] +; VBITS_GE_128-NEXT: mov z24.h, z5.h[3] +; VBITS_GE_128-NEXT: mov z25.h, z5.h[2] +; VBITS_GE_128-NEXT: strb w11, [sp, #24] +; VBITS_GE_128-NEXT: fmov w11, s5 +; VBITS_GE_128-NEXT: strb w9, [sp, #15] +; VBITS_GE_128-NEXT: fmov w9, s23 +; VBITS_GE_128-NEXT: strb w10, [sp, #14] +; VBITS_GE_128-NEXT: fmov w10, s24 +; VBITS_GE_128-NEXT: strb w8, [sp, #13] +; VBITS_GE_128-NEXT: fmov w8, s25 +; VBITS_GE_128-NEXT: ext z0.b, z0.b, z0.b, #8 +; VBITS_GE_128-NEXT: ext z1.b, z1.b, z1.b, #8 +; VBITS_GE_128-NEXT: mov z26.h, z5.h[1] +; VBITS_GE_128-NEXT: sunpklo z0.h, z0.b +; VBITS_GE_128-NEXT: sunpklo z1.h, z1.b +; VBITS_GE_128-NEXT: strb w11, [sp, #8] +; VBITS_GE_128-NEXT: mul z0.h, p1/m, z0.h, z1.h +; VBITS_GE_128-NEXT: strb w9, [sp, #12] +; VBITS_GE_128-NEXT: lsr z0.h, p1/m, z0.h, z2.h +; VBITS_GE_128-NEXT: strb w10, [sp, #11] +; VBITS_GE_128-NEXT: movprfx z1, z3 +; VBITS_GE_128-NEXT: lsr z1.h, p1/m, z1.h, z2.h +; VBITS_GE_128-NEXT: strb w8, [sp, #10] +; VBITS_GE_128-NEXT: fmov w8, s26 +; VBITS_GE_128-NEXT: mov z2.h, z1.h[7] +; VBITS_GE_128-NEXT: fmov w9, s1 +; VBITS_GE_128-NEXT: fmov w10, s0 +; VBITS_GE_128-NEXT: mov z3.h, z1.h[6] +; VBITS_GE_128-NEXT: strb w8, [sp, #9] +; VBITS_GE_128-NEXT: fmov w8, s2 +; VBITS_GE_128-NEXT: mov z4.h, z1.h[5] +; VBITS_GE_128-NEXT: mov z5.h, z1.h[4] +; VBITS_GE_128-NEXT: strb w9, [sp, #48] +; VBITS_GE_128-NEXT: fmov w9, s3 +; VBITS_GE_128-NEXT: strb w10, [sp, #56] +; VBITS_GE_128-NEXT: fmov w10, s4 +; VBITS_GE_128-NEXT: strb w8, [sp, #55] +; VBITS_GE_128-NEXT: fmov w8, s5 +; VBITS_GE_128-NEXT: mov z6.h, z1.h[3] +; VBITS_GE_128-NEXT: mov z7.h, z1.h[2] +; VBITS_GE_128-NEXT: mov z16.h, z1.h[1] +; VBITS_GE_128-NEXT: strb w9, [sp, #54] +; VBITS_GE_128-NEXT: fmov w9, s6 +; VBITS_GE_128-NEXT: strb w10, [sp, #53] +; VBITS_GE_128-NEXT: fmov w10, s7 +; VBITS_GE_128-NEXT: strb w8, [sp, #52] +; VBITS_GE_128-NEXT: fmov w8, s16 +; VBITS_GE_128-NEXT: mov z1.h, z0.h[7] +; VBITS_GE_128-NEXT: mov z17.h, z0.h[6] +; VBITS_GE_128-NEXT: mov z18.h, z0.h[5] +; VBITS_GE_128-NEXT: strb w9, [sp, #51] +; VBITS_GE_128-NEXT: fmov w9, s1 +; VBITS_GE_128-NEXT: strb w10, [sp, #50] +; VBITS_GE_128-NEXT: fmov w10, s17 +; VBITS_GE_128-NEXT: strb w8, [sp, #49] +; VBITS_GE_128-NEXT: fmov w8, s18 +; VBITS_GE_128-NEXT: mov z19.h, z0.h[4] +; VBITS_GE_128-NEXT: mov z20.h, z0.h[3] +; VBITS_GE_128-NEXT: mov z21.h, z0.h[2] +; VBITS_GE_128-NEXT: strb w9, [sp, #63] +; VBITS_GE_128-NEXT: fmov w9, s19 +; VBITS_GE_128-NEXT: strb w10, [sp, #62] +; VBITS_GE_128-NEXT: fmov w10, s20 +; VBITS_GE_128-NEXT: strb w8, [sp, #61] +; VBITS_GE_128-NEXT: fmov w8, s21 +; VBITS_GE_128-NEXT: mov z0.h, z0.h[1] +; VBITS_GE_128-NEXT: strb w9, [sp, #60] +; VBITS_GE_128-NEXT: fmov w9, s0 +; VBITS_GE_128-NEXT: strb w10, [sp, #59] +; VBITS_GE_128-NEXT: add x10, sp, #32 +; VBITS_GE_128-NEXT: add x11, sp, #16 +; VBITS_GE_128-NEXT: strb w8, [sp, #58] +; VBITS_GE_128-NEXT: mov x8, sp +; VBITS_GE_128-NEXT: strb w9, [sp, #57] +; VBITS_GE_128-NEXT: ld1b { z0.b }, p0/z, [x10] +; VBITS_GE_128-NEXT: ld1b { z1.b }, p0/z, [x11] +; VBITS_GE_128-NEXT: ld1b { z2.b }, p0/z, [x8] +; VBITS_GE_128-NEXT: add x8, sp, #48 +; VBITS_GE_128-NEXT: ld1b { z3.b }, p0/z, [x8] +; VBITS_GE_128-NEXT: stp q2, q0, [x0] +; VBITS_GE_128-NEXT: stp q3, q1, [x0, #32] +; VBITS_GE_128-NEXT: add sp, sp, #64 +; VBITS_GE_128-NEXT: ret +; +; VBITS_GE_256-LABEL: smulh_v64i8: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov w8, #32 +; VBITS_GE_256-NEXT: ptrue p0.b, vl32 +; VBITS_GE_256-NEXT: ptrue p1.h, vl16 +; VBITS_GE_256-NEXT: ld1b { z0.b }, p0/z, [x0, x8] +; VBITS_GE_256-NEXT: ld1b { z1.b }, p0/z, [x0] +; VBITS_GE_256-NEXT: ld1b { z2.b }, p0/z, [x1, x8] +; VBITS_GE_256-NEXT: ld1b { z3.b }, p0/z, [x1] +; VBITS_GE_256-NEXT: sunpklo z4.h, z0.b +; VBITS_GE_256-NEXT: ext z0.b, z0.b, z0.b, #16 +; VBITS_GE_256-NEXT: sunpklo z5.h, z1.b +; VBITS_GE_256-NEXT: ext z1.b, z1.b, z1.b, #16 +; VBITS_GE_256-NEXT: sunpklo z6.h, z2.b +; VBITS_GE_256-NEXT: ext z2.b, z2.b, z2.b, #16 +; VBITS_GE_256-NEXT: sunpklo z7.h, z3.b +; VBITS_GE_256-NEXT: ext z3.b, z3.b, z3.b, #16 +; VBITS_GE_256-NEXT: sunpklo z0.h, z0.b +; VBITS_GE_256-NEXT: sunpklo z1.h, z1.b +; VBITS_GE_256-NEXT: sunpklo z2.h, z2.b +; VBITS_GE_256-NEXT: sunpklo z3.h, z3.b +; VBITS_GE_256-NEXT: mul z0.h, p1/m, z0.h, z2.h +; VBITS_GE_256-NEXT: movprfx z2, z5 +; VBITS_GE_256-NEXT: mul z2.h, p1/m, z2.h, z7.h +; VBITS_GE_256-NEXT: mul z1.h, p1/m, z1.h, z3.h +; VBITS_GE_256-NEXT: mul z4.h, p1/m, z4.h, z6.h +; VBITS_GE_256-NEXT: lsr z0.h, p1/m, z0.h, #8 +; VBITS_GE_256-NEXT: movprfx z3, z4 +; VBITS_GE_256-NEXT: lsr z3.h, p1/m, z3.h, #8 +; VBITS_GE_256-NEXT: lsr z1.h, p1/m, z1.h, #8 +; VBITS_GE_256-NEXT: lsr z2.h, p1/m, z2.h, #8 +; VBITS_GE_256-NEXT: uzp1 z3.b, z3.b, z3.b +; VBITS_GE_256-NEXT: uzp1 z0.b, z0.b, z0.b +; VBITS_GE_256-NEXT: ptrue p1.b, vl16 +; VBITS_GE_256-NEXT: uzp1 z2.b, z2.b, z2.b +; VBITS_GE_256-NEXT: uzp1 z1.b, z1.b, z1.b +; VBITS_GE_256-NEXT: splice z3.b, p1, z3.b, z0.b +; VBITS_GE_256-NEXT: splice z2.b, p1, z2.b, z1.b +; VBITS_GE_256-NEXT: st1b { z3.b }, p0, [x0, x8] +; VBITS_GE_256-NEXT: st1b { z2.b }, p0, [x0] +; VBITS_GE_256-NEXT: ret +; +; VBITS_GE_512-LABEL: smulh_v64i8: +; VBITS_GE_512: // %bb.0: +; VBITS_GE_512-NEXT: ptrue p0.b, vl64 +; VBITS_GE_512-NEXT: ld1b { z0.b }, p0/z, [x0] +; VBITS_GE_512-NEXT: ld1b { z1.b }, p0/z, [x1] +; VBITS_GE_512-NEXT: smulh z0.b, p0/m, z0.b, z1.b +; VBITS_GE_512-NEXT: st1b { z0.b }, p0, [x0] +; VBITS_GE_512-NEXT: ret + %op1 = load <64 x i8>, <64 x i8>* %a + %op2 = load <64 x i8>, <64 x i8>* %b + %insert = insertelement <64 x i16> undef, i16 8, i64 0 + %splat = shufflevector <64 x i16> %insert, <64 x i16> undef, <64 x i32> zeroinitializer + %1 = sext <64 x i8> %op1 to <64 x i16> + %2 = sext <64 x i8> %op2 to <64 x i16> + %mul = mul <64 x i16> %1, %2 + %shr = lshr <64 x i16> %mul, + %res = trunc <64 x i16> %shr to <64 x i8> + store <64 x i8> %res, <64 x i8>* %a + ret void +} + +define <4 x i16> @smulh_v4i16(<4 x i16> %op1, <4 x i16> %op2) #0 { +; CHECK-LABEL: smulh_v4i16: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: ptrue p0.h, vl4 +; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1 +; CHECK-NEXT: smulh z0.h, p0/m, z0.h, z1.h +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 +; CHECK-NEXT: ret + %1 = sext <4 x i16> %op1 to <4 x i32> + %2 = sext <4 x i16> %op2 to <4 x i32> + %mul = mul <4 x i32> %1, %2 + %shr = lshr <4 x i32> %mul, + %res = trunc <4 x i32> %shr to <4 x i16> + ret <4 x i16> %res +} + +define <8 x i16> @smulh_v8i16(<8 x i16> %op1, <8 x i16> %op2) #0 { +; CHECK-LABEL: smulh_v8i16: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: ptrue p0.h, vl8 +; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1 +; CHECK-NEXT: smulh z0.h, p0/m, z0.h, z1.h +; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 +; CHECK-NEXT: ret + %1 = sext <8 x i16> %op1 to <8 x i32> + %2 = sext <8 x i16> %op2 to <8 x i32> + %mul = mul <8 x i32> %1, %2 + %shr = lshr <8 x i32> %mul, + %res = trunc <8 x i32> %shr to <8 x i16> + ret <8 x i16> %res +} + +define void @smulh_v16i16(<16 x i16>* %a, <16 x i16>* %b) #0 { +; VBITS_GE_128-LABEL: smulh_v16i16: +; VBITS_GE_128: // %bb.0: +; VBITS_GE_128-NEXT: mov x8, #8 +; VBITS_GE_128-NEXT: ptrue p0.h, vl8 +; VBITS_GE_128-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] +; VBITS_GE_128-NEXT: ld1h { z1.h }, p0/z, [x0] +; VBITS_GE_128-NEXT: ld1h { z2.h }, p0/z, [x1, x8, lsl #1] +; VBITS_GE_128-NEXT: ld1h { z3.h }, p0/z, [x1] +; VBITS_GE_128-NEXT: ptrue p0.h, vl4 +; VBITS_GE_128-NEXT: movprfx z4, z1 +; VBITS_GE_128-NEXT: smulh z4.h, p0/m, z4.h, z3.h +; VBITS_GE_128-NEXT: ext z1.b, z1.b, z1.b, #8 +; VBITS_GE_128-NEXT: ext z3.b, z3.b, z3.b, #8 +; VBITS_GE_128-NEXT: smulh z1.h, p0/m, z1.h, z3.h +; VBITS_GE_128-NEXT: movprfx z3, z0 +; VBITS_GE_128-NEXT: smulh z3.h, p0/m, z3.h, z2.h +; VBITS_GE_128-NEXT: ext z2.b, z2.b, z2.b, #8 +; VBITS_GE_128-NEXT: ext z0.b, z0.b, z0.b, #8 +; VBITS_GE_128-NEXT: smulh z0.h, p0/m, z0.h, z2.h +; VBITS_GE_128-NEXT: splice z4.h, p0, z4.h, z1.h +; VBITS_GE_128-NEXT: splice z3.h, p0, z3.h, z0.h +; VBITS_GE_128-NEXT: stp q4, q3, [x0] +; VBITS_GE_128-NEXT: ret +; +; VBITS_GE_256-LABEL: smulh_v16i16: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: ptrue p0.h, vl16 +; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0] +; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x1] +; VBITS_GE_256-NEXT: smulh z0.h, p0/m, z0.h, z1.h +; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x0] +; VBITS_GE_256-NEXT: ret +; +; VBITS_GE_512-LABEL: smulh_v16i16: +; VBITS_GE_512: // %bb.0: +; VBITS_GE_512-NEXT: ptrue p0.h, vl16 +; VBITS_GE_512-NEXT: ld1h { z0.h }, p0/z, [x0] +; VBITS_GE_512-NEXT: ld1h { z1.h }, p0/z, [x1] +; VBITS_GE_512-NEXT: smulh z0.h, p0/m, z0.h, z1.h +; VBITS_GE_512-NEXT: st1h { z0.h }, p0, [x0] +; VBITS_GE_512-NEXT: ret + %op1 = load <16 x i16>, <16 x i16>* %a + %op2 = load <16 x i16>, <16 x i16>* %b + %1 = sext <16 x i16> %op1 to <16 x i32> + %2 = sext <16 x i16> %op2 to <16 x i32> + %mul = mul <16 x i32> %1, %2 + %shr = lshr <16 x i32> %mul, + %res = trunc <16 x i32> %shr to <16 x i16> + store <16 x i16> %res, <16 x i16>* %a + ret void +} + +define void @smulh_v32i16(<32 x i16>* %a, <32 x i16>* %b) #0 { +; VBITS_GE_128-LABEL: smulh_v32i16: +; VBITS_GE_128: // %bb.0: +; VBITS_GE_128-NEXT: mov x8, #16 +; VBITS_GE_128-NEXT: mov x9, #24 +; VBITS_GE_128-NEXT: mov x10, #8 +; VBITS_GE_128-NEXT: ptrue p0.h, vl8 +; VBITS_GE_128-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] +; VBITS_GE_128-NEXT: ld1h { z1.h }, p0/z, [x0, x9, lsl #1] +; VBITS_GE_128-NEXT: ld1h { z2.h }, p0/z, [x0, x10, lsl #1] +; VBITS_GE_128-NEXT: ld1h { z3.h }, p0/z, [x0] +; VBITS_GE_128-NEXT: ld1h { z4.h }, p0/z, [x1, x8, lsl #1] +; VBITS_GE_128-NEXT: ld1h { z5.h }, p0/z, [x1, x9, lsl #1] +; VBITS_GE_128-NEXT: ld1h { z6.h }, p0/z, [x1, x10, lsl #1] +; VBITS_GE_128-NEXT: ld1h { z7.h }, p0/z, [x1] +; VBITS_GE_128-NEXT: ptrue p0.h, vl4 +; VBITS_GE_128-NEXT: movprfx z18, z0 +; VBITS_GE_128-NEXT: smulh z18.h, p0/m, z18.h, z4.h +; VBITS_GE_128-NEXT: ext z0.b, z0.b, z0.b, #8 +; VBITS_GE_128-NEXT: mov z16.d, z2.d +; VBITS_GE_128-NEXT: ext z4.b, z4.b, z4.b, #8 +; VBITS_GE_128-NEXT: mov z17.d, z3.d +; VBITS_GE_128-NEXT: smulh z0.h, p0/m, z0.h, z4.h +; VBITS_GE_128-NEXT: movprfx z4, z1 +; VBITS_GE_128-NEXT: smulh z4.h, p0/m, z4.h, z5.h +; VBITS_GE_128-NEXT: ext z5.b, z5.b, z5.b, #8 +; VBITS_GE_128-NEXT: ext z1.b, z1.b, z1.b, #8 +; VBITS_GE_128-NEXT: smulh z2.h, p0/m, z2.h, z6.h +; VBITS_GE_128-NEXT: smulh z3.h, p0/m, z3.h, z7.h +; VBITS_GE_128-NEXT: ext z6.b, z6.b, z6.b, #8 +; VBITS_GE_128-NEXT: smulh z1.h, p0/m, z1.h, z5.h +; VBITS_GE_128-NEXT: ext z16.b, z16.b, z16.b, #8 +; VBITS_GE_128-NEXT: ext z17.b, z17.b, z17.b, #8 +; VBITS_GE_128-NEXT: ext z7.b, z7.b, z7.b, #8 +; VBITS_GE_128-NEXT: movprfx z5, z16 +; VBITS_GE_128-NEXT: smulh z5.h, p0/m, z5.h, z6.h +; VBITS_GE_128-NEXT: movprfx z6, z17 +; VBITS_GE_128-NEXT: smulh z6.h, p0/m, z6.h, z7.h +; VBITS_GE_128-NEXT: splice z18.h, p0, z18.h, z0.h +; VBITS_GE_128-NEXT: splice z4.h, p0, z4.h, z1.h +; VBITS_GE_128-NEXT: splice z3.h, p0, z3.h, z6.h +; VBITS_GE_128-NEXT: splice z2.h, p0, z2.h, z5.h +; VBITS_GE_128-NEXT: stp q18, q4, [x0, #32] +; VBITS_GE_128-NEXT: stp q3, q2, [x0] +; VBITS_GE_128-NEXT: ret +; +; VBITS_GE_256-LABEL: smulh_v32i16: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov x8, #16 +; VBITS_GE_256-NEXT: ptrue p0.h, vl16 +; VBITS_GE_256-NEXT: ptrue p1.h, vl8 +; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] +; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0] +; VBITS_GE_256-NEXT: ld1h { z2.h }, p0/z, [x1, x8, lsl #1] +; VBITS_GE_256-NEXT: ld1h { z3.h }, p0/z, [x1] +; VBITS_GE_256-NEXT: movprfx z4, z0 +; VBITS_GE_256-NEXT: smulh z4.h, p1/m, z4.h, z2.h +; VBITS_GE_256-NEXT: movprfx z5, z1 +; VBITS_GE_256-NEXT: smulh z5.h, p1/m, z5.h, z3.h +; VBITS_GE_256-NEXT: ext z0.b, z0.b, z0.b, #16 +; VBITS_GE_256-NEXT: ext z1.b, z1.b, z1.b, #16 +; VBITS_GE_256-NEXT: ext z2.b, z2.b, z2.b, #16 +; VBITS_GE_256-NEXT: ext z3.b, z3.b, z3.b, #16 +; VBITS_GE_256-NEXT: smulh z0.h, p1/m, z0.h, z2.h +; VBITS_GE_256-NEXT: smulh z1.h, p1/m, z1.h, z3.h +; VBITS_GE_256-NEXT: splice z4.h, p1, z4.h, z0.h +; VBITS_GE_256-NEXT: splice z5.h, p1, z5.h, z1.h +; VBITS_GE_256-NEXT: st1h { z4.h }, p0, [x0, x8, lsl #1] +; VBITS_GE_256-NEXT: st1h { z5.h }, p0, [x0] +; VBITS_GE_256-NEXT: ret +; +; VBITS_GE_512-LABEL: smulh_v32i16: +; VBITS_GE_512: // %bb.0: +; VBITS_GE_512-NEXT: ptrue p0.h, vl32 +; VBITS_GE_512-NEXT: ld1h { z0.h }, p0/z, [x0] +; VBITS_GE_512-NEXT: ld1h { z1.h }, p0/z, [x1] +; VBITS_GE_512-NEXT: smulh z0.h, p0/m, z0.h, z1.h +; VBITS_GE_512-NEXT: st1h { z0.h }, p0, [x0] +; VBITS_GE_512-NEXT: ret + %op1 = load <32 x i16>, <32 x i16>* %a + %op2 = load <32 x i16>, <32 x i16>* %b + %1 = sext <32 x i16> %op1 to <32 x i32> + %2 = sext <32 x i16> %op2 to <32 x i32> + %mul = mul <32 x i32> %1, %2 + %shr = lshr <32 x i32> %mul, + %res = trunc <32 x i32> %shr to <32 x i16> + store <32 x i16> %res, <32 x i16>* %a + ret void +} + +define <2 x i32> @smulh_v2i32(<2 x i32> %op1, <2 x i32> %op2) #0 { +; CHECK-LABEL: smulh_v2i32: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: ptrue p0.s, vl2 +; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1 +; CHECK-NEXT: smulh z0.s, p0/m, z0.s, z1.s +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 +; CHECK-NEXT: ret + %1 = sext <2 x i32> %op1 to <2 x i64> + %2 = sext <2 x i32> %op2 to <2 x i64> + %mul = mul <2 x i64> %1, %2 + %shr = lshr <2 x i64> %mul, + %res = trunc <2 x i64> %shr to <2 x i32> + ret <2 x i32> %res +} + +define <4 x i32> @smulh_v4i32(<4 x i32> %op1, <4 x i32> %op2) #0 { +; CHECK-LABEL: smulh_v4i32: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: ptrue p0.s, vl4 +; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1 +; CHECK-NEXT: smulh z0.s, p0/m, z0.s, z1.s +; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 +; CHECK-NEXT: ret + %1 = sext <4 x i32> %op1 to <4 x i64> + %2 = sext <4 x i32> %op2 to <4 x i64> + %mul = mul <4 x i64> %1, %2 + %shr = lshr <4 x i64> %mul, + %res = trunc <4 x i64> %shr to <4 x i32> + ret <4 x i32> %res +} + +define void @smulh_v8i32(<8 x i32>* %a, <8 x i32>* %b) #0 { +; VBITS_GE_128-LABEL: smulh_v8i32: +; VBITS_GE_128: // %bb.0: +; VBITS_GE_128-NEXT: mov x8, #4 +; VBITS_GE_128-NEXT: ptrue p0.s, vl4 +; VBITS_GE_128-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] +; VBITS_GE_128-NEXT: ld1w { z1.s }, p0/z, [x0] +; VBITS_GE_128-NEXT: ld1w { z2.s }, p0/z, [x1, x8, lsl #2] +; VBITS_GE_128-NEXT: ld1w { z3.s }, p0/z, [x1] +; VBITS_GE_128-NEXT: ptrue p0.s, vl2 +; VBITS_GE_128-NEXT: movprfx z4, z1 +; VBITS_GE_128-NEXT: smulh z4.s, p0/m, z4.s, z3.s +; VBITS_GE_128-NEXT: ext z1.b, z1.b, z1.b, #8 +; VBITS_GE_128-NEXT: ext z3.b, z3.b, z3.b, #8 +; VBITS_GE_128-NEXT: smulh z1.s, p0/m, z1.s, z3.s +; VBITS_GE_128-NEXT: movprfx z3, z0 +; VBITS_GE_128-NEXT: smulh z3.s, p0/m, z3.s, z2.s +; VBITS_GE_128-NEXT: ext z2.b, z2.b, z2.b, #8 +; VBITS_GE_128-NEXT: ext z0.b, z0.b, z0.b, #8 +; VBITS_GE_128-NEXT: smulh z0.s, p0/m, z0.s, z2.s +; VBITS_GE_128-NEXT: splice z4.s, p0, z4.s, z1.s +; VBITS_GE_128-NEXT: splice z3.s, p0, z3.s, z0.s +; VBITS_GE_128-NEXT: stp q4, q3, [x0] +; VBITS_GE_128-NEXT: ret +; +; VBITS_GE_256-LABEL: smulh_v8i32: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: ptrue p0.s, vl8 +; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0] +; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x1] +; VBITS_GE_256-NEXT: smulh z0.s, p0/m, z0.s, z1.s +; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0] +; VBITS_GE_256-NEXT: ret +; +; VBITS_GE_512-LABEL: smulh_v8i32: +; VBITS_GE_512: // %bb.0: +; VBITS_GE_512-NEXT: ptrue p0.s, vl8 +; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x0] +; VBITS_GE_512-NEXT: ld1w { z1.s }, p0/z, [x1] +; VBITS_GE_512-NEXT: smulh z0.s, p0/m, z0.s, z1.s +; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x0] +; VBITS_GE_512-NEXT: ret + %op1 = load <8 x i32>, <8 x i32>* %a + %op2 = load <8 x i32>, <8 x i32>* %b + %1 = sext <8 x i32> %op1 to <8 x i64> + %2 = sext <8 x i32> %op2 to <8 x i64> + %mul = mul <8 x i64> %1, %2 + %shr = lshr <8 x i64> %mul, + %res = trunc <8 x i64> %shr to <8 x i32> + store <8 x i32> %res, <8 x i32>* %a + ret void +} + +define void @smulh_v16i32(<16 x i32>* %a, <16 x i32>* %b) #0 { +; VBITS_GE_128-LABEL: smulh_v16i32: +; VBITS_GE_128: // %bb.0: +; VBITS_GE_128-NEXT: mov x8, #8 +; VBITS_GE_128-NEXT: mov x9, #12 +; VBITS_GE_128-NEXT: mov x10, #4 +; VBITS_GE_128-NEXT: ptrue p0.s, vl4 +; VBITS_GE_128-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] +; VBITS_GE_128-NEXT: ld1w { z1.s }, p0/z, [x0, x9, lsl #2] +; VBITS_GE_128-NEXT: ld1w { z2.s }, p0/z, [x0, x10, lsl #2] +; VBITS_GE_128-NEXT: ld1w { z3.s }, p0/z, [x0] +; VBITS_GE_128-NEXT: ld1w { z4.s }, p0/z, [x1, x8, lsl #2] +; VBITS_GE_128-NEXT: ld1w { z5.s }, p0/z, [x1, x9, lsl #2] +; VBITS_GE_128-NEXT: ld1w { z6.s }, p0/z, [x1, x10, lsl #2] +; VBITS_GE_128-NEXT: ld1w { z7.s }, p0/z, [x1] +; VBITS_GE_128-NEXT: ptrue p0.s, vl2 +; VBITS_GE_128-NEXT: movprfx z18, z0 +; VBITS_GE_128-NEXT: smulh z18.s, p0/m, z18.s, z4.s +; VBITS_GE_128-NEXT: ext z0.b, z0.b, z0.b, #8 +; VBITS_GE_128-NEXT: mov z16.d, z2.d +; VBITS_GE_128-NEXT: ext z4.b, z4.b, z4.b, #8 +; VBITS_GE_128-NEXT: mov z17.d, z3.d +; VBITS_GE_128-NEXT: smulh z0.s, p0/m, z0.s, z4.s +; VBITS_GE_128-NEXT: movprfx z4, z1 +; VBITS_GE_128-NEXT: smulh z4.s, p0/m, z4.s, z5.s +; VBITS_GE_128-NEXT: ext z5.b, z5.b, z5.b, #8 +; VBITS_GE_128-NEXT: ext z1.b, z1.b, z1.b, #8 +; VBITS_GE_128-NEXT: smulh z2.s, p0/m, z2.s, z6.s +; VBITS_GE_128-NEXT: smulh z3.s, p0/m, z3.s, z7.s +; VBITS_GE_128-NEXT: ext z6.b, z6.b, z6.b, #8 +; VBITS_GE_128-NEXT: smulh z1.s, p0/m, z1.s, z5.s +; VBITS_GE_128-NEXT: ext z16.b, z16.b, z16.b, #8 +; VBITS_GE_128-NEXT: ext z17.b, z17.b, z17.b, #8 +; VBITS_GE_128-NEXT: ext z7.b, z7.b, z7.b, #8 +; VBITS_GE_128-NEXT: movprfx z5, z16 +; VBITS_GE_128-NEXT: smulh z5.s, p0/m, z5.s, z6.s +; VBITS_GE_128-NEXT: movprfx z6, z17 +; VBITS_GE_128-NEXT: smulh z6.s, p0/m, z6.s, z7.s +; VBITS_GE_128-NEXT: splice z18.s, p0, z18.s, z0.s +; VBITS_GE_128-NEXT: splice z4.s, p0, z4.s, z1.s +; VBITS_GE_128-NEXT: splice z3.s, p0, z3.s, z6.s +; VBITS_GE_128-NEXT: splice z2.s, p0, z2.s, z5.s +; VBITS_GE_128-NEXT: stp q18, q4, [x0, #32] +; VBITS_GE_128-NEXT: stp q3, q2, [x0] +; VBITS_GE_128-NEXT: ret +; +; VBITS_GE_256-LABEL: smulh_v16i32: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov x8, #8 +; VBITS_GE_256-NEXT: ptrue p0.s, vl8 +; VBITS_GE_256-NEXT: ptrue p1.s, vl4 +; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0] +; VBITS_GE_256-NEXT: ld1w { z2.s }, p0/z, [x1, x8, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z3.s }, p0/z, [x1] +; VBITS_GE_256-NEXT: movprfx z4, z0 +; VBITS_GE_256-NEXT: smulh z4.s, p1/m, z4.s, z2.s +; VBITS_GE_256-NEXT: movprfx z5, z1 +; VBITS_GE_256-NEXT: smulh z5.s, p1/m, z5.s, z3.s +; VBITS_GE_256-NEXT: ext z0.b, z0.b, z0.b, #16 +; VBITS_GE_256-NEXT: ext z1.b, z1.b, z1.b, #16 +; VBITS_GE_256-NEXT: ext z2.b, z2.b, z2.b, #16 +; VBITS_GE_256-NEXT: ext z3.b, z3.b, z3.b, #16 +; VBITS_GE_256-NEXT: smulh z0.s, p1/m, z0.s, z2.s +; VBITS_GE_256-NEXT: smulh z1.s, p1/m, z1.s, z3.s +; VBITS_GE_256-NEXT: splice z4.s, p1, z4.s, z0.s +; VBITS_GE_256-NEXT: splice z5.s, p1, z5.s, z1.s +; VBITS_GE_256-NEXT: st1w { z4.s }, p0, [x0, x8, lsl #2] +; VBITS_GE_256-NEXT: st1w { z5.s }, p0, [x0] +; VBITS_GE_256-NEXT: ret +; +; VBITS_GE_512-LABEL: smulh_v16i32: +; VBITS_GE_512: // %bb.0: +; VBITS_GE_512-NEXT: ptrue p0.s, vl16 +; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x0] +; VBITS_GE_512-NEXT: ld1w { z1.s }, p0/z, [x1] +; VBITS_GE_512-NEXT: smulh z0.s, p0/m, z0.s, z1.s +; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x0] +; VBITS_GE_512-NEXT: ret + %op1 = load <16 x i32>, <16 x i32>* %a + %op2 = load <16 x i32>, <16 x i32>* %b + %1 = sext <16 x i32> %op1 to <16 x i64> + %2 = sext <16 x i32> %op2 to <16 x i64> + %mul = mul <16 x i64> %1, %2 + %shr = lshr <16 x i64> %mul, + %res = trunc <16 x i64> %shr to <16 x i32> + store <16 x i32> %res, <16 x i32>* %a + ret void +} + +define <1 x i64> @smulh_v1i64(<1 x i64> %op1, <1 x i64> %op2) #0 { +; CHECK-LABEL: smulh_v1i64: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: ptrue p0.d, vl1 +; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1 +; CHECK-NEXT: smulh z0.d, p0/m, z0.d, z1.d +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 +; CHECK-NEXT: ret + %insert = insertelement <1 x i128> undef, i128 64, i128 0 + %splat = shufflevector <1 x i128> %insert, <1 x i128> undef, <1 x i32> zeroinitializer + %1 = sext <1 x i64> %op1 to <1 x i128> + %2 = sext <1 x i64> %op2 to <1 x i128> + %mul = mul <1 x i128> %1, %2 + %shr = lshr <1 x i128> %mul, %splat + %res = trunc <1 x i128> %shr to <1 x i64> + ret <1 x i64> %res +} + +define <2 x i64> @smulh_v2i64(<2 x i64> %op1, <2 x i64> %op2) #0 { +; CHECK-LABEL: smulh_v2i64: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: ptrue p0.d, vl2 +; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1 +; CHECK-NEXT: smulh z0.d, p0/m, z0.d, z1.d +; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 +; CHECK-NEXT: ret + %1 = sext <2 x i64> %op1 to <2 x i128> + %2 = sext <2 x i64> %op2 to <2 x i128> + %mul = mul <2 x i128> %1, %2 + %shr = lshr <2 x i128> %mul, + %res = trunc <2 x i128> %shr to <2 x i64> + ret <2 x i64> %res +} + +define void @smulh_v4i64(<4 x i64>* %a, <4 x i64>* %b) #0 { +; VBITS_GE_128-LABEL: smulh_v4i64: +; VBITS_GE_128: // %bb.0: +; VBITS_GE_128-NEXT: mov x8, #2 +; VBITS_GE_128-NEXT: ptrue p0.d, vl2 +; VBITS_GE_128-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] +; VBITS_GE_128-NEXT: ld1d { z1.d }, p0/z, [x0] +; VBITS_GE_128-NEXT: ld1d { z2.d }, p0/z, [x1, x8, lsl #3] +; VBITS_GE_128-NEXT: ld1d { z3.d }, p0/z, [x1] +; VBITS_GE_128-NEXT: ptrue p0.d, vl1 +; VBITS_GE_128-NEXT: fmov x8, d0 +; VBITS_GE_128-NEXT: mov z4.d, z0.d[1] +; VBITS_GE_128-NEXT: fmov x10, d2 +; VBITS_GE_128-NEXT: mov z0.d, z1.d[1] +; VBITS_GE_128-NEXT: fmov x9, d1 +; VBITS_GE_128-NEXT: mov z1.d, z2.d[1] +; VBITS_GE_128-NEXT: mov z2.d, z3.d[1] +; VBITS_GE_128-NEXT: fmov x11, d3 +; VBITS_GE_128-NEXT: fmov x12, d0 +; VBITS_GE_128-NEXT: fmov x13, d2 +; VBITS_GE_128-NEXT: fmov x14, d4 +; VBITS_GE_128-NEXT: smulh x8, x8, x10 +; VBITS_GE_128-NEXT: fmov x10, d1 +; VBITS_GE_128-NEXT: smulh x9, x9, x11 +; VBITS_GE_128-NEXT: smulh x12, x12, x13 +; VBITS_GE_128-NEXT: smulh x10, x14, x10 +; VBITS_GE_128-NEXT: fmov d2, x8 +; VBITS_GE_128-NEXT: fmov d0, x9 +; VBITS_GE_128-NEXT: fmov d1, x12 +; VBITS_GE_128-NEXT: fmov d3, x10 +; VBITS_GE_128-NEXT: splice z0.d, p0, z0.d, z1.d +; VBITS_GE_128-NEXT: splice z2.d, p0, z2.d, z3.d +; VBITS_GE_128-NEXT: stp q0, q2, [x0] +; VBITS_GE_128-NEXT: ret +; +; VBITS_GE_256-LABEL: smulh_v4i64: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: ptrue p0.d, vl4 +; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0] +; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x1] +; VBITS_GE_256-NEXT: smulh z0.d, p0/m, z0.d, z1.d +; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x0] +; VBITS_GE_256-NEXT: ret +; +; VBITS_GE_512-LABEL: smulh_v4i64: +; VBITS_GE_512: // %bb.0: +; VBITS_GE_512-NEXT: ptrue p0.d, vl4 +; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x0] +; VBITS_GE_512-NEXT: ld1d { z1.d }, p0/z, [x1] +; VBITS_GE_512-NEXT: smulh z0.d, p0/m, z0.d, z1.d +; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x0] +; VBITS_GE_512-NEXT: ret + %op1 = load <4 x i64>, <4 x i64>* %a + %op2 = load <4 x i64>, <4 x i64>* %b + %1 = sext <4 x i64> %op1 to <4 x i128> + %2 = sext <4 x i64> %op2 to <4 x i128> + %mul = mul <4 x i128> %1, %2 + %shr = lshr <4 x i128> %mul, + %res = trunc <4 x i128> %shr to <4 x i64> + store <4 x i64> %res, <4 x i64>* %a + ret void +} + +define void @smulh_v8i64(<8 x i64>* %a, <8 x i64>* %b) #0 { +; VBITS_GE_128-LABEL: smulh_v8i64: +; VBITS_GE_128: // %bb.0: +; VBITS_GE_128-NEXT: mov x8, #2 +; VBITS_GE_128-NEXT: mov x9, #4 +; VBITS_GE_128-NEXT: mov x10, #6 +; VBITS_GE_128-NEXT: ptrue p0.d, vl2 +; VBITS_GE_128-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] +; VBITS_GE_128-NEXT: ld1d { z1.d }, p0/z, [x0, x9, lsl #3] +; VBITS_GE_128-NEXT: ld1d { z2.d }, p0/z, [x0, x10, lsl #3] +; VBITS_GE_128-NEXT: ld1d { z3.d }, p0/z, [x0] +; VBITS_GE_128-NEXT: ld1d { z4.d }, p0/z, [x1, x9, lsl #3] +; VBITS_GE_128-NEXT: ld1d { z5.d }, p0/z, [x1, x10, lsl #3] +; VBITS_GE_128-NEXT: ld1d { z6.d }, p0/z, [x1, x8, lsl #3] +; VBITS_GE_128-NEXT: mov z7.d, z0.d[1] +; VBITS_GE_128-NEXT: fmov x8, d0 +; VBITS_GE_128-NEXT: fmov x9, d7 +; VBITS_GE_128-NEXT: mov z7.d, z2.d[1] +; VBITS_GE_128-NEXT: fmov x10, d2 +; VBITS_GE_128-NEXT: mov z2.d, z1.d[1] +; VBITS_GE_128-NEXT: fmov x13, d2 +; VBITS_GE_128-NEXT: mov z2.d, z4.d[1] +; VBITS_GE_128-NEXT: fmov x17, d2 +; VBITS_GE_128-NEXT: ld1d { z0.d }, p0/z, [x1] +; VBITS_GE_128-NEXT: fmov x12, d1 +; VBITS_GE_128-NEXT: fmov x18, d4 +; VBITS_GE_128-NEXT: mov z1.d, z3.d[1] +; VBITS_GE_128-NEXT: fmov x16, d6 +; VBITS_GE_128-NEXT: smulh x13, x13, x17 +; VBITS_GE_128-NEXT: fmov x17, d5 +; VBITS_GE_128-NEXT: fmov x15, d1 +; VBITS_GE_128-NEXT: mov z1.d, z6.d[1] +; VBITS_GE_128-NEXT: mov z2.d, z5.d[1] +; VBITS_GE_128-NEXT: smulh x12, x12, x18 +; VBITS_GE_128-NEXT: fmov x18, d1 +; VBITS_GE_128-NEXT: mov z1.d, z0.d[1] +; VBITS_GE_128-NEXT: fmov x11, d7 +; VBITS_GE_128-NEXT: fmov x14, d3 +; VBITS_GE_128-NEXT: fmov x1, d2 +; VBITS_GE_128-NEXT: smulh x10, x10, x17 +; VBITS_GE_128-NEXT: fmov x17, d1 +; VBITS_GE_128-NEXT: smulh x8, x8, x16 +; VBITS_GE_128-NEXT: fmov x16, d0 +; VBITS_GE_128-NEXT: smulh x9, x9, x18 +; VBITS_GE_128-NEXT: smulh x11, x11, x1 +; VBITS_GE_128-NEXT: fmov d1, x12 +; VBITS_GE_128-NEXT: smulh x15, x15, x17 +; VBITS_GE_128-NEXT: fmov d2, x13 +; VBITS_GE_128-NEXT: smulh x14, x14, x16 +; VBITS_GE_128-NEXT: fmov d0, x8 +; VBITS_GE_128-NEXT: fmov d3, x10 +; VBITS_GE_128-NEXT: fmov d7, x9 +; VBITS_GE_128-NEXT: fmov d4, x11 +; VBITS_GE_128-NEXT: ptrue p0.d, vl1 +; VBITS_GE_128-NEXT: fmov d6, x15 +; VBITS_GE_128-NEXT: splice z1.d, p0, z1.d, z2.d +; VBITS_GE_128-NEXT: fmov d5, x14 +; VBITS_GE_128-NEXT: splice z0.d, p0, z0.d, z7.d +; VBITS_GE_128-NEXT: splice z3.d, p0, z3.d, z4.d +; VBITS_GE_128-NEXT: stp q1, q3, [x0, #32] +; VBITS_GE_128-NEXT: splice z5.d, p0, z5.d, z6.d +; VBITS_GE_128-NEXT: stp q5, q0, [x0] +; VBITS_GE_128-NEXT: ret +; +; VBITS_GE_256-LABEL: smulh_v8i64: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov x8, #4 +; VBITS_GE_256-NEXT: ptrue p0.d, vl4 +; VBITS_GE_256-NEXT: ptrue p1.d, vl1 +; VBITS_GE_256-NEXT: ptrue p2.d, vl2 +; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0] +; VBITS_GE_256-NEXT: ld1d { z2.d }, p0/z, [x1, x8, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z3.d }, p0/z, [x1] +; VBITS_GE_256-NEXT: fmov x9, d0 +; VBITS_GE_256-NEXT: mov z4.d, z0.d[1] +; VBITS_GE_256-NEXT: ext z0.b, z0.b, z0.b, #16 +; VBITS_GE_256-NEXT: fmov x10, d1 +; VBITS_GE_256-NEXT: mov z5.d, z1.d[1] +; VBITS_GE_256-NEXT: ext z1.b, z1.b, z1.b, #16 +; VBITS_GE_256-NEXT: fmov x11, d4 +; VBITS_GE_256-NEXT: mov z4.d, z0.d[1] +; VBITS_GE_256-NEXT: fmov x12, d0 +; VBITS_GE_256-NEXT: mov z0.d, z1.d[1] +; VBITS_GE_256-NEXT: fmov x15, d0 +; VBITS_GE_256-NEXT: fmov x16, d2 +; VBITS_GE_256-NEXT: mov z0.d, z2.d[1] +; VBITS_GE_256-NEXT: ext z2.b, z2.b, z2.b, #16 +; VBITS_GE_256-NEXT: fmov x17, d0 +; VBITS_GE_256-NEXT: mov z0.d, z2.d[1] +; VBITS_GE_256-NEXT: fmov x18, d2 +; VBITS_GE_256-NEXT: fmov x1, d0 +; VBITS_GE_256-NEXT: fmov x2, d3 +; VBITS_GE_256-NEXT: mov z0.d, z3.d[1] +; VBITS_GE_256-NEXT: ext z3.b, z3.b, z3.b, #16 +; VBITS_GE_256-NEXT: fmov x13, d5 +; VBITS_GE_256-NEXT: mov z2.d, z3.d[1] +; VBITS_GE_256-NEXT: smulh x12, x12, x18 +; VBITS_GE_256-NEXT: fmov x18, d2 +; VBITS_GE_256-NEXT: fmov x14, d4 +; VBITS_GE_256-NEXT: smulh x11, x11, x17 +; VBITS_GE_256-NEXT: fmov x17, d0 +; VBITS_GE_256-NEXT: smulh x9, x9, x16 +; VBITS_GE_256-NEXT: fmov x16, d1 +; VBITS_GE_256-NEXT: smulh x15, x15, x18 +; VBITS_GE_256-NEXT: fmov x18, d3 +; VBITS_GE_256-NEXT: smulh x14, x14, x1 +; VBITS_GE_256-NEXT: fmov d2, x12 +; VBITS_GE_256-NEXT: smulh x13, x13, x17 +; VBITS_GE_256-NEXT: fmov d1, x11 +; VBITS_GE_256-NEXT: smulh x10, x10, x2 +; VBITS_GE_256-NEXT: fmov d0, x9 +; VBITS_GE_256-NEXT: smulh x16, x16, x18 +; VBITS_GE_256-NEXT: fmov d7, x15 +; VBITS_GE_256-NEXT: fmov d3, x14 +; VBITS_GE_256-NEXT: fmov d5, x13 +; VBITS_GE_256-NEXT: splice z0.d, p1, z0.d, z1.d +; VBITS_GE_256-NEXT: fmov d4, x10 +; VBITS_GE_256-NEXT: fmov d6, x16 +; VBITS_GE_256-NEXT: splice z2.d, p1, z2.d, z3.d +; VBITS_GE_256-NEXT: splice z0.d, p2, z0.d, z2.d +; VBITS_GE_256-NEXT: splice z4.d, p1, z4.d, z5.d +; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x0, x8, lsl #3] +; VBITS_GE_256-NEXT: splice z6.d, p1, z6.d, z7.d +; VBITS_GE_256-NEXT: splice z4.d, p2, z4.d, z6.d +; VBITS_GE_256-NEXT: st1d { z4.d }, p0, [x0] +; VBITS_GE_256-NEXT: ret +; +; VBITS_GE_512-LABEL: smulh_v8i64: +; VBITS_GE_512: // %bb.0: +; VBITS_GE_512-NEXT: ptrue p0.d, vl8 +; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x0] +; VBITS_GE_512-NEXT: ld1d { z1.d }, p0/z, [x1] +; VBITS_GE_512-NEXT: smulh z0.d, p0/m, z0.d, z1.d +; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x0] +; VBITS_GE_512-NEXT: ret + %op1 = load <8 x i64>, <8 x i64>* %a + %op2 = load <8 x i64>, <8 x i64>* %b + %1 = sext <8 x i64> %op1 to <8 x i128> + %2 = sext <8 x i64> %op2 to <8 x i128> + %mul = mul <8 x i128> %1, %2 + %shr = lshr <8 x i128> %mul, + %res = trunc <8 x i128> %shr to <8 x i64> + store <8 x i64> %res, <8 x i64>* %a + ret void +} + +; +; UMULH +; + +define <8 x i8> @umulh_v8i8(<8 x i8> %op1, <8 x i8> %op2) #0 { +; CHECK-LABEL: umulh_v8i8: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: ptrue p0.b, vl8 +; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1 +; CHECK-NEXT: umulh z0.b, p0/m, z0.b, z1.b +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 +; CHECK-NEXT: ret + %1 = zext <8 x i8> %op1 to <8 x i16> + %2 = zext <8 x i8> %op2 to <8 x i16> + %mul = mul <8 x i16> %1, %2 + %shr = lshr <8 x i16> %mul, + %res = trunc <8 x i16> %shr to <8 x i8> + ret <8 x i8> %res +} + +define <16 x i8> @umulh_v16i8(<16 x i8> %op1, <16 x i8> %op2) #0 { +; CHECK-LABEL: umulh_v16i8: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: ptrue p0.b, vl16 +; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1 +; CHECK-NEXT: umulh z0.b, p0/m, z0.b, z1.b +; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 +; CHECK-NEXT: ret + %1 = zext <16 x i8> %op1 to <16 x i16> + %2 = zext <16 x i8> %op2 to <16 x i16> + %mul = mul <16 x i16> %1, %2 + %shr = lshr <16 x i16> %mul, + %res = trunc <16 x i16> %shr to <16 x i8> + ret <16 x i8> %res +} + +define void @umulh_v32i8(<32 x i8>* %a, <32 x i8>* %b) #0 { +; VBITS_GE_128-LABEL: umulh_v32i8: +; VBITS_GE_128: // %bb.0: +; VBITS_GE_128-NEXT: mov w8, #16 +; VBITS_GE_128-NEXT: ptrue p0.b, vl16 +; VBITS_GE_128-NEXT: ptrue p1.h, vl8 +; VBITS_GE_128-NEXT: ld1b { z0.b }, p0/z, [x0, x8] +; VBITS_GE_128-NEXT: ld1b { z2.b }, p0/z, [x0] +; VBITS_GE_128-NEXT: ld1b { z1.b }, p0/z, [x1, x8] +; VBITS_GE_128-NEXT: ld1b { z3.b }, p0/z, [x1] +; VBITS_GE_128-NEXT: adrp x8, .LCPI18_0 +; VBITS_GE_128-NEXT: add x8, x8, :lo12:.LCPI18_0 +; VBITS_GE_128-NEXT: uunpklo z5.h, z2.b +; VBITS_GE_128-NEXT: ext z2.b, z2.b, z2.b, #8 +; VBITS_GE_128-NEXT: uunpklo z7.h, z3.b +; VBITS_GE_128-NEXT: ld1h { z16.h }, p1/z, [x8] +; VBITS_GE_128-NEXT: ext z3.b, z3.b, z3.b, #8 +; VBITS_GE_128-NEXT: uunpklo z2.h, z2.b +; VBITS_GE_128-NEXT: uunpklo z3.h, z3.b +; VBITS_GE_128-NEXT: mul z5.h, p1/m, z5.h, z7.h +; VBITS_GE_128-NEXT: mul z2.h, p1/m, z2.h, z3.h +; VBITS_GE_128-NEXT: movprfx z3, z5 +; VBITS_GE_128-NEXT: lsr z3.h, p1/m, z3.h, z16.h +; VBITS_GE_128-NEXT: fmov w8, s3 +; VBITS_GE_128-NEXT: uunpklo z4.h, z0.b +; VBITS_GE_128-NEXT: uunpklo z6.h, z1.b +; VBITS_GE_128-NEXT: ext z0.b, z0.b, z0.b, #8 +; VBITS_GE_128-NEXT: ext z1.b, z1.b, z1.b, #8 +; VBITS_GE_128-NEXT: lsr z2.h, p1/m, z2.h, z16.h +; VBITS_GE_128-NEXT: mov z5.h, z3.h[7] +; VBITS_GE_128-NEXT: uunpklo z0.h, z0.b +; VBITS_GE_128-NEXT: uunpklo z1.h, z1.b +; VBITS_GE_128-NEXT: mul z4.h, p1/m, z4.h, z6.h +; VBITS_GE_128-NEXT: mov z6.h, z3.h[6] +; VBITS_GE_128-NEXT: mov z7.h, z3.h[5] +; VBITS_GE_128-NEXT: mov z17.h, z3.h[4] +; VBITS_GE_128-NEXT: mov z18.h, z3.h[3] +; VBITS_GE_128-NEXT: mov z19.h, z3.h[2] +; VBITS_GE_128-NEXT: mov z20.h, z3.h[1] +; VBITS_GE_128-NEXT: mov z3.h, z2.h[7] +; VBITS_GE_128-NEXT: mov z21.h, z2.h[6] +; VBITS_GE_128-NEXT: mov z22.h, z2.h[5] +; VBITS_GE_128-NEXT: mov z23.h, z2.h[4] +; VBITS_GE_128-NEXT: mov z24.h, z2.h[3] +; VBITS_GE_128-NEXT: mov z25.h, z2.h[2] +; VBITS_GE_128-NEXT: mov z26.h, z2.h[1] +; VBITS_GE_128-NEXT: fmov w9, s2 +; VBITS_GE_128-NEXT: mul z0.h, p1/m, z0.h, z1.h +; VBITS_GE_128-NEXT: fmov w10, s5 +; VBITS_GE_128-NEXT: strb w8, [sp, #-32]! +; VBITS_GE_128-NEXT: .cfi_def_cfa_offset 32 +; VBITS_GE_128-NEXT: fmov w8, s6 +; VBITS_GE_128-NEXT: strb w9, [sp, #8] +; VBITS_GE_128-NEXT: fmov w9, s7 +; VBITS_GE_128-NEXT: strb w10, [sp, #7] +; VBITS_GE_128-NEXT: fmov w10, s17 +; VBITS_GE_128-NEXT: lsr z0.h, p1/m, z0.h, z16.h +; VBITS_GE_128-NEXT: strb w8, [sp, #6] +; VBITS_GE_128-NEXT: fmov w8, s18 +; VBITS_GE_128-NEXT: strb w9, [sp, #5] +; VBITS_GE_128-NEXT: fmov w9, s19 +; VBITS_GE_128-NEXT: strb w10, [sp, #4] +; VBITS_GE_128-NEXT: fmov w10, s20 +; VBITS_GE_128-NEXT: strb w8, [sp, #3] +; VBITS_GE_128-NEXT: fmov w8, s3 +; VBITS_GE_128-NEXT: strb w9, [sp, #2] +; VBITS_GE_128-NEXT: fmov w9, s21 +; VBITS_GE_128-NEXT: strb w10, [sp, #1] +; VBITS_GE_128-NEXT: fmov w10, s22 +; VBITS_GE_128-NEXT: strb w8, [sp, #15] +; VBITS_GE_128-NEXT: fmov w8, s23 +; VBITS_GE_128-NEXT: strb w9, [sp, #14] +; VBITS_GE_128-NEXT: fmov w9, s24 +; VBITS_GE_128-NEXT: strb w10, [sp, #13] +; VBITS_GE_128-NEXT: fmov w10, s25 +; VBITS_GE_128-NEXT: strb w8, [sp, #12] +; VBITS_GE_128-NEXT: fmov w8, s26 +; VBITS_GE_128-NEXT: movprfx z1, z4 +; VBITS_GE_128-NEXT: lsr z1.h, p1/m, z1.h, z16.h +; VBITS_GE_128-NEXT: strb w9, [sp, #11] +; VBITS_GE_128-NEXT: mov z2.h, z1.h[7] +; VBITS_GE_128-NEXT: fmov w9, s1 +; VBITS_GE_128-NEXT: strb w10, [sp, #10] +; VBITS_GE_128-NEXT: fmov w10, s0 +; VBITS_GE_128-NEXT: strb w8, [sp, #9] +; VBITS_GE_128-NEXT: fmov w8, s2 +; VBITS_GE_128-NEXT: mov z3.h, z1.h[6] +; VBITS_GE_128-NEXT: mov z4.h, z1.h[5] +; VBITS_GE_128-NEXT: mov z5.h, z1.h[4] +; VBITS_GE_128-NEXT: strb w9, [sp, #16] +; VBITS_GE_128-NEXT: fmov w9, s3 +; VBITS_GE_128-NEXT: strb w10, [sp, #24] +; VBITS_GE_128-NEXT: fmov w10, s4 +; VBITS_GE_128-NEXT: strb w8, [sp, #23] +; VBITS_GE_128-NEXT: fmov w8, s5 +; VBITS_GE_128-NEXT: mov z6.h, z1.h[3] +; VBITS_GE_128-NEXT: mov z7.h, z1.h[2] +; VBITS_GE_128-NEXT: mov z16.h, z1.h[1] +; VBITS_GE_128-NEXT: strb w9, [sp, #22] +; VBITS_GE_128-NEXT: fmov w9, s6 +; VBITS_GE_128-NEXT: strb w10, [sp, #21] +; VBITS_GE_128-NEXT: fmov w10, s7 +; VBITS_GE_128-NEXT: strb w8, [sp, #20] +; VBITS_GE_128-NEXT: fmov w8, s16 +; VBITS_GE_128-NEXT: mov z1.h, z0.h[7] +; VBITS_GE_128-NEXT: mov z17.h, z0.h[6] +; VBITS_GE_128-NEXT: mov z18.h, z0.h[5] +; VBITS_GE_128-NEXT: strb w9, [sp, #19] +; VBITS_GE_128-NEXT: fmov w9, s1 +; VBITS_GE_128-NEXT: strb w10, [sp, #18] +; VBITS_GE_128-NEXT: fmov w10, s17 +; VBITS_GE_128-NEXT: strb w8, [sp, #17] +; VBITS_GE_128-NEXT: fmov w8, s18 +; VBITS_GE_128-NEXT: mov z19.h, z0.h[4] +; VBITS_GE_128-NEXT: mov z20.h, z0.h[3] +; VBITS_GE_128-NEXT: mov z21.h, z0.h[2] +; VBITS_GE_128-NEXT: strb w9, [sp, #31] +; VBITS_GE_128-NEXT: fmov w9, s19 +; VBITS_GE_128-NEXT: strb w10, [sp, #30] +; VBITS_GE_128-NEXT: fmov w10, s20 +; VBITS_GE_128-NEXT: strb w8, [sp, #29] +; VBITS_GE_128-NEXT: fmov w8, s21 +; VBITS_GE_128-NEXT: mov z22.h, z0.h[1] +; VBITS_GE_128-NEXT: strb w9, [sp, #28] +; VBITS_GE_128-NEXT: fmov w9, s22 +; VBITS_GE_128-NEXT: strb w10, [sp, #27] +; VBITS_GE_128-NEXT: mov x10, sp +; VBITS_GE_128-NEXT: strb w8, [sp, #26] +; VBITS_GE_128-NEXT: add x8, sp, #16 +; VBITS_GE_128-NEXT: strb w9, [sp, #25] +; VBITS_GE_128-NEXT: ld1b { z0.b }, p0/z, [x10] +; VBITS_GE_128-NEXT: ld1b { z1.b }, p0/z, [x8] +; VBITS_GE_128-NEXT: stp q0, q1, [x0] +; VBITS_GE_128-NEXT: add sp, sp, #32 +; VBITS_GE_128-NEXT: ret +; +; VBITS_GE_256-LABEL: umulh_v32i8: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: ptrue p0.b, vl32 +; VBITS_GE_256-NEXT: ld1b { z0.b }, p0/z, [x0] +; VBITS_GE_256-NEXT: ld1b { z1.b }, p0/z, [x1] +; VBITS_GE_256-NEXT: umulh z0.b, p0/m, z0.b, z1.b +; VBITS_GE_256-NEXT: st1b { z0.b }, p0, [x0] +; VBITS_GE_256-NEXT: ret +; +; VBITS_GE_512-LABEL: umulh_v32i8: +; VBITS_GE_512: // %bb.0: +; VBITS_GE_512-NEXT: ptrue p0.b, vl32 +; VBITS_GE_512-NEXT: ld1b { z0.b }, p0/z, [x0] +; VBITS_GE_512-NEXT: ld1b { z1.b }, p0/z, [x1] +; VBITS_GE_512-NEXT: umulh z0.b, p0/m, z0.b, z1.b +; VBITS_GE_512-NEXT: st1b { z0.b }, p0, [x0] +; VBITS_GE_512-NEXT: ret + %op1 = load <32 x i8>, <32 x i8>* %a + %op2 = load <32 x i8>, <32 x i8>* %b + %1 = zext <32 x i8> %op1 to <32 x i16> + %2 = zext <32 x i8> %op2 to <32 x i16> + %mul = mul <32 x i16> %1, %2 + %shr = lshr <32 x i16> %mul, + %res = trunc <32 x i16> %shr to <32 x i8> + store <32 x i8> %res, <32 x i8>* %a + ret void +} + +define void @umulh_v64i8(<64 x i8>* %a, <64 x i8>* %b) #0 { +; VBITS_GE_128-LABEL: umulh_v64i8: +; VBITS_GE_128: // %bb.0: +; VBITS_GE_128-NEXT: sub sp, sp, #64 +; VBITS_GE_128-NEXT: .cfi_def_cfa_offset 64 +; VBITS_GE_128-NEXT: mov w8, #16 +; VBITS_GE_128-NEXT: mov w9, #48 +; VBITS_GE_128-NEXT: mov w10, #32 +; VBITS_GE_128-NEXT: ptrue p0.b, vl16 +; VBITS_GE_128-NEXT: ptrue p1.h, vl8 +; VBITS_GE_128-NEXT: ld1b { z2.b }, p0/z, [x0, x8] +; VBITS_GE_128-NEXT: ld1b { z3.b }, p0/z, [x0, x9] +; VBITS_GE_128-NEXT: ld1b { z0.b }, p0/z, [x0, x10] +; VBITS_GE_128-NEXT: ld1b { z5.b }, p0/z, [x0] +; VBITS_GE_128-NEXT: ld1b { z4.b }, p0/z, [x1, x8] +; VBITS_GE_128-NEXT: ld1b { z7.b }, p0/z, [x1, x9] +; VBITS_GE_128-NEXT: adrp x8, .LCPI19_0 +; VBITS_GE_128-NEXT: add x8, x8, :lo12:.LCPI19_0 +; VBITS_GE_128-NEXT: uunpklo z18.h, z2.b +; VBITS_GE_128-NEXT: ext z2.b, z2.b, z2.b, #8 +; VBITS_GE_128-NEXT: ld1b { z1.b }, p0/z, [x1, x10] +; VBITS_GE_128-NEXT: ld1b { z6.b }, p0/z, [x1] +; VBITS_GE_128-NEXT: uunpklo z19.h, z2.b +; VBITS_GE_128-NEXT: ld1h { z2.h }, p1/z, [x8] +; VBITS_GE_128-NEXT: uunpklo z21.h, z7.b +; VBITS_GE_128-NEXT: ext z7.b, z7.b, z7.b, #8 +; VBITS_GE_128-NEXT: uunpklo z23.h, z7.b +; VBITS_GE_128-NEXT: uunpklo z7.h, z4.b +; VBITS_GE_128-NEXT: mul z18.h, p1/m, z18.h, z7.h +; VBITS_GE_128-NEXT: uunpklo z16.h, z3.b +; VBITS_GE_128-NEXT: lsr z18.h, p1/m, z18.h, z2.h +; VBITS_GE_128-NEXT: ext z3.b, z3.b, z3.b, #8 +; VBITS_GE_128-NEXT: ext z4.b, z4.b, z4.b, #8 +; VBITS_GE_128-NEXT: fmov w8, s18 +; VBITS_GE_128-NEXT: uunpklo z3.h, z3.b +; VBITS_GE_128-NEXT: uunpklo z20.h, z5.b +; VBITS_GE_128-NEXT: uunpklo z4.h, z4.b +; VBITS_GE_128-NEXT: uunpklo z17.h, z0.b +; VBITS_GE_128-NEXT: uunpklo z22.h, z1.b +; VBITS_GE_128-NEXT: uunpklo z24.h, z6.b +; VBITS_GE_128-NEXT: mul z19.h, p1/m, z19.h, z4.h +; VBITS_GE_128-NEXT: movprfx z7, z16 +; VBITS_GE_128-NEXT: mul z7.h, p1/m, z7.h, z21.h +; VBITS_GE_128-NEXT: movprfx z16, z3 +; VBITS_GE_128-NEXT: mul z16.h, p1/m, z16.h, z23.h +; VBITS_GE_128-NEXT: movprfx z3, z17 +; VBITS_GE_128-NEXT: mul z3.h, p1/m, z3.h, z22.h +; VBITS_GE_128-NEXT: movprfx z4, z20 +; VBITS_GE_128-NEXT: mul z4.h, p1/m, z4.h, z24.h +; VBITS_GE_128-NEXT: mov z20.h, z18.h[6] +; VBITS_GE_128-NEXT: movprfx z17, z19 +; VBITS_GE_128-NEXT: lsr z17.h, p1/m, z17.h, z2.h +; VBITS_GE_128-NEXT: fmov w9, s17 +; VBITS_GE_128-NEXT: strb w8, [sp, #32] +; VBITS_GE_128-NEXT: fmov w8, s20 +; VBITS_GE_128-NEXT: mov z19.h, z18.h[7] +; VBITS_GE_128-NEXT: mov z21.h, z18.h[5] +; VBITS_GE_128-NEXT: mov z23.h, z18.h[3] +; VBITS_GE_128-NEXT: fmov w10, s19 +; VBITS_GE_128-NEXT: strb w9, [sp, #40] +; VBITS_GE_128-NEXT: fmov w9, s21 +; VBITS_GE_128-NEXT: strb w8, [sp, #38] +; VBITS_GE_128-NEXT: fmov w8, s23 +; VBITS_GE_128-NEXT: mov z22.h, z18.h[4] +; VBITS_GE_128-NEXT: mov z24.h, z18.h[2] +; VBITS_GE_128-NEXT: mov z25.h, z18.h[1] +; VBITS_GE_128-NEXT: mov z18.h, z17.h[7] +; VBITS_GE_128-NEXT: strb w10, [sp, #39] +; VBITS_GE_128-NEXT: fmov w10, s22 +; VBITS_GE_128-NEXT: strb w9, [sp, #37] +; VBITS_GE_128-NEXT: fmov w9, s24 +; VBITS_GE_128-NEXT: strb w8, [sp, #35] +; VBITS_GE_128-NEXT: fmov w8, s18 +; VBITS_GE_128-NEXT: mov z26.h, z17.h[6] +; VBITS_GE_128-NEXT: mov z28.h, z17.h[4] +; VBITS_GE_128-NEXT: strb w10, [sp, #36] +; VBITS_GE_128-NEXT: fmov w10, s25 +; VBITS_GE_128-NEXT: strb w9, [sp, #34] +; VBITS_GE_128-NEXT: fmov w9, s26 +; VBITS_GE_128-NEXT: strb w8, [sp, #47] +; VBITS_GE_128-NEXT: fmov w8, s28 +; VBITS_GE_128-NEXT: ext z5.b, z5.b, z5.b, #8 +; VBITS_GE_128-NEXT: ext z6.b, z6.b, z6.b, #8 +; VBITS_GE_128-NEXT: mov z27.h, z17.h[5] +; VBITS_GE_128-NEXT: mov z29.h, z17.h[3] +; VBITS_GE_128-NEXT: mov z30.h, z17.h[2] +; VBITS_GE_128-NEXT: uunpklo z5.h, z5.b +; VBITS_GE_128-NEXT: uunpklo z6.h, z6.b +; VBITS_GE_128-NEXT: strb w10, [sp, #33] +; VBITS_GE_128-NEXT: mul z5.h, p1/m, z5.h, z6.h +; VBITS_GE_128-NEXT: fmov w10, s27 +; VBITS_GE_128-NEXT: strb w9, [sp, #46] +; VBITS_GE_128-NEXT: fmov w9, s29 +; VBITS_GE_128-NEXT: movprfx z6, z16 +; VBITS_GE_128-NEXT: lsr z6.h, p1/m, z6.h, z2.h +; VBITS_GE_128-NEXT: strb w8, [sp, #44] +; VBITS_GE_128-NEXT: fmov w8, s30 +; VBITS_GE_128-NEXT: lsr z7.h, p1/m, z7.h, z2.h +; VBITS_GE_128-NEXT: mov z31.h, z17.h[1] +; VBITS_GE_128-NEXT: mov z16.h, z7.h[7] +; VBITS_GE_128-NEXT: strb w10, [sp, #45] +; VBITS_GE_128-NEXT: fmov w10, s7 +; VBITS_GE_128-NEXT: strb w9, [sp, #43] +; VBITS_GE_128-NEXT: fmov w9, s31 +; VBITS_GE_128-NEXT: strb w8, [sp, #42] +; VBITS_GE_128-NEXT: fmov w8, s16 +; VBITS_GE_128-NEXT: mov z17.h, z7.h[6] +; VBITS_GE_128-NEXT: mov z18.h, z7.h[5] +; VBITS_GE_128-NEXT: mov z19.h, z7.h[4] +; VBITS_GE_128-NEXT: strb w9, [sp, #41] +; VBITS_GE_128-NEXT: strb w10, [sp, #16] +; VBITS_GE_128-NEXT: fmov w9, s17 +; VBITS_GE_128-NEXT: fmov w10, s18 +; VBITS_GE_128-NEXT: strb w8, [sp, #23] +; VBITS_GE_128-NEXT: fmov w8, s19 +; VBITS_GE_128-NEXT: mov z20.h, z7.h[3] +; VBITS_GE_128-NEXT: mov z21.h, z7.h[2] +; VBITS_GE_128-NEXT: mov z22.h, z7.h[1] +; VBITS_GE_128-NEXT: strb w9, [sp, #22] +; VBITS_GE_128-NEXT: fmov w9, s20 +; VBITS_GE_128-NEXT: strb w10, [sp, #21] +; VBITS_GE_128-NEXT: fmov w10, s21 +; VBITS_GE_128-NEXT: strb w8, [sp, #20] +; VBITS_GE_128-NEXT: fmov w8, s22 +; VBITS_GE_128-NEXT: mov z7.h, z6.h[7] +; VBITS_GE_128-NEXT: mov z23.h, z6.h[6] +; VBITS_GE_128-NEXT: mov z24.h, z6.h[5] +; VBITS_GE_128-NEXT: strb w9, [sp, #19] +; VBITS_GE_128-NEXT: fmov w9, s7 +; VBITS_GE_128-NEXT: strb w10, [sp, #18] +; VBITS_GE_128-NEXT: fmov w10, s23 +; VBITS_GE_128-NEXT: strb w8, [sp, #17] +; VBITS_GE_128-NEXT: fmov w8, s24 +; VBITS_GE_128-NEXT: mov z25.h, z6.h[4] +; VBITS_GE_128-NEXT: mov z26.h, z6.h[3] +; VBITS_GE_128-NEXT: mov z27.h, z6.h[2] +; VBITS_GE_128-NEXT: strb w9, [sp, #31] +; VBITS_GE_128-NEXT: fmov w9, s25 +; VBITS_GE_128-NEXT: strb w10, [sp, #30] +; VBITS_GE_128-NEXT: fmov w10, s26 +; VBITS_GE_128-NEXT: strb w8, [sp, #29] +; VBITS_GE_128-NEXT: fmov w8, s27 +; VBITS_GE_128-NEXT: lsr z4.h, p1/m, z4.h, z2.h +; VBITS_GE_128-NEXT: mov z28.h, z6.h[1] +; VBITS_GE_128-NEXT: fmov w11, s6 +; VBITS_GE_128-NEXT: mov z6.h, z4.h[7] +; VBITS_GE_128-NEXT: strb w9, [sp, #28] +; VBITS_GE_128-NEXT: fmov w9, s28 +; VBITS_GE_128-NEXT: strb w10, [sp, #27] +; VBITS_GE_128-NEXT: fmov w10, s4 +; VBITS_GE_128-NEXT: strb w8, [sp, #26] +; VBITS_GE_128-NEXT: fmov w8, s6 +; VBITS_GE_128-NEXT: mov z7.h, z4.h[6] +; VBITS_GE_128-NEXT: mov z16.h, z4.h[5] +; VBITS_GE_128-NEXT: mov z17.h, z4.h[4] +; VBITS_GE_128-NEXT: strb w9, [sp, #25] +; VBITS_GE_128-NEXT: fmov w9, s7 +; VBITS_GE_128-NEXT: strb w10, [sp] +; VBITS_GE_128-NEXT: fmov w10, s16 +; VBITS_GE_128-NEXT: strb w8, [sp, #7] +; VBITS_GE_128-NEXT: fmov w8, s17 +; VBITS_GE_128-NEXT: mov z18.h, z4.h[3] +; VBITS_GE_128-NEXT: mov z19.h, z4.h[2] +; VBITS_GE_128-NEXT: mov z20.h, z4.h[1] +; VBITS_GE_128-NEXT: strb w9, [sp, #6] +; VBITS_GE_128-NEXT: fmov w9, s18 +; VBITS_GE_128-NEXT: strb w10, [sp, #5] +; VBITS_GE_128-NEXT: fmov w10, s19 +; VBITS_GE_128-NEXT: strb w8, [sp, #4] +; VBITS_GE_128-NEXT: fmov w8, s20 +; VBITS_GE_128-NEXT: lsr z5.h, p1/m, z5.h, z2.h +; VBITS_GE_128-NEXT: strb w9, [sp, #3] +; VBITS_GE_128-NEXT: mov z4.h, z5.h[7] +; VBITS_GE_128-NEXT: mov z21.h, z5.h[6] +; VBITS_GE_128-NEXT: mov z22.h, z5.h[5] +; VBITS_GE_128-NEXT: fmov w9, s4 +; VBITS_GE_128-NEXT: strb w10, [sp, #2] +; VBITS_GE_128-NEXT: fmov w10, s21 +; VBITS_GE_128-NEXT: strb w8, [sp, #1] +; VBITS_GE_128-NEXT: fmov w8, s22 +; VBITS_GE_128-NEXT: mov z23.h, z5.h[4] +; VBITS_GE_128-NEXT: mov z24.h, z5.h[3] +; VBITS_GE_128-NEXT: mov z25.h, z5.h[2] +; VBITS_GE_128-NEXT: strb w11, [sp, #24] +; VBITS_GE_128-NEXT: fmov w11, s5 +; VBITS_GE_128-NEXT: strb w9, [sp, #15] +; VBITS_GE_128-NEXT: fmov w9, s23 +; VBITS_GE_128-NEXT: strb w10, [sp, #14] +; VBITS_GE_128-NEXT: fmov w10, s24 +; VBITS_GE_128-NEXT: strb w8, [sp, #13] +; VBITS_GE_128-NEXT: fmov w8, s25 +; VBITS_GE_128-NEXT: ext z0.b, z0.b, z0.b, #8 +; VBITS_GE_128-NEXT: ext z1.b, z1.b, z1.b, #8 +; VBITS_GE_128-NEXT: mov z26.h, z5.h[1] +; VBITS_GE_128-NEXT: uunpklo z0.h, z0.b +; VBITS_GE_128-NEXT: uunpklo z1.h, z1.b +; VBITS_GE_128-NEXT: strb w11, [sp, #8] +; VBITS_GE_128-NEXT: mul z0.h, p1/m, z0.h, z1.h +; VBITS_GE_128-NEXT: strb w9, [sp, #12] +; VBITS_GE_128-NEXT: lsr z0.h, p1/m, z0.h, z2.h +; VBITS_GE_128-NEXT: strb w10, [sp, #11] +; VBITS_GE_128-NEXT: movprfx z1, z3 +; VBITS_GE_128-NEXT: lsr z1.h, p1/m, z1.h, z2.h +; VBITS_GE_128-NEXT: strb w8, [sp, #10] +; VBITS_GE_128-NEXT: fmov w8, s26 +; VBITS_GE_128-NEXT: mov z2.h, z1.h[7] +; VBITS_GE_128-NEXT: fmov w9, s1 +; VBITS_GE_128-NEXT: fmov w10, s0 +; VBITS_GE_128-NEXT: mov z3.h, z1.h[6] +; VBITS_GE_128-NEXT: strb w8, [sp, #9] +; VBITS_GE_128-NEXT: fmov w8, s2 +; VBITS_GE_128-NEXT: mov z4.h, z1.h[5] +; VBITS_GE_128-NEXT: mov z5.h, z1.h[4] +; VBITS_GE_128-NEXT: strb w9, [sp, #48] +; VBITS_GE_128-NEXT: fmov w9, s3 +; VBITS_GE_128-NEXT: strb w10, [sp, #56] +; VBITS_GE_128-NEXT: fmov w10, s4 +; VBITS_GE_128-NEXT: strb w8, [sp, #55] +; VBITS_GE_128-NEXT: fmov w8, s5 +; VBITS_GE_128-NEXT: mov z6.h, z1.h[3] +; VBITS_GE_128-NEXT: mov z7.h, z1.h[2] +; VBITS_GE_128-NEXT: mov z16.h, z1.h[1] +; VBITS_GE_128-NEXT: strb w9, [sp, #54] +; VBITS_GE_128-NEXT: fmov w9, s6 +; VBITS_GE_128-NEXT: strb w10, [sp, #53] +; VBITS_GE_128-NEXT: fmov w10, s7 +; VBITS_GE_128-NEXT: strb w8, [sp, #52] +; VBITS_GE_128-NEXT: fmov w8, s16 +; VBITS_GE_128-NEXT: mov z1.h, z0.h[7] +; VBITS_GE_128-NEXT: mov z17.h, z0.h[6] +; VBITS_GE_128-NEXT: mov z18.h, z0.h[5] +; VBITS_GE_128-NEXT: strb w9, [sp, #51] +; VBITS_GE_128-NEXT: fmov w9, s1 +; VBITS_GE_128-NEXT: strb w10, [sp, #50] +; VBITS_GE_128-NEXT: fmov w10, s17 +; VBITS_GE_128-NEXT: strb w8, [sp, #49] +; VBITS_GE_128-NEXT: fmov w8, s18 +; VBITS_GE_128-NEXT: mov z19.h, z0.h[4] +; VBITS_GE_128-NEXT: mov z20.h, z0.h[3] +; VBITS_GE_128-NEXT: mov z21.h, z0.h[2] +; VBITS_GE_128-NEXT: strb w9, [sp, #63] +; VBITS_GE_128-NEXT: fmov w9, s19 +; VBITS_GE_128-NEXT: strb w10, [sp, #62] +; VBITS_GE_128-NEXT: fmov w10, s20 +; VBITS_GE_128-NEXT: strb w8, [sp, #61] +; VBITS_GE_128-NEXT: fmov w8, s21 +; VBITS_GE_128-NEXT: mov z0.h, z0.h[1] +; VBITS_GE_128-NEXT: strb w9, [sp, #60] +; VBITS_GE_128-NEXT: fmov w9, s0 +; VBITS_GE_128-NEXT: strb w10, [sp, #59] +; VBITS_GE_128-NEXT: add x10, sp, #32 +; VBITS_GE_128-NEXT: add x11, sp, #16 +; VBITS_GE_128-NEXT: strb w8, [sp, #58] +; VBITS_GE_128-NEXT: mov x8, sp +; VBITS_GE_128-NEXT: strb w9, [sp, #57] +; VBITS_GE_128-NEXT: ld1b { z0.b }, p0/z, [x10] +; VBITS_GE_128-NEXT: ld1b { z1.b }, p0/z, [x11] +; VBITS_GE_128-NEXT: ld1b { z2.b }, p0/z, [x8] +; VBITS_GE_128-NEXT: add x8, sp, #48 +; VBITS_GE_128-NEXT: ld1b { z3.b }, p0/z, [x8] +; VBITS_GE_128-NEXT: stp q2, q0, [x0] +; VBITS_GE_128-NEXT: stp q3, q1, [x0, #32] +; VBITS_GE_128-NEXT: add sp, sp, #64 +; VBITS_GE_128-NEXT: ret +; +; VBITS_GE_256-LABEL: umulh_v64i8: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov w8, #32 +; VBITS_GE_256-NEXT: ptrue p0.b, vl32 +; VBITS_GE_256-NEXT: ptrue p1.h, vl16 +; VBITS_GE_256-NEXT: ld1b { z0.b }, p0/z, [x0, x8] +; VBITS_GE_256-NEXT: ld1b { z1.b }, p0/z, [x0] +; VBITS_GE_256-NEXT: ld1b { z2.b }, p0/z, [x1, x8] +; VBITS_GE_256-NEXT: ld1b { z3.b }, p0/z, [x1] +; VBITS_GE_256-NEXT: uunpklo z4.h, z0.b +; VBITS_GE_256-NEXT: ext z0.b, z0.b, z0.b, #16 +; VBITS_GE_256-NEXT: uunpklo z5.h, z1.b +; VBITS_GE_256-NEXT: ext z1.b, z1.b, z1.b, #16 +; VBITS_GE_256-NEXT: uunpklo z6.h, z2.b +; VBITS_GE_256-NEXT: ext z2.b, z2.b, z2.b, #16 +; VBITS_GE_256-NEXT: uunpklo z7.h, z3.b +; VBITS_GE_256-NEXT: ext z3.b, z3.b, z3.b, #16 +; VBITS_GE_256-NEXT: uunpklo z0.h, z0.b +; VBITS_GE_256-NEXT: uunpklo z1.h, z1.b +; VBITS_GE_256-NEXT: uunpklo z2.h, z2.b +; VBITS_GE_256-NEXT: uunpklo z3.h, z3.b +; VBITS_GE_256-NEXT: mul z0.h, p1/m, z0.h, z2.h +; VBITS_GE_256-NEXT: movprfx z2, z5 +; VBITS_GE_256-NEXT: mul z2.h, p1/m, z2.h, z7.h +; VBITS_GE_256-NEXT: mul z1.h, p1/m, z1.h, z3.h +; VBITS_GE_256-NEXT: mul z4.h, p1/m, z4.h, z6.h +; VBITS_GE_256-NEXT: lsr z0.h, p1/m, z0.h, #8 +; VBITS_GE_256-NEXT: movprfx z3, z4 +; VBITS_GE_256-NEXT: lsr z3.h, p1/m, z3.h, #8 +; VBITS_GE_256-NEXT: lsr z1.h, p1/m, z1.h, #8 +; VBITS_GE_256-NEXT: lsr z2.h, p1/m, z2.h, #8 +; VBITS_GE_256-NEXT: uzp1 z3.b, z3.b, z3.b +; VBITS_GE_256-NEXT: uzp1 z0.b, z0.b, z0.b +; VBITS_GE_256-NEXT: ptrue p1.b, vl16 +; VBITS_GE_256-NEXT: uzp1 z2.b, z2.b, z2.b +; VBITS_GE_256-NEXT: uzp1 z1.b, z1.b, z1.b +; VBITS_GE_256-NEXT: splice z3.b, p1, z3.b, z0.b +; VBITS_GE_256-NEXT: splice z2.b, p1, z2.b, z1.b +; VBITS_GE_256-NEXT: st1b { z3.b }, p0, [x0, x8] +; VBITS_GE_256-NEXT: st1b { z2.b }, p0, [x0] +; VBITS_GE_256-NEXT: ret +; +; VBITS_GE_512-LABEL: umulh_v64i8: +; VBITS_GE_512: // %bb.0: +; VBITS_GE_512-NEXT: ptrue p0.b, vl64 +; VBITS_GE_512-NEXT: ld1b { z0.b }, p0/z, [x0] +; VBITS_GE_512-NEXT: ld1b { z1.b }, p0/z, [x1] +; VBITS_GE_512-NEXT: umulh z0.b, p0/m, z0.b, z1.b +; VBITS_GE_512-NEXT: st1b { z0.b }, p0, [x0] +; VBITS_GE_512-NEXT: ret + %op1 = load <64 x i8>, <64 x i8>* %a + %op2 = load <64 x i8>, <64 x i8>* %b + %1 = zext <64 x i8> %op1 to <64 x i16> + %2 = zext <64 x i8> %op2 to <64 x i16> + %mul = mul <64 x i16> %1, %2 + %shr = lshr <64 x i16> %mul, + %res = trunc <64 x i16> %shr to <64 x i8> + store <64 x i8> %res, <64 x i8>* %a + ret void +} + +define <4 x i16> @umulh_v4i16(<4 x i16> %op1, <4 x i16> %op2) #0 { +; CHECK-LABEL: umulh_v4i16: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: ptrue p0.h, vl4 +; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1 +; CHECK-NEXT: umulh z0.h, p0/m, z0.h, z1.h +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 +; CHECK-NEXT: ret + %1 = zext <4 x i16> %op1 to <4 x i32> + %2 = zext <4 x i16> %op2 to <4 x i32> + %mul = mul <4 x i32> %1, %2 + %shr = lshr <4 x i32> %mul, + %res = trunc <4 x i32> %shr to <4 x i16> + ret <4 x i16> %res +} + +define <8 x i16> @umulh_v8i16(<8 x i16> %op1, <8 x i16> %op2) #0 { +; CHECK-LABEL: umulh_v8i16: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: ptrue p0.h, vl8 +; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1 +; CHECK-NEXT: umulh z0.h, p0/m, z0.h, z1.h +; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 +; CHECK-NEXT: ret + %1 = zext <8 x i16> %op1 to <8 x i32> + %2 = zext <8 x i16> %op2 to <8 x i32> + %mul = mul <8 x i32> %1, %2 + %shr = lshr <8 x i32> %mul, + %res = trunc <8 x i32> %shr to <8 x i16> + ret <8 x i16> %res +} + +define void @umulh_v16i16(<16 x i16>* %a, <16 x i16>* %b) #0 { +; VBITS_GE_128-LABEL: umulh_v16i16: +; VBITS_GE_128: // %bb.0: +; VBITS_GE_128-NEXT: mov x8, #8 +; VBITS_GE_128-NEXT: ptrue p0.h, vl8 +; VBITS_GE_128-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] +; VBITS_GE_128-NEXT: ld1h { z1.h }, p0/z, [x0] +; VBITS_GE_128-NEXT: ld1h { z2.h }, p0/z, [x1, x8, lsl #1] +; VBITS_GE_128-NEXT: ld1h { z3.h }, p0/z, [x1] +; VBITS_GE_128-NEXT: ptrue p0.h, vl4 +; VBITS_GE_128-NEXT: movprfx z4, z1 +; VBITS_GE_128-NEXT: umulh z4.h, p0/m, z4.h, z3.h +; VBITS_GE_128-NEXT: ext z1.b, z1.b, z1.b, #8 +; VBITS_GE_128-NEXT: ext z3.b, z3.b, z3.b, #8 +; VBITS_GE_128-NEXT: umulh z1.h, p0/m, z1.h, z3.h +; VBITS_GE_128-NEXT: movprfx z3, z0 +; VBITS_GE_128-NEXT: umulh z3.h, p0/m, z3.h, z2.h +; VBITS_GE_128-NEXT: ext z2.b, z2.b, z2.b, #8 +; VBITS_GE_128-NEXT: ext z0.b, z0.b, z0.b, #8 +; VBITS_GE_128-NEXT: umulh z0.h, p0/m, z0.h, z2.h +; VBITS_GE_128-NEXT: splice z4.h, p0, z4.h, z1.h +; VBITS_GE_128-NEXT: splice z3.h, p0, z3.h, z0.h +; VBITS_GE_128-NEXT: stp q4, q3, [x0] +; VBITS_GE_128-NEXT: ret +; +; VBITS_GE_256-LABEL: umulh_v16i16: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: ptrue p0.h, vl16 +; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0] +; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x1] +; VBITS_GE_256-NEXT: umulh z0.h, p0/m, z0.h, z1.h +; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x0] +; VBITS_GE_256-NEXT: ret +; +; VBITS_GE_512-LABEL: umulh_v16i16: +; VBITS_GE_512: // %bb.0: +; VBITS_GE_512-NEXT: ptrue p0.h, vl16 +; VBITS_GE_512-NEXT: ld1h { z0.h }, p0/z, [x0] +; VBITS_GE_512-NEXT: ld1h { z1.h }, p0/z, [x1] +; VBITS_GE_512-NEXT: umulh z0.h, p0/m, z0.h, z1.h +; VBITS_GE_512-NEXT: st1h { z0.h }, p0, [x0] +; VBITS_GE_512-NEXT: ret + %op1 = load <16 x i16>, <16 x i16>* %a + %op2 = load <16 x i16>, <16 x i16>* %b + %1 = zext <16 x i16> %op1 to <16 x i32> + %2 = zext <16 x i16> %op2 to <16 x i32> + %mul = mul <16 x i32> %1, %2 + %shr = lshr <16 x i32> %mul, + %res = trunc <16 x i32> %shr to <16 x i16> + store <16 x i16> %res, <16 x i16>* %a + ret void +} + +define void @umulh_v32i16(<32 x i16>* %a, <32 x i16>* %b) #0 { +; VBITS_GE_128-LABEL: umulh_v32i16: +; VBITS_GE_128: // %bb.0: +; VBITS_GE_128-NEXT: mov x8, #16 +; VBITS_GE_128-NEXT: mov x9, #24 +; VBITS_GE_128-NEXT: mov x10, #8 +; VBITS_GE_128-NEXT: ptrue p0.h, vl8 +; VBITS_GE_128-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] +; VBITS_GE_128-NEXT: ld1h { z1.h }, p0/z, [x0, x9, lsl #1] +; VBITS_GE_128-NEXT: ld1h { z2.h }, p0/z, [x0, x10, lsl #1] +; VBITS_GE_128-NEXT: ld1h { z3.h }, p0/z, [x0] +; VBITS_GE_128-NEXT: ld1h { z4.h }, p0/z, [x1, x8, lsl #1] +; VBITS_GE_128-NEXT: ld1h { z5.h }, p0/z, [x1, x9, lsl #1] +; VBITS_GE_128-NEXT: ld1h { z6.h }, p0/z, [x1, x10, lsl #1] +; VBITS_GE_128-NEXT: ld1h { z7.h }, p0/z, [x1] +; VBITS_GE_128-NEXT: ptrue p0.h, vl4 +; VBITS_GE_128-NEXT: movprfx z18, z0 +; VBITS_GE_128-NEXT: umulh z18.h, p0/m, z18.h, z4.h +; VBITS_GE_128-NEXT: ext z0.b, z0.b, z0.b, #8 +; VBITS_GE_128-NEXT: mov z16.d, z2.d +; VBITS_GE_128-NEXT: ext z4.b, z4.b, z4.b, #8 +; VBITS_GE_128-NEXT: mov z17.d, z3.d +; VBITS_GE_128-NEXT: umulh z0.h, p0/m, z0.h, z4.h +; VBITS_GE_128-NEXT: movprfx z4, z1 +; VBITS_GE_128-NEXT: umulh z4.h, p0/m, z4.h, z5.h +; VBITS_GE_128-NEXT: ext z5.b, z5.b, z5.b, #8 +; VBITS_GE_128-NEXT: ext z1.b, z1.b, z1.b, #8 +; VBITS_GE_128-NEXT: umulh z2.h, p0/m, z2.h, z6.h +; VBITS_GE_128-NEXT: umulh z3.h, p0/m, z3.h, z7.h +; VBITS_GE_128-NEXT: ext z6.b, z6.b, z6.b, #8 +; VBITS_GE_128-NEXT: umulh z1.h, p0/m, z1.h, z5.h +; VBITS_GE_128-NEXT: ext z16.b, z16.b, z16.b, #8 +; VBITS_GE_128-NEXT: ext z17.b, z17.b, z17.b, #8 +; VBITS_GE_128-NEXT: ext z7.b, z7.b, z7.b, #8 +; VBITS_GE_128-NEXT: movprfx z5, z16 +; VBITS_GE_128-NEXT: umulh z5.h, p0/m, z5.h, z6.h +; VBITS_GE_128-NEXT: movprfx z6, z17 +; VBITS_GE_128-NEXT: umulh z6.h, p0/m, z6.h, z7.h +; VBITS_GE_128-NEXT: splice z18.h, p0, z18.h, z0.h +; VBITS_GE_128-NEXT: splice z4.h, p0, z4.h, z1.h +; VBITS_GE_128-NEXT: splice z3.h, p0, z3.h, z6.h +; VBITS_GE_128-NEXT: splice z2.h, p0, z2.h, z5.h +; VBITS_GE_128-NEXT: stp q18, q4, [x0, #32] +; VBITS_GE_128-NEXT: stp q3, q2, [x0] +; VBITS_GE_128-NEXT: ret +; +; VBITS_GE_256-LABEL: umulh_v32i16: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov x8, #16 +; VBITS_GE_256-NEXT: ptrue p0.h, vl16 +; VBITS_GE_256-NEXT: ptrue p1.h, vl8 +; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] +; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0] +; VBITS_GE_256-NEXT: ld1h { z2.h }, p0/z, [x1, x8, lsl #1] +; VBITS_GE_256-NEXT: ld1h { z3.h }, p0/z, [x1] +; VBITS_GE_256-NEXT: movprfx z4, z0 +; VBITS_GE_256-NEXT: umulh z4.h, p1/m, z4.h, z2.h +; VBITS_GE_256-NEXT: movprfx z5, z1 +; VBITS_GE_256-NEXT: umulh z5.h, p1/m, z5.h, z3.h +; VBITS_GE_256-NEXT: ext z0.b, z0.b, z0.b, #16 +; VBITS_GE_256-NEXT: ext z1.b, z1.b, z1.b, #16 +; VBITS_GE_256-NEXT: ext z2.b, z2.b, z2.b, #16 +; VBITS_GE_256-NEXT: ext z3.b, z3.b, z3.b, #16 +; VBITS_GE_256-NEXT: umulh z0.h, p1/m, z0.h, z2.h +; VBITS_GE_256-NEXT: umulh z1.h, p1/m, z1.h, z3.h +; VBITS_GE_256-NEXT: splice z4.h, p1, z4.h, z0.h +; VBITS_GE_256-NEXT: splice z5.h, p1, z5.h, z1.h +; VBITS_GE_256-NEXT: st1h { z4.h }, p0, [x0, x8, lsl #1] +; VBITS_GE_256-NEXT: st1h { z5.h }, p0, [x0] +; VBITS_GE_256-NEXT: ret +; +; VBITS_GE_512-LABEL: umulh_v32i16: +; VBITS_GE_512: // %bb.0: +; VBITS_GE_512-NEXT: ptrue p0.h, vl32 +; VBITS_GE_512-NEXT: ld1h { z0.h }, p0/z, [x0] +; VBITS_GE_512-NEXT: ld1h { z1.h }, p0/z, [x1] +; VBITS_GE_512-NEXT: umulh z0.h, p0/m, z0.h, z1.h +; VBITS_GE_512-NEXT: st1h { z0.h }, p0, [x0] +; VBITS_GE_512-NEXT: ret + %op1 = load <32 x i16>, <32 x i16>* %a + %op2 = load <32 x i16>, <32 x i16>* %b + %1 = zext <32 x i16> %op1 to <32 x i32> + %2 = zext <32 x i16> %op2 to <32 x i32> + %mul = mul <32 x i32> %1, %2 + %shr = lshr <32 x i32> %mul, + %res = trunc <32 x i32> %shr to <32 x i16> + store <32 x i16> %res, <32 x i16>* %a + ret void +} + +define <2 x i32> @umulh_v2i32(<2 x i32> %op1, <2 x i32> %op2) #0 { +; CHECK-LABEL: umulh_v2i32: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: ptrue p0.s, vl2 +; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1 +; CHECK-NEXT: umulh z0.s, p0/m, z0.s, z1.s +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 +; CHECK-NEXT: ret + %1 = zext <2 x i32> %op1 to <2 x i64> + %2 = zext <2 x i32> %op2 to <2 x i64> + %mul = mul <2 x i64> %1, %2 + %shr = lshr <2 x i64> %mul, + %res = trunc <2 x i64> %shr to <2 x i32> + ret <2 x i32> %res +} + +define <4 x i32> @umulh_v4i32(<4 x i32> %op1, <4 x i32> %op2) #0 { +; CHECK-LABEL: umulh_v4i32: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: ptrue p0.s, vl4 +; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1 +; CHECK-NEXT: umulh z0.s, p0/m, z0.s, z1.s +; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 +; CHECK-NEXT: ret + %1 = zext <4 x i32> %op1 to <4 x i64> + %2 = zext <4 x i32> %op2 to <4 x i64> + %mul = mul <4 x i64> %1, %2 + %shr = lshr <4 x i64> %mul, + %res = trunc <4 x i64> %shr to <4 x i32> + ret <4 x i32> %res +} + +define void @umulh_v8i32(<8 x i32>* %a, <8 x i32>* %b) #0 { +; VBITS_GE_128-LABEL: umulh_v8i32: +; VBITS_GE_128: // %bb.0: +; VBITS_GE_128-NEXT: mov x8, #4 +; VBITS_GE_128-NEXT: ptrue p0.s, vl4 +; VBITS_GE_128-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] +; VBITS_GE_128-NEXT: ld1w { z1.s }, p0/z, [x0] +; VBITS_GE_128-NEXT: ld1w { z2.s }, p0/z, [x1, x8, lsl #2] +; VBITS_GE_128-NEXT: ld1w { z3.s }, p0/z, [x1] +; VBITS_GE_128-NEXT: ptrue p0.s, vl2 +; VBITS_GE_128-NEXT: movprfx z4, z1 +; VBITS_GE_128-NEXT: umulh z4.s, p0/m, z4.s, z3.s +; VBITS_GE_128-NEXT: ext z1.b, z1.b, z1.b, #8 +; VBITS_GE_128-NEXT: ext z3.b, z3.b, z3.b, #8 +; VBITS_GE_128-NEXT: umulh z1.s, p0/m, z1.s, z3.s +; VBITS_GE_128-NEXT: movprfx z3, z0 +; VBITS_GE_128-NEXT: umulh z3.s, p0/m, z3.s, z2.s +; VBITS_GE_128-NEXT: ext z2.b, z2.b, z2.b, #8 +; VBITS_GE_128-NEXT: ext z0.b, z0.b, z0.b, #8 +; VBITS_GE_128-NEXT: umulh z0.s, p0/m, z0.s, z2.s +; VBITS_GE_128-NEXT: splice z4.s, p0, z4.s, z1.s +; VBITS_GE_128-NEXT: splice z3.s, p0, z3.s, z0.s +; VBITS_GE_128-NEXT: stp q4, q3, [x0] +; VBITS_GE_128-NEXT: ret +; +; VBITS_GE_256-LABEL: umulh_v8i32: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: ptrue p0.s, vl8 +; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0] +; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x1] +; VBITS_GE_256-NEXT: umulh z0.s, p0/m, z0.s, z1.s +; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0] +; VBITS_GE_256-NEXT: ret +; +; VBITS_GE_512-LABEL: umulh_v8i32: +; VBITS_GE_512: // %bb.0: +; VBITS_GE_512-NEXT: ptrue p0.s, vl8 +; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x0] +; VBITS_GE_512-NEXT: ld1w { z1.s }, p0/z, [x1] +; VBITS_GE_512-NEXT: umulh z0.s, p0/m, z0.s, z1.s +; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x0] +; VBITS_GE_512-NEXT: ret + %op1 = load <8 x i32>, <8 x i32>* %a + %op2 = load <8 x i32>, <8 x i32>* %b + %insert = insertelement <8 x i64> undef, i64 32, i64 0 + %splat = shufflevector <8 x i64> %insert, <8 x i64> undef, <8 x i32> zeroinitializer + %1 = zext <8 x i32> %op1 to <8 x i64> + %2 = zext <8 x i32> %op2 to <8 x i64> + %mul = mul <8 x i64> %1, %2 + %shr = lshr <8 x i64> %mul, + %res = trunc <8 x i64> %shr to <8 x i32> + store <8 x i32> %res, <8 x i32>* %a + ret void +} + +define void @umulh_v16i32(<16 x i32>* %a, <16 x i32>* %b) #0 { +; VBITS_GE_128-LABEL: umulh_v16i32: +; VBITS_GE_128: // %bb.0: +; VBITS_GE_128-NEXT: mov x8, #8 +; VBITS_GE_128-NEXT: mov x9, #12 +; VBITS_GE_128-NEXT: mov x10, #4 +; VBITS_GE_128-NEXT: ptrue p0.s, vl4 +; VBITS_GE_128-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] +; VBITS_GE_128-NEXT: ld1w { z1.s }, p0/z, [x0, x9, lsl #2] +; VBITS_GE_128-NEXT: ld1w { z2.s }, p0/z, [x0, x10, lsl #2] +; VBITS_GE_128-NEXT: ld1w { z3.s }, p0/z, [x0] +; VBITS_GE_128-NEXT: ld1w { z4.s }, p0/z, [x1, x8, lsl #2] +; VBITS_GE_128-NEXT: ld1w { z5.s }, p0/z, [x1, x9, lsl #2] +; VBITS_GE_128-NEXT: ld1w { z6.s }, p0/z, [x1, x10, lsl #2] +; VBITS_GE_128-NEXT: ld1w { z7.s }, p0/z, [x1] +; VBITS_GE_128-NEXT: ptrue p0.s, vl2 +; VBITS_GE_128-NEXT: movprfx z18, z0 +; VBITS_GE_128-NEXT: umulh z18.s, p0/m, z18.s, z4.s +; VBITS_GE_128-NEXT: ext z0.b, z0.b, z0.b, #8 +; VBITS_GE_128-NEXT: mov z16.d, z2.d +; VBITS_GE_128-NEXT: ext z4.b, z4.b, z4.b, #8 +; VBITS_GE_128-NEXT: mov z17.d, z3.d +; VBITS_GE_128-NEXT: umulh z0.s, p0/m, z0.s, z4.s +; VBITS_GE_128-NEXT: movprfx z4, z1 +; VBITS_GE_128-NEXT: umulh z4.s, p0/m, z4.s, z5.s +; VBITS_GE_128-NEXT: ext z5.b, z5.b, z5.b, #8 +; VBITS_GE_128-NEXT: ext z1.b, z1.b, z1.b, #8 +; VBITS_GE_128-NEXT: umulh z2.s, p0/m, z2.s, z6.s +; VBITS_GE_128-NEXT: umulh z3.s, p0/m, z3.s, z7.s +; VBITS_GE_128-NEXT: ext z6.b, z6.b, z6.b, #8 +; VBITS_GE_128-NEXT: umulh z1.s, p0/m, z1.s, z5.s +; VBITS_GE_128-NEXT: ext z16.b, z16.b, z16.b, #8 +; VBITS_GE_128-NEXT: ext z17.b, z17.b, z17.b, #8 +; VBITS_GE_128-NEXT: ext z7.b, z7.b, z7.b, #8 +; VBITS_GE_128-NEXT: movprfx z5, z16 +; VBITS_GE_128-NEXT: umulh z5.s, p0/m, z5.s, z6.s +; VBITS_GE_128-NEXT: movprfx z6, z17 +; VBITS_GE_128-NEXT: umulh z6.s, p0/m, z6.s, z7.s +; VBITS_GE_128-NEXT: splice z18.s, p0, z18.s, z0.s +; VBITS_GE_128-NEXT: splice z4.s, p0, z4.s, z1.s +; VBITS_GE_128-NEXT: splice z3.s, p0, z3.s, z6.s +; VBITS_GE_128-NEXT: splice z2.s, p0, z2.s, z5.s +; VBITS_GE_128-NEXT: stp q18, q4, [x0, #32] +; VBITS_GE_128-NEXT: stp q3, q2, [x0] +; VBITS_GE_128-NEXT: ret +; +; VBITS_GE_256-LABEL: umulh_v16i32: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov x8, #8 +; VBITS_GE_256-NEXT: ptrue p0.s, vl8 +; VBITS_GE_256-NEXT: ptrue p1.s, vl4 +; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0] +; VBITS_GE_256-NEXT: ld1w { z2.s }, p0/z, [x1, x8, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z3.s }, p0/z, [x1] +; VBITS_GE_256-NEXT: movprfx z4, z0 +; VBITS_GE_256-NEXT: umulh z4.s, p1/m, z4.s, z2.s +; VBITS_GE_256-NEXT: movprfx z5, z1 +; VBITS_GE_256-NEXT: umulh z5.s, p1/m, z5.s, z3.s +; VBITS_GE_256-NEXT: ext z0.b, z0.b, z0.b, #16 +; VBITS_GE_256-NEXT: ext z1.b, z1.b, z1.b, #16 +; VBITS_GE_256-NEXT: ext z2.b, z2.b, z2.b, #16 +; VBITS_GE_256-NEXT: ext z3.b, z3.b, z3.b, #16 +; VBITS_GE_256-NEXT: umulh z0.s, p1/m, z0.s, z2.s +; VBITS_GE_256-NEXT: umulh z1.s, p1/m, z1.s, z3.s +; VBITS_GE_256-NEXT: splice z4.s, p1, z4.s, z0.s +; VBITS_GE_256-NEXT: splice z5.s, p1, z5.s, z1.s +; VBITS_GE_256-NEXT: st1w { z4.s }, p0, [x0, x8, lsl #2] +; VBITS_GE_256-NEXT: st1w { z5.s }, p0, [x0] +; VBITS_GE_256-NEXT: ret +; +; VBITS_GE_512-LABEL: umulh_v16i32: +; VBITS_GE_512: // %bb.0: +; VBITS_GE_512-NEXT: ptrue p0.s, vl16 +; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x0] +; VBITS_GE_512-NEXT: ld1w { z1.s }, p0/z, [x1] +; VBITS_GE_512-NEXT: umulh z0.s, p0/m, z0.s, z1.s +; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x0] +; VBITS_GE_512-NEXT: ret + %op1 = load <16 x i32>, <16 x i32>* %a + %op2 = load <16 x i32>, <16 x i32>* %b + %1 = zext <16 x i32> %op1 to <16 x i64> + %2 = zext <16 x i32> %op2 to <16 x i64> + %mul = mul <16 x i64> %1, %2 + %shr = lshr <16 x i64> %mul, + %res = trunc <16 x i64> %shr to <16 x i32> + store <16 x i32> %res, <16 x i32>* %a + ret void +} + +define <1 x i64> @umulh_v1i64(<1 x i64> %op1, <1 x i64> %op2) #0 { +; CHECK-LABEL: umulh_v1i64: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: ptrue p0.d, vl1 +; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1 +; CHECK-NEXT: umulh z0.d, p0/m, z0.d, z1.d +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 +; CHECK-NEXT: ret + %1 = zext <1 x i64> %op1 to <1 x i128> + %2 = zext <1 x i64> %op2 to <1 x i128> + %mul = mul <1 x i128> %1, %2 + %shr = lshr <1 x i128> %mul, + %res = trunc <1 x i128> %shr to <1 x i64> + ret <1 x i64> %res +} + +define <2 x i64> @umulh_v2i64(<2 x i64> %op1, <2 x i64> %op2) #0 { +; CHECK-LABEL: umulh_v2i64: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: ptrue p0.d, vl2 +; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1 +; CHECK-NEXT: umulh z0.d, p0/m, z0.d, z1.d +; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 +; CHECK-NEXT: ret + %1 = zext <2 x i64> %op1 to <2 x i128> + %2 = zext <2 x i64> %op2 to <2 x i128> + %mul = mul <2 x i128> %1, %2 + %shr = lshr <2 x i128> %mul, + %res = trunc <2 x i128> %shr to <2 x i64> + ret <2 x i64> %res +} + +define void @umulh_v4i64(<4 x i64>* %a, <4 x i64>* %b) #0 { +; VBITS_GE_128-LABEL: umulh_v4i64: +; VBITS_GE_128: // %bb.0: +; VBITS_GE_128-NEXT: mov x8, #2 +; VBITS_GE_128-NEXT: ptrue p0.d, vl2 +; VBITS_GE_128-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] +; VBITS_GE_128-NEXT: ld1d { z1.d }, p0/z, [x0] +; VBITS_GE_128-NEXT: ld1d { z2.d }, p0/z, [x1, x8, lsl #3] +; VBITS_GE_128-NEXT: ld1d { z3.d }, p0/z, [x1] +; VBITS_GE_128-NEXT: ptrue p0.d, vl1 +; VBITS_GE_128-NEXT: fmov x8, d0 +; VBITS_GE_128-NEXT: mov z4.d, z0.d[1] +; VBITS_GE_128-NEXT: fmov x10, d2 +; VBITS_GE_128-NEXT: mov z0.d, z1.d[1] +; VBITS_GE_128-NEXT: fmov x9, d1 +; VBITS_GE_128-NEXT: mov z1.d, z2.d[1] +; VBITS_GE_128-NEXT: mov z2.d, z3.d[1] +; VBITS_GE_128-NEXT: fmov x11, d3 +; VBITS_GE_128-NEXT: fmov x12, d0 +; VBITS_GE_128-NEXT: fmov x13, d2 +; VBITS_GE_128-NEXT: fmov x14, d4 +; VBITS_GE_128-NEXT: umulh x8, x8, x10 +; VBITS_GE_128-NEXT: fmov x10, d1 +; VBITS_GE_128-NEXT: umulh x9, x9, x11 +; VBITS_GE_128-NEXT: umulh x12, x12, x13 +; VBITS_GE_128-NEXT: umulh x10, x14, x10 +; VBITS_GE_128-NEXT: fmov d2, x8 +; VBITS_GE_128-NEXT: fmov d0, x9 +; VBITS_GE_128-NEXT: fmov d1, x12 +; VBITS_GE_128-NEXT: fmov d3, x10 +; VBITS_GE_128-NEXT: splice z0.d, p0, z0.d, z1.d +; VBITS_GE_128-NEXT: splice z2.d, p0, z2.d, z3.d +; VBITS_GE_128-NEXT: stp q0, q2, [x0] +; VBITS_GE_128-NEXT: ret +; +; VBITS_GE_256-LABEL: umulh_v4i64: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: ptrue p0.d, vl4 +; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0] +; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x1] +; VBITS_GE_256-NEXT: umulh z0.d, p0/m, z0.d, z1.d +; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x0] +; VBITS_GE_256-NEXT: ret +; +; VBITS_GE_512-LABEL: umulh_v4i64: +; VBITS_GE_512: // %bb.0: +; VBITS_GE_512-NEXT: ptrue p0.d, vl4 +; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x0] +; VBITS_GE_512-NEXT: ld1d { z1.d }, p0/z, [x1] +; VBITS_GE_512-NEXT: umulh z0.d, p0/m, z0.d, z1.d +; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x0] +; VBITS_GE_512-NEXT: ret + %op1 = load <4 x i64>, <4 x i64>* %a + %op2 = load <4 x i64>, <4 x i64>* %b + %1 = zext <4 x i64> %op1 to <4 x i128> + %2 = zext <4 x i64> %op2 to <4 x i128> + %mul = mul <4 x i128> %1, %2 + %shr = lshr <4 x i128> %mul, + %res = trunc <4 x i128> %shr to <4 x i64> + store <4 x i64> %res, <4 x i64>* %a + ret void +} + +define void @umulh_v8i64(<8 x i64>* %a, <8 x i64>* %b) #0 { +; VBITS_GE_128-LABEL: umulh_v8i64: +; VBITS_GE_128: // %bb.0: +; VBITS_GE_128-NEXT: mov x8, #2 +; VBITS_GE_128-NEXT: mov x9, #4 +; VBITS_GE_128-NEXT: mov x10, #6 +; VBITS_GE_128-NEXT: ptrue p0.d, vl2 +; VBITS_GE_128-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] +; VBITS_GE_128-NEXT: ld1d { z1.d }, p0/z, [x0, x9, lsl #3] +; VBITS_GE_128-NEXT: ld1d { z2.d }, p0/z, [x0, x10, lsl #3] +; VBITS_GE_128-NEXT: ld1d { z3.d }, p0/z, [x0] +; VBITS_GE_128-NEXT: ld1d { z4.d }, p0/z, [x1, x9, lsl #3] +; VBITS_GE_128-NEXT: ld1d { z5.d }, p0/z, [x1, x10, lsl #3] +; VBITS_GE_128-NEXT: ld1d { z6.d }, p0/z, [x1, x8, lsl #3] +; VBITS_GE_128-NEXT: mov z7.d, z0.d[1] +; VBITS_GE_128-NEXT: fmov x8, d0 +; VBITS_GE_128-NEXT: fmov x9, d7 +; VBITS_GE_128-NEXT: mov z7.d, z2.d[1] +; VBITS_GE_128-NEXT: fmov x10, d2 +; VBITS_GE_128-NEXT: mov z2.d, z1.d[1] +; VBITS_GE_128-NEXT: fmov x13, d2 +; VBITS_GE_128-NEXT: mov z2.d, z4.d[1] +; VBITS_GE_128-NEXT: fmov x17, d2 +; VBITS_GE_128-NEXT: ld1d { z0.d }, p0/z, [x1] +; VBITS_GE_128-NEXT: fmov x12, d1 +; VBITS_GE_128-NEXT: fmov x18, d4 +; VBITS_GE_128-NEXT: mov z1.d, z3.d[1] +; VBITS_GE_128-NEXT: fmov x16, d6 +; VBITS_GE_128-NEXT: umulh x13, x13, x17 +; VBITS_GE_128-NEXT: fmov x17, d5 +; VBITS_GE_128-NEXT: fmov x15, d1 +; VBITS_GE_128-NEXT: mov z1.d, z6.d[1] +; VBITS_GE_128-NEXT: mov z2.d, z5.d[1] +; VBITS_GE_128-NEXT: umulh x12, x12, x18 +; VBITS_GE_128-NEXT: fmov x18, d1 +; VBITS_GE_128-NEXT: mov z1.d, z0.d[1] +; VBITS_GE_128-NEXT: fmov x11, d7 +; VBITS_GE_128-NEXT: fmov x14, d3 +; VBITS_GE_128-NEXT: fmov x1, d2 +; VBITS_GE_128-NEXT: umulh x10, x10, x17 +; VBITS_GE_128-NEXT: fmov x17, d1 +; VBITS_GE_128-NEXT: umulh x8, x8, x16 +; VBITS_GE_128-NEXT: fmov x16, d0 +; VBITS_GE_128-NEXT: umulh x9, x9, x18 +; VBITS_GE_128-NEXT: umulh x11, x11, x1 +; VBITS_GE_128-NEXT: fmov d1, x12 +; VBITS_GE_128-NEXT: umulh x15, x15, x17 +; VBITS_GE_128-NEXT: fmov d2, x13 +; VBITS_GE_128-NEXT: umulh x14, x14, x16 +; VBITS_GE_128-NEXT: fmov d0, x8 +; VBITS_GE_128-NEXT: fmov d3, x10 +; VBITS_GE_128-NEXT: fmov d7, x9 +; VBITS_GE_128-NEXT: fmov d4, x11 +; VBITS_GE_128-NEXT: ptrue p0.d, vl1 +; VBITS_GE_128-NEXT: fmov d6, x15 +; VBITS_GE_128-NEXT: splice z1.d, p0, z1.d, z2.d +; VBITS_GE_128-NEXT: fmov d5, x14 +; VBITS_GE_128-NEXT: splice z0.d, p0, z0.d, z7.d +; VBITS_GE_128-NEXT: splice z3.d, p0, z3.d, z4.d +; VBITS_GE_128-NEXT: stp q1, q3, [x0, #32] +; VBITS_GE_128-NEXT: splice z5.d, p0, z5.d, z6.d +; VBITS_GE_128-NEXT: stp q5, q0, [x0] +; VBITS_GE_128-NEXT: ret +; +; VBITS_GE_256-LABEL: umulh_v8i64: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov x8, #4 +; VBITS_GE_256-NEXT: ptrue p0.d, vl4 +; VBITS_GE_256-NEXT: ptrue p1.d, vl1 +; VBITS_GE_256-NEXT: ptrue p2.d, vl2 +; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0] +; VBITS_GE_256-NEXT: ld1d { z2.d }, p0/z, [x1, x8, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z3.d }, p0/z, [x1] +; VBITS_GE_256-NEXT: fmov x9, d0 +; VBITS_GE_256-NEXT: mov z4.d, z0.d[1] +; VBITS_GE_256-NEXT: ext z0.b, z0.b, z0.b, #16 +; VBITS_GE_256-NEXT: fmov x10, d1 +; VBITS_GE_256-NEXT: mov z5.d, z1.d[1] +; VBITS_GE_256-NEXT: ext z1.b, z1.b, z1.b, #16 +; VBITS_GE_256-NEXT: fmov x11, d4 +; VBITS_GE_256-NEXT: mov z4.d, z0.d[1] +; VBITS_GE_256-NEXT: fmov x12, d0 +; VBITS_GE_256-NEXT: mov z0.d, z1.d[1] +; VBITS_GE_256-NEXT: fmov x15, d0 +; VBITS_GE_256-NEXT: fmov x16, d2 +; VBITS_GE_256-NEXT: mov z0.d, z2.d[1] +; VBITS_GE_256-NEXT: ext z2.b, z2.b, z2.b, #16 +; VBITS_GE_256-NEXT: fmov x17, d0 +; VBITS_GE_256-NEXT: mov z0.d, z2.d[1] +; VBITS_GE_256-NEXT: fmov x18, d2 +; VBITS_GE_256-NEXT: fmov x1, d0 +; VBITS_GE_256-NEXT: fmov x2, d3 +; VBITS_GE_256-NEXT: mov z0.d, z3.d[1] +; VBITS_GE_256-NEXT: ext z3.b, z3.b, z3.b, #16 +; VBITS_GE_256-NEXT: fmov x13, d5 +; VBITS_GE_256-NEXT: mov z2.d, z3.d[1] +; VBITS_GE_256-NEXT: umulh x12, x12, x18 +; VBITS_GE_256-NEXT: fmov x18, d2 +; VBITS_GE_256-NEXT: fmov x14, d4 +; VBITS_GE_256-NEXT: umulh x11, x11, x17 +; VBITS_GE_256-NEXT: fmov x17, d0 +; VBITS_GE_256-NEXT: umulh x9, x9, x16 +; VBITS_GE_256-NEXT: fmov x16, d1 +; VBITS_GE_256-NEXT: umulh x15, x15, x18 +; VBITS_GE_256-NEXT: fmov x18, d3 +; VBITS_GE_256-NEXT: umulh x14, x14, x1 +; VBITS_GE_256-NEXT: fmov d2, x12 +; VBITS_GE_256-NEXT: umulh x13, x13, x17 +; VBITS_GE_256-NEXT: fmov d1, x11 +; VBITS_GE_256-NEXT: umulh x10, x10, x2 +; VBITS_GE_256-NEXT: fmov d0, x9 +; VBITS_GE_256-NEXT: umulh x16, x16, x18 +; VBITS_GE_256-NEXT: fmov d7, x15 +; VBITS_GE_256-NEXT: fmov d3, x14 +; VBITS_GE_256-NEXT: fmov d5, x13 +; VBITS_GE_256-NEXT: splice z0.d, p1, z0.d, z1.d +; VBITS_GE_256-NEXT: fmov d4, x10 +; VBITS_GE_256-NEXT: fmov d6, x16 +; VBITS_GE_256-NEXT: splice z2.d, p1, z2.d, z3.d +; VBITS_GE_256-NEXT: splice z0.d, p2, z0.d, z2.d +; VBITS_GE_256-NEXT: splice z4.d, p1, z4.d, z5.d +; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x0, x8, lsl #3] +; VBITS_GE_256-NEXT: splice z6.d, p1, z6.d, z7.d +; VBITS_GE_256-NEXT: splice z4.d, p2, z4.d, z6.d +; VBITS_GE_256-NEXT: st1d { z4.d }, p0, [x0] +; VBITS_GE_256-NEXT: ret +; +; VBITS_GE_512-LABEL: umulh_v8i64: +; VBITS_GE_512: // %bb.0: +; VBITS_GE_512-NEXT: ptrue p0.d, vl8 +; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x0] +; VBITS_GE_512-NEXT: ld1d { z1.d }, p0/z, [x1] +; VBITS_GE_512-NEXT: umulh z0.d, p0/m, z0.d, z1.d +; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x0] +; VBITS_GE_512-NEXT: ret + %op1 = load <8 x i64>, <8 x i64>* %a + %op2 = load <8 x i64>, <8 x i64>* %b + %1 = zext <8 x i64> %op1 to <8 x i128> + %2 = zext <8 x i64> %op2 to <8 x i128> + %mul = mul <8 x i128> %1, %2 + %shr = lshr <8 x i128> %mul, + %res = trunc <8 x i128> %shr to <8 x i64> + store <8 x i64> %res, <8 x i64>* %a + ret void +} + +attributes #0 = { "target-features"="+sve" } diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-rem.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-rem.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-rem.ll @@ -0,0 +1,5833 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -aarch64-sve-vector-bits-min=128 -force-sve-when-streaming-compatible < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_128 +; RUN: llc -aarch64-sve-vector-bits-min=256 -force-sve-when-streaming-compatible < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_256 +; RUN: llc -aarch64-sve-vector-bits-min=512 -force-sve-when-streaming-compatible < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512 +; RUN: llc -aarch64-sve-vector-bits-min=2048 -force-sve-when-streaming-compatible < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512 + +target triple = "aarch64-unknown-linux-gnu" + +; +; SREM +; + +define <8 x i8> @srem_v8i8(<8 x i8> %op1, <8 x i8> %op2) #0 { +; VBITS_GE_128-LABEL: srem_v8i8: +; VBITS_GE_128: // %bb.0: +; VBITS_GE_128-NEXT: sub sp, sp, #16 +; VBITS_GE_128-NEXT: .cfi_def_cfa_offset 16 +; VBITS_GE_128-NEXT: // kill: def $d1 killed $d1 def $z1 +; VBITS_GE_128-NEXT: // kill: def $d0 killed $d0 def $z0 +; VBITS_GE_128-NEXT: sunpklo z2.h, z1.b +; VBITS_GE_128-NEXT: sunpklo z3.h, z0.b +; VBITS_GE_128-NEXT: ptrue p0.s, vl4 +; VBITS_GE_128-NEXT: sunpkhi z4.s, z2.h +; VBITS_GE_128-NEXT: sunpkhi z5.s, z3.h +; VBITS_GE_128-NEXT: sunpklo z2.s, z2.h +; VBITS_GE_128-NEXT: sunpklo z3.s, z3.h +; VBITS_GE_128-NEXT: sdivr z4.s, p0/m, z4.s, z5.s +; VBITS_GE_128-NEXT: sdivr z2.s, p0/m, z2.s, z3.s +; VBITS_GE_128-NEXT: ptrue p0.b, vl8 +; VBITS_GE_128-NEXT: uzp1 z2.h, z2.h, z4.h +; VBITS_GE_128-NEXT: fmov w8, s2 +; VBITS_GE_128-NEXT: mov z3.h, z2.h[7] +; VBITS_GE_128-NEXT: mov z5.h, z2.h[5] +; VBITS_GE_128-NEXT: fmov w9, s3 +; VBITS_GE_128-NEXT: mov z4.h, z2.h[6] +; VBITS_GE_128-NEXT: mov z6.h, z2.h[4] +; VBITS_GE_128-NEXT: strb w8, [sp, #8] +; VBITS_GE_128-NEXT: fmov w8, s5 +; VBITS_GE_128-NEXT: mov z16.h, z2.h[2] +; VBITS_GE_128-NEXT: fmov w10, s4 +; VBITS_GE_128-NEXT: strb w9, [sp, #15] +; VBITS_GE_128-NEXT: fmov w9, s6 +; VBITS_GE_128-NEXT: strb w8, [sp, #13] +; VBITS_GE_128-NEXT: fmov w8, s16 +; VBITS_GE_128-NEXT: mov z7.h, z2.h[3] +; VBITS_GE_128-NEXT: mov z2.h, z2.h[1] +; VBITS_GE_128-NEXT: strb w10, [sp, #14] +; VBITS_GE_128-NEXT: fmov w10, s7 +; VBITS_GE_128-NEXT: strb w9, [sp, #12] +; VBITS_GE_128-NEXT: fmov w9, s2 +; VBITS_GE_128-NEXT: strb w8, [sp, #10] +; VBITS_GE_128-NEXT: add x8, sp, #8 +; VBITS_GE_128-NEXT: strb w10, [sp, #11] +; VBITS_GE_128-NEXT: strb w9, [sp, #9] +; VBITS_GE_128-NEXT: ld1b { z2.b }, p0/z, [x8] +; VBITS_GE_128-NEXT: mls z0.b, p0/m, z2.b, z1.b +; VBITS_GE_128-NEXT: // kill: def $d0 killed $d0 killed $z0 +; VBITS_GE_128-NEXT: add sp, sp, #16 +; VBITS_GE_128-NEXT: ret +; +; VBITS_GE_256-LABEL: srem_v8i8: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: sub sp, sp, #16 +; VBITS_GE_256-NEXT: .cfi_def_cfa_offset 16 +; VBITS_GE_256-NEXT: // kill: def $d1 killed $d1 def $z1 +; VBITS_GE_256-NEXT: // kill: def $d0 killed $d0 def $z0 +; VBITS_GE_256-NEXT: sunpklo z2.h, z1.b +; VBITS_GE_256-NEXT: sunpklo z3.h, z0.b +; VBITS_GE_256-NEXT: ptrue p0.s, vl8 +; VBITS_GE_256-NEXT: sunpklo z2.s, z2.h +; VBITS_GE_256-NEXT: sunpklo z3.s, z3.h +; VBITS_GE_256-NEXT: sdivr z2.s, p0/m, z2.s, z3.s +; VBITS_GE_256-NEXT: ptrue p0.b, vl8 +; VBITS_GE_256-NEXT: uzp1 z2.h, z2.h, z2.h +; VBITS_GE_256-NEXT: fmov w8, s2 +; VBITS_GE_256-NEXT: mov z3.h, z2.h[7] +; VBITS_GE_256-NEXT: mov z5.h, z2.h[5] +; VBITS_GE_256-NEXT: fmov w9, s3 +; VBITS_GE_256-NEXT: mov z4.h, z2.h[6] +; VBITS_GE_256-NEXT: mov z6.h, z2.h[4] +; VBITS_GE_256-NEXT: strb w8, [sp, #8] +; VBITS_GE_256-NEXT: fmov w8, s5 +; VBITS_GE_256-NEXT: mov z16.h, z2.h[2] +; VBITS_GE_256-NEXT: fmov w10, s4 +; VBITS_GE_256-NEXT: strb w9, [sp, #15] +; VBITS_GE_256-NEXT: fmov w9, s6 +; VBITS_GE_256-NEXT: strb w8, [sp, #13] +; VBITS_GE_256-NEXT: fmov w8, s16 +; VBITS_GE_256-NEXT: mov z7.h, z2.h[3] +; VBITS_GE_256-NEXT: mov z2.h, z2.h[1] +; VBITS_GE_256-NEXT: strb w10, [sp, #14] +; VBITS_GE_256-NEXT: fmov w10, s7 +; VBITS_GE_256-NEXT: strb w9, [sp, #12] +; VBITS_GE_256-NEXT: fmov w9, s2 +; VBITS_GE_256-NEXT: strb w8, [sp, #10] +; VBITS_GE_256-NEXT: add x8, sp, #8 +; VBITS_GE_256-NEXT: strb w10, [sp, #11] +; VBITS_GE_256-NEXT: strb w9, [sp, #9] +; VBITS_GE_256-NEXT: ld1b { z2.b }, p0/z, [x8] +; VBITS_GE_256-NEXT: mls z0.b, p0/m, z2.b, z1.b +; VBITS_GE_256-NEXT: // kill: def $d0 killed $d0 killed $z0 +; VBITS_GE_256-NEXT: add sp, sp, #16 +; VBITS_GE_256-NEXT: ret +; +; VBITS_GE_512-LABEL: srem_v8i8: +; VBITS_GE_512: // %bb.0: +; VBITS_GE_512-NEXT: sub sp, sp, #16 +; VBITS_GE_512-NEXT: .cfi_def_cfa_offset 16 +; VBITS_GE_512-NEXT: // kill: def $d1 killed $d1 def $z1 +; VBITS_GE_512-NEXT: // kill: def $d0 killed $d0 def $z0 +; VBITS_GE_512-NEXT: sunpklo z2.h, z1.b +; VBITS_GE_512-NEXT: sunpklo z3.h, z0.b +; VBITS_GE_512-NEXT: ptrue p0.s, vl8 +; VBITS_GE_512-NEXT: sunpklo z2.s, z2.h +; VBITS_GE_512-NEXT: sunpklo z3.s, z3.h +; VBITS_GE_512-NEXT: sdivr z2.s, p0/m, z2.s, z3.s +; VBITS_GE_512-NEXT: ptrue p0.b, vl8 +; VBITS_GE_512-NEXT: uzp1 z2.h, z2.h, z2.h +; VBITS_GE_512-NEXT: fmov w8, s2 +; VBITS_GE_512-NEXT: mov z3.h, z2.h[7] +; VBITS_GE_512-NEXT: mov z5.h, z2.h[5] +; VBITS_GE_512-NEXT: fmov w9, s3 +; VBITS_GE_512-NEXT: mov z4.h, z2.h[6] +; VBITS_GE_512-NEXT: mov z6.h, z2.h[4] +; VBITS_GE_512-NEXT: strb w8, [sp, #8] +; VBITS_GE_512-NEXT: fmov w8, s5 +; VBITS_GE_512-NEXT: mov z16.h, z2.h[2] +; VBITS_GE_512-NEXT: fmov w10, s4 +; VBITS_GE_512-NEXT: strb w9, [sp, #15] +; VBITS_GE_512-NEXT: fmov w9, s6 +; VBITS_GE_512-NEXT: strb w8, [sp, #13] +; VBITS_GE_512-NEXT: fmov w8, s16 +; VBITS_GE_512-NEXT: mov z7.h, z2.h[3] +; VBITS_GE_512-NEXT: mov z2.h, z2.h[1] +; VBITS_GE_512-NEXT: strb w10, [sp, #14] +; VBITS_GE_512-NEXT: fmov w10, s7 +; VBITS_GE_512-NEXT: strb w9, [sp, #12] +; VBITS_GE_512-NEXT: fmov w9, s2 +; VBITS_GE_512-NEXT: strb w8, [sp, #10] +; VBITS_GE_512-NEXT: add x8, sp, #8 +; VBITS_GE_512-NEXT: strb w10, [sp, #11] +; VBITS_GE_512-NEXT: strb w9, [sp, #9] +; VBITS_GE_512-NEXT: ld1b { z2.b }, p0/z, [x8] +; VBITS_GE_512-NEXT: mls z0.b, p0/m, z2.b, z1.b +; VBITS_GE_512-NEXT: // kill: def $d0 killed $d0 killed $z0 +; VBITS_GE_512-NEXT: add sp, sp, #16 +; VBITS_GE_512-NEXT: ret + %res = srem <8 x i8> %op1, %op2 + ret <8 x i8> %res +} + +define <16 x i8> @srem_v16i8(<16 x i8> %op1, <16 x i8> %op2) #0 { +; VBITS_GE_128-LABEL: srem_v16i8: +; VBITS_GE_128: // %bb.0: +; VBITS_GE_128-NEXT: // kill: def $q1 killed $q1 def $z1 +; VBITS_GE_128-NEXT: // kill: def $q0 killed $q0 def $z0 +; VBITS_GE_128-NEXT: sunpkhi z2.h, z1.b +; VBITS_GE_128-NEXT: sunpkhi z3.h, z0.b +; VBITS_GE_128-NEXT: ptrue p0.s, vl4 +; VBITS_GE_128-NEXT: sunpkhi z5.s, z2.h +; VBITS_GE_128-NEXT: sunpkhi z6.s, z3.h +; VBITS_GE_128-NEXT: sunpklo z2.s, z2.h +; VBITS_GE_128-NEXT: sunpklo z3.s, z3.h +; VBITS_GE_128-NEXT: sunpklo z4.h, z1.b +; VBITS_GE_128-NEXT: sdivr z2.s, p0/m, z2.s, z3.s +; VBITS_GE_128-NEXT: sunpklo z3.h, z0.b +; VBITS_GE_128-NEXT: sdivr z5.s, p0/m, z5.s, z6.s +; VBITS_GE_128-NEXT: sunpkhi z6.s, z4.h +; VBITS_GE_128-NEXT: sunpkhi z7.s, z3.h +; VBITS_GE_128-NEXT: sunpklo z4.s, z4.h +; VBITS_GE_128-NEXT: sunpklo z3.s, z3.h +; VBITS_GE_128-NEXT: sdivr z6.s, p0/m, z6.s, z7.s +; VBITS_GE_128-NEXT: sdiv z3.s, p0/m, z3.s, z4.s +; VBITS_GE_128-NEXT: uzp1 z2.h, z2.h, z5.h +; VBITS_GE_128-NEXT: uzp1 z3.h, z3.h, z6.h +; VBITS_GE_128-NEXT: ptrue p0.b, vl16 +; VBITS_GE_128-NEXT: uzp1 z2.b, z3.b, z2.b +; VBITS_GE_128-NEXT: mls z0.b, p0/m, z2.b, z1.b +; VBITS_GE_128-NEXT: // kill: def $q0 killed $q0 killed $z0 +; VBITS_GE_128-NEXT: ret +; +; VBITS_GE_256-LABEL: srem_v16i8: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: // kill: def $q1 killed $q1 def $z1 +; VBITS_GE_256-NEXT: // kill: def $q0 killed $q0 def $z0 +; VBITS_GE_256-NEXT: sunpklo z2.h, z1.b +; VBITS_GE_256-NEXT: sunpklo z3.h, z0.b +; VBITS_GE_256-NEXT: ptrue p0.s, vl8 +; VBITS_GE_256-NEXT: sunpkhi z4.s, z2.h +; VBITS_GE_256-NEXT: sunpkhi z5.s, z3.h +; VBITS_GE_256-NEXT: sunpklo z2.s, z2.h +; VBITS_GE_256-NEXT: sunpklo z3.s, z3.h +; VBITS_GE_256-NEXT: sdivr z4.s, p0/m, z4.s, z5.s +; VBITS_GE_256-NEXT: sdivr z2.s, p0/m, z2.s, z3.s +; VBITS_GE_256-NEXT: ptrue p0.b, vl16 +; VBITS_GE_256-NEXT: uzp1 z2.h, z2.h, z4.h +; VBITS_GE_256-NEXT: uzp1 z2.b, z2.b, z2.b +; VBITS_GE_256-NEXT: mls z0.b, p0/m, z2.b, z1.b +; VBITS_GE_256-NEXT: // kill: def $q0 killed $q0 killed $z0 +; VBITS_GE_256-NEXT: ret +; +; VBITS_GE_512-LABEL: srem_v16i8: +; VBITS_GE_512: // %bb.0: +; VBITS_GE_512-NEXT: // kill: def $q1 killed $q1 def $z1 +; VBITS_GE_512-NEXT: // kill: def $q0 killed $q0 def $z0 +; VBITS_GE_512-NEXT: sunpklo z2.h, z1.b +; VBITS_GE_512-NEXT: sunpklo z3.h, z0.b +; VBITS_GE_512-NEXT: ptrue p0.s, vl16 +; VBITS_GE_512-NEXT: sunpklo z2.s, z2.h +; VBITS_GE_512-NEXT: sunpklo z3.s, z3.h +; VBITS_GE_512-NEXT: sdivr z2.s, p0/m, z2.s, z3.s +; VBITS_GE_512-NEXT: ptrue p0.b, vl16 +; VBITS_GE_512-NEXT: uzp1 z2.h, z2.h, z2.h +; VBITS_GE_512-NEXT: uzp1 z2.b, z2.b, z2.b +; VBITS_GE_512-NEXT: mls z0.b, p0/m, z2.b, z1.b +; VBITS_GE_512-NEXT: // kill: def $q0 killed $q0 killed $z0 +; VBITS_GE_512-NEXT: ret + %res = srem <16 x i8> %op1, %op2 + ret <16 x i8> %res +} + +define void @srem_v32i8(<32 x i8>* %a, <32 x i8>* %b) #0 { +; VBITS_GE_128-LABEL: srem_v32i8: +; VBITS_GE_128: // %bb.0: +; VBITS_GE_128-NEXT: mov w8, #16 +; VBITS_GE_128-NEXT: ptrue p0.b, vl16 +; VBITS_GE_128-NEXT: ptrue p1.s, vl4 +; VBITS_GE_128-NEXT: ld1b { z0.b }, p0/z, [x0, x8] +; VBITS_GE_128-NEXT: ld1b { z1.b }, p0/z, [x0] +; VBITS_GE_128-NEXT: ld1b { z2.b }, p0/z, [x1, x8] +; VBITS_GE_128-NEXT: ld1b { z3.b }, p0/z, [x1] +; VBITS_GE_128-NEXT: sunpkhi z5.h, z0.b +; VBITS_GE_128-NEXT: sunpklo z7.h, z0.b +; VBITS_GE_128-NEXT: sunpkhi z4.h, z2.b +; VBITS_GE_128-NEXT: sunpklo z6.h, z2.b +; VBITS_GE_128-NEXT: sunpkhi z16.s, z4.h +; VBITS_GE_128-NEXT: sunpkhi z17.s, z5.h +; VBITS_GE_128-NEXT: sunpklo z4.s, z4.h +; VBITS_GE_128-NEXT: sunpklo z5.s, z5.h +; VBITS_GE_128-NEXT: sunpkhi z18.s, z6.h +; VBITS_GE_128-NEXT: sdivr z16.s, p1/m, z16.s, z17.s +; VBITS_GE_128-NEXT: sdivr z4.s, p1/m, z4.s, z5.s +; VBITS_GE_128-NEXT: sunpkhi z5.s, z7.h +; VBITS_GE_128-NEXT: sunpklo z6.s, z6.h +; VBITS_GE_128-NEXT: sunpklo z7.s, z7.h +; VBITS_GE_128-NEXT: uzp1 z4.h, z4.h, z16.h +; VBITS_GE_128-NEXT: sdivr z6.s, p1/m, z6.s, z7.s +; VBITS_GE_128-NEXT: sunpkhi z7.h, z3.b +; VBITS_GE_128-NEXT: sunpkhi z16.h, z1.b +; VBITS_GE_128-NEXT: sdiv z5.s, p1/m, z5.s, z18.s +; VBITS_GE_128-NEXT: sunpkhi z17.s, z7.h +; VBITS_GE_128-NEXT: sunpkhi z18.s, z16.h +; VBITS_GE_128-NEXT: sunpklo z7.s, z7.h +; VBITS_GE_128-NEXT: sunpklo z16.s, z16.h +; VBITS_GE_128-NEXT: sdivr z17.s, p1/m, z17.s, z18.s +; VBITS_GE_128-NEXT: sdivr z7.s, p1/m, z7.s, z16.s +; VBITS_GE_128-NEXT: sunpklo z16.h, z3.b +; VBITS_GE_128-NEXT: sunpklo z18.h, z1.b +; VBITS_GE_128-NEXT: sunpkhi z19.s, z16.h +; VBITS_GE_128-NEXT: sunpkhi z20.s, z18.h +; VBITS_GE_128-NEXT: sunpklo z16.s, z16.h +; VBITS_GE_128-NEXT: sunpklo z18.s, z18.h +; VBITS_GE_128-NEXT: sdivr z19.s, p1/m, z19.s, z20.s +; VBITS_GE_128-NEXT: sdivr z16.s, p1/m, z16.s, z18.s +; VBITS_GE_128-NEXT: uzp1 z7.h, z7.h, z17.h +; VBITS_GE_128-NEXT: uzp1 z16.h, z16.h, z19.h +; VBITS_GE_128-NEXT: uzp1 z5.h, z6.h, z5.h +; VBITS_GE_128-NEXT: uzp1 z6.b, z16.b, z7.b +; VBITS_GE_128-NEXT: uzp1 z4.b, z5.b, z4.b +; VBITS_GE_128-NEXT: mls z1.b, p0/m, z6.b, z3.b +; VBITS_GE_128-NEXT: mls z0.b, p0/m, z4.b, z2.b +; VBITS_GE_128-NEXT: stp q1, q0, [x0] +; VBITS_GE_128-NEXT: ret +; +; VBITS_GE_256-LABEL: srem_v32i8: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: ptrue p0.b, vl32 +; VBITS_GE_256-NEXT: ptrue p1.s, vl8 +; VBITS_GE_256-NEXT: ld1b { z0.b }, p0/z, [x0] +; VBITS_GE_256-NEXT: ld1b { z1.b }, p0/z, [x1] +; VBITS_GE_256-NEXT: sunpkhi z2.h, z1.b +; VBITS_GE_256-NEXT: sunpkhi z3.h, z0.b +; VBITS_GE_256-NEXT: sunpklo z4.h, z1.b +; VBITS_GE_256-NEXT: sunpklo z5.h, z0.b +; VBITS_GE_256-NEXT: sunpkhi z6.s, z2.h +; VBITS_GE_256-NEXT: sunpkhi z7.s, z3.h +; VBITS_GE_256-NEXT: sunpklo z2.s, z2.h +; VBITS_GE_256-NEXT: sunpklo z3.s, z3.h +; VBITS_GE_256-NEXT: sdivr z6.s, p1/m, z6.s, z7.s +; VBITS_GE_256-NEXT: sunpkhi z7.s, z4.h +; VBITS_GE_256-NEXT: sdivr z2.s, p1/m, z2.s, z3.s +; VBITS_GE_256-NEXT: sunpkhi z3.s, z5.h +; VBITS_GE_256-NEXT: sunpklo z4.s, z4.h +; VBITS_GE_256-NEXT: sunpklo z5.s, z5.h +; VBITS_GE_256-NEXT: sdiv z3.s, p1/m, z3.s, z7.s +; VBITS_GE_256-NEXT: sdivr z4.s, p1/m, z4.s, z5.s +; VBITS_GE_256-NEXT: uzp1 z2.h, z2.h, z6.h +; VBITS_GE_256-NEXT: uzp1 z3.h, z4.h, z3.h +; VBITS_GE_256-NEXT: uzp1 z2.b, z3.b, z2.b +; VBITS_GE_256-NEXT: mls z0.b, p0/m, z2.b, z1.b +; VBITS_GE_256-NEXT: st1b { z0.b }, p0, [x0] +; VBITS_GE_256-NEXT: ret + %op1 = load <32 x i8>, <32 x i8>* %a + %op2 = load <32 x i8>, <32 x i8>* %b + %res = srem <32 x i8> %op1, %op2 + store <32 x i8> %res, <32 x i8>* %a + ret void +} + +define void @srem_v64i8(<64 x i8>* %a, <64 x i8>* %b) #0 { +; VBITS_GE_128-LABEL: srem_v64i8: +; VBITS_GE_128: // %bb.0: +; VBITS_GE_128-NEXT: mov w8, #32 +; VBITS_GE_128-NEXT: mov w9, #48 +; VBITS_GE_128-NEXT: mov w10, #16 +; VBITS_GE_128-NEXT: ptrue p0.b, vl16 +; VBITS_GE_128-NEXT: ptrue p1.s, vl4 +; VBITS_GE_128-NEXT: ld1b { z2.b }, p0/z, [x0, x8] +; VBITS_GE_128-NEXT: ld1b { z1.b }, p0/z, [x0, x9] +; VBITS_GE_128-NEXT: ld1b { z0.b }, p0/z, [x0, x10] +; VBITS_GE_128-NEXT: ld1b { z3.b }, p0/z, [x0] +; VBITS_GE_128-NEXT: ld1b { z4.b }, p0/z, [x1, x10] +; VBITS_GE_128-NEXT: ld1b { z6.b }, p0/z, [x1, x9] +; VBITS_GE_128-NEXT: ld1b { z5.b }, p0/z, [x1, x8] +; VBITS_GE_128-NEXT: sunpkhi z16.h, z0.b +; VBITS_GE_128-NEXT: sunpkhi z7.h, z4.b +; VBITS_GE_128-NEXT: sunpkhi z18.s, z16.h +; VBITS_GE_128-NEXT: sunpkhi z17.s, z7.h +; VBITS_GE_128-NEXT: sunpklo z7.s, z7.h +; VBITS_GE_128-NEXT: sunpklo z16.s, z16.h +; VBITS_GE_128-NEXT: sdivr z17.s, p1/m, z17.s, z18.s +; VBITS_GE_128-NEXT: sdivr z7.s, p1/m, z7.s, z16.s +; VBITS_GE_128-NEXT: sunpklo z16.h, z4.b +; VBITS_GE_128-NEXT: sunpklo z18.h, z0.b +; VBITS_GE_128-NEXT: sunpkhi z19.s, z16.h +; VBITS_GE_128-NEXT: sunpkhi z20.s, z18.h +; VBITS_GE_128-NEXT: sunpklo z16.s, z16.h +; VBITS_GE_128-NEXT: sunpklo z18.s, z18.h +; VBITS_GE_128-NEXT: sdivr z19.s, p1/m, z19.s, z20.s +; VBITS_GE_128-NEXT: sdivr z16.s, p1/m, z16.s, z18.s +; VBITS_GE_128-NEXT: uzp1 z7.h, z7.h, z17.h +; VBITS_GE_128-NEXT: sunpkhi z17.h, z6.b +; VBITS_GE_128-NEXT: sunpkhi z18.h, z1.b +; VBITS_GE_128-NEXT: uzp1 z16.h, z16.h, z19.h +; VBITS_GE_128-NEXT: sunpkhi z19.s, z17.h +; VBITS_GE_128-NEXT: sunpkhi z20.s, z18.h +; VBITS_GE_128-NEXT: sunpklo z17.s, z17.h +; VBITS_GE_128-NEXT: sunpklo z18.s, z18.h +; VBITS_GE_128-NEXT: sdivr z19.s, p1/m, z19.s, z20.s +; VBITS_GE_128-NEXT: sdivr z17.s, p1/m, z17.s, z18.s +; VBITS_GE_128-NEXT: uzp1 z7.b, z16.b, z7.b +; VBITS_GE_128-NEXT: uzp1 z16.h, z17.h, z19.h +; VBITS_GE_128-NEXT: sunpklo z17.h, z6.b +; VBITS_GE_128-NEXT: sunpklo z18.h, z1.b +; VBITS_GE_128-NEXT: sunpkhi z19.s, z17.h +; VBITS_GE_128-NEXT: sunpkhi z20.s, z18.h +; VBITS_GE_128-NEXT: sunpklo z17.s, z17.h +; VBITS_GE_128-NEXT: sunpklo z18.s, z18.h +; VBITS_GE_128-NEXT: sdivr z19.s, p1/m, z19.s, z20.s +; VBITS_GE_128-NEXT: sdivr z17.s, p1/m, z17.s, z18.s +; VBITS_GE_128-NEXT: sunpkhi z18.h, z5.b +; VBITS_GE_128-NEXT: sunpkhi z20.h, z2.b +; VBITS_GE_128-NEXT: sunpkhi z21.s, z18.h +; VBITS_GE_128-NEXT: sunpkhi z22.s, z20.h +; VBITS_GE_128-NEXT: uzp1 z17.h, z17.h, z19.h +; VBITS_GE_128-NEXT: movprfx z19, z22 +; VBITS_GE_128-NEXT: sdiv z19.s, p1/m, z19.s, z21.s +; VBITS_GE_128-NEXT: sunpklo z21.h, z5.b +; VBITS_GE_128-NEXT: sunpklo z22.h, z2.b +; VBITS_GE_128-NEXT: sunpklo z18.s, z18.h +; VBITS_GE_128-NEXT: sunpklo z20.s, z20.h +; VBITS_GE_128-NEXT: sunpkhi z23.s, z21.h +; VBITS_GE_128-NEXT: sunpkhi z24.s, z22.h +; VBITS_GE_128-NEXT: sunpklo z21.s, z21.h +; VBITS_GE_128-NEXT: sunpklo z22.s, z22.h +; VBITS_GE_128-NEXT: sdivr z18.s, p1/m, z18.s, z20.s +; VBITS_GE_128-NEXT: movprfx z20, z24 +; VBITS_GE_128-NEXT: sdiv z20.s, p1/m, z20.s, z23.s +; VBITS_GE_128-NEXT: sdivr z21.s, p1/m, z21.s, z22.s +; VBITS_GE_128-NEXT: ld1b { z22.b }, p0/z, [x1] +; VBITS_GE_128-NEXT: uzp1 z18.h, z18.h, z19.h +; VBITS_GE_128-NEXT: uzp1 z19.h, z21.h, z20.h +; VBITS_GE_128-NEXT: uzp1 z16.b, z17.b, z16.b +; VBITS_GE_128-NEXT: uzp1 z17.b, z19.b, z18.b +; VBITS_GE_128-NEXT: mls z1.b, p0/m, z16.b, z6.b +; VBITS_GE_128-NEXT: mls z2.b, p0/m, z17.b, z5.b +; VBITS_GE_128-NEXT: sunpkhi z6.h, z3.b +; VBITS_GE_128-NEXT: sunpkhi z5.h, z22.b +; VBITS_GE_128-NEXT: sunpkhi z17.s, z6.h +; VBITS_GE_128-NEXT: sunpkhi z16.s, z5.h +; VBITS_GE_128-NEXT: sunpklo z5.s, z5.h +; VBITS_GE_128-NEXT: sunpklo z6.s, z6.h +; VBITS_GE_128-NEXT: sdivr z16.s, p1/m, z16.s, z17.s +; VBITS_GE_128-NEXT: sdivr z5.s, p1/m, z5.s, z6.s +; VBITS_GE_128-NEXT: sunpklo z6.h, z22.b +; VBITS_GE_128-NEXT: sunpklo z17.h, z3.b +; VBITS_GE_128-NEXT: sunpkhi z18.s, z6.h +; VBITS_GE_128-NEXT: sunpkhi z19.s, z17.h +; VBITS_GE_128-NEXT: sunpklo z6.s, z6.h +; VBITS_GE_128-NEXT: sunpklo z17.s, z17.h +; VBITS_GE_128-NEXT: sdivr z18.s, p1/m, z18.s, z19.s +; VBITS_GE_128-NEXT: sdivr z6.s, p1/m, z6.s, z17.s +; VBITS_GE_128-NEXT: uzp1 z5.h, z5.h, z16.h +; VBITS_GE_128-NEXT: uzp1 z6.h, z6.h, z18.h +; VBITS_GE_128-NEXT: mls z0.b, p0/m, z7.b, z4.b +; VBITS_GE_128-NEXT: uzp1 z5.b, z6.b, z5.b +; VBITS_GE_128-NEXT: stp q2, q1, [x0, #32] +; VBITS_GE_128-NEXT: mls z3.b, p0/m, z5.b, z22.b +; VBITS_GE_128-NEXT: stp q3, q0, [x0] +; VBITS_GE_128-NEXT: ret +; +; VBITS_GE_256-LABEL: srem_v64i8: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov w8, #32 +; VBITS_GE_256-NEXT: ptrue p0.b, vl32 +; VBITS_GE_256-NEXT: ptrue p1.s, vl8 +; VBITS_GE_256-NEXT: ld1b { z0.b }, p0/z, [x0, x8] +; VBITS_GE_256-NEXT: ld1b { z1.b }, p0/z, [x0] +; VBITS_GE_256-NEXT: ld1b { z2.b }, p0/z, [x1, x8] +; VBITS_GE_256-NEXT: ld1b { z3.b }, p0/z, [x1] +; VBITS_GE_256-NEXT: sunpkhi z5.h, z0.b +; VBITS_GE_256-NEXT: sunpklo z7.h, z0.b +; VBITS_GE_256-NEXT: sunpkhi z4.h, z2.b +; VBITS_GE_256-NEXT: sunpklo z6.h, z2.b +; VBITS_GE_256-NEXT: sunpkhi z16.s, z4.h +; VBITS_GE_256-NEXT: sunpkhi z17.s, z5.h +; VBITS_GE_256-NEXT: sunpklo z4.s, z4.h +; VBITS_GE_256-NEXT: sunpklo z5.s, z5.h +; VBITS_GE_256-NEXT: sunpkhi z18.s, z6.h +; VBITS_GE_256-NEXT: sdivr z4.s, p1/m, z4.s, z5.s +; VBITS_GE_256-NEXT: sunpkhi z5.s, z7.h +; VBITS_GE_256-NEXT: sunpklo z6.s, z6.h +; VBITS_GE_256-NEXT: sunpklo z7.s, z7.h +; VBITS_GE_256-NEXT: sdiv z5.s, p1/m, z5.s, z18.s +; VBITS_GE_256-NEXT: sdivr z6.s, p1/m, z6.s, z7.s +; VBITS_GE_256-NEXT: sdivr z16.s, p1/m, z16.s, z17.s +; VBITS_GE_256-NEXT: uzp1 z5.h, z6.h, z5.h +; VBITS_GE_256-NEXT: sunpkhi z6.h, z3.b +; VBITS_GE_256-NEXT: sunpkhi z7.h, z1.b +; VBITS_GE_256-NEXT: uzp1 z4.h, z4.h, z16.h +; VBITS_GE_256-NEXT: sunpkhi z16.s, z6.h +; VBITS_GE_256-NEXT: sunpkhi z17.s, z7.h +; VBITS_GE_256-NEXT: sunpklo z6.s, z6.h +; VBITS_GE_256-NEXT: sunpklo z7.s, z7.h +; VBITS_GE_256-NEXT: sdivr z16.s, p1/m, z16.s, z17.s +; VBITS_GE_256-NEXT: sdivr z6.s, p1/m, z6.s, z7.s +; VBITS_GE_256-NEXT: sunpklo z7.h, z3.b +; VBITS_GE_256-NEXT: sunpklo z17.h, z1.b +; VBITS_GE_256-NEXT: sunpkhi z18.s, z7.h +; VBITS_GE_256-NEXT: sunpkhi z19.s, z17.h +; VBITS_GE_256-NEXT: sunpklo z7.s, z7.h +; VBITS_GE_256-NEXT: sunpklo z17.s, z17.h +; VBITS_GE_256-NEXT: sdivr z18.s, p1/m, z18.s, z19.s +; VBITS_GE_256-NEXT: sdivr z7.s, p1/m, z7.s, z17.s +; VBITS_GE_256-NEXT: uzp1 z6.h, z6.h, z16.h +; VBITS_GE_256-NEXT: uzp1 z7.h, z7.h, z18.h +; VBITS_GE_256-NEXT: uzp1 z4.b, z5.b, z4.b +; VBITS_GE_256-NEXT: uzp1 z5.b, z7.b, z6.b +; VBITS_GE_256-NEXT: mls z0.b, p0/m, z4.b, z2.b +; VBITS_GE_256-NEXT: mls z1.b, p0/m, z5.b, z3.b +; VBITS_GE_256-NEXT: st1b { z0.b }, p0, [x0, x8] +; VBITS_GE_256-NEXT: st1b { z1.b }, p0, [x0] +; VBITS_GE_256-NEXT: ret + %op1 = load <64 x i8>, <64 x i8>* %a + %op2 = load <64 x i8>, <64 x i8>* %b + %res = srem <64 x i8> %op1, %op2 + store <64 x i8> %res, <64 x i8>* %a + ret void +} + +define void @srem_v128i8(<128 x i8>* %a, <128 x i8>* %b) #0 { +; VBITS_GE_128-LABEL: srem_v128i8: +; VBITS_GE_128: // %bb.0: +; VBITS_GE_128-NEXT: mov w11, #96 +; VBITS_GE_128-NEXT: mov w8, #112 +; VBITS_GE_128-NEXT: mov w9, #64 +; VBITS_GE_128-NEXT: mov w10, #80 +; VBITS_GE_128-NEXT: mov w12, #32 +; VBITS_GE_128-NEXT: mov w13, #48 +; VBITS_GE_128-NEXT: mov w14, #16 +; VBITS_GE_128-NEXT: ptrue p0.b, vl16 +; VBITS_GE_128-NEXT: ld1b { z0.b }, p0/z, [x0, x11] +; VBITS_GE_128-NEXT: ld1b { z7.b }, p0/z, [x0, x8] +; VBITS_GE_128-NEXT: ld1b { z6.b }, p0/z, [x0, x9] +; VBITS_GE_128-NEXT: ld1b { z4.b }, p0/z, [x0, x10] +; VBITS_GE_128-NEXT: ld1b { z3.b }, p0/z, [x0, x12] +; VBITS_GE_128-NEXT: ld1b { z2.b }, p0/z, [x0, x13] +; VBITS_GE_128-NEXT: ld1b { z1.b }, p0/z, [x0, x14] +; VBITS_GE_128-NEXT: ld1b { z5.b }, p0/z, [x0] +; VBITS_GE_128-NEXT: ld1b { z16.b }, p0/z, [x1, x14] +; VBITS_GE_128-NEXT: ptrue p1.s, vl4 +; VBITS_GE_128-NEXT: ld1b { z19.b }, p0/z, [x1, x13] +; VBITS_GE_128-NEXT: ld1b { z18.b }, p0/z, [x1, x11] +; VBITS_GE_128-NEXT: sunpkhi z21.h, z1.b +; VBITS_GE_128-NEXT: sunpkhi z17.h, z16.b +; VBITS_GE_128-NEXT: sunpkhi z22.s, z21.h +; VBITS_GE_128-NEXT: sunpkhi z20.s, z17.h +; VBITS_GE_128-NEXT: sunpklo z17.s, z17.h +; VBITS_GE_128-NEXT: sunpklo z21.s, z21.h +; VBITS_GE_128-NEXT: sdivr z20.s, p1/m, z20.s, z22.s +; VBITS_GE_128-NEXT: sdivr z17.s, p1/m, z17.s, z21.s +; VBITS_GE_128-NEXT: sunpklo z21.h, z16.b +; VBITS_GE_128-NEXT: sunpklo z22.h, z1.b +; VBITS_GE_128-NEXT: sunpkhi z23.s, z21.h +; VBITS_GE_128-NEXT: sunpkhi z24.s, z22.h +; VBITS_GE_128-NEXT: sunpklo z21.s, z21.h +; VBITS_GE_128-NEXT: sunpklo z22.s, z22.h +; VBITS_GE_128-NEXT: sdivr z23.s, p1/m, z23.s, z24.s +; VBITS_GE_128-NEXT: sdivr z21.s, p1/m, z21.s, z22.s +; VBITS_GE_128-NEXT: uzp1 z17.h, z17.h, z20.h +; VBITS_GE_128-NEXT: uzp1 z20.h, z21.h, z23.h +; VBITS_GE_128-NEXT: ld1b { z21.b }, p0/z, [x1, x12] +; VBITS_GE_128-NEXT: uzp1 z17.b, z20.b, z17.b +; VBITS_GE_128-NEXT: sunpkhi z20.h, z19.b +; VBITS_GE_128-NEXT: sunpkhi z22.h, z2.b +; VBITS_GE_128-NEXT: sunpkhi z23.s, z20.h +; VBITS_GE_128-NEXT: sunpkhi z24.s, z22.h +; VBITS_GE_128-NEXT: sunpklo z20.s, z20.h +; VBITS_GE_128-NEXT: sunpklo z22.s, z22.h +; VBITS_GE_128-NEXT: sdivr z23.s, p1/m, z23.s, z24.s +; VBITS_GE_128-NEXT: sdivr z20.s, p1/m, z20.s, z22.s +; VBITS_GE_128-NEXT: sunpklo z22.h, z19.b +; VBITS_GE_128-NEXT: sunpklo z24.h, z2.b +; VBITS_GE_128-NEXT: sunpkhi z25.s, z22.h +; VBITS_GE_128-NEXT: sunpkhi z26.s, z24.h +; VBITS_GE_128-NEXT: sunpklo z22.s, z22.h +; VBITS_GE_128-NEXT: sunpklo z24.s, z24.h +; VBITS_GE_128-NEXT: sdivr z25.s, p1/m, z25.s, z26.s +; VBITS_GE_128-NEXT: sdivr z22.s, p1/m, z22.s, z24.s +; VBITS_GE_128-NEXT: uzp1 z20.h, z20.h, z23.h +; VBITS_GE_128-NEXT: sunpkhi z23.h, z21.b +; VBITS_GE_128-NEXT: sunpkhi z24.h, z3.b +; VBITS_GE_128-NEXT: uzp1 z22.h, z22.h, z25.h +; VBITS_GE_128-NEXT: sunpkhi z25.s, z23.h +; VBITS_GE_128-NEXT: sunpkhi z26.s, z24.h +; VBITS_GE_128-NEXT: sunpklo z23.s, z23.h +; VBITS_GE_128-NEXT: sunpklo z24.s, z24.h +; VBITS_GE_128-NEXT: sdivr z25.s, p1/m, z25.s, z26.s +; VBITS_GE_128-NEXT: sdivr z23.s, p1/m, z23.s, z24.s +; VBITS_GE_128-NEXT: sunpklo z24.h, z21.b +; VBITS_GE_128-NEXT: sunpklo z26.h, z3.b +; VBITS_GE_128-NEXT: sunpkhi z27.s, z24.h +; VBITS_GE_128-NEXT: sunpkhi z28.s, z26.h +; VBITS_GE_128-NEXT: sunpklo z24.s, z24.h +; VBITS_GE_128-NEXT: sunpklo z26.s, z26.h +; VBITS_GE_128-NEXT: sdivr z27.s, p1/m, z27.s, z28.s +; VBITS_GE_128-NEXT: sdivr z24.s, p1/m, z24.s, z26.s +; VBITS_GE_128-NEXT: ld1b { z26.b }, p0/z, [x1, x10] +; VBITS_GE_128-NEXT: uzp1 z23.h, z23.h, z25.h +; VBITS_GE_128-NEXT: uzp1 z24.h, z24.h, z27.h +; VBITS_GE_128-NEXT: uzp1 z20.b, z22.b, z20.b +; VBITS_GE_128-NEXT: ld1b { z28.b }, p0/z, [x1, x9] +; VBITS_GE_128-NEXT: uzp1 z22.b, z24.b, z23.b +; VBITS_GE_128-NEXT: mls z2.b, p0/m, z20.b, z19.b +; VBITS_GE_128-NEXT: sunpkhi z19.h, z26.b +; VBITS_GE_128-NEXT: sunpkhi z20.h, z4.b +; VBITS_GE_128-NEXT: mls z3.b, p0/m, z22.b, z21.b +; VBITS_GE_128-NEXT: sunpkhi z21.s, z19.h +; VBITS_GE_128-NEXT: sunpkhi z22.s, z20.h +; VBITS_GE_128-NEXT: sunpklo z19.s, z19.h +; VBITS_GE_128-NEXT: sunpklo z20.s, z20.h +; VBITS_GE_128-NEXT: sdivr z21.s, p1/m, z21.s, z22.s +; VBITS_GE_128-NEXT: sdivr z19.s, p1/m, z19.s, z20.s +; VBITS_GE_128-NEXT: sunpklo z20.h, z26.b +; VBITS_GE_128-NEXT: sunpklo z22.h, z4.b +; VBITS_GE_128-NEXT: sunpkhi z23.s, z20.h +; VBITS_GE_128-NEXT: sunpkhi z24.s, z22.h +; VBITS_GE_128-NEXT: sunpklo z20.s, z20.h +; VBITS_GE_128-NEXT: sunpklo z22.s, z22.h +; VBITS_GE_128-NEXT: sdivr z23.s, p1/m, z23.s, z24.s +; VBITS_GE_128-NEXT: sdivr z20.s, p1/m, z20.s, z22.s +; VBITS_GE_128-NEXT: uzp1 z21.h, z19.h, z21.h +; VBITS_GE_128-NEXT: sunpkhi z19.h, z28.b +; VBITS_GE_128-NEXT: sunpkhi z22.h, z6.b +; VBITS_GE_128-NEXT: uzp1 z20.h, z20.h, z23.h +; VBITS_GE_128-NEXT: sunpkhi z23.s, z19.h +; VBITS_GE_128-NEXT: sunpkhi z24.s, z22.h +; VBITS_GE_128-NEXT: sunpklo z19.s, z19.h +; VBITS_GE_128-NEXT: sunpklo z22.s, z22.h +; VBITS_GE_128-NEXT: sdivr z23.s, p1/m, z23.s, z24.s +; VBITS_GE_128-NEXT: sdiv z22.s, p1/m, z22.s, z19.s +; VBITS_GE_128-NEXT: sunpklo z19.h, z28.b +; VBITS_GE_128-NEXT: sunpklo z24.h, z6.b +; VBITS_GE_128-NEXT: sunpkhi z25.s, z19.h +; VBITS_GE_128-NEXT: sunpkhi z27.s, z24.h +; VBITS_GE_128-NEXT: sunpklo z19.s, z19.h +; VBITS_GE_128-NEXT: sdivr z25.s, p1/m, z25.s, z27.s +; VBITS_GE_128-NEXT: ld1b { z27.b }, p0/z, [x1, x8] +; VBITS_GE_128-NEXT: sunpklo z24.s, z24.h +; VBITS_GE_128-NEXT: uzp1 z22.h, z22.h, z23.h +; VBITS_GE_128-NEXT: sdiv z24.s, p1/m, z24.s, z19.s +; VBITS_GE_128-NEXT: uzp1 z20.b, z20.b, z21.b +; VBITS_GE_128-NEXT: uzp1 z23.h, z24.h, z25.h +; VBITS_GE_128-NEXT: mls z4.b, p0/m, z20.b, z26.b +; VBITS_GE_128-NEXT: uzp1 z21.b, z23.b, z22.b +; VBITS_GE_128-NEXT: sunpkhi z20.h, z27.b +; VBITS_GE_128-NEXT: mls z6.b, p0/m, z21.b, z28.b +; VBITS_GE_128-NEXT: sunpkhi z21.h, z7.b +; VBITS_GE_128-NEXT: sunpkhi z22.s, z20.h +; VBITS_GE_128-NEXT: sunpkhi z23.s, z21.h +; VBITS_GE_128-NEXT: sunpklo z20.s, z20.h +; VBITS_GE_128-NEXT: sunpklo z21.s, z21.h +; VBITS_GE_128-NEXT: sdivr z22.s, p1/m, z22.s, z23.s +; VBITS_GE_128-NEXT: sdivr z20.s, p1/m, z20.s, z21.s +; VBITS_GE_128-NEXT: sunpklo z21.h, z27.b +; VBITS_GE_128-NEXT: sunpklo z23.h, z7.b +; VBITS_GE_128-NEXT: sunpkhi z24.s, z21.h +; VBITS_GE_128-NEXT: sunpkhi z25.s, z23.h +; VBITS_GE_128-NEXT: sunpklo z21.s, z21.h +; VBITS_GE_128-NEXT: sunpklo z23.s, z23.h +; VBITS_GE_128-NEXT: sdivr z24.s, p1/m, z24.s, z25.s +; VBITS_GE_128-NEXT: sdivr z21.s, p1/m, z21.s, z23.s +; VBITS_GE_128-NEXT: uzp1 z20.h, z20.h, z22.h +; VBITS_GE_128-NEXT: sunpkhi z22.h, z18.b +; VBITS_GE_128-NEXT: sunpkhi z23.h, z0.b +; VBITS_GE_128-NEXT: uzp1 z21.h, z21.h, z24.h +; VBITS_GE_128-NEXT: sunpkhi z24.s, z22.h +; VBITS_GE_128-NEXT: sunpkhi z25.s, z23.h +; VBITS_GE_128-NEXT: sunpklo z22.s, z22.h +; VBITS_GE_128-NEXT: sunpklo z23.s, z23.h +; VBITS_GE_128-NEXT: sdivr z24.s, p1/m, z24.s, z25.s +; VBITS_GE_128-NEXT: sdivr z22.s, p1/m, z22.s, z23.s +; VBITS_GE_128-NEXT: sunpklo z23.h, z18.b +; VBITS_GE_128-NEXT: sunpklo z25.h, z0.b +; VBITS_GE_128-NEXT: ld1b { z19.b }, p0/z, [x1] +; VBITS_GE_128-NEXT: sunpkhi z26.s, z23.h +; VBITS_GE_128-NEXT: sunpkhi z28.s, z25.h +; VBITS_GE_128-NEXT: sunpklo z23.s, z23.h +; VBITS_GE_128-NEXT: sunpklo z25.s, z25.h +; VBITS_GE_128-NEXT: sdivr z26.s, p1/m, z26.s, z28.s +; VBITS_GE_128-NEXT: sdivr z23.s, p1/m, z23.s, z25.s +; VBITS_GE_128-NEXT: uzp1 z22.h, z22.h, z24.h +; VBITS_GE_128-NEXT: uzp1 z23.h, z23.h, z26.h +; VBITS_GE_128-NEXT: uzp1 z20.b, z21.b, z20.b +; VBITS_GE_128-NEXT: uzp1 z21.b, z23.b, z22.b +; VBITS_GE_128-NEXT: mls z7.b, p0/m, z20.b, z27.b +; VBITS_GE_128-NEXT: mls z0.b, p0/m, z21.b, z18.b +; VBITS_GE_128-NEXT: sunpkhi z18.h, z19.b +; VBITS_GE_128-NEXT: sunpkhi z20.h, z5.b +; VBITS_GE_128-NEXT: sunpkhi z21.s, z18.h +; VBITS_GE_128-NEXT: sunpkhi z22.s, z20.h +; VBITS_GE_128-NEXT: sunpklo z18.s, z18.h +; VBITS_GE_128-NEXT: sunpklo z20.s, z20.h +; VBITS_GE_128-NEXT: sdivr z21.s, p1/m, z21.s, z22.s +; VBITS_GE_128-NEXT: sdivr z18.s, p1/m, z18.s, z20.s +; VBITS_GE_128-NEXT: sunpklo z20.h, z19.b +; VBITS_GE_128-NEXT: sunpklo z22.h, z5.b +; VBITS_GE_128-NEXT: sunpkhi z23.s, z20.h +; VBITS_GE_128-NEXT: sunpkhi z24.s, z22.h +; VBITS_GE_128-NEXT: uzp1 z18.h, z18.h, z21.h +; VBITS_GE_128-NEXT: movprfx z21, z24 +; VBITS_GE_128-NEXT: sdiv z21.s, p1/m, z21.s, z23.s +; VBITS_GE_128-NEXT: sunpklo z20.s, z20.h +; VBITS_GE_128-NEXT: stp q3, q2, [x0, #32] +; VBITS_GE_128-NEXT: mls z1.b, p0/m, z17.b, z16.b +; VBITS_GE_128-NEXT: stp q6, q4, [x0, #64] +; VBITS_GE_128-NEXT: stp q0, q7, [x0, #96] +; VBITS_GE_128-NEXT: sunpklo z0.s, z22.h +; VBITS_GE_128-NEXT: sdiv z0.s, p1/m, z0.s, z20.s +; VBITS_GE_128-NEXT: uzp1 z0.h, z0.h, z21.h +; VBITS_GE_128-NEXT: uzp1 z0.b, z0.b, z18.b +; VBITS_GE_128-NEXT: mls z5.b, p0/m, z0.b, z19.b +; VBITS_GE_128-NEXT: stp q5, q1, [x0] +; VBITS_GE_128-NEXT: ret +; +; VBITS_GE_256-LABEL: srem_v128i8: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov w8, #64 +; VBITS_GE_256-NEXT: mov w9, #96 +; VBITS_GE_256-NEXT: mov w10, #32 +; VBITS_GE_256-NEXT: ptrue p0.b, vl32 +; VBITS_GE_256-NEXT: ptrue p1.s, vl8 +; VBITS_GE_256-NEXT: ld1b { z3.b }, p0/z, [x0, x8] +; VBITS_GE_256-NEXT: ld1b { z2.b }, p0/z, [x0, x9] +; VBITS_GE_256-NEXT: ld1b { z1.b }, p0/z, [x0, x10] +; VBITS_GE_256-NEXT: ld1b { z0.b }, p0/z, [x0] +; VBITS_GE_256-NEXT: ld1b { z4.b }, p0/z, [x1, x10] +; VBITS_GE_256-NEXT: ld1b { z5.b }, p0/z, [x1, x9] +; VBITS_GE_256-NEXT: sunpkhi z7.h, z1.b +; VBITS_GE_256-NEXT: sunpkhi z6.h, z4.b +; VBITS_GE_256-NEXT: sunpkhi z17.s, z7.h +; VBITS_GE_256-NEXT: sunpkhi z16.s, z6.h +; VBITS_GE_256-NEXT: sunpklo z6.s, z6.h +; VBITS_GE_256-NEXT: sunpklo z7.s, z7.h +; VBITS_GE_256-NEXT: sdivr z16.s, p1/m, z16.s, z17.s +; VBITS_GE_256-NEXT: sdivr z6.s, p1/m, z6.s, z7.s +; VBITS_GE_256-NEXT: sunpklo z7.h, z4.b +; VBITS_GE_256-NEXT: sunpklo z17.h, z1.b +; VBITS_GE_256-NEXT: sunpkhi z18.s, z7.h +; VBITS_GE_256-NEXT: sunpkhi z19.s, z17.h +; VBITS_GE_256-NEXT: sunpklo z7.s, z7.h +; VBITS_GE_256-NEXT: sunpklo z17.s, z17.h +; VBITS_GE_256-NEXT: sdivr z18.s, p1/m, z18.s, z19.s +; VBITS_GE_256-NEXT: sdivr z7.s, p1/m, z7.s, z17.s +; VBITS_GE_256-NEXT: uzp1 z6.h, z6.h, z16.h +; VBITS_GE_256-NEXT: sunpkhi z16.h, z5.b +; VBITS_GE_256-NEXT: sunpkhi z17.h, z2.b +; VBITS_GE_256-NEXT: uzp1 z7.h, z7.h, z18.h +; VBITS_GE_256-NEXT: sunpkhi z18.s, z16.h +; VBITS_GE_256-NEXT: sunpkhi z19.s, z17.h +; VBITS_GE_256-NEXT: sunpklo z16.s, z16.h +; VBITS_GE_256-NEXT: sunpklo z17.s, z17.h +; VBITS_GE_256-NEXT: sdivr z18.s, p1/m, z18.s, z19.s +; VBITS_GE_256-NEXT: sdivr z16.s, p1/m, z16.s, z17.s +; VBITS_GE_256-NEXT: sunpklo z17.h, z5.b +; VBITS_GE_256-NEXT: sunpklo z19.h, z2.b +; VBITS_GE_256-NEXT: sunpkhi z20.s, z17.h +; VBITS_GE_256-NEXT: sunpkhi z21.s, z19.h +; VBITS_GE_256-NEXT: sunpklo z17.s, z17.h +; VBITS_GE_256-NEXT: sdivr z20.s, p1/m, z20.s, z21.s +; VBITS_GE_256-NEXT: ld1b { z21.b }, p0/z, [x1, x8] +; VBITS_GE_256-NEXT: sunpklo z19.s, z19.h +; VBITS_GE_256-NEXT: uzp1 z16.h, z16.h, z18.h +; VBITS_GE_256-NEXT: sdivr z17.s, p1/m, z17.s, z19.s +; VBITS_GE_256-NEXT: uzp1 z6.b, z7.b, z6.b +; VBITS_GE_256-NEXT: uzp1 z17.h, z17.h, z20.h +; VBITS_GE_256-NEXT: ld1b { z19.b }, p0/z, [x1] +; VBITS_GE_256-NEXT: uzp1 z7.b, z17.b, z16.b +; VBITS_GE_256-NEXT: mls z1.b, p0/m, z6.b, z4.b +; VBITS_GE_256-NEXT: mls z2.b, p0/m, z7.b, z5.b +; VBITS_GE_256-NEXT: sunpkhi z4.h, z21.b +; VBITS_GE_256-NEXT: sunpkhi z5.h, z3.b +; VBITS_GE_256-NEXT: sunpkhi z6.s, z4.h +; VBITS_GE_256-NEXT: sunpkhi z7.s, z5.h +; VBITS_GE_256-NEXT: sunpklo z4.s, z4.h +; VBITS_GE_256-NEXT: sunpklo z5.s, z5.h +; VBITS_GE_256-NEXT: sdivr z6.s, p1/m, z6.s, z7.s +; VBITS_GE_256-NEXT: sdivr z4.s, p1/m, z4.s, z5.s +; VBITS_GE_256-NEXT: sunpklo z5.h, z21.b +; VBITS_GE_256-NEXT: sunpklo z7.h, z3.b +; VBITS_GE_256-NEXT: sunpkhi z16.s, z5.h +; VBITS_GE_256-NEXT: sunpkhi z17.s, z7.h +; VBITS_GE_256-NEXT: sunpklo z5.s, z5.h +; VBITS_GE_256-NEXT: sunpklo z7.s, z7.h +; VBITS_GE_256-NEXT: sdivr z16.s, p1/m, z16.s, z17.s +; VBITS_GE_256-NEXT: sdivr z5.s, p1/m, z5.s, z7.s +; VBITS_GE_256-NEXT: uzp1 z4.h, z4.h, z6.h +; VBITS_GE_256-NEXT: sunpkhi z6.h, z19.b +; VBITS_GE_256-NEXT: sunpkhi z7.h, z0.b +; VBITS_GE_256-NEXT: uzp1 z5.h, z5.h, z16.h +; VBITS_GE_256-NEXT: sunpkhi z16.s, z6.h +; VBITS_GE_256-NEXT: sunpkhi z17.s, z7.h +; VBITS_GE_256-NEXT: sunpklo z6.s, z6.h +; VBITS_GE_256-NEXT: sunpklo z7.s, z7.h +; VBITS_GE_256-NEXT: sdivr z16.s, p1/m, z16.s, z17.s +; VBITS_GE_256-NEXT: sdivr z6.s, p1/m, z6.s, z7.s +; VBITS_GE_256-NEXT: sunpklo z7.h, z19.b +; VBITS_GE_256-NEXT: sunpklo z17.h, z0.b +; VBITS_GE_256-NEXT: sunpkhi z18.s, z7.h +; VBITS_GE_256-NEXT: sunpkhi z20.s, z17.h +; VBITS_GE_256-NEXT: sunpklo z7.s, z7.h +; VBITS_GE_256-NEXT: sunpklo z17.s, z17.h +; VBITS_GE_256-NEXT: sdivr z18.s, p1/m, z18.s, z20.s +; VBITS_GE_256-NEXT: sdivr z7.s, p1/m, z7.s, z17.s +; VBITS_GE_256-NEXT: uzp1 z6.h, z6.h, z16.h +; VBITS_GE_256-NEXT: uzp1 z7.h, z7.h, z18.h +; VBITS_GE_256-NEXT: uzp1 z4.b, z5.b, z4.b +; VBITS_GE_256-NEXT: uzp1 z5.b, z7.b, z6.b +; VBITS_GE_256-NEXT: mls z3.b, p0/m, z4.b, z21.b +; VBITS_GE_256-NEXT: mls z0.b, p0/m, z5.b, z19.b +; VBITS_GE_256-NEXT: st1b { z3.b }, p0, [x0, x8] +; VBITS_GE_256-NEXT: st1b { z2.b }, p0, [x0, x9] +; VBITS_GE_256-NEXT: st1b { z1.b }, p0, [x0, x10] +; VBITS_GE_256-NEXT: st1b { z0.b }, p0, [x0] +; VBITS_GE_256-NEXT: ret + %op1 = load <128 x i8>, <128 x i8>* %a + %op2 = load <128 x i8>, <128 x i8>* %b + %res = srem <128 x i8> %op1, %op2 + store <128 x i8> %res, <128 x i8>* %a + ret void +} + +define void @srem_v256i8(<256 x i8>* %a, <256 x i8>* %b) #0 { +; VBITS_GE_128-LABEL: srem_v256i8: +; VBITS_GE_128: // %bb.0: +; VBITS_GE_128-NEXT: str d10, [sp, #-32]! // 8-byte Folded Spill +; VBITS_GE_128-NEXT: .cfi_def_cfa_offset 32 +; VBITS_GE_128-NEXT: stp d9, d8, [sp, #16] // 16-byte Folded Spill +; VBITS_GE_128-NEXT: .cfi_offset b8, -8 +; VBITS_GE_128-NEXT: .cfi_offset b9, -16 +; VBITS_GE_128-NEXT: .cfi_offset b10, -32 +; VBITS_GE_128-NEXT: mov w3, #240 +; VBITS_GE_128-NEXT: mov w9, #224 +; VBITS_GE_128-NEXT: mov w10, #208 +; VBITS_GE_128-NEXT: mov w11, #192 +; VBITS_GE_128-NEXT: mov w12, #176 +; VBITS_GE_128-NEXT: mov w13, #160 +; VBITS_GE_128-NEXT: mov w14, #144 +; VBITS_GE_128-NEXT: mov w15, #128 +; VBITS_GE_128-NEXT: mov w16, #112 +; VBITS_GE_128-NEXT: mov w17, #96 +; VBITS_GE_128-NEXT: mov w18, #80 +; VBITS_GE_128-NEXT: mov w2, #64 +; VBITS_GE_128-NEXT: mov w4, #48 +; VBITS_GE_128-NEXT: mov w5, #32 +; VBITS_GE_128-NEXT: mov w8, #16 +; VBITS_GE_128-NEXT: ptrue p0.b, vl16 +; VBITS_GE_128-NEXT: ld1b { z2.b }, p0/z, [x0, x3] +; VBITS_GE_128-NEXT: ld1b { z1.b }, p0/z, [x0, x9] +; VBITS_GE_128-NEXT: ld1b { z0.b }, p0/z, [x0, x10] +; VBITS_GE_128-NEXT: ld1b { z16.b }, p0/z, [x0, x11] +; VBITS_GE_128-NEXT: ld1b { z7.b }, p0/z, [x0, x12] +; VBITS_GE_128-NEXT: ld1b { z6.b }, p0/z, [x0, x13] +; VBITS_GE_128-NEXT: ld1b { z5.b }, p0/z, [x0, x14] +; VBITS_GE_128-NEXT: ld1b { z4.b }, p0/z, [x0, x15] +; VBITS_GE_128-NEXT: ld1b { z3.b }, p0/z, [x0, x16] +; VBITS_GE_128-NEXT: ld1b { z23.b }, p0/z, [x0, x17] +; VBITS_GE_128-NEXT: ld1b { z22.b }, p0/z, [x0, x18] +; VBITS_GE_128-NEXT: ld1b { z21.b }, p0/z, [x0, x2] +; VBITS_GE_128-NEXT: ld1b { z19.b }, p0/z, [x0, x4] +; VBITS_GE_128-NEXT: ld1b { z18.b }, p0/z, [x0, x5] +; VBITS_GE_128-NEXT: ld1b { z17.b }, p0/z, [x0, x8] +; VBITS_GE_128-NEXT: ld1b { z20.b }, p0/z, [x0] +; VBITS_GE_128-NEXT: ld1b { z26.b }, p0/z, [x1, x5] +; VBITS_GE_128-NEXT: ptrue p1.s, vl4 +; VBITS_GE_128-NEXT: ld1b { z25.b }, p0/z, [x1, x4] +; VBITS_GE_128-NEXT: ld1b { z24.b }, p0/z, [x1, x3] +; VBITS_GE_128-NEXT: sunpkhi z28.h, z18.b +; VBITS_GE_128-NEXT: sunpkhi z30.s, z28.h +; VBITS_GE_128-NEXT: sunpklo z28.s, z28.h +; VBITS_GE_128-NEXT: sunpkhi z27.h, z26.b +; VBITS_GE_128-NEXT: sunpkhi z29.s, z27.h +; VBITS_GE_128-NEXT: sunpklo z27.s, z27.h +; VBITS_GE_128-NEXT: sdivr z29.s, p1/m, z29.s, z30.s +; VBITS_GE_128-NEXT: sdivr z27.s, p1/m, z27.s, z28.s +; VBITS_GE_128-NEXT: sunpklo z28.h, z26.b +; VBITS_GE_128-NEXT: sunpklo z30.h, z18.b +; VBITS_GE_128-NEXT: sunpkhi z31.s, z28.h +; VBITS_GE_128-NEXT: sunpkhi z8.s, z30.h +; VBITS_GE_128-NEXT: sunpklo z28.s, z28.h +; VBITS_GE_128-NEXT: sunpklo z30.s, z30.h +; VBITS_GE_128-NEXT: sdivr z31.s, p1/m, z31.s, z8.s +; VBITS_GE_128-NEXT: sdivr z28.s, p1/m, z28.s, z30.s +; VBITS_GE_128-NEXT: uzp1 z29.h, z27.h, z29.h +; VBITS_GE_128-NEXT: uzp1 z28.h, z28.h, z31.h +; VBITS_GE_128-NEXT: uzp1 z28.b, z28.b, z29.b +; VBITS_GE_128-NEXT: sunpkhi z29.h, z25.b +; VBITS_GE_128-NEXT: sunpkhi z30.h, z19.b +; VBITS_GE_128-NEXT: sunpkhi z31.s, z29.h +; VBITS_GE_128-NEXT: sunpkhi z8.s, z30.h +; VBITS_GE_128-NEXT: sunpklo z29.s, z29.h +; VBITS_GE_128-NEXT: sunpklo z30.s, z30.h +; VBITS_GE_128-NEXT: ld1b { z27.b }, p0/z, [x1, x2] +; VBITS_GE_128-NEXT: sdivr z31.s, p1/m, z31.s, z8.s +; VBITS_GE_128-NEXT: sdivr z29.s, p1/m, z29.s, z30.s +; VBITS_GE_128-NEXT: sunpklo z30.h, z25.b +; VBITS_GE_128-NEXT: sunpklo z8.h, z19.b +; VBITS_GE_128-NEXT: sunpkhi z9.s, z30.h +; VBITS_GE_128-NEXT: sunpkhi z10.s, z8.h +; VBITS_GE_128-NEXT: sunpklo z30.s, z30.h +; VBITS_GE_128-NEXT: sunpklo z8.s, z8.h +; VBITS_GE_128-NEXT: sdivr z9.s, p1/m, z9.s, z10.s +; VBITS_GE_128-NEXT: sdivr z30.s, p1/m, z30.s, z8.s +; VBITS_GE_128-NEXT: uzp1 z29.h, z29.h, z31.h +; VBITS_GE_128-NEXT: uzp1 z30.h, z30.h, z9.h +; VBITS_GE_128-NEXT: mls z18.b, p0/m, z28.b, z26.b +; VBITS_GE_128-NEXT: uzp1 z28.b, z30.b, z29.b +; VBITS_GE_128-NEXT: sunpkhi z26.h, z27.b +; VBITS_GE_128-NEXT: sunpkhi z29.h, z21.b +; VBITS_GE_128-NEXT: sunpkhi z30.s, z26.h +; VBITS_GE_128-NEXT: sunpkhi z31.s, z29.h +; VBITS_GE_128-NEXT: sunpklo z26.s, z26.h +; VBITS_GE_128-NEXT: sunpklo z29.s, z29.h +; VBITS_GE_128-NEXT: sdivr z30.s, p1/m, z30.s, z31.s +; VBITS_GE_128-NEXT: sdiv z29.s, p1/m, z29.s, z26.s +; VBITS_GE_128-NEXT: sunpklo z26.h, z27.b +; VBITS_GE_128-NEXT: sunpklo z31.h, z21.b +; VBITS_GE_128-NEXT: sunpkhi z8.s, z26.h +; VBITS_GE_128-NEXT: sunpkhi z9.s, z31.h +; VBITS_GE_128-NEXT: ld1b { z10.b }, p0/z, [x1, x18] +; VBITS_GE_128-NEXT: sdivr z8.s, p1/m, z8.s, z9.s +; VBITS_GE_128-NEXT: sunpklo z9.s, z26.h +; VBITS_GE_128-NEXT: sunpklo z31.s, z31.h +; VBITS_GE_128-NEXT: sdiv z31.s, p1/m, z31.s, z9.s +; VBITS_GE_128-NEXT: uzp1 z29.h, z29.h, z30.h +; VBITS_GE_128-NEXT: uzp1 z30.h, z31.h, z8.h +; VBITS_GE_128-NEXT: mls z19.b, p0/m, z28.b, z25.b +; VBITS_GE_128-NEXT: uzp1 z25.b, z30.b, z29.b +; VBITS_GE_128-NEXT: sunpkhi z28.h, z10.b +; VBITS_GE_128-NEXT: sunpkhi z29.h, z22.b +; VBITS_GE_128-NEXT: sunpkhi z30.s, z28.h +; VBITS_GE_128-NEXT: sunpkhi z31.s, z29.h +; VBITS_GE_128-NEXT: sunpklo z28.s, z28.h +; VBITS_GE_128-NEXT: sunpklo z29.s, z29.h +; VBITS_GE_128-NEXT: ld1b { z26.b }, p0/z, [x1, x17] +; VBITS_GE_128-NEXT: sdivr z30.s, p1/m, z30.s, z31.s +; VBITS_GE_128-NEXT: sdivr z28.s, p1/m, z28.s, z29.s +; VBITS_GE_128-NEXT: sunpklo z29.h, z10.b +; VBITS_GE_128-NEXT: sunpklo z31.h, z22.b +; VBITS_GE_128-NEXT: sunpkhi z8.s, z29.h +; VBITS_GE_128-NEXT: sunpkhi z9.s, z31.h +; VBITS_GE_128-NEXT: sunpklo z29.s, z29.h +; VBITS_GE_128-NEXT: sunpklo z31.s, z31.h +; VBITS_GE_128-NEXT: sdivr z8.s, p1/m, z8.s, z9.s +; VBITS_GE_128-NEXT: sdivr z29.s, p1/m, z29.s, z31.s +; VBITS_GE_128-NEXT: uzp1 z28.h, z28.h, z30.h +; VBITS_GE_128-NEXT: uzp1 z29.h, z29.h, z8.h +; VBITS_GE_128-NEXT: mls z21.b, p0/m, z25.b, z27.b +; VBITS_GE_128-NEXT: uzp1 z27.b, z29.b, z28.b +; VBITS_GE_128-NEXT: sunpkhi z25.h, z26.b +; VBITS_GE_128-NEXT: sunpkhi z28.h, z23.b +; VBITS_GE_128-NEXT: sunpkhi z29.s, z25.h +; VBITS_GE_128-NEXT: sunpkhi z30.s, z28.h +; VBITS_GE_128-NEXT: sunpklo z25.s, z25.h +; VBITS_GE_128-NEXT: sunpklo z28.s, z28.h +; VBITS_GE_128-NEXT: sdivr z29.s, p1/m, z29.s, z30.s +; VBITS_GE_128-NEXT: sdiv z28.s, p1/m, z28.s, z25.s +; VBITS_GE_128-NEXT: sunpklo z25.h, z26.b +; VBITS_GE_128-NEXT: sunpklo z30.h, z23.b +; VBITS_GE_128-NEXT: sunpkhi z31.s, z25.h +; VBITS_GE_128-NEXT: sunpkhi z8.s, z30.h +; VBITS_GE_128-NEXT: ld1b { z9.b }, p0/z, [x1, x16] +; VBITS_GE_128-NEXT: sdivr z31.s, p1/m, z31.s, z8.s +; VBITS_GE_128-NEXT: sunpklo z8.s, z25.h +; VBITS_GE_128-NEXT: sunpklo z30.s, z30.h +; VBITS_GE_128-NEXT: sdiv z30.s, p1/m, z30.s, z8.s +; VBITS_GE_128-NEXT: uzp1 z28.h, z28.h, z29.h +; VBITS_GE_128-NEXT: uzp1 z29.h, z30.h, z31.h +; VBITS_GE_128-NEXT: mls z22.b, p0/m, z27.b, z10.b +; VBITS_GE_128-NEXT: uzp1 z27.b, z29.b, z28.b +; VBITS_GE_128-NEXT: sunpkhi z28.h, z9.b +; VBITS_GE_128-NEXT: sunpkhi z29.h, z3.b +; VBITS_GE_128-NEXT: sunpkhi z30.s, z28.h +; VBITS_GE_128-NEXT: sunpkhi z31.s, z29.h +; VBITS_GE_128-NEXT: sunpklo z28.s, z28.h +; VBITS_GE_128-NEXT: sunpklo z29.s, z29.h +; VBITS_GE_128-NEXT: ld1b { z25.b }, p0/z, [x1, x15] +; VBITS_GE_128-NEXT: sdivr z30.s, p1/m, z30.s, z31.s +; VBITS_GE_128-NEXT: sdivr z28.s, p1/m, z28.s, z29.s +; VBITS_GE_128-NEXT: sunpklo z29.h, z9.b +; VBITS_GE_128-NEXT: sunpklo z31.h, z3.b +; VBITS_GE_128-NEXT: sunpkhi z8.s, z29.h +; VBITS_GE_128-NEXT: sunpkhi z10.s, z31.h +; VBITS_GE_128-NEXT: sunpklo z29.s, z29.h +; VBITS_GE_128-NEXT: sunpklo z31.s, z31.h +; VBITS_GE_128-NEXT: sdivr z8.s, p1/m, z8.s, z10.s +; VBITS_GE_128-NEXT: sdivr z29.s, p1/m, z29.s, z31.s +; VBITS_GE_128-NEXT: uzp1 z28.h, z28.h, z30.h +; VBITS_GE_128-NEXT: uzp1 z29.h, z29.h, z8.h +; VBITS_GE_128-NEXT: mls z23.b, p0/m, z27.b, z26.b +; VBITS_GE_128-NEXT: uzp1 z27.b, z29.b, z28.b +; VBITS_GE_128-NEXT: sunpkhi z26.h, z25.b +; VBITS_GE_128-NEXT: sunpkhi z28.h, z4.b +; VBITS_GE_128-NEXT: sunpkhi z29.s, z26.h +; VBITS_GE_128-NEXT: sunpkhi z30.s, z28.h +; VBITS_GE_128-NEXT: sunpklo z26.s, z26.h +; VBITS_GE_128-NEXT: sunpklo z28.s, z28.h +; VBITS_GE_128-NEXT: sdivr z29.s, p1/m, z29.s, z30.s +; VBITS_GE_128-NEXT: sdiv z28.s, p1/m, z28.s, z26.s +; VBITS_GE_128-NEXT: sunpklo z26.h, z25.b +; VBITS_GE_128-NEXT: sunpklo z30.h, z4.b +; VBITS_GE_128-NEXT: sunpkhi z31.s, z26.h +; VBITS_GE_128-NEXT: sunpkhi z8.s, z30.h +; VBITS_GE_128-NEXT: ld1b { z10.b }, p0/z, [x1, x14] +; VBITS_GE_128-NEXT: sdivr z31.s, p1/m, z31.s, z8.s +; VBITS_GE_128-NEXT: sunpklo z8.s, z26.h +; VBITS_GE_128-NEXT: sunpklo z30.s, z30.h +; VBITS_GE_128-NEXT: sdiv z30.s, p1/m, z30.s, z8.s +; VBITS_GE_128-NEXT: uzp1 z28.h, z28.h, z29.h +; VBITS_GE_128-NEXT: uzp1 z29.h, z30.h, z31.h +; VBITS_GE_128-NEXT: mls z3.b, p0/m, z27.b, z9.b +; VBITS_GE_128-NEXT: uzp1 z27.b, z29.b, z28.b +; VBITS_GE_128-NEXT: sunpkhi z28.h, z10.b +; VBITS_GE_128-NEXT: sunpkhi z29.h, z5.b +; VBITS_GE_128-NEXT: sunpkhi z30.s, z28.h +; VBITS_GE_128-NEXT: sunpkhi z31.s, z29.h +; VBITS_GE_128-NEXT: sunpklo z28.s, z28.h +; VBITS_GE_128-NEXT: sunpklo z29.s, z29.h +; VBITS_GE_128-NEXT: ld1b { z26.b }, p0/z, [x1, x13] +; VBITS_GE_128-NEXT: sdivr z30.s, p1/m, z30.s, z31.s +; VBITS_GE_128-NEXT: sdivr z28.s, p1/m, z28.s, z29.s +; VBITS_GE_128-NEXT: sunpklo z29.h, z10.b +; VBITS_GE_128-NEXT: sunpklo z31.h, z5.b +; VBITS_GE_128-NEXT: sunpkhi z8.s, z29.h +; VBITS_GE_128-NEXT: sunpkhi z9.s, z31.h +; VBITS_GE_128-NEXT: sunpklo z29.s, z29.h +; VBITS_GE_128-NEXT: sunpklo z31.s, z31.h +; VBITS_GE_128-NEXT: sdivr z8.s, p1/m, z8.s, z9.s +; VBITS_GE_128-NEXT: sdivr z29.s, p1/m, z29.s, z31.s +; VBITS_GE_128-NEXT: uzp1 z28.h, z28.h, z30.h +; VBITS_GE_128-NEXT: uzp1 z29.h, z29.h, z8.h +; VBITS_GE_128-NEXT: mls z4.b, p0/m, z27.b, z25.b +; VBITS_GE_128-NEXT: uzp1 z27.b, z29.b, z28.b +; VBITS_GE_128-NEXT: sunpkhi z25.h, z26.b +; VBITS_GE_128-NEXT: sunpkhi z28.h, z6.b +; VBITS_GE_128-NEXT: sunpkhi z29.s, z25.h +; VBITS_GE_128-NEXT: sunpkhi z30.s, z28.h +; VBITS_GE_128-NEXT: sunpklo z25.s, z25.h +; VBITS_GE_128-NEXT: sunpklo z28.s, z28.h +; VBITS_GE_128-NEXT: sdivr z29.s, p1/m, z29.s, z30.s +; VBITS_GE_128-NEXT: sdiv z28.s, p1/m, z28.s, z25.s +; VBITS_GE_128-NEXT: sunpklo z25.h, z26.b +; VBITS_GE_128-NEXT: sunpklo z30.h, z6.b +; VBITS_GE_128-NEXT: sunpkhi z31.s, z25.h +; VBITS_GE_128-NEXT: sunpkhi z8.s, z30.h +; VBITS_GE_128-NEXT: ld1b { z9.b }, p0/z, [x1, x12] +; VBITS_GE_128-NEXT: sdivr z31.s, p1/m, z31.s, z8.s +; VBITS_GE_128-NEXT: sunpklo z8.s, z25.h +; VBITS_GE_128-NEXT: sunpklo z30.s, z30.h +; VBITS_GE_128-NEXT: sdiv z30.s, p1/m, z30.s, z8.s +; VBITS_GE_128-NEXT: uzp1 z28.h, z28.h, z29.h +; VBITS_GE_128-NEXT: uzp1 z29.h, z30.h, z31.h +; VBITS_GE_128-NEXT: mls z5.b, p0/m, z27.b, z10.b +; VBITS_GE_128-NEXT: uzp1 z27.b, z29.b, z28.b +; VBITS_GE_128-NEXT: sunpkhi z28.h, z9.b +; VBITS_GE_128-NEXT: sunpkhi z29.h, z7.b +; VBITS_GE_128-NEXT: sunpkhi z30.s, z28.h +; VBITS_GE_128-NEXT: sunpkhi z31.s, z29.h +; VBITS_GE_128-NEXT: sunpklo z28.s, z28.h +; VBITS_GE_128-NEXT: sunpklo z29.s, z29.h +; VBITS_GE_128-NEXT: ld1b { z25.b }, p0/z, [x1, x11] +; VBITS_GE_128-NEXT: sdivr z30.s, p1/m, z30.s, z31.s +; VBITS_GE_128-NEXT: sdivr z28.s, p1/m, z28.s, z29.s +; VBITS_GE_128-NEXT: sunpklo z29.h, z9.b +; VBITS_GE_128-NEXT: sunpklo z31.h, z7.b +; VBITS_GE_128-NEXT: sunpkhi z8.s, z29.h +; VBITS_GE_128-NEXT: sunpkhi z10.s, z31.h +; VBITS_GE_128-NEXT: sunpklo z29.s, z29.h +; VBITS_GE_128-NEXT: sunpklo z31.s, z31.h +; VBITS_GE_128-NEXT: sdivr z8.s, p1/m, z8.s, z10.s +; VBITS_GE_128-NEXT: sdivr z29.s, p1/m, z29.s, z31.s +; VBITS_GE_128-NEXT: uzp1 z28.h, z28.h, z30.h +; VBITS_GE_128-NEXT: uzp1 z29.h, z29.h, z8.h +; VBITS_GE_128-NEXT: mls z6.b, p0/m, z27.b, z26.b +; VBITS_GE_128-NEXT: uzp1 z27.b, z29.b, z28.b +; VBITS_GE_128-NEXT: sunpkhi z26.h, z25.b +; VBITS_GE_128-NEXT: sunpkhi z28.h, z16.b +; VBITS_GE_128-NEXT: sunpkhi z29.s, z26.h +; VBITS_GE_128-NEXT: sunpkhi z30.s, z28.h +; VBITS_GE_128-NEXT: sunpklo z26.s, z26.h +; VBITS_GE_128-NEXT: sunpklo z28.s, z28.h +; VBITS_GE_128-NEXT: sdivr z29.s, p1/m, z29.s, z30.s +; VBITS_GE_128-NEXT: sdiv z28.s, p1/m, z28.s, z26.s +; VBITS_GE_128-NEXT: sunpklo z26.h, z25.b +; VBITS_GE_128-NEXT: sunpklo z30.h, z16.b +; VBITS_GE_128-NEXT: sunpkhi z31.s, z26.h +; VBITS_GE_128-NEXT: sunpkhi z8.s, z30.h +; VBITS_GE_128-NEXT: ld1b { z10.b }, p0/z, [x1, x10] +; VBITS_GE_128-NEXT: sdivr z31.s, p1/m, z31.s, z8.s +; VBITS_GE_128-NEXT: sunpklo z8.s, z26.h +; VBITS_GE_128-NEXT: sunpklo z30.s, z30.h +; VBITS_GE_128-NEXT: sdiv z30.s, p1/m, z30.s, z8.s +; VBITS_GE_128-NEXT: uzp1 z28.h, z28.h, z29.h +; VBITS_GE_128-NEXT: uzp1 z29.h, z30.h, z31.h +; VBITS_GE_128-NEXT: mls z7.b, p0/m, z27.b, z9.b +; VBITS_GE_128-NEXT: uzp1 z27.b, z29.b, z28.b +; VBITS_GE_128-NEXT: sunpkhi z28.h, z10.b +; VBITS_GE_128-NEXT: sunpkhi z29.h, z0.b +; VBITS_GE_128-NEXT: sunpkhi z30.s, z28.h +; VBITS_GE_128-NEXT: sunpkhi z31.s, z29.h +; VBITS_GE_128-NEXT: sunpklo z28.s, z28.h +; VBITS_GE_128-NEXT: sunpklo z29.s, z29.h +; VBITS_GE_128-NEXT: ld1b { z26.b }, p0/z, [x1, x9] +; VBITS_GE_128-NEXT: sdivr z30.s, p1/m, z30.s, z31.s +; VBITS_GE_128-NEXT: sdivr z28.s, p1/m, z28.s, z29.s +; VBITS_GE_128-NEXT: sunpklo z29.h, z10.b +; VBITS_GE_128-NEXT: sunpklo z31.h, z0.b +; VBITS_GE_128-NEXT: sunpkhi z8.s, z29.h +; VBITS_GE_128-NEXT: sunpkhi z9.s, z31.h +; VBITS_GE_128-NEXT: sunpklo z29.s, z29.h +; VBITS_GE_128-NEXT: sunpklo z31.s, z31.h +; VBITS_GE_128-NEXT: sdivr z8.s, p1/m, z8.s, z9.s +; VBITS_GE_128-NEXT: sdivr z29.s, p1/m, z29.s, z31.s +; VBITS_GE_128-NEXT: uzp1 z28.h, z28.h, z30.h +; VBITS_GE_128-NEXT: uzp1 z29.h, z29.h, z8.h +; VBITS_GE_128-NEXT: mls z16.b, p0/m, z27.b, z25.b +; VBITS_GE_128-NEXT: uzp1 z27.b, z29.b, z28.b +; VBITS_GE_128-NEXT: sunpkhi z25.h, z26.b +; VBITS_GE_128-NEXT: sunpkhi z28.h, z1.b +; VBITS_GE_128-NEXT: sunpkhi z29.s, z25.h +; VBITS_GE_128-NEXT: sunpkhi z30.s, z28.h +; VBITS_GE_128-NEXT: sdivr z29.s, p1/m, z29.s, z30.s +; VBITS_GE_128-NEXT: sunpklo z30.h, z26.b +; VBITS_GE_128-NEXT: sunpklo z31.h, z1.b +; VBITS_GE_128-NEXT: sunpklo z25.s, z25.h +; VBITS_GE_128-NEXT: sunpklo z28.s, z28.h +; VBITS_GE_128-NEXT: sunpkhi z8.s, z30.h +; VBITS_GE_128-NEXT: sunpkhi z9.s, z31.h +; VBITS_GE_128-NEXT: sunpklo z30.s, z30.h +; VBITS_GE_128-NEXT: sunpklo z31.s, z31.h +; VBITS_GE_128-NEXT: sdiv z28.s, p1/m, z28.s, z25.s +; VBITS_GE_128-NEXT: sdivr z8.s, p1/m, z8.s, z9.s +; VBITS_GE_128-NEXT: sdivr z30.s, p1/m, z30.s, z31.s +; VBITS_GE_128-NEXT: uzp1 z28.h, z28.h, z29.h +; VBITS_GE_128-NEXT: uzp1 z29.h, z30.h, z8.h +; VBITS_GE_128-NEXT: mls z0.b, p0/m, z27.b, z10.b +; VBITS_GE_128-NEXT: uzp1 z27.b, z29.b, z28.b +; VBITS_GE_128-NEXT: sunpkhi z28.h, z24.b +; VBITS_GE_128-NEXT: sunpkhi z29.h, z2.b +; VBITS_GE_128-NEXT: sunpkhi z30.s, z28.h +; VBITS_GE_128-NEXT: sunpkhi z31.s, z29.h +; VBITS_GE_128-NEXT: sunpklo z28.s, z28.h +; VBITS_GE_128-NEXT: sunpklo z29.s, z29.h +; VBITS_GE_128-NEXT: ld1b { z25.b }, p0/z, [x1, x8] +; VBITS_GE_128-NEXT: ld1b { z9.b }, p0/z, [x1] +; VBITS_GE_128-NEXT: sdivr z30.s, p1/m, z30.s, z31.s +; VBITS_GE_128-NEXT: sdivr z28.s, p1/m, z28.s, z29.s +; VBITS_GE_128-NEXT: sunpklo z29.h, z24.b +; VBITS_GE_128-NEXT: sunpklo z31.h, z2.b +; VBITS_GE_128-NEXT: sunpkhi z8.s, z29.h +; VBITS_GE_128-NEXT: sunpkhi z10.s, z31.h +; VBITS_GE_128-NEXT: sunpklo z29.s, z29.h +; VBITS_GE_128-NEXT: sunpklo z31.s, z31.h +; VBITS_GE_128-NEXT: sdivr z8.s, p1/m, z8.s, z10.s +; VBITS_GE_128-NEXT: sdivr z29.s, p1/m, z29.s, z31.s +; VBITS_GE_128-NEXT: uzp1 z28.h, z28.h, z30.h +; VBITS_GE_128-NEXT: uzp1 z29.h, z29.h, z8.h +; VBITS_GE_128-NEXT: mls z1.b, p0/m, z27.b, z26.b +; VBITS_GE_128-NEXT: uzp1 z26.b, z29.b, z28.b +; VBITS_GE_128-NEXT: sunpkhi z27.h, z9.b +; VBITS_GE_128-NEXT: sunpkhi z28.h, z20.b +; VBITS_GE_128-NEXT: sunpkhi z29.s, z27.h +; VBITS_GE_128-NEXT: sunpkhi z30.s, z28.h +; VBITS_GE_128-NEXT: sunpklo z27.s, z27.h +; VBITS_GE_128-NEXT: sunpklo z28.s, z28.h +; VBITS_GE_128-NEXT: sdivr z29.s, p1/m, z29.s, z30.s +; VBITS_GE_128-NEXT: sdivr z27.s, p1/m, z27.s, z28.s +; VBITS_GE_128-NEXT: sunpklo z28.h, z9.b +; VBITS_GE_128-NEXT: sunpklo z30.h, z20.b +; VBITS_GE_128-NEXT: sunpkhi z31.s, z28.h +; VBITS_GE_128-NEXT: sunpkhi z8.s, z30.h +; VBITS_GE_128-NEXT: sunpklo z28.s, z28.h +; VBITS_GE_128-NEXT: sunpklo z30.s, z30.h +; VBITS_GE_128-NEXT: sdivr z31.s, p1/m, z31.s, z8.s +; VBITS_GE_128-NEXT: sdivr z28.s, p1/m, z28.s, z30.s +; VBITS_GE_128-NEXT: uzp1 z27.h, z27.h, z29.h +; VBITS_GE_128-NEXT: uzp1 z28.h, z28.h, z31.h +; VBITS_GE_128-NEXT: mls z2.b, p0/m, z26.b, z24.b +; VBITS_GE_128-NEXT: uzp1 z24.b, z28.b, z27.b +; VBITS_GE_128-NEXT: mls z20.b, p0/m, z24.b, z9.b +; VBITS_GE_128-NEXT: sunpkhi z24.h, z25.b +; VBITS_GE_128-NEXT: ldp d9, d8, [sp, #16] // 16-byte Folded Reload +; VBITS_GE_128-NEXT: stp q18, q19, [x0, #32] +; VBITS_GE_128-NEXT: stp q21, q22, [x0, #64] +; VBITS_GE_128-NEXT: stp q23, q3, [x0, #96] +; VBITS_GE_128-NEXT: sunpklo z3.h, z17.b +; VBITS_GE_128-NEXT: stp q4, q5, [x0, #128] +; VBITS_GE_128-NEXT: sunpkhi z4.s, z3.h +; VBITS_GE_128-NEXT: stp q16, q0, [x0, #192] +; VBITS_GE_128-NEXT: sunpklo z3.s, z3.h +; VBITS_GE_128-NEXT: stp q6, q7, [x0, #160] +; VBITS_GE_128-NEXT: stp q1, q2, [x0, #224] +; VBITS_GE_128-NEXT: sunpkhi z1.h, z17.b +; VBITS_GE_128-NEXT: sunpkhi z2.s, z24.h +; VBITS_GE_128-NEXT: sunpkhi z0.s, z1.h +; VBITS_GE_128-NEXT: sdiv z0.s, p1/m, z0.s, z2.s +; VBITS_GE_128-NEXT: sunpklo z2.s, z24.h +; VBITS_GE_128-NEXT: sunpklo z1.s, z1.h +; VBITS_GE_128-NEXT: sdiv z1.s, p1/m, z1.s, z2.s +; VBITS_GE_128-NEXT: uzp1 z0.h, z1.h, z0.h +; VBITS_GE_128-NEXT: sunpklo z1.h, z25.b +; VBITS_GE_128-NEXT: sunpkhi z2.s, z1.h +; VBITS_GE_128-NEXT: sunpklo z1.s, z1.h +; VBITS_GE_128-NEXT: sdivr z2.s, p1/m, z2.s, z4.s +; VBITS_GE_128-NEXT: sdivr z1.s, p1/m, z1.s, z3.s +; VBITS_GE_128-NEXT: uzp1 z1.h, z1.h, z2.h +; VBITS_GE_128-NEXT: uzp1 z0.b, z1.b, z0.b +; VBITS_GE_128-NEXT: mls z17.b, p0/m, z0.b, z25.b +; VBITS_GE_128-NEXT: stp q20, q17, [x0] +; VBITS_GE_128-NEXT: ldr d10, [sp], #32 // 8-byte Folded Reload +; VBITS_GE_128-NEXT: ret +; +; VBITS_GE_256-LABEL: srem_v256i8: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov w8, #192 +; VBITS_GE_256-NEXT: mov w9, #224 +; VBITS_GE_256-NEXT: mov w10, #128 +; VBITS_GE_256-NEXT: mov w11, #160 +; VBITS_GE_256-NEXT: mov w12, #64 +; VBITS_GE_256-NEXT: mov w13, #96 +; VBITS_GE_256-NEXT: mov w14, #32 +; VBITS_GE_256-NEXT: ptrue p0.b, vl32 +; VBITS_GE_256-NEXT: ld1b { z0.b }, p0/z, [x0, x8] +; VBITS_GE_256-NEXT: ld1b { z7.b }, p0/z, [x0, x9] +; VBITS_GE_256-NEXT: ld1b { z6.b }, p0/z, [x0, x10] +; VBITS_GE_256-NEXT: ld1b { z5.b }, p0/z, [x0, x11] +; VBITS_GE_256-NEXT: ld1b { z4.b }, p0/z, [x0, x12] +; VBITS_GE_256-NEXT: ld1b { z3.b }, p0/z, [x0, x13] +; VBITS_GE_256-NEXT: ld1b { z2.b }, p0/z, [x0, x14] +; VBITS_GE_256-NEXT: ld1b { z1.b }, p0/z, [x0] +; VBITS_GE_256-NEXT: ld1b { z18.b }, p0/z, [x1, x14] +; VBITS_GE_256-NEXT: ld1b { z19.b }, p0/z, [x1, x13] +; VBITS_GE_256-NEXT: ptrue p1.s, vl8 +; VBITS_GE_256-NEXT: ld1b { z28.b }, p0/z, [x1, x11] +; VBITS_GE_256-NEXT: ld1b { z17.b }, p0/z, [x1, x9] +; VBITS_GE_256-NEXT: ld1b { z16.b }, p0/z, [x1, x8] +; VBITS_GE_256-NEXT: sunpkhi z22.h, z2.b +; VBITS_GE_256-NEXT: sunpkhi z20.h, z18.b +; VBITS_GE_256-NEXT: sunpkhi z23.s, z22.h +; VBITS_GE_256-NEXT: sunpkhi z21.s, z20.h +; VBITS_GE_256-NEXT: sunpklo z20.s, z20.h +; VBITS_GE_256-NEXT: sunpklo z22.s, z22.h +; VBITS_GE_256-NEXT: sdivr z21.s, p1/m, z21.s, z23.s +; VBITS_GE_256-NEXT: sdivr z20.s, p1/m, z20.s, z22.s +; VBITS_GE_256-NEXT: sunpklo z22.h, z18.b +; VBITS_GE_256-NEXT: sunpklo z23.h, z2.b +; VBITS_GE_256-NEXT: sunpkhi z24.s, z22.h +; VBITS_GE_256-NEXT: sunpkhi z25.s, z23.h +; VBITS_GE_256-NEXT: sunpklo z22.s, z22.h +; VBITS_GE_256-NEXT: sunpklo z23.s, z23.h +; VBITS_GE_256-NEXT: sdivr z24.s, p1/m, z24.s, z25.s +; VBITS_GE_256-NEXT: sdivr z22.s, p1/m, z22.s, z23.s +; VBITS_GE_256-NEXT: uzp1 z20.h, z20.h, z21.h +; VBITS_GE_256-NEXT: uzp1 z21.h, z22.h, z24.h +; VBITS_GE_256-NEXT: sunpkhi z22.h, z19.b +; VBITS_GE_256-NEXT: sunpkhi z23.h, z3.b +; VBITS_GE_256-NEXT: sunpkhi z24.s, z22.h +; VBITS_GE_256-NEXT: sunpkhi z25.s, z23.h +; VBITS_GE_256-NEXT: sunpklo z22.s, z22.h +; VBITS_GE_256-NEXT: sunpklo z23.s, z23.h +; VBITS_GE_256-NEXT: sdivr z24.s, p1/m, z24.s, z25.s +; VBITS_GE_256-NEXT: sdivr z22.s, p1/m, z22.s, z23.s +; VBITS_GE_256-NEXT: sunpklo z23.h, z19.b +; VBITS_GE_256-NEXT: sunpklo z25.h, z3.b +; VBITS_GE_256-NEXT: sunpkhi z26.s, z23.h +; VBITS_GE_256-NEXT: sunpkhi z27.s, z25.h +; VBITS_GE_256-NEXT: sunpklo z23.s, z23.h +; VBITS_GE_256-NEXT: sdivr z26.s, p1/m, z26.s, z27.s +; VBITS_GE_256-NEXT: sunpklo z25.s, z25.h +; VBITS_GE_256-NEXT: ld1b { z27.b }, p0/z, [x1, x12] +; VBITS_GE_256-NEXT: sdivr z23.s, p1/m, z23.s, z25.s +; VBITS_GE_256-NEXT: uzp1 z22.h, z22.h, z24.h +; VBITS_GE_256-NEXT: uzp1 z23.h, z23.h, z26.h +; VBITS_GE_256-NEXT: uzp1 z20.b, z21.b, z20.b +; VBITS_GE_256-NEXT: uzp1 z21.b, z23.b, z22.b +; VBITS_GE_256-NEXT: mls z2.b, p0/m, z20.b, z18.b +; VBITS_GE_256-NEXT: mls z3.b, p0/m, z21.b, z19.b +; VBITS_GE_256-NEXT: sunpkhi z18.h, z27.b +; VBITS_GE_256-NEXT: sunpkhi z19.h, z4.b +; VBITS_GE_256-NEXT: sunpkhi z20.s, z18.h +; VBITS_GE_256-NEXT: sunpkhi z21.s, z19.h +; VBITS_GE_256-NEXT: sunpklo z18.s, z18.h +; VBITS_GE_256-NEXT: sunpklo z19.s, z19.h +; VBITS_GE_256-NEXT: sdivr z20.s, p1/m, z20.s, z21.s +; VBITS_GE_256-NEXT: sdivr z18.s, p1/m, z18.s, z19.s +; VBITS_GE_256-NEXT: sunpklo z19.h, z27.b +; VBITS_GE_256-NEXT: sunpklo z21.h, z4.b +; VBITS_GE_256-NEXT: sunpkhi z22.s, z19.h +; VBITS_GE_256-NEXT: sunpkhi z23.s, z21.h +; VBITS_GE_256-NEXT: sunpklo z19.s, z19.h +; VBITS_GE_256-NEXT: sunpklo z21.s, z21.h +; VBITS_GE_256-NEXT: sdivr z22.s, p1/m, z22.s, z23.s +; VBITS_GE_256-NEXT: sdivr z19.s, p1/m, z19.s, z21.s +; VBITS_GE_256-NEXT: uzp1 z20.h, z18.h, z20.h +; VBITS_GE_256-NEXT: sunpkhi z18.h, z28.b +; VBITS_GE_256-NEXT: sunpkhi z21.h, z5.b +; VBITS_GE_256-NEXT: uzp1 z19.h, z19.h, z22.h +; VBITS_GE_256-NEXT: sunpkhi z22.s, z18.h +; VBITS_GE_256-NEXT: sunpkhi z23.s, z21.h +; VBITS_GE_256-NEXT: sunpklo z18.s, z18.h +; VBITS_GE_256-NEXT: sunpklo z21.s, z21.h +; VBITS_GE_256-NEXT: sdivr z22.s, p1/m, z22.s, z23.s +; VBITS_GE_256-NEXT: sdiv z21.s, p1/m, z21.s, z18.s +; VBITS_GE_256-NEXT: sunpklo z18.h, z28.b +; VBITS_GE_256-NEXT: sunpklo z23.h, z5.b +; VBITS_GE_256-NEXT: sunpkhi z24.s, z18.h +; VBITS_GE_256-NEXT: sunpkhi z25.s, z23.h +; VBITS_GE_256-NEXT: ld1b { z26.b }, p0/z, [x1, x10] +; VBITS_GE_256-NEXT: sdivr z24.s, p1/m, z24.s, z25.s +; VBITS_GE_256-NEXT: sunpklo z25.s, z18.h +; VBITS_GE_256-NEXT: sunpklo z23.s, z23.h +; VBITS_GE_256-NEXT: sdiv z23.s, p1/m, z23.s, z25.s +; VBITS_GE_256-NEXT: uzp1 z21.h, z21.h, z22.h +; VBITS_GE_256-NEXT: uzp1 z22.h, z23.h, z24.h +; VBITS_GE_256-NEXT: uzp1 z19.b, z19.b, z20.b +; VBITS_GE_256-NEXT: uzp1 z20.b, z22.b, z21.b +; VBITS_GE_256-NEXT: mls z4.b, p0/m, z19.b, z27.b +; VBITS_GE_256-NEXT: mls z5.b, p0/m, z20.b, z28.b +; VBITS_GE_256-NEXT: sunpkhi z19.h, z26.b +; VBITS_GE_256-NEXT: sunpkhi z20.h, z6.b +; VBITS_GE_256-NEXT: sunpkhi z21.s, z19.h +; VBITS_GE_256-NEXT: sunpkhi z22.s, z20.h +; VBITS_GE_256-NEXT: sunpklo z19.s, z19.h +; VBITS_GE_256-NEXT: sunpklo z20.s, z20.h +; VBITS_GE_256-NEXT: sdivr z21.s, p1/m, z21.s, z22.s +; VBITS_GE_256-NEXT: sdivr z19.s, p1/m, z19.s, z20.s +; VBITS_GE_256-NEXT: sunpklo z20.h, z26.b +; VBITS_GE_256-NEXT: sunpklo z22.h, z6.b +; VBITS_GE_256-NEXT: sunpkhi z23.s, z20.h +; VBITS_GE_256-NEXT: sunpkhi z24.s, z22.h +; VBITS_GE_256-NEXT: sunpklo z20.s, z20.h +; VBITS_GE_256-NEXT: sunpklo z22.s, z22.h +; VBITS_GE_256-NEXT: sdivr z23.s, p1/m, z23.s, z24.s +; VBITS_GE_256-NEXT: sdivr z20.s, p1/m, z20.s, z22.s +; VBITS_GE_256-NEXT: uzp1 z19.h, z19.h, z21.h +; VBITS_GE_256-NEXT: sunpkhi z21.h, z17.b +; VBITS_GE_256-NEXT: sunpkhi z22.h, z7.b +; VBITS_GE_256-NEXT: uzp1 z20.h, z20.h, z23.h +; VBITS_GE_256-NEXT: sunpkhi z23.s, z21.h +; VBITS_GE_256-NEXT: sunpkhi z24.s, z22.h +; VBITS_GE_256-NEXT: sunpklo z21.s, z21.h +; VBITS_GE_256-NEXT: sunpklo z22.s, z22.h +; VBITS_GE_256-NEXT: sdivr z23.s, p1/m, z23.s, z24.s +; VBITS_GE_256-NEXT: sdivr z21.s, p1/m, z21.s, z22.s +; VBITS_GE_256-NEXT: sunpklo z22.h, z17.b +; VBITS_GE_256-NEXT: sunpklo z24.h, z7.b +; VBITS_GE_256-NEXT: sunpkhi z25.s, z22.h +; VBITS_GE_256-NEXT: sunpkhi z27.s, z24.h +; VBITS_GE_256-NEXT: sunpklo z22.s, z22.h +; VBITS_GE_256-NEXT: sunpklo z24.s, z24.h +; VBITS_GE_256-NEXT: sdivr z25.s, p1/m, z25.s, z27.s +; VBITS_GE_256-NEXT: sdivr z22.s, p1/m, z22.s, z24.s +; VBITS_GE_256-NEXT: uzp1 z21.h, z21.h, z23.h +; VBITS_GE_256-NEXT: uzp1 z22.h, z22.h, z25.h +; VBITS_GE_256-NEXT: uzp1 z19.b, z20.b, z19.b +; VBITS_GE_256-NEXT: uzp1 z20.b, z22.b, z21.b +; VBITS_GE_256-NEXT: ld1b { z18.b }, p0/z, [x1] +; VBITS_GE_256-NEXT: mls z6.b, p0/m, z19.b, z26.b +; VBITS_GE_256-NEXT: mls z7.b, p0/m, z20.b, z17.b +; VBITS_GE_256-NEXT: sunpkhi z17.h, z16.b +; VBITS_GE_256-NEXT: sunpkhi z19.h, z0.b +; VBITS_GE_256-NEXT: sunpkhi z20.s, z17.h +; VBITS_GE_256-NEXT: sunpkhi z21.s, z19.h +; VBITS_GE_256-NEXT: sunpklo z17.s, z17.h +; VBITS_GE_256-NEXT: sunpklo z19.s, z19.h +; VBITS_GE_256-NEXT: sdivr z20.s, p1/m, z20.s, z21.s +; VBITS_GE_256-NEXT: sdivr z17.s, p1/m, z17.s, z19.s +; VBITS_GE_256-NEXT: sunpklo z19.h, z16.b +; VBITS_GE_256-NEXT: sunpklo z21.h, z0.b +; VBITS_GE_256-NEXT: sunpkhi z22.s, z19.h +; VBITS_GE_256-NEXT: sunpkhi z23.s, z21.h +; VBITS_GE_256-NEXT: sunpklo z19.s, z19.h +; VBITS_GE_256-NEXT: sunpklo z21.s, z21.h +; VBITS_GE_256-NEXT: sdivr z22.s, p1/m, z22.s, z23.s +; VBITS_GE_256-NEXT: sdivr z19.s, p1/m, z19.s, z21.s +; VBITS_GE_256-NEXT: uzp1 z17.h, z17.h, z20.h +; VBITS_GE_256-NEXT: sunpkhi z20.h, z18.b +; VBITS_GE_256-NEXT: sunpkhi z21.h, z1.b +; VBITS_GE_256-NEXT: uzp1 z19.h, z19.h, z22.h +; VBITS_GE_256-NEXT: sunpkhi z22.s, z20.h +; VBITS_GE_256-NEXT: sunpkhi z23.s, z21.h +; VBITS_GE_256-NEXT: sunpklo z20.s, z20.h +; VBITS_GE_256-NEXT: sunpklo z21.s, z21.h +; VBITS_GE_256-NEXT: sdivr z22.s, p1/m, z22.s, z23.s +; VBITS_GE_256-NEXT: sdivr z20.s, p1/m, z20.s, z21.s +; VBITS_GE_256-NEXT: sunpklo z21.h, z18.b +; VBITS_GE_256-NEXT: sunpklo z23.h, z1.b +; VBITS_GE_256-NEXT: sunpkhi z24.s, z21.h +; VBITS_GE_256-NEXT: sunpkhi z25.s, z23.h +; VBITS_GE_256-NEXT: sunpklo z21.s, z21.h +; VBITS_GE_256-NEXT: sunpklo z23.s, z23.h +; VBITS_GE_256-NEXT: sdivr z24.s, p1/m, z24.s, z25.s +; VBITS_GE_256-NEXT: sdivr z21.s, p1/m, z21.s, z23.s +; VBITS_GE_256-NEXT: uzp1 z20.h, z20.h, z22.h +; VBITS_GE_256-NEXT: uzp1 z21.h, z21.h, z24.h +; VBITS_GE_256-NEXT: uzp1 z17.b, z19.b, z17.b +; VBITS_GE_256-NEXT: uzp1 z19.b, z21.b, z20.b +; VBITS_GE_256-NEXT: mls z0.b, p0/m, z17.b, z16.b +; VBITS_GE_256-NEXT: mls z1.b, p0/m, z19.b, z18.b +; VBITS_GE_256-NEXT: st1b { z0.b }, p0, [x0, x8] +; VBITS_GE_256-NEXT: st1b { z7.b }, p0, [x0, x9] +; VBITS_GE_256-NEXT: st1b { z6.b }, p0, [x0, x10] +; VBITS_GE_256-NEXT: st1b { z5.b }, p0, [x0, x11] +; VBITS_GE_256-NEXT: st1b { z4.b }, p0, [x0, x12] +; VBITS_GE_256-NEXT: st1b { z3.b }, p0, [x0, x13] +; VBITS_GE_256-NEXT: st1b { z2.b }, p0, [x0, x14] +; VBITS_GE_256-NEXT: st1b { z1.b }, p0, [x0] +; VBITS_GE_256-NEXT: ret + %op1 = load <256 x i8>, <256 x i8>* %a + %op2 = load <256 x i8>, <256 x i8>* %b + %res = srem <256 x i8> %op1, %op2 + store <256 x i8> %res, <256 x i8>* %a + ret void +} + +define <4 x i16> @srem_v4i16(<4 x i16> %op1, <4 x i16> %op2) #0 { +; CHECK-LABEL: srem_v4i16: +; CHECK: // %bb.0: +; CHECK-NEXT: sub sp, sp, #16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1 +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: ptrue p0.s, vl4 +; CHECK-NEXT: sunpklo z2.s, z1.h +; CHECK-NEXT: sunpklo z3.s, z0.h +; CHECK-NEXT: sdivr z2.s, p0/m, z2.s, z3.s +; CHECK-NEXT: ptrue p0.h, vl4 +; CHECK-NEXT: fmov w8, s2 +; CHECK-NEXT: mov z3.s, z2.s[3] +; CHECK-NEXT: mov z4.s, z2.s[2] +; CHECK-NEXT: mov z2.s, z2.s[1] +; CHECK-NEXT: fmov w9, s3 +; CHECK-NEXT: fmov w10, s4 +; CHECK-NEXT: strh w8, [sp, #8] +; CHECK-NEXT: fmov w8, s2 +; CHECK-NEXT: strh w9, [sp, #14] +; CHECK-NEXT: strh w8, [sp, #10] +; CHECK-NEXT: add x8, sp, #8 +; CHECK-NEXT: strh w10, [sp, #12] +; CHECK-NEXT: ld1h { z2.h }, p0/z, [x8] +; CHECK-NEXT: mls z0.h, p0/m, z2.h, z1.h +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 +; CHECK-NEXT: add sp, sp, #16 +; CHECK-NEXT: ret + %res = srem <4 x i16> %op1, %op2 + ret <4 x i16> %res +} + +define <8 x i16> @srem_v8i16(<8 x i16> %op1, <8 x i16> %op2) #0 { +; VBITS_GE_128-LABEL: srem_v8i16: +; VBITS_GE_128: // %bb.0: +; VBITS_GE_128-NEXT: // kill: def $q1 killed $q1 def $z1 +; VBITS_GE_128-NEXT: // kill: def $q0 killed $q0 def $z0 +; VBITS_GE_128-NEXT: ptrue p0.s, vl4 +; VBITS_GE_128-NEXT: sunpkhi z2.s, z1.h +; VBITS_GE_128-NEXT: sunpkhi z3.s, z0.h +; VBITS_GE_128-NEXT: sunpklo z4.s, z1.h +; VBITS_GE_128-NEXT: sdivr z2.s, p0/m, z2.s, z3.s +; VBITS_GE_128-NEXT: sunpklo z5.s, z0.h +; VBITS_GE_128-NEXT: movprfx z3, z5 +; VBITS_GE_128-NEXT: sdiv z3.s, p0/m, z3.s, z4.s +; VBITS_GE_128-NEXT: uzp1 z2.h, z3.h, z2.h +; VBITS_GE_128-NEXT: ptrue p0.h, vl8 +; VBITS_GE_128-NEXT: mls z0.h, p0/m, z2.h, z1.h +; VBITS_GE_128-NEXT: // kill: def $q0 killed $q0 killed $z0 +; VBITS_GE_128-NEXT: ret +; +; VBITS_GE_256-LABEL: srem_v8i16: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: // kill: def $q1 killed $q1 def $z1 +; VBITS_GE_256-NEXT: // kill: def $q0 killed $q0 def $z0 +; VBITS_GE_256-NEXT: ptrue p0.s, vl8 +; VBITS_GE_256-NEXT: sunpklo z2.s, z1.h +; VBITS_GE_256-NEXT: sunpklo z3.s, z0.h +; VBITS_GE_256-NEXT: sdivr z2.s, p0/m, z2.s, z3.s +; VBITS_GE_256-NEXT: ptrue p0.h, vl8 +; VBITS_GE_256-NEXT: uzp1 z2.h, z2.h, z2.h +; VBITS_GE_256-NEXT: mls z0.h, p0/m, z2.h, z1.h +; VBITS_GE_256-NEXT: // kill: def $q0 killed $q0 killed $z0 +; VBITS_GE_256-NEXT: ret +; +; VBITS_GE_512-LABEL: srem_v8i16: +; VBITS_GE_512: // %bb.0: +; VBITS_GE_512-NEXT: // kill: def $q1 killed $q1 def $z1 +; VBITS_GE_512-NEXT: // kill: def $q0 killed $q0 def $z0 +; VBITS_GE_512-NEXT: ptrue p0.s, vl8 +; VBITS_GE_512-NEXT: sunpklo z2.s, z1.h +; VBITS_GE_512-NEXT: sunpklo z3.s, z0.h +; VBITS_GE_512-NEXT: sdivr z2.s, p0/m, z2.s, z3.s +; VBITS_GE_512-NEXT: ptrue p0.h, vl8 +; VBITS_GE_512-NEXT: uzp1 z2.h, z2.h, z2.h +; VBITS_GE_512-NEXT: mls z0.h, p0/m, z2.h, z1.h +; VBITS_GE_512-NEXT: // kill: def $q0 killed $q0 killed $z0 +; VBITS_GE_512-NEXT: ret + %res = srem <8 x i16> %op1, %op2 + ret <8 x i16> %res +} + +define void @srem_v16i16(<16 x i16>* %a, <16 x i16>* %b) #0 { +; VBITS_GE_128-LABEL: srem_v16i16: +; VBITS_GE_128: // %bb.0: +; VBITS_GE_128-NEXT: mov x8, #8 +; VBITS_GE_128-NEXT: ptrue p0.h, vl8 +; VBITS_GE_128-NEXT: ptrue p1.s, vl4 +; VBITS_GE_128-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] +; VBITS_GE_128-NEXT: ld1h { z1.h }, p0/z, [x0] +; VBITS_GE_128-NEXT: ld1h { z2.h }, p0/z, [x1, x8, lsl #1] +; VBITS_GE_128-NEXT: ld1h { z3.h }, p0/z, [x1] +; VBITS_GE_128-NEXT: sunpkhi z5.s, z0.h +; VBITS_GE_128-NEXT: sunpkhi z16.s, z1.h +; VBITS_GE_128-NEXT: sunpkhi z4.s, z2.h +; VBITS_GE_128-NEXT: sunpkhi z7.s, z3.h +; VBITS_GE_128-NEXT: sdivr z4.s, p1/m, z4.s, z5.s +; VBITS_GE_128-NEXT: sunpklo z5.s, z3.h +; VBITS_GE_128-NEXT: sdivr z7.s, p1/m, z7.s, z16.s +; VBITS_GE_128-NEXT: sunpklo z16.s, z1.h +; VBITS_GE_128-NEXT: sunpklo z6.s, z2.h +; VBITS_GE_128-NEXT: sdivr z5.s, p1/m, z5.s, z16.s +; VBITS_GE_128-NEXT: sunpklo z16.s, z0.h +; VBITS_GE_128-NEXT: uzp1 z5.h, z5.h, z7.h +; VBITS_GE_128-NEXT: sdivr z6.s, p1/m, z6.s, z16.s +; VBITS_GE_128-NEXT: mls z1.h, p0/m, z5.h, z3.h +; VBITS_GE_128-NEXT: uzp1 z4.h, z6.h, z4.h +; VBITS_GE_128-NEXT: mls z0.h, p0/m, z4.h, z2.h +; VBITS_GE_128-NEXT: stp q1, q0, [x0] +; VBITS_GE_128-NEXT: ret +; +; VBITS_GE_256-LABEL: srem_v16i16: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: ptrue p0.h, vl16 +; VBITS_GE_256-NEXT: ptrue p1.s, vl8 +; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0] +; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x1] +; VBITS_GE_256-NEXT: sunpkhi z2.s, z1.h +; VBITS_GE_256-NEXT: sunpkhi z3.s, z0.h +; VBITS_GE_256-NEXT: sdivr z2.s, p1/m, z2.s, z3.s +; VBITS_GE_256-NEXT: sunpklo z4.s, z1.h +; VBITS_GE_256-NEXT: sunpklo z5.s, z0.h +; VBITS_GE_256-NEXT: movprfx z3, z5 +; VBITS_GE_256-NEXT: sdiv z3.s, p1/m, z3.s, z4.s +; VBITS_GE_256-NEXT: uzp1 z2.h, z3.h, z2.h +; VBITS_GE_256-NEXT: mls z0.h, p0/m, z2.h, z1.h +; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x0] +; VBITS_GE_256-NEXT: ret +; +; VBITS_GE_512-LABEL: srem_v16i16: +; VBITS_GE_512: // %bb.0: +; VBITS_GE_512-NEXT: ptrue p0.h, vl16 +; VBITS_GE_512-NEXT: ptrue p1.s, vl16 +; VBITS_GE_512-NEXT: ld1h { z0.h }, p0/z, [x0] +; VBITS_GE_512-NEXT: ld1h { z1.h }, p0/z, [x1] +; VBITS_GE_512-NEXT: sunpklo z2.s, z1.h +; VBITS_GE_512-NEXT: sunpklo z3.s, z0.h +; VBITS_GE_512-NEXT: sdivr z2.s, p1/m, z2.s, z3.s +; VBITS_GE_512-NEXT: uzp1 z2.h, z2.h, z2.h +; VBITS_GE_512-NEXT: mls z0.h, p0/m, z2.h, z1.h +; VBITS_GE_512-NEXT: st1h { z0.h }, p0, [x0] +; VBITS_GE_512-NEXT: ret + %op1 = load <16 x i16>, <16 x i16>* %a + %op2 = load <16 x i16>, <16 x i16>* %b + %res = srem <16 x i16> %op1, %op2 + store <16 x i16> %res, <16 x i16>* %a + ret void +} + +define void @srem_v32i16(<32 x i16>* %a, <32 x i16>* %b) #0 { +; VBITS_GE_128-LABEL: srem_v32i16: +; VBITS_GE_128: // %bb.0: +; VBITS_GE_128-NEXT: mov x8, #16 +; VBITS_GE_128-NEXT: mov x9, #24 +; VBITS_GE_128-NEXT: mov x10, #8 +; VBITS_GE_128-NEXT: ptrue p0.h, vl8 +; VBITS_GE_128-NEXT: ptrue p1.s, vl4 +; VBITS_GE_128-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] +; VBITS_GE_128-NEXT: ld1h { z1.h }, p0/z, [x0, x9, lsl #1] +; VBITS_GE_128-NEXT: ld1h { z2.h }, p0/z, [x0, x10, lsl #1] +; VBITS_GE_128-NEXT: ld1h { z3.h }, p0/z, [x0] +; VBITS_GE_128-NEXT: ld1h { z4.h }, p0/z, [x1, x10, lsl #1] +; VBITS_GE_128-NEXT: ld1h { z6.h }, p0/z, [x1, x9, lsl #1] +; VBITS_GE_128-NEXT: ld1h { z5.h }, p0/z, [x1, x8, lsl #1] +; VBITS_GE_128-NEXT: sunpklo z20.s, z1.h +; VBITS_GE_128-NEXT: sunpklo z22.s, z0.h +; VBITS_GE_128-NEXT: sunpkhi z16.s, z2.h +; VBITS_GE_128-NEXT: sunpklo z18.s, z2.h +; VBITS_GE_128-NEXT: sunpkhi z7.s, z4.h +; VBITS_GE_128-NEXT: sunpklo z17.s, z4.h +; VBITS_GE_128-NEXT: sdivr z7.s, p1/m, z7.s, z16.s +; VBITS_GE_128-NEXT: movprfx z16, z18 +; VBITS_GE_128-NEXT: sdiv z16.s, p1/m, z16.s, z17.s +; VBITS_GE_128-NEXT: ld1h { z17.h }, p0/z, [x1] +; VBITS_GE_128-NEXT: uzp1 z7.h, z16.h, z7.h +; VBITS_GE_128-NEXT: sunpkhi z16.s, z6.h +; VBITS_GE_128-NEXT: sunpkhi z18.s, z1.h +; VBITS_GE_128-NEXT: sunpklo z19.s, z6.h +; VBITS_GE_128-NEXT: sdivr z16.s, p1/m, z16.s, z18.s +; VBITS_GE_128-NEXT: movprfx z18, z20 +; VBITS_GE_128-NEXT: sdiv z18.s, p1/m, z18.s, z19.s +; VBITS_GE_128-NEXT: sunpkhi z19.s, z5.h +; VBITS_GE_128-NEXT: sunpkhi z20.s, z0.h +; VBITS_GE_128-NEXT: sunpklo z21.s, z5.h +; VBITS_GE_128-NEXT: sdivr z19.s, p1/m, z19.s, z20.s +; VBITS_GE_128-NEXT: movprfx z20, z22 +; VBITS_GE_128-NEXT: sdiv z20.s, p1/m, z20.s, z21.s +; VBITS_GE_128-NEXT: uzp1 z16.h, z18.h, z16.h +; VBITS_GE_128-NEXT: uzp1 z18.h, z20.h, z19.h +; VBITS_GE_128-NEXT: mls z1.h, p0/m, z16.h, z6.h +; VBITS_GE_128-NEXT: mls z0.h, p0/m, z18.h, z5.h +; VBITS_GE_128-NEXT: sunpkhi z5.s, z17.h +; VBITS_GE_128-NEXT: sunpkhi z6.s, z3.h +; VBITS_GE_128-NEXT: sdivr z5.s, p1/m, z5.s, z6.s +; VBITS_GE_128-NEXT: sunpklo z16.s, z17.h +; VBITS_GE_128-NEXT: sunpklo z18.s, z3.h +; VBITS_GE_128-NEXT: movprfx z6, z18 +; VBITS_GE_128-NEXT: sdiv z6.s, p1/m, z6.s, z16.s +; VBITS_GE_128-NEXT: uzp1 z5.h, z6.h, z5.h +; VBITS_GE_128-NEXT: mls z2.h, p0/m, z7.h, z4.h +; VBITS_GE_128-NEXT: mls z3.h, p0/m, z5.h, z17.h +; VBITS_GE_128-NEXT: stp q0, q1, [x0, #32] +; VBITS_GE_128-NEXT: stp q3, q2, [x0] +; VBITS_GE_128-NEXT: ret +; +; VBITS_GE_256-LABEL: srem_v32i16: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov x8, #16 +; VBITS_GE_256-NEXT: ptrue p0.h, vl16 +; VBITS_GE_256-NEXT: ptrue p1.s, vl8 +; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] +; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0] +; VBITS_GE_256-NEXT: ld1h { z2.h }, p0/z, [x1, x8, lsl #1] +; VBITS_GE_256-NEXT: ld1h { z3.h }, p0/z, [x1] +; VBITS_GE_256-NEXT: sunpkhi z5.s, z0.h +; VBITS_GE_256-NEXT: sunpklo z7.s, z0.h +; VBITS_GE_256-NEXT: sunpkhi z4.s, z2.h +; VBITS_GE_256-NEXT: sunpklo z6.s, z2.h +; VBITS_GE_256-NEXT: sunpkhi z16.s, z3.h +; VBITS_GE_256-NEXT: sunpkhi z17.s, z1.h +; VBITS_GE_256-NEXT: sdivr z4.s, p1/m, z4.s, z5.s +; VBITS_GE_256-NEXT: sunpklo z5.s, z3.h +; VBITS_GE_256-NEXT: sdivr z6.s, p1/m, z6.s, z7.s +; VBITS_GE_256-NEXT: sunpklo z7.s, z1.h +; VBITS_GE_256-NEXT: sdivr z16.s, p1/m, z16.s, z17.s +; VBITS_GE_256-NEXT: sdivr z5.s, p1/m, z5.s, z7.s +; VBITS_GE_256-NEXT: uzp1 z4.h, z6.h, z4.h +; VBITS_GE_256-NEXT: uzp1 z5.h, z5.h, z16.h +; VBITS_GE_256-NEXT: mls z0.h, p0/m, z4.h, z2.h +; VBITS_GE_256-NEXT: mls z1.h, p0/m, z5.h, z3.h +; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x0, x8, lsl #1] +; VBITS_GE_256-NEXT: st1h { z1.h }, p0, [x0] +; VBITS_GE_256-NEXT: ret + %op1 = load <32 x i16>, <32 x i16>* %a + %op2 = load <32 x i16>, <32 x i16>* %b + %res = srem <32 x i16> %op1, %op2 + store <32 x i16> %res, <32 x i16>* %a + ret void +} + +define void @srem_v64i16(<64 x i16>* %a, <64 x i16>* %b) #0 { +; VBITS_GE_128-LABEL: srem_v64i16: +; VBITS_GE_128: // %bb.0: +; VBITS_GE_128-NEXT: mov x8, #48 +; VBITS_GE_128-NEXT: mov x9, #56 +; VBITS_GE_128-NEXT: mov x10, #32 +; VBITS_GE_128-NEXT: mov x11, #40 +; VBITS_GE_128-NEXT: mov x12, #16 +; VBITS_GE_128-NEXT: mov x13, #24 +; VBITS_GE_128-NEXT: mov x14, #8 +; VBITS_GE_128-NEXT: ptrue p0.h, vl8 +; VBITS_GE_128-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] +; VBITS_GE_128-NEXT: ld1h { z7.h }, p0/z, [x0, x9, lsl #1] +; VBITS_GE_128-NEXT: ld1h { z6.h }, p0/z, [x0, x10, lsl #1] +; VBITS_GE_128-NEXT: ld1h { z5.h }, p0/z, [x0, x11, lsl #1] +; VBITS_GE_128-NEXT: ld1h { z4.h }, p0/z, [x0, x12, lsl #1] +; VBITS_GE_128-NEXT: ld1h { z2.h }, p0/z, [x0, x13, lsl #1] +; VBITS_GE_128-NEXT: ld1h { z1.h }, p0/z, [x0, x14, lsl #1] +; VBITS_GE_128-NEXT: ld1h { z3.h }, p0/z, [x0] +; VBITS_GE_128-NEXT: ld1h { z16.h }, p0/z, [x1, x14, lsl #1] +; VBITS_GE_128-NEXT: ld1h { z21.h }, p0/z, [x1, x13, lsl #1] +; VBITS_GE_128-NEXT: ptrue p1.s, vl4 +; VBITS_GE_128-NEXT: ld1h { z18.h }, p0/z, [x1, x8, lsl #1] +; VBITS_GE_128-NEXT: ld1h { z19.h }, p0/z, [x1, x9, lsl #1] +; VBITS_GE_128-NEXT: ld1h { z20.h }, p0/z, [x1, x11, lsl #1] +; VBITS_GE_128-NEXT: sunpkhi z22.s, z1.h +; VBITS_GE_128-NEXT: sunpklo z24.s, z1.h +; VBITS_GE_128-NEXT: sunpkhi z17.s, z16.h +; VBITS_GE_128-NEXT: sunpklo z23.s, z16.h +; VBITS_GE_128-NEXT: sdivr z17.s, p1/m, z17.s, z22.s +; VBITS_GE_128-NEXT: movprfx z22, z24 +; VBITS_GE_128-NEXT: sdiv z22.s, p1/m, z22.s, z23.s +; VBITS_GE_128-NEXT: ld1h { z23.h }, p0/z, [x1, x12, lsl #1] +; VBITS_GE_128-NEXT: uzp1 z17.h, z22.h, z17.h +; VBITS_GE_128-NEXT: sunpkhi z22.s, z21.h +; VBITS_GE_128-NEXT: sunpkhi z24.s, z2.h +; VBITS_GE_128-NEXT: sunpklo z25.s, z21.h +; VBITS_GE_128-NEXT: sunpklo z26.s, z2.h +; VBITS_GE_128-NEXT: sdivr z22.s, p1/m, z22.s, z24.s +; VBITS_GE_128-NEXT: movprfx z24, z26 +; VBITS_GE_128-NEXT: sdiv z24.s, p1/m, z24.s, z25.s +; VBITS_GE_128-NEXT: sunpkhi z25.s, z23.h +; VBITS_GE_128-NEXT: sunpkhi z26.s, z4.h +; VBITS_GE_128-NEXT: uzp1 z22.h, z24.h, z22.h +; VBITS_GE_128-NEXT: movprfx z24, z26 +; VBITS_GE_128-NEXT: sdiv z24.s, p1/m, z24.s, z25.s +; VBITS_GE_128-NEXT: sunpklo z25.s, z23.h +; VBITS_GE_128-NEXT: sunpklo z26.s, z4.h +; VBITS_GE_128-NEXT: sdivr z25.s, p1/m, z25.s, z26.s +; VBITS_GE_128-NEXT: ld1h { z26.h }, p0/z, [x1, x10, lsl #1] +; VBITS_GE_128-NEXT: mls z2.h, p0/m, z22.h, z21.h +; VBITS_GE_128-NEXT: uzp1 z21.h, z25.h, z24.h +; VBITS_GE_128-NEXT: sunpkhi z22.s, z20.h +; VBITS_GE_128-NEXT: sunpkhi z24.s, z5.h +; VBITS_GE_128-NEXT: mls z4.h, p0/m, z21.h, z23.h +; VBITS_GE_128-NEXT: movprfx z21, z24 +; VBITS_GE_128-NEXT: sdiv z21.s, p1/m, z21.s, z22.s +; VBITS_GE_128-NEXT: sunpklo z22.s, z20.h +; VBITS_GE_128-NEXT: sunpklo z23.s, z5.h +; VBITS_GE_128-NEXT: sunpkhi z24.s, z26.h +; VBITS_GE_128-NEXT: sunpkhi z25.s, z6.h +; VBITS_GE_128-NEXT: sdivr z22.s, p1/m, z22.s, z23.s +; VBITS_GE_128-NEXT: movprfx z23, z25 +; VBITS_GE_128-NEXT: sdiv z23.s, p1/m, z23.s, z24.s +; VBITS_GE_128-NEXT: sunpklo z24.s, z26.h +; VBITS_GE_128-NEXT: sunpklo z25.s, z6.h +; VBITS_GE_128-NEXT: sdivr z24.s, p1/m, z24.s, z25.s +; VBITS_GE_128-NEXT: uzp1 z21.h, z22.h, z21.h +; VBITS_GE_128-NEXT: uzp1 z22.h, z24.h, z23.h +; VBITS_GE_128-NEXT: mls z5.h, p0/m, z21.h, z20.h +; VBITS_GE_128-NEXT: mls z6.h, p0/m, z22.h, z26.h +; VBITS_GE_128-NEXT: sunpkhi z20.s, z19.h +; VBITS_GE_128-NEXT: sunpkhi z21.s, z7.h +; VBITS_GE_128-NEXT: sunpklo z22.s, z19.h +; VBITS_GE_128-NEXT: sunpklo z23.s, z7.h +; VBITS_GE_128-NEXT: ld1h { z25.h }, p0/z, [x1] +; VBITS_GE_128-NEXT: sdivr z20.s, p1/m, z20.s, z21.s +; VBITS_GE_128-NEXT: movprfx z21, z23 +; VBITS_GE_128-NEXT: sdiv z21.s, p1/m, z21.s, z22.s +; VBITS_GE_128-NEXT: sunpkhi z22.s, z18.h +; VBITS_GE_128-NEXT: sunpkhi z23.s, z0.h +; VBITS_GE_128-NEXT: sdivr z22.s, p1/m, z22.s, z23.s +; VBITS_GE_128-NEXT: sunpklo z24.s, z18.h +; VBITS_GE_128-NEXT: sunpklo z26.s, z0.h +; VBITS_GE_128-NEXT: movprfx z23, z26 +; VBITS_GE_128-NEXT: sdiv z23.s, p1/m, z23.s, z24.s +; VBITS_GE_128-NEXT: uzp1 z20.h, z21.h, z20.h +; VBITS_GE_128-NEXT: uzp1 z21.h, z23.h, z22.h +; VBITS_GE_128-NEXT: mls z7.h, p0/m, z20.h, z19.h +; VBITS_GE_128-NEXT: mls z0.h, p0/m, z21.h, z18.h +; VBITS_GE_128-NEXT: sunpkhi z18.s, z25.h +; VBITS_GE_128-NEXT: sunpkhi z19.s, z3.h +; VBITS_GE_128-NEXT: stp q0, q7, [x0, #96] +; VBITS_GE_128-NEXT: sunpklo z0.s, z25.h +; VBITS_GE_128-NEXT: sunpklo z7.s, z3.h +; VBITS_GE_128-NEXT: sdivr z18.s, p1/m, z18.s, z19.s +; VBITS_GE_128-NEXT: sdivr z0.s, p1/m, z0.s, z7.s +; VBITS_GE_128-NEXT: mls z1.h, p0/m, z17.h, z16.h +; VBITS_GE_128-NEXT: uzp1 z0.h, z0.h, z18.h +; VBITS_GE_128-NEXT: stp q4, q2, [x0, #32] +; VBITS_GE_128-NEXT: mls z3.h, p0/m, z0.h, z25.h +; VBITS_GE_128-NEXT: stp q6, q5, [x0, #64] +; VBITS_GE_128-NEXT: stp q3, q1, [x0] +; VBITS_GE_128-NEXT: ret +; +; VBITS_GE_256-LABEL: srem_v64i16: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov x8, #32 +; VBITS_GE_256-NEXT: mov x9, #48 +; VBITS_GE_256-NEXT: mov x10, #16 +; VBITS_GE_256-NEXT: ptrue p0.h, vl16 +; VBITS_GE_256-NEXT: ptrue p1.s, vl8 +; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] +; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0, x9, lsl #1] +; VBITS_GE_256-NEXT: ld1h { z2.h }, p0/z, [x0, x10, lsl #1] +; VBITS_GE_256-NEXT: ld1h { z3.h }, p0/z, [x0] +; VBITS_GE_256-NEXT: ld1h { z4.h }, p0/z, [x1, x10, lsl #1] +; VBITS_GE_256-NEXT: ld1h { z5.h }, p0/z, [x1, x9, lsl #1] +; VBITS_GE_256-NEXT: ld1h { z6.h }, p0/z, [x1, x8, lsl #1] +; VBITS_GE_256-NEXT: sunpkhi z19.s, z1.h +; VBITS_GE_256-NEXT: sunpkhi z16.s, z2.h +; VBITS_GE_256-NEXT: sunpkhi z7.s, z4.h +; VBITS_GE_256-NEXT: sunpklo z17.s, z4.h +; VBITS_GE_256-NEXT: sdivr z7.s, p1/m, z7.s, z16.s +; VBITS_GE_256-NEXT: sunpklo z16.s, z2.h +; VBITS_GE_256-NEXT: sunpkhi z18.s, z5.h +; VBITS_GE_256-NEXT: sdiv z16.s, p1/m, z16.s, z17.s +; VBITS_GE_256-NEXT: movprfx z17, z19 +; VBITS_GE_256-NEXT: sdiv z17.s, p1/m, z17.s, z18.s +; VBITS_GE_256-NEXT: sunpklo z18.s, z5.h +; VBITS_GE_256-NEXT: sunpklo z19.s, z1.h +; VBITS_GE_256-NEXT: uzp1 z7.h, z16.h, z7.h +; VBITS_GE_256-NEXT: sdivr z18.s, p1/m, z18.s, z19.s +; VBITS_GE_256-NEXT: ld1h { z19.h }, p0/z, [x1] +; VBITS_GE_256-NEXT: uzp1 z16.h, z18.h, z17.h +; VBITS_GE_256-NEXT: mls z2.h, p0/m, z7.h, z4.h +; VBITS_GE_256-NEXT: mls z1.h, p0/m, z16.h, z5.h +; VBITS_GE_256-NEXT: sunpkhi z4.s, z6.h +; VBITS_GE_256-NEXT: sunpkhi z5.s, z0.h +; VBITS_GE_256-NEXT: sunpklo z7.s, z6.h +; VBITS_GE_256-NEXT: sunpklo z16.s, z0.h +; VBITS_GE_256-NEXT: sdivr z4.s, p1/m, z4.s, z5.s +; VBITS_GE_256-NEXT: movprfx z5, z16 +; VBITS_GE_256-NEXT: sdiv z5.s, p1/m, z5.s, z7.s +; VBITS_GE_256-NEXT: sunpkhi z7.s, z19.h +; VBITS_GE_256-NEXT: sunpkhi z16.s, z3.h +; VBITS_GE_256-NEXT: sunpklo z17.s, z19.h +; VBITS_GE_256-NEXT: sdivr z7.s, p1/m, z7.s, z16.s +; VBITS_GE_256-NEXT: sunpklo z18.s, z3.h +; VBITS_GE_256-NEXT: movprfx z16, z18 +; VBITS_GE_256-NEXT: sdiv z16.s, p1/m, z16.s, z17.s +; VBITS_GE_256-NEXT: uzp1 z4.h, z5.h, z4.h +; VBITS_GE_256-NEXT: uzp1 z5.h, z16.h, z7.h +; VBITS_GE_256-NEXT: mls z0.h, p0/m, z4.h, z6.h +; VBITS_GE_256-NEXT: mls z3.h, p0/m, z5.h, z19.h +; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x0, x8, lsl #1] +; VBITS_GE_256-NEXT: st1h { z1.h }, p0, [x0, x9, lsl #1] +; VBITS_GE_256-NEXT: st1h { z2.h }, p0, [x0, x10, lsl #1] +; VBITS_GE_256-NEXT: st1h { z3.h }, p0, [x0] +; VBITS_GE_256-NEXT: ret + %op1 = load <64 x i16>, <64 x i16>* %a + %op2 = load <64 x i16>, <64 x i16>* %b + %res = srem <64 x i16> %op1, %op2 + store <64 x i16> %res, <64 x i16>* %a + ret void +} + +define void @srem_v128i16(<128 x i16>* %a, <128 x i16>* %b) #0 { +; VBITS_GE_128-LABEL: srem_v128i16: +; VBITS_GE_128: // %bb.0: +; VBITS_GE_128-NEXT: str d8, [sp, #-16]! // 8-byte Folded Spill +; VBITS_GE_128-NEXT: .cfi_def_cfa_offset 16 +; VBITS_GE_128-NEXT: .cfi_offset b8, -16 +; VBITS_GE_128-NEXT: mov x16, #120 +; VBITS_GE_128-NEXT: mov x17, #112 +; VBITS_GE_128-NEXT: mov x9, #104 +; VBITS_GE_128-NEXT: mov x10, #96 +; VBITS_GE_128-NEXT: mov x11, #88 +; VBITS_GE_128-NEXT: mov x12, #80 +; VBITS_GE_128-NEXT: mov x13, #72 +; VBITS_GE_128-NEXT: mov x8, #64 +; VBITS_GE_128-NEXT: mov x14, #56 +; VBITS_GE_128-NEXT: mov x15, #48 +; VBITS_GE_128-NEXT: mov x18, #40 +; VBITS_GE_128-NEXT: mov x2, #32 +; VBITS_GE_128-NEXT: mov x3, #24 +; VBITS_GE_128-NEXT: mov x4, #16 +; VBITS_GE_128-NEXT: mov x5, #8 +; VBITS_GE_128-NEXT: ptrue p0.h, vl8 +; VBITS_GE_128-NEXT: ld1h { z2.h }, p0/z, [x0, x16, lsl #1] +; VBITS_GE_128-NEXT: ld1h { z1.h }, p0/z, [x0, x17, lsl #1] +; VBITS_GE_128-NEXT: ld1h { z0.h }, p0/z, [x0, x9, lsl #1] +; VBITS_GE_128-NEXT: ld1h { z16.h }, p0/z, [x0, x10, lsl #1] +; VBITS_GE_128-NEXT: ld1h { z7.h }, p0/z, [x0, x11, lsl #1] +; VBITS_GE_128-NEXT: ld1h { z6.h }, p0/z, [x0, x12, lsl #1] +; VBITS_GE_128-NEXT: ld1h { z5.h }, p0/z, [x0, x13, lsl #1] +; VBITS_GE_128-NEXT: ld1h { z4.h }, p0/z, [x0, x8, lsl #1] +; VBITS_GE_128-NEXT: ld1h { z3.h }, p0/z, [x0, x14, lsl #1] +; VBITS_GE_128-NEXT: ld1h { z23.h }, p0/z, [x0, x15, lsl #1] +; VBITS_GE_128-NEXT: ld1h { z22.h }, p0/z, [x0, x18, lsl #1] +; VBITS_GE_128-NEXT: ld1h { z21.h }, p0/z, [x0, x2, lsl #1] +; VBITS_GE_128-NEXT: ld1h { z20.h }, p0/z, [x0, x3, lsl #1] +; VBITS_GE_128-NEXT: ld1h { z19.h }, p0/z, [x0, x4, lsl #1] +; VBITS_GE_128-NEXT: ld1h { z18.h }, p0/z, [x0, x5, lsl #1] +; VBITS_GE_128-NEXT: ld1h { z17.h }, p0/z, [x0] +; VBITS_GE_128-NEXT: ld1h { z26.h }, p0/z, [x1, x5, lsl #1] +; VBITS_GE_128-NEXT: ld1h { z28.h }, p0/z, [x1, x4, lsl #1] +; VBITS_GE_128-NEXT: ptrue p1.s, vl4 +; VBITS_GE_128-NEXT: ld1h { z24.h }, p0/z, [x1, x16, lsl #1] +; VBITS_GE_128-NEXT: ld1h { z25.h }, p0/z, [x1, x17, lsl #1] +; VBITS_GE_128-NEXT: ld1h { z27.h }, p0/z, [x1, x3, lsl #1] +; VBITS_GE_128-NEXT: sunpkhi z30.s, z18.h +; VBITS_GE_128-NEXT: sunpklo z8.s, z18.h +; VBITS_GE_128-NEXT: sunpkhi z29.s, z26.h +; VBITS_GE_128-NEXT: sunpklo z31.s, z26.h +; VBITS_GE_128-NEXT: sdivr z29.s, p1/m, z29.s, z30.s +; VBITS_GE_128-NEXT: movprfx z30, z8 +; VBITS_GE_128-NEXT: sdiv z30.s, p1/m, z30.s, z31.s +; VBITS_GE_128-NEXT: sunpkhi z31.s, z28.h +; VBITS_GE_128-NEXT: sunpkhi z8.s, z19.h +; VBITS_GE_128-NEXT: uzp1 z29.h, z30.h, z29.h +; VBITS_GE_128-NEXT: movprfx z30, z8 +; VBITS_GE_128-NEXT: sdiv z30.s, p1/m, z30.s, z31.s +; VBITS_GE_128-NEXT: sunpklo z31.s, z28.h +; VBITS_GE_128-NEXT: sunpklo z8.s, z19.h +; VBITS_GE_128-NEXT: sdivr z31.s, p1/m, z31.s, z8.s +; VBITS_GE_128-NEXT: mls z18.h, p0/m, z29.h, z26.h +; VBITS_GE_128-NEXT: uzp1 z30.h, z31.h, z30.h +; VBITS_GE_128-NEXT: ld1h { z31.h }, p0/z, [x1, x2, lsl #1] +; VBITS_GE_128-NEXT: mls z19.h, p0/m, z30.h, z28.h +; VBITS_GE_128-NEXT: sunpkhi z26.s, z27.h +; VBITS_GE_128-NEXT: sunpkhi z28.s, z20.h +; VBITS_GE_128-NEXT: sunpklo z29.s, z27.h +; VBITS_GE_128-NEXT: sunpklo z30.s, z20.h +; VBITS_GE_128-NEXT: ld1h { z8.h }, p0/z, [x1, x18, lsl #1] +; VBITS_GE_128-NEXT: sdivr z26.s, p1/m, z26.s, z28.s +; VBITS_GE_128-NEXT: movprfx z28, z30 +; VBITS_GE_128-NEXT: sdiv z28.s, p1/m, z28.s, z29.s +; VBITS_GE_128-NEXT: sunpkhi z29.s, z31.h +; VBITS_GE_128-NEXT: sunpkhi z30.s, z21.h +; VBITS_GE_128-NEXT: uzp1 z26.h, z28.h, z26.h +; VBITS_GE_128-NEXT: movprfx z28, z30 +; VBITS_GE_128-NEXT: sdiv z28.s, p1/m, z28.s, z29.s +; VBITS_GE_128-NEXT: sunpklo z29.s, z31.h +; VBITS_GE_128-NEXT: sunpklo z30.s, z21.h +; VBITS_GE_128-NEXT: sdivr z29.s, p1/m, z29.s, z30.s +; VBITS_GE_128-NEXT: mls z20.h, p0/m, z26.h, z27.h +; VBITS_GE_128-NEXT: uzp1 z28.h, z29.h, z28.h +; VBITS_GE_128-NEXT: ld1h { z29.h }, p0/z, [x1, x15, lsl #1] +; VBITS_GE_128-NEXT: mls z21.h, p0/m, z28.h, z31.h +; VBITS_GE_128-NEXT: sunpkhi z26.s, z8.h +; VBITS_GE_128-NEXT: sunpkhi z27.s, z22.h +; VBITS_GE_128-NEXT: sunpklo z28.s, z8.h +; VBITS_GE_128-NEXT: sunpklo z31.s, z22.h +; VBITS_GE_128-NEXT: ld1h { z30.h }, p0/z, [x1, x14, lsl #1] +; VBITS_GE_128-NEXT: sdivr z26.s, p1/m, z26.s, z27.s +; VBITS_GE_128-NEXT: movprfx z27, z31 +; VBITS_GE_128-NEXT: sdiv z27.s, p1/m, z27.s, z28.s +; VBITS_GE_128-NEXT: sunpkhi z28.s, z29.h +; VBITS_GE_128-NEXT: sunpkhi z31.s, z23.h +; VBITS_GE_128-NEXT: uzp1 z26.h, z27.h, z26.h +; VBITS_GE_128-NEXT: movprfx z27, z31 +; VBITS_GE_128-NEXT: sdiv z27.s, p1/m, z27.s, z28.s +; VBITS_GE_128-NEXT: sunpklo z28.s, z29.h +; VBITS_GE_128-NEXT: sunpklo z31.s, z23.h +; VBITS_GE_128-NEXT: sdivr z28.s, p1/m, z28.s, z31.s +; VBITS_GE_128-NEXT: mls z22.h, p0/m, z26.h, z8.h +; VBITS_GE_128-NEXT: uzp1 z27.h, z28.h, z27.h +; VBITS_GE_128-NEXT: ld1h { z28.h }, p0/z, [x1, x13, lsl #1] +; VBITS_GE_128-NEXT: mls z23.h, p0/m, z27.h, z29.h +; VBITS_GE_128-NEXT: sunpkhi z26.s, z30.h +; VBITS_GE_128-NEXT: sunpkhi z27.s, z3.h +; VBITS_GE_128-NEXT: sunpklo z29.s, z30.h +; VBITS_GE_128-NEXT: sunpklo z8.s, z3.h +; VBITS_GE_128-NEXT: ld1h { z31.h }, p0/z, [x1, x12, lsl #1] +; VBITS_GE_128-NEXT: sdivr z26.s, p1/m, z26.s, z27.s +; VBITS_GE_128-NEXT: movprfx z27, z8 +; VBITS_GE_128-NEXT: sdiv z27.s, p1/m, z27.s, z29.s +; VBITS_GE_128-NEXT: sunpkhi z29.s, z28.h +; VBITS_GE_128-NEXT: sunpkhi z8.s, z5.h +; VBITS_GE_128-NEXT: uzp1 z26.h, z27.h, z26.h +; VBITS_GE_128-NEXT: movprfx z27, z8 +; VBITS_GE_128-NEXT: sdiv z27.s, p1/m, z27.s, z29.s +; VBITS_GE_128-NEXT: sunpklo z29.s, z28.h +; VBITS_GE_128-NEXT: sunpklo z8.s, z5.h +; VBITS_GE_128-NEXT: sdivr z29.s, p1/m, z29.s, z8.s +; VBITS_GE_128-NEXT: mls z3.h, p0/m, z26.h, z30.h +; VBITS_GE_128-NEXT: uzp1 z27.h, z29.h, z27.h +; VBITS_GE_128-NEXT: ld1h { z29.h }, p0/z, [x1, x11, lsl #1] +; VBITS_GE_128-NEXT: mls z5.h, p0/m, z27.h, z28.h +; VBITS_GE_128-NEXT: sunpkhi z26.s, z31.h +; VBITS_GE_128-NEXT: sunpkhi z27.s, z6.h +; VBITS_GE_128-NEXT: sunpklo z28.s, z31.h +; VBITS_GE_128-NEXT: sunpklo z30.s, z6.h +; VBITS_GE_128-NEXT: ld1h { z8.h }, p0/z, [x1, x10, lsl #1] +; VBITS_GE_128-NEXT: sdivr z26.s, p1/m, z26.s, z27.s +; VBITS_GE_128-NEXT: movprfx z27, z30 +; VBITS_GE_128-NEXT: sdiv z27.s, p1/m, z27.s, z28.s +; VBITS_GE_128-NEXT: sunpkhi z28.s, z29.h +; VBITS_GE_128-NEXT: sunpkhi z30.s, z7.h +; VBITS_GE_128-NEXT: uzp1 z26.h, z27.h, z26.h +; VBITS_GE_128-NEXT: movprfx z27, z30 +; VBITS_GE_128-NEXT: sdiv z27.s, p1/m, z27.s, z28.s +; VBITS_GE_128-NEXT: sunpklo z28.s, z29.h +; VBITS_GE_128-NEXT: sunpklo z30.s, z7.h +; VBITS_GE_128-NEXT: sdivr z28.s, p1/m, z28.s, z30.s +; VBITS_GE_128-NEXT: ld1h { z30.h }, p0/z, [x1, x9, lsl #1] +; VBITS_GE_128-NEXT: mls z6.h, p0/m, z26.h, z31.h +; VBITS_GE_128-NEXT: uzp1 z26.h, z28.h, z27.h +; VBITS_GE_128-NEXT: sunpkhi z27.s, z8.h +; VBITS_GE_128-NEXT: sunpkhi z28.s, z16.h +; VBITS_GE_128-NEXT: mls z7.h, p0/m, z26.h, z29.h +; VBITS_GE_128-NEXT: movprfx z26, z28 +; VBITS_GE_128-NEXT: sdiv z26.s, p1/m, z26.s, z27.s +; VBITS_GE_128-NEXT: sunpklo z27.s, z8.h +; VBITS_GE_128-NEXT: sunpklo z28.s, z16.h +; VBITS_GE_128-NEXT: sunpkhi z29.s, z30.h +; VBITS_GE_128-NEXT: sunpkhi z31.s, z0.h +; VBITS_GE_128-NEXT: sdivr z27.s, p1/m, z27.s, z28.s +; VBITS_GE_128-NEXT: movprfx z28, z31 +; VBITS_GE_128-NEXT: sdiv z28.s, p1/m, z28.s, z29.s +; VBITS_GE_128-NEXT: sunpklo z29.s, z30.h +; VBITS_GE_128-NEXT: sunpklo z31.s, z0.h +; VBITS_GE_128-NEXT: sdivr z29.s, p1/m, z29.s, z31.s +; VBITS_GE_128-NEXT: uzp1 z26.h, z27.h, z26.h +; VBITS_GE_128-NEXT: uzp1 z27.h, z29.h, z28.h +; VBITS_GE_128-NEXT: mls z16.h, p0/m, z26.h, z8.h +; VBITS_GE_128-NEXT: mls z0.h, p0/m, z27.h, z30.h +; VBITS_GE_128-NEXT: sunpkhi z26.s, z25.h +; VBITS_GE_128-NEXT: sunpkhi z27.s, z1.h +; VBITS_GE_128-NEXT: sunpklo z28.s, z25.h +; VBITS_GE_128-NEXT: sunpklo z29.s, z1.h +; VBITS_GE_128-NEXT: ld1h { z31.h }, p0/z, [x1, x8, lsl #1] +; VBITS_GE_128-NEXT: sdivr z26.s, p1/m, z26.s, z27.s +; VBITS_GE_128-NEXT: movprfx z27, z29 +; VBITS_GE_128-NEXT: sdiv z27.s, p1/m, z27.s, z28.s +; VBITS_GE_128-NEXT: sunpkhi z28.s, z24.h +; VBITS_GE_128-NEXT: sunpkhi z29.s, z2.h +; VBITS_GE_128-NEXT: sdivr z28.s, p1/m, z28.s, z29.s +; VBITS_GE_128-NEXT: sunpklo z30.s, z24.h +; VBITS_GE_128-NEXT: sunpklo z8.s, z2.h +; VBITS_GE_128-NEXT: movprfx z29, z8 +; VBITS_GE_128-NEXT: sdiv z29.s, p1/m, z29.s, z30.s +; VBITS_GE_128-NEXT: uzp1 z26.h, z27.h, z26.h +; VBITS_GE_128-NEXT: uzp1 z27.h, z29.h, z28.h +; VBITS_GE_128-NEXT: mls z1.h, p0/m, z26.h, z25.h +; VBITS_GE_128-NEXT: mls z2.h, p0/m, z27.h, z24.h +; VBITS_GE_128-NEXT: ld1h { z24.h }, p0/z, [x1] +; VBITS_GE_128-NEXT: sunpkhi z25.s, z31.h +; VBITS_GE_128-NEXT: stp q19, q20, [x0, #32] +; VBITS_GE_128-NEXT: stp q21, q22, [x0, #64] +; VBITS_GE_128-NEXT: stp q23, q3, [x0, #96] +; VBITS_GE_128-NEXT: stp q6, q7, [x0, #160] +; VBITS_GE_128-NEXT: stp q16, q0, [x0, #192] +; VBITS_GE_128-NEXT: sunpklo z0.s, z31.h +; VBITS_GE_128-NEXT: stp q1, q2, [x0, #224] +; VBITS_GE_128-NEXT: sunpkhi z2.s, z4.h +; VBITS_GE_128-NEXT: sunpklo z1.s, z4.h +; VBITS_GE_128-NEXT: sdiv z2.s, p1/m, z2.s, z25.s +; VBITS_GE_128-NEXT: sdivr z0.s, p1/m, z0.s, z1.s +; VBITS_GE_128-NEXT: sunpkhi z1.s, z17.h +; VBITS_GE_128-NEXT: uzp1 z0.h, z0.h, z2.h +; VBITS_GE_128-NEXT: sunpklo z2.s, z17.h +; VBITS_GE_128-NEXT: mls z4.h, p0/m, z0.h, z31.h +; VBITS_GE_128-NEXT: sunpkhi z0.s, z24.h +; VBITS_GE_128-NEXT: sdivr z0.s, p1/m, z0.s, z1.s +; VBITS_GE_128-NEXT: sunpklo z1.s, z24.h +; VBITS_GE_128-NEXT: sdivr z1.s, p1/m, z1.s, z2.s +; VBITS_GE_128-NEXT: stp q4, q5, [x0, #128] +; VBITS_GE_128-NEXT: uzp1 z0.h, z1.h, z0.h +; VBITS_GE_128-NEXT: mls z17.h, p0/m, z0.h, z24.h +; VBITS_GE_128-NEXT: stp q17, q18, [x0] +; VBITS_GE_128-NEXT: ldr d8, [sp], #16 // 8-byte Folded Reload +; VBITS_GE_128-NEXT: ret +; +; VBITS_GE_256-LABEL: srem_v128i16: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov x8, #96 +; VBITS_GE_256-NEXT: mov x9, #112 +; VBITS_GE_256-NEXT: mov x10, #64 +; VBITS_GE_256-NEXT: mov x11, #80 +; VBITS_GE_256-NEXT: mov x12, #32 +; VBITS_GE_256-NEXT: mov x13, #48 +; VBITS_GE_256-NEXT: mov x14, #16 +; VBITS_GE_256-NEXT: ptrue p0.h, vl16 +; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] +; VBITS_GE_256-NEXT: ld1h { z7.h }, p0/z, [x0, x9, lsl #1] +; VBITS_GE_256-NEXT: ld1h { z6.h }, p0/z, [x0, x10, lsl #1] +; VBITS_GE_256-NEXT: ld1h { z5.h }, p0/z, [x0, x11, lsl #1] +; VBITS_GE_256-NEXT: ld1h { z4.h }, p0/z, [x0, x12, lsl #1] +; VBITS_GE_256-NEXT: ld1h { z3.h }, p0/z, [x0, x13, lsl #1] +; VBITS_GE_256-NEXT: ld1h { z2.h }, p0/z, [x0, x14, lsl #1] +; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0] +; VBITS_GE_256-NEXT: ld1h { z17.h }, p0/z, [x1, x14, lsl #1] +; VBITS_GE_256-NEXT: ld1h { z21.h }, p0/z, [x1, x13, lsl #1] +; VBITS_GE_256-NEXT: ptrue p1.s, vl8 +; VBITS_GE_256-NEXT: ld1h { z16.h }, p0/z, [x1, x8, lsl #1] +; VBITS_GE_256-NEXT: ld1h { z18.h }, p0/z, [x1, x9, lsl #1] +; VBITS_GE_256-NEXT: ld1h { z19.h }, p0/z, [x1, x10, lsl #1] +; VBITS_GE_256-NEXT: sunpkhi z23.s, z2.h +; VBITS_GE_256-NEXT: ld1h { z20.h }, p0/z, [x1, x12, lsl #1] +; VBITS_GE_256-NEXT: sunpkhi z22.s, z17.h +; VBITS_GE_256-NEXT: sunpklo z24.s, z17.h +; VBITS_GE_256-NEXT: sunpklo z25.s, z2.h +; VBITS_GE_256-NEXT: sdivr z22.s, p1/m, z22.s, z23.s +; VBITS_GE_256-NEXT: movprfx z23, z25 +; VBITS_GE_256-NEXT: sdiv z23.s, p1/m, z23.s, z24.s +; VBITS_GE_256-NEXT: sunpkhi z24.s, z21.h +; VBITS_GE_256-NEXT: sunpkhi z25.s, z3.h +; VBITS_GE_256-NEXT: uzp1 z22.h, z23.h, z22.h +; VBITS_GE_256-NEXT: movprfx z23, z25 +; VBITS_GE_256-NEXT: sdiv z23.s, p1/m, z23.s, z24.s +; VBITS_GE_256-NEXT: ld1h { z26.h }, p0/z, [x1, x11, lsl #1] +; VBITS_GE_256-NEXT: sunpklo z24.s, z21.h +; VBITS_GE_256-NEXT: sunpklo z25.s, z3.h +; VBITS_GE_256-NEXT: sdivr z24.s, p1/m, z24.s, z25.s +; VBITS_GE_256-NEXT: mls z2.h, p0/m, z22.h, z17.h +; VBITS_GE_256-NEXT: uzp1 z17.h, z24.h, z23.h +; VBITS_GE_256-NEXT: sunpkhi z22.s, z20.h +; VBITS_GE_256-NEXT: sunpkhi z23.s, z4.h +; VBITS_GE_256-NEXT: mls z3.h, p0/m, z17.h, z21.h +; VBITS_GE_256-NEXT: movprfx z17, z23 +; VBITS_GE_256-NEXT: sdiv z17.s, p1/m, z17.s, z22.s +; VBITS_GE_256-NEXT: sunpklo z21.s, z20.h +; VBITS_GE_256-NEXT: sunpklo z22.s, z4.h +; VBITS_GE_256-NEXT: sunpkhi z23.s, z26.h +; VBITS_GE_256-NEXT: sunpkhi z24.s, z5.h +; VBITS_GE_256-NEXT: sdivr z21.s, p1/m, z21.s, z22.s +; VBITS_GE_256-NEXT: movprfx z22, z24 +; VBITS_GE_256-NEXT: sdiv z22.s, p1/m, z22.s, z23.s +; VBITS_GE_256-NEXT: sunpklo z23.s, z26.h +; VBITS_GE_256-NEXT: sunpklo z24.s, z5.h +; VBITS_GE_256-NEXT: uzp1 z17.h, z21.h, z17.h +; VBITS_GE_256-NEXT: sdivr z23.s, p1/m, z23.s, z24.s +; VBITS_GE_256-NEXT: mls z4.h, p0/m, z17.h, z20.h +; VBITS_GE_256-NEXT: uzp1 z21.h, z23.h, z22.h +; VBITS_GE_256-NEXT: sunpkhi z17.s, z19.h +; VBITS_GE_256-NEXT: mls z5.h, p0/m, z21.h, z26.h +; VBITS_GE_256-NEXT: sunpkhi z20.s, z6.h +; VBITS_GE_256-NEXT: sunpklo z21.s, z19.h +; VBITS_GE_256-NEXT: sunpklo z22.s, z6.h +; VBITS_GE_256-NEXT: ld1h { z25.h }, p0/z, [x1] +; VBITS_GE_256-NEXT: sdivr z17.s, p1/m, z17.s, z20.s +; VBITS_GE_256-NEXT: movprfx z20, z22 +; VBITS_GE_256-NEXT: sdiv z20.s, p1/m, z20.s, z21.s +; VBITS_GE_256-NEXT: sunpkhi z21.s, z18.h +; VBITS_GE_256-NEXT: sunpkhi z22.s, z7.h +; VBITS_GE_256-NEXT: sunpklo z23.s, z18.h +; VBITS_GE_256-NEXT: sdivr z21.s, p1/m, z21.s, z22.s +; VBITS_GE_256-NEXT: sunpklo z24.s, z7.h +; VBITS_GE_256-NEXT: movprfx z22, z24 +; VBITS_GE_256-NEXT: sdiv z22.s, p1/m, z22.s, z23.s +; VBITS_GE_256-NEXT: uzp1 z17.h, z20.h, z17.h +; VBITS_GE_256-NEXT: uzp1 z20.h, z22.h, z21.h +; VBITS_GE_256-NEXT: mls z6.h, p0/m, z17.h, z19.h +; VBITS_GE_256-NEXT: mls z7.h, p0/m, z20.h, z18.h +; VBITS_GE_256-NEXT: sunpkhi z17.s, z16.h +; VBITS_GE_256-NEXT: sunpkhi z18.s, z0.h +; VBITS_GE_256-NEXT: sunpklo z19.s, z16.h +; VBITS_GE_256-NEXT: sunpklo z20.s, z0.h +; VBITS_GE_256-NEXT: sdivr z17.s, p1/m, z17.s, z18.s +; VBITS_GE_256-NEXT: movprfx z18, z20 +; VBITS_GE_256-NEXT: sdiv z18.s, p1/m, z18.s, z19.s +; VBITS_GE_256-NEXT: sunpkhi z19.s, z25.h +; VBITS_GE_256-NEXT: sunpkhi z20.s, z1.h +; VBITS_GE_256-NEXT: sunpklo z21.s, z25.h +; VBITS_GE_256-NEXT: sdivr z19.s, p1/m, z19.s, z20.s +; VBITS_GE_256-NEXT: sunpklo z22.s, z1.h +; VBITS_GE_256-NEXT: movprfx z20, z22 +; VBITS_GE_256-NEXT: sdiv z20.s, p1/m, z20.s, z21.s +; VBITS_GE_256-NEXT: uzp1 z17.h, z18.h, z17.h +; VBITS_GE_256-NEXT: uzp1 z18.h, z20.h, z19.h +; VBITS_GE_256-NEXT: mls z0.h, p0/m, z17.h, z16.h +; VBITS_GE_256-NEXT: mls z1.h, p0/m, z18.h, z25.h +; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x0, x8, lsl #1] +; VBITS_GE_256-NEXT: st1h { z7.h }, p0, [x0, x9, lsl #1] +; VBITS_GE_256-NEXT: st1h { z6.h }, p0, [x0, x10, lsl #1] +; VBITS_GE_256-NEXT: st1h { z5.h }, p0, [x0, x11, lsl #1] +; VBITS_GE_256-NEXT: st1h { z4.h }, p0, [x0, x12, lsl #1] +; VBITS_GE_256-NEXT: st1h { z3.h }, p0, [x0, x13, lsl #1] +; VBITS_GE_256-NEXT: st1h { z2.h }, p0, [x0, x14, lsl #1] +; VBITS_GE_256-NEXT: st1h { z1.h }, p0, [x0] +; VBITS_GE_256-NEXT: ret + %op1 = load <128 x i16>, <128 x i16>* %a + %op2 = load <128 x i16>, <128 x i16>* %b + %res = srem <128 x i16> %op1, %op2 + store <128 x i16> %res, <128 x i16>* %a + ret void +} + +define <2 x i32> @srem_v2i32(<2 x i32> %op1, <2 x i32> %op2) #0 { +; CHECK-LABEL: srem_v2i32: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: ptrue p0.s, vl2 +; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1 +; CHECK-NEXT: movprfx z2, z0 +; CHECK-NEXT: sdiv z2.s, p0/m, z2.s, z1.s +; CHECK-NEXT: mls z0.s, p0/m, z2.s, z1.s +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 +; CHECK-NEXT: ret + %res = srem <2 x i32> %op1, %op2 + ret <2 x i32> %res +} + +define <4 x i32> @srem_v4i32(<4 x i32> %op1, <4 x i32> %op2) #0 { +; CHECK-LABEL: srem_v4i32: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: ptrue p0.s, vl4 +; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1 +; CHECK-NEXT: movprfx z2, z0 +; CHECK-NEXT: sdiv z2.s, p0/m, z2.s, z1.s +; CHECK-NEXT: mls z0.s, p0/m, z2.s, z1.s +; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 +; CHECK-NEXT: ret + %res = srem <4 x i32> %op1, %op2 + ret <4 x i32> %res +} + +define void @srem_v8i32(<8 x i32>* %a, <8 x i32>* %b) #0 { +; VBITS_GE_128-LABEL: srem_v8i32: +; VBITS_GE_128: // %bb.0: +; VBITS_GE_128-NEXT: mov x8, #4 +; VBITS_GE_128-NEXT: ptrue p0.s, vl4 +; VBITS_GE_128-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] +; VBITS_GE_128-NEXT: ld1w { z1.s }, p0/z, [x0] +; VBITS_GE_128-NEXT: ld1w { z2.s }, p0/z, [x1, x8, lsl #2] +; VBITS_GE_128-NEXT: ld1w { z3.s }, p0/z, [x1] +; VBITS_GE_128-NEXT: movprfx z4, z1 +; VBITS_GE_128-NEXT: sdiv z4.s, p0/m, z4.s, z3.s +; VBITS_GE_128-NEXT: movprfx z5, z0 +; VBITS_GE_128-NEXT: sdiv z5.s, p0/m, z5.s, z2.s +; VBITS_GE_128-NEXT: mls z1.s, p0/m, z4.s, z3.s +; VBITS_GE_128-NEXT: mls z0.s, p0/m, z5.s, z2.s +; VBITS_GE_128-NEXT: stp q1, q0, [x0] +; VBITS_GE_128-NEXT: ret +; +; VBITS_GE_256-LABEL: srem_v8i32: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: ptrue p0.s, vl8 +; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0] +; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x1] +; VBITS_GE_256-NEXT: movprfx z2, z0 +; VBITS_GE_256-NEXT: sdiv z2.s, p0/m, z2.s, z1.s +; VBITS_GE_256-NEXT: mls z0.s, p0/m, z2.s, z1.s +; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0] +; VBITS_GE_256-NEXT: ret +; +; VBITS_GE_512-LABEL: srem_v8i32: +; VBITS_GE_512: // %bb.0: +; VBITS_GE_512-NEXT: ptrue p0.s, vl8 +; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x0] +; VBITS_GE_512-NEXT: ld1w { z1.s }, p0/z, [x1] +; VBITS_GE_512-NEXT: movprfx z2, z0 +; VBITS_GE_512-NEXT: sdiv z2.s, p0/m, z2.s, z1.s +; VBITS_GE_512-NEXT: mls z0.s, p0/m, z2.s, z1.s +; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x0] +; VBITS_GE_512-NEXT: ret + %op1 = load <8 x i32>, <8 x i32>* %a + %op2 = load <8 x i32>, <8 x i32>* %b + %res = srem <8 x i32> %op1, %op2 + store <8 x i32> %res, <8 x i32>* %a + ret void +} + +define void @srem_v16i32(<16 x i32>* %a, <16 x i32>* %b) #0 { +; VBITS_GE_128-LABEL: srem_v16i32: +; VBITS_GE_128: // %bb.0: +; VBITS_GE_128-NEXT: mov x8, #8 +; VBITS_GE_128-NEXT: mov x9, #12 +; VBITS_GE_128-NEXT: mov x10, #4 +; VBITS_GE_128-NEXT: ptrue p0.s, vl4 +; VBITS_GE_128-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] +; VBITS_GE_128-NEXT: ld1w { z1.s }, p0/z, [x0, x9, lsl #2] +; VBITS_GE_128-NEXT: ld1w { z2.s }, p0/z, [x0, x10, lsl #2] +; VBITS_GE_128-NEXT: ld1w { z3.s }, p0/z, [x0] +; VBITS_GE_128-NEXT: ld1w { z4.s }, p0/z, [x1, x9, lsl #2] +; VBITS_GE_128-NEXT: ld1w { z5.s }, p0/z, [x1, x8, lsl #2] +; VBITS_GE_128-NEXT: ld1w { z6.s }, p0/z, [x1, x10, lsl #2] +; VBITS_GE_128-NEXT: ld1w { z7.s }, p0/z, [x1] +; VBITS_GE_128-NEXT: movprfx z16, z1 +; VBITS_GE_128-NEXT: sdiv z16.s, p0/m, z16.s, z4.s +; VBITS_GE_128-NEXT: mls z1.s, p0/m, z16.s, z4.s +; VBITS_GE_128-NEXT: movprfx z4, z0 +; VBITS_GE_128-NEXT: sdiv z4.s, p0/m, z4.s, z5.s +; VBITS_GE_128-NEXT: movprfx z16, z2 +; VBITS_GE_128-NEXT: sdiv z16.s, p0/m, z16.s, z6.s +; VBITS_GE_128-NEXT: mls z0.s, p0/m, z4.s, z5.s +; VBITS_GE_128-NEXT: movprfx z4, z3 +; VBITS_GE_128-NEXT: sdiv z4.s, p0/m, z4.s, z7.s +; VBITS_GE_128-NEXT: mls z3.s, p0/m, z4.s, z7.s +; VBITS_GE_128-NEXT: mls z2.s, p0/m, z16.s, z6.s +; VBITS_GE_128-NEXT: stp q0, q1, [x0, #32] +; VBITS_GE_128-NEXT: stp q3, q2, [x0] +; VBITS_GE_128-NEXT: ret +; +; VBITS_GE_256-LABEL: srem_v16i32: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov x8, #8 +; VBITS_GE_256-NEXT: ptrue p0.s, vl8 +; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0] +; VBITS_GE_256-NEXT: ld1w { z2.s }, p0/z, [x1, x8, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z3.s }, p0/z, [x1] +; VBITS_GE_256-NEXT: movprfx z4, z0 +; VBITS_GE_256-NEXT: sdiv z4.s, p0/m, z4.s, z2.s +; VBITS_GE_256-NEXT: movprfx z5, z1 +; VBITS_GE_256-NEXT: sdiv z5.s, p0/m, z5.s, z3.s +; VBITS_GE_256-NEXT: mls z0.s, p0/m, z4.s, z2.s +; VBITS_GE_256-NEXT: mls z1.s, p0/m, z5.s, z3.s +; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0, x8, lsl #2] +; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x0] +; VBITS_GE_256-NEXT: ret +; +; VBITS_GE_512-LABEL: srem_v16i32: +; VBITS_GE_512: // %bb.0: +; VBITS_GE_512-NEXT: ptrue p0.s, vl16 +; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x0] +; VBITS_GE_512-NEXT: ld1w { z1.s }, p0/z, [x1] +; VBITS_GE_512-NEXT: movprfx z2, z0 +; VBITS_GE_512-NEXT: sdiv z2.s, p0/m, z2.s, z1.s +; VBITS_GE_512-NEXT: mls z0.s, p0/m, z2.s, z1.s +; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x0] +; VBITS_GE_512-NEXT: ret + %op1 = load <16 x i32>, <16 x i32>* %a + %op2 = load <16 x i32>, <16 x i32>* %b + %res = srem <16 x i32> %op1, %op2 + store <16 x i32> %res, <16 x i32>* %a + ret void +} + +define void @srem_v32i32(<32 x i32>* %a, <32 x i32>* %b) #0 { +; VBITS_GE_128-LABEL: srem_v32i32: +; VBITS_GE_128: // %bb.0: +; VBITS_GE_128-NEXT: mov x8, #24 +; VBITS_GE_128-NEXT: mov x9, #28 +; VBITS_GE_128-NEXT: mov x10, #16 +; VBITS_GE_128-NEXT: mov x11, #20 +; VBITS_GE_128-NEXT: mov x12, #8 +; VBITS_GE_128-NEXT: mov x13, #12 +; VBITS_GE_128-NEXT: mov x14, #4 +; VBITS_GE_128-NEXT: ptrue p0.s, vl4 +; VBITS_GE_128-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] +; VBITS_GE_128-NEXT: ld1w { z1.s }, p0/z, [x0, x9, lsl #2] +; VBITS_GE_128-NEXT: ld1w { z2.s }, p0/z, [x0, x10, lsl #2] +; VBITS_GE_128-NEXT: ld1w { z3.s }, p0/z, [x0, x11, lsl #2] +; VBITS_GE_128-NEXT: ld1w { z4.s }, p0/z, [x0, x12, lsl #2] +; VBITS_GE_128-NEXT: ld1w { z5.s }, p0/z, [x0, x13, lsl #2] +; VBITS_GE_128-NEXT: ld1w { z6.s }, p0/z, [x0, x14, lsl #2] +; VBITS_GE_128-NEXT: ld1w { z7.s }, p0/z, [x0] +; VBITS_GE_128-NEXT: ld1w { z16.s }, p0/z, [x1, x13, lsl #2] +; VBITS_GE_128-NEXT: ld1w { z18.s }, p0/z, [x1, x9, lsl #2] +; VBITS_GE_128-NEXT: ld1w { z17.s }, p0/z, [x1, x8, lsl #2] +; VBITS_GE_128-NEXT: ld1w { z19.s }, p0/z, [x1, x10, lsl #2] +; VBITS_GE_128-NEXT: ld1w { z21.s }, p0/z, [x1, x11, lsl #2] +; VBITS_GE_128-NEXT: ld1w { z20.s }, p0/z, [x1, x14, lsl #2] +; VBITS_GE_128-NEXT: ld1w { z22.s }, p0/z, [x1, x12, lsl #2] +; VBITS_GE_128-NEXT: ld1w { z23.s }, p0/z, [x1] +; VBITS_GE_128-NEXT: movprfx z24, z5 +; VBITS_GE_128-NEXT: sdiv z24.s, p0/m, z24.s, z16.s +; VBITS_GE_128-NEXT: mls z5.s, p0/m, z24.s, z16.s +; VBITS_GE_128-NEXT: movprfx z16, z1 +; VBITS_GE_128-NEXT: sdiv z16.s, p0/m, z16.s, z18.s +; VBITS_GE_128-NEXT: movprfx z24, z6 +; VBITS_GE_128-NEXT: sdiv z24.s, p0/m, z24.s, z20.s +; VBITS_GE_128-NEXT: mls z1.s, p0/m, z16.s, z18.s +; VBITS_GE_128-NEXT: movprfx z16, z4 +; VBITS_GE_128-NEXT: sdiv z16.s, p0/m, z16.s, z22.s +; VBITS_GE_128-NEXT: movprfx z18, z0 +; VBITS_GE_128-NEXT: sdiv z18.s, p0/m, z18.s, z17.s +; VBITS_GE_128-NEXT: mls z0.s, p0/m, z18.s, z17.s +; VBITS_GE_128-NEXT: movprfx z17, z3 +; VBITS_GE_128-NEXT: sdiv z17.s, p0/m, z17.s, z21.s +; VBITS_GE_128-NEXT: movprfx z18, z2 +; VBITS_GE_128-NEXT: sdiv z18.s, p0/m, z18.s, z19.s +; VBITS_GE_128-NEXT: mls z2.s, p0/m, z18.s, z19.s +; VBITS_GE_128-NEXT: mls z3.s, p0/m, z17.s, z21.s +; VBITS_GE_128-NEXT: stp q2, q3, [x0, #64] +; VBITS_GE_128-NEXT: mls z4.s, p0/m, z16.s, z22.s +; VBITS_GE_128-NEXT: stp q0, q1, [x0, #96] +; VBITS_GE_128-NEXT: movprfx z0, z7 +; VBITS_GE_128-NEXT: sdiv z0.s, p0/m, z0.s, z23.s +; VBITS_GE_128-NEXT: mls z7.s, p0/m, z0.s, z23.s +; VBITS_GE_128-NEXT: mls z6.s, p0/m, z24.s, z20.s +; VBITS_GE_128-NEXT: stp q7, q6, [x0] +; VBITS_GE_128-NEXT: stp q4, q5, [x0, #32] +; VBITS_GE_128-NEXT: ret +; +; VBITS_GE_256-LABEL: srem_v32i32: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov x8, #16 +; VBITS_GE_256-NEXT: mov x9, #24 +; VBITS_GE_256-NEXT: mov x10, #8 +; VBITS_GE_256-NEXT: ptrue p0.s, vl8 +; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0, x9, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z2.s }, p0/z, [x0, x10, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z3.s }, p0/z, [x0] +; VBITS_GE_256-NEXT: ld1w { z4.s }, p0/z, [x1, x9, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z5.s }, p0/z, [x1, x10, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z6.s }, p0/z, [x1, x8, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z7.s }, p0/z, [x1] +; VBITS_GE_256-NEXT: movprfx z16, z2 +; VBITS_GE_256-NEXT: sdiv z16.s, p0/m, z16.s, z5.s +; VBITS_GE_256-NEXT: movprfx z17, z1 +; VBITS_GE_256-NEXT: sdiv z17.s, p0/m, z17.s, z4.s +; VBITS_GE_256-NEXT: mls z2.s, p0/m, z16.s, z5.s +; VBITS_GE_256-NEXT: mls z1.s, p0/m, z17.s, z4.s +; VBITS_GE_256-NEXT: movprfx z4, z0 +; VBITS_GE_256-NEXT: sdiv z4.s, p0/m, z4.s, z6.s +; VBITS_GE_256-NEXT: movprfx z5, z3 +; VBITS_GE_256-NEXT: sdiv z5.s, p0/m, z5.s, z7.s +; VBITS_GE_256-NEXT: mls z0.s, p0/m, z4.s, z6.s +; VBITS_GE_256-NEXT: mls z3.s, p0/m, z5.s, z7.s +; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0, x8, lsl #2] +; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x0, x9, lsl #2] +; VBITS_GE_256-NEXT: st1w { z2.s }, p0, [x0, x10, lsl #2] +; VBITS_GE_256-NEXT: st1w { z3.s }, p0, [x0] +; VBITS_GE_256-NEXT: ret + %op1 = load <32 x i32>, <32 x i32>* %a + %op2 = load <32 x i32>, <32 x i32>* %b + %res = srem <32 x i32> %op1, %op2 + store <32 x i32> %res, <32 x i32>* %a + ret void +} + +define void @srem_v64i32(<64 x i32>* %a, <64 x i32>* %b) #0 { +; VBITS_GE_128-LABEL: srem_v64i32: +; VBITS_GE_128: // %bb.0: +; VBITS_GE_128-NEXT: mov x15, #60 +; VBITS_GE_128-NEXT: mov x16, #56 +; VBITS_GE_128-NEXT: mov x13, #52 +; VBITS_GE_128-NEXT: mov x14, #48 +; VBITS_GE_128-NEXT: mov x17, #44 +; VBITS_GE_128-NEXT: mov x11, #40 +; VBITS_GE_128-NEXT: mov x18, #36 +; VBITS_GE_128-NEXT: mov x12, #32 +; VBITS_GE_128-NEXT: mov x2, #28 +; VBITS_GE_128-NEXT: mov x8, #24 +; VBITS_GE_128-NEXT: mov x3, #20 +; VBITS_GE_128-NEXT: mov x9, #16 +; VBITS_GE_128-NEXT: mov x4, #12 +; VBITS_GE_128-NEXT: mov x10, #8 +; VBITS_GE_128-NEXT: mov x5, #4 +; VBITS_GE_128-NEXT: ptrue p0.s, vl4 +; VBITS_GE_128-NEXT: ld1w { z2.s }, p0/z, [x0, x15, lsl #2] +; VBITS_GE_128-NEXT: ld1w { z1.s }, p0/z, [x0, x16, lsl #2] +; VBITS_GE_128-NEXT: ld1w { z0.s }, p0/z, [x0, x13, lsl #2] +; VBITS_GE_128-NEXT: ld1w { z16.s }, p0/z, [x0, x14, lsl #2] +; VBITS_GE_128-NEXT: ld1w { z7.s }, p0/z, [x0, x17, lsl #2] +; VBITS_GE_128-NEXT: ld1w { z6.s }, p0/z, [x0, x11, lsl #2] +; VBITS_GE_128-NEXT: ld1w { z5.s }, p0/z, [x0, x18, lsl #2] +; VBITS_GE_128-NEXT: ld1w { z4.s }, p0/z, [x0, x12, lsl #2] +; VBITS_GE_128-NEXT: ld1w { z3.s }, p0/z, [x0, x2, lsl #2] +; VBITS_GE_128-NEXT: ld1w { z23.s }, p0/z, [x0, x8, lsl #2] +; VBITS_GE_128-NEXT: ld1w { z22.s }, p0/z, [x0, x3, lsl #2] +; VBITS_GE_128-NEXT: ld1w { z21.s }, p0/z, [x0, x9, lsl #2] +; VBITS_GE_128-NEXT: ld1w { z20.s }, p0/z, [x0, x4, lsl #2] +; VBITS_GE_128-NEXT: ld1w { z19.s }, p0/z, [x0, x10, lsl #2] +; VBITS_GE_128-NEXT: ld1w { z18.s }, p0/z, [x0, x5, lsl #2] +; VBITS_GE_128-NEXT: ld1w { z17.s }, p0/z, [x0] +; VBITS_GE_128-NEXT: ld1w { z24.s }, p0/z, [x1, x5, lsl #2] +; VBITS_GE_128-NEXT: ld1w { z25.s }, p0/z, [x1, x4, lsl #2] +; VBITS_GE_128-NEXT: ld1w { z28.s }, p0/z, [x1, x3, lsl #2] +; VBITS_GE_128-NEXT: ld1w { z26.s }, p0/z, [x1, x15, lsl #2] +; VBITS_GE_128-NEXT: ld1w { z27.s }, p0/z, [x1, x16, lsl #2] +; VBITS_GE_128-NEXT: ld1w { z29.s }, p0/z, [x1, x2, lsl #2] +; VBITS_GE_128-NEXT: movprfx z30, z18 +; VBITS_GE_128-NEXT: sdiv z30.s, p0/m, z30.s, z24.s +; VBITS_GE_128-NEXT: mls z18.s, p0/m, z30.s, z24.s +; VBITS_GE_128-NEXT: movprfx z24, z20 +; VBITS_GE_128-NEXT: sdiv z24.s, p0/m, z24.s, z25.s +; VBITS_GE_128-NEXT: ld1w { z30.s }, p0/z, [x1, x18, lsl #2] +; VBITS_GE_128-NEXT: mls z20.s, p0/m, z24.s, z25.s +; VBITS_GE_128-NEXT: ld1w { z24.s }, p0/z, [x1, x17, lsl #2] +; VBITS_GE_128-NEXT: movprfx z25, z22 +; VBITS_GE_128-NEXT: sdiv z25.s, p0/m, z25.s, z28.s +; VBITS_GE_128-NEXT: mls z22.s, p0/m, z25.s, z28.s +; VBITS_GE_128-NEXT: movprfx z25, z3 +; VBITS_GE_128-NEXT: sdiv z25.s, p0/m, z25.s, z29.s +; VBITS_GE_128-NEXT: ld1w { z28.s }, p0/z, [x1, x13, lsl #2] +; VBITS_GE_128-NEXT: mls z3.s, p0/m, z25.s, z29.s +; VBITS_GE_128-NEXT: ld1w { z25.s }, p0/z, [x1, x14, lsl #2] +; VBITS_GE_128-NEXT: movprfx z29, z5 +; VBITS_GE_128-NEXT: sdiv z29.s, p0/m, z29.s, z30.s +; VBITS_GE_128-NEXT: mls z5.s, p0/m, z29.s, z30.s +; VBITS_GE_128-NEXT: movprfx z29, z7 +; VBITS_GE_128-NEXT: sdiv z29.s, p0/m, z29.s, z24.s +; VBITS_GE_128-NEXT: ld1w { z30.s }, p0/z, [x1, x11, lsl #2] +; VBITS_GE_128-NEXT: mls z7.s, p0/m, z29.s, z24.s +; VBITS_GE_128-NEXT: ld1w { z24.s }, p0/z, [x1, x12, lsl #2] +; VBITS_GE_128-NEXT: movprfx z29, z0 +; VBITS_GE_128-NEXT: sdiv z29.s, p0/m, z29.s, z28.s +; VBITS_GE_128-NEXT: mls z0.s, p0/m, z29.s, z28.s +; VBITS_GE_128-NEXT: movprfx z28, z1 +; VBITS_GE_128-NEXT: sdiv z28.s, p0/m, z28.s, z27.s +; VBITS_GE_128-NEXT: ld1w { z29.s }, p0/z, [x1, x8, lsl #2] +; VBITS_GE_128-NEXT: mls z1.s, p0/m, z28.s, z27.s +; VBITS_GE_128-NEXT: ld1w { z27.s }, p0/z, [x1, x9, lsl #2] +; VBITS_GE_128-NEXT: movprfx z28, z2 +; VBITS_GE_128-NEXT: sdiv z28.s, p0/m, z28.s, z26.s +; VBITS_GE_128-NEXT: mls z2.s, p0/m, z28.s, z26.s +; VBITS_GE_128-NEXT: ld1w { z26.s }, p0/z, [x1, x10, lsl #2] +; VBITS_GE_128-NEXT: ld1w { z28.s }, p0/z, [x1] +; VBITS_GE_128-NEXT: stp q1, q2, [x0, #224] +; VBITS_GE_128-NEXT: movprfx z1, z16 +; VBITS_GE_128-NEXT: sdiv z1.s, p0/m, z1.s, z25.s +; VBITS_GE_128-NEXT: mls z16.s, p0/m, z1.s, z25.s +; VBITS_GE_128-NEXT: stp q16, q0, [x0, #192] +; VBITS_GE_128-NEXT: movprfx z0, z6 +; VBITS_GE_128-NEXT: sdiv z0.s, p0/m, z0.s, z30.s +; VBITS_GE_128-NEXT: mls z6.s, p0/m, z0.s, z30.s +; VBITS_GE_128-NEXT: stp q6, q7, [x0, #160] +; VBITS_GE_128-NEXT: movprfx z0, z4 +; VBITS_GE_128-NEXT: sdiv z0.s, p0/m, z0.s, z24.s +; VBITS_GE_128-NEXT: mls z4.s, p0/m, z0.s, z24.s +; VBITS_GE_128-NEXT: stp q4, q5, [x0, #128] +; VBITS_GE_128-NEXT: movprfx z0, z23 +; VBITS_GE_128-NEXT: sdiv z0.s, p0/m, z0.s, z29.s +; VBITS_GE_128-NEXT: mls z23.s, p0/m, z0.s, z29.s +; VBITS_GE_128-NEXT: stp q23, q3, [x0, #96] +; VBITS_GE_128-NEXT: movprfx z0, z21 +; VBITS_GE_128-NEXT: sdiv z0.s, p0/m, z0.s, z27.s +; VBITS_GE_128-NEXT: mls z21.s, p0/m, z0.s, z27.s +; VBITS_GE_128-NEXT: stp q21, q22, [x0, #64] +; VBITS_GE_128-NEXT: movprfx z0, z19 +; VBITS_GE_128-NEXT: sdiv z0.s, p0/m, z0.s, z26.s +; VBITS_GE_128-NEXT: mls z19.s, p0/m, z0.s, z26.s +; VBITS_GE_128-NEXT: stp q19, q20, [x0, #32] +; VBITS_GE_128-NEXT: movprfx z0, z17 +; VBITS_GE_128-NEXT: sdiv z0.s, p0/m, z0.s, z28.s +; VBITS_GE_128-NEXT: mls z17.s, p0/m, z0.s, z28.s +; VBITS_GE_128-NEXT: stp q17, q18, [x0] +; VBITS_GE_128-NEXT: ret +; +; VBITS_GE_256-LABEL: srem_v64i32: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov x8, #48 +; VBITS_GE_256-NEXT: mov x9, #56 +; VBITS_GE_256-NEXT: mov x10, #32 +; VBITS_GE_256-NEXT: mov x11, #40 +; VBITS_GE_256-NEXT: mov x12, #16 +; VBITS_GE_256-NEXT: mov x13, #24 +; VBITS_GE_256-NEXT: mov x14, #8 +; VBITS_GE_256-NEXT: ptrue p0.s, vl8 +; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0, x9, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z2.s }, p0/z, [x0, x10, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z3.s }, p0/z, [x0, x11, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z4.s }, p0/z, [x0, x12, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z5.s }, p0/z, [x0, x13, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z6.s }, p0/z, [x0, x14, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z7.s }, p0/z, [x0] +; VBITS_GE_256-NEXT: ld1w { z16.s }, p0/z, [x1, x14, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z17.s }, p0/z, [x1, x13, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z18.s }, p0/z, [x1, x8, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z19.s }, p0/z, [x1, x9, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z20.s }, p0/z, [x1, x11, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z21.s }, p0/z, [x1, x12, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z22.s }, p0/z, [x1, x10, lsl #2] +; VBITS_GE_256-NEXT: movprfx z23, z6 +; VBITS_GE_256-NEXT: sdiv z23.s, p0/m, z23.s, z16.s +; VBITS_GE_256-NEXT: mls z6.s, p0/m, z23.s, z16.s +; VBITS_GE_256-NEXT: movprfx z16, z5 +; VBITS_GE_256-NEXT: sdiv z16.s, p0/m, z16.s, z17.s +; VBITS_GE_256-NEXT: ld1w { z23.s }, p0/z, [x1] +; VBITS_GE_256-NEXT: mls z5.s, p0/m, z16.s, z17.s +; VBITS_GE_256-NEXT: movprfx z16, z4 +; VBITS_GE_256-NEXT: sdiv z16.s, p0/m, z16.s, z21.s +; VBITS_GE_256-NEXT: movprfx z17, z3 +; VBITS_GE_256-NEXT: sdiv z17.s, p0/m, z17.s, z20.s +; VBITS_GE_256-NEXT: mls z4.s, p0/m, z16.s, z21.s +; VBITS_GE_256-NEXT: mls z3.s, p0/m, z17.s, z20.s +; VBITS_GE_256-NEXT: movprfx z16, z2 +; VBITS_GE_256-NEXT: sdiv z16.s, p0/m, z16.s, z22.s +; VBITS_GE_256-NEXT: movprfx z17, z1 +; VBITS_GE_256-NEXT: sdiv z17.s, p0/m, z17.s, z19.s +; VBITS_GE_256-NEXT: mls z2.s, p0/m, z16.s, z22.s +; VBITS_GE_256-NEXT: mls z1.s, p0/m, z17.s, z19.s +; VBITS_GE_256-NEXT: movprfx z16, z0 +; VBITS_GE_256-NEXT: sdiv z16.s, p0/m, z16.s, z18.s +; VBITS_GE_256-NEXT: movprfx z17, z7 +; VBITS_GE_256-NEXT: sdiv z17.s, p0/m, z17.s, z23.s +; VBITS_GE_256-NEXT: mls z0.s, p0/m, z16.s, z18.s +; VBITS_GE_256-NEXT: mls z7.s, p0/m, z17.s, z23.s +; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0, x8, lsl #2] +; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x0, x9, lsl #2] +; VBITS_GE_256-NEXT: st1w { z2.s }, p0, [x0, x10, lsl #2] +; VBITS_GE_256-NEXT: st1w { z3.s }, p0, [x0, x11, lsl #2] +; VBITS_GE_256-NEXT: st1w { z4.s }, p0, [x0, x12, lsl #2] +; VBITS_GE_256-NEXT: st1w { z5.s }, p0, [x0, x13, lsl #2] +; VBITS_GE_256-NEXT: st1w { z6.s }, p0, [x0, x14, lsl #2] +; VBITS_GE_256-NEXT: st1w { z7.s }, p0, [x0] +; VBITS_GE_256-NEXT: ret + %op1 = load <64 x i32>, <64 x i32>* %a + %op2 = load <64 x i32>, <64 x i32>* %b + %res = srem <64 x i32> %op1, %op2 + store <64 x i32> %res, <64 x i32>* %a + ret void +} + +define <1 x i64> @srem_v1i64(<1 x i64> %op1, <1 x i64> %op2) #0 { +; CHECK-LABEL: srem_v1i64: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: ptrue p0.d, vl1 +; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1 +; CHECK-NEXT: movprfx z2, z0 +; CHECK-NEXT: sdiv z2.d, p0/m, z2.d, z1.d +; CHECK-NEXT: mls z0.d, p0/m, z2.d, z1.d +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 +; CHECK-NEXT: ret + %res = srem <1 x i64> %op1, %op2 + ret <1 x i64> %res +} + +define <2 x i64> @srem_v2i64(<2 x i64> %op1, <2 x i64> %op2) #0 { +; CHECK-LABEL: srem_v2i64: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: ptrue p0.d, vl2 +; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1 +; CHECK-NEXT: movprfx z2, z0 +; CHECK-NEXT: sdiv z2.d, p0/m, z2.d, z1.d +; CHECK-NEXT: mls z0.d, p0/m, z2.d, z1.d +; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 +; CHECK-NEXT: ret + %res = srem <2 x i64> %op1, %op2 + ret <2 x i64> %res +} + +define void @srem_v4i64(<4 x i64>* %a, <4 x i64>* %b) #0 { +; VBITS_GE_128-LABEL: srem_v4i64: +; VBITS_GE_128: // %bb.0: +; VBITS_GE_128-NEXT: mov x8, #2 +; VBITS_GE_128-NEXT: ptrue p0.d, vl2 +; VBITS_GE_128-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] +; VBITS_GE_128-NEXT: ld1d { z1.d }, p0/z, [x0] +; VBITS_GE_128-NEXT: ld1d { z2.d }, p0/z, [x1, x8, lsl #3] +; VBITS_GE_128-NEXT: ld1d { z3.d }, p0/z, [x1] +; VBITS_GE_128-NEXT: movprfx z4, z1 +; VBITS_GE_128-NEXT: sdiv z4.d, p0/m, z4.d, z3.d +; VBITS_GE_128-NEXT: movprfx z5, z0 +; VBITS_GE_128-NEXT: sdiv z5.d, p0/m, z5.d, z2.d +; VBITS_GE_128-NEXT: mls z1.d, p0/m, z4.d, z3.d +; VBITS_GE_128-NEXT: mls z0.d, p0/m, z5.d, z2.d +; VBITS_GE_128-NEXT: stp q1, q0, [x0] +; VBITS_GE_128-NEXT: ret +; +; VBITS_GE_256-LABEL: srem_v4i64: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: ptrue p0.d, vl4 +; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0] +; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x1] +; VBITS_GE_256-NEXT: movprfx z2, z0 +; VBITS_GE_256-NEXT: sdiv z2.d, p0/m, z2.d, z1.d +; VBITS_GE_256-NEXT: mls z0.d, p0/m, z2.d, z1.d +; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x0] +; VBITS_GE_256-NEXT: ret +; +; VBITS_GE_512-LABEL: srem_v4i64: +; VBITS_GE_512: // %bb.0: +; VBITS_GE_512-NEXT: ptrue p0.d, vl4 +; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x0] +; VBITS_GE_512-NEXT: ld1d { z1.d }, p0/z, [x1] +; VBITS_GE_512-NEXT: movprfx z2, z0 +; VBITS_GE_512-NEXT: sdiv z2.d, p0/m, z2.d, z1.d +; VBITS_GE_512-NEXT: mls z0.d, p0/m, z2.d, z1.d +; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x0] +; VBITS_GE_512-NEXT: ret + %op1 = load <4 x i64>, <4 x i64>* %a + %op2 = load <4 x i64>, <4 x i64>* %b + %res = srem <4 x i64> %op1, %op2 + store <4 x i64> %res, <4 x i64>* %a + ret void +} + +define void @srem_v8i64(<8 x i64>* %a, <8 x i64>* %b) #0 { +; VBITS_GE_128-LABEL: srem_v8i64: +; VBITS_GE_128: // %bb.0: +; VBITS_GE_128-NEXT: mov x8, #4 +; VBITS_GE_128-NEXT: mov x9, #6 +; VBITS_GE_128-NEXT: mov x10, #2 +; VBITS_GE_128-NEXT: ptrue p0.d, vl2 +; VBITS_GE_128-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] +; VBITS_GE_128-NEXT: ld1d { z1.d }, p0/z, [x0, x9, lsl #3] +; VBITS_GE_128-NEXT: ld1d { z2.d }, p0/z, [x0, x10, lsl #3] +; VBITS_GE_128-NEXT: ld1d { z3.d }, p0/z, [x0] +; VBITS_GE_128-NEXT: ld1d { z4.d }, p0/z, [x1, x9, lsl #3] +; VBITS_GE_128-NEXT: ld1d { z5.d }, p0/z, [x1, x8, lsl #3] +; VBITS_GE_128-NEXT: ld1d { z6.d }, p0/z, [x1, x10, lsl #3] +; VBITS_GE_128-NEXT: ld1d { z7.d }, p0/z, [x1] +; VBITS_GE_128-NEXT: movprfx z16, z1 +; VBITS_GE_128-NEXT: sdiv z16.d, p0/m, z16.d, z4.d +; VBITS_GE_128-NEXT: mls z1.d, p0/m, z16.d, z4.d +; VBITS_GE_128-NEXT: movprfx z4, z0 +; VBITS_GE_128-NEXT: sdiv z4.d, p0/m, z4.d, z5.d +; VBITS_GE_128-NEXT: movprfx z16, z2 +; VBITS_GE_128-NEXT: sdiv z16.d, p0/m, z16.d, z6.d +; VBITS_GE_128-NEXT: mls z0.d, p0/m, z4.d, z5.d +; VBITS_GE_128-NEXT: movprfx z4, z3 +; VBITS_GE_128-NEXT: sdiv z4.d, p0/m, z4.d, z7.d +; VBITS_GE_128-NEXT: mls z3.d, p0/m, z4.d, z7.d +; VBITS_GE_128-NEXT: mls z2.d, p0/m, z16.d, z6.d +; VBITS_GE_128-NEXT: stp q0, q1, [x0, #32] +; VBITS_GE_128-NEXT: stp q3, q2, [x0] +; VBITS_GE_128-NEXT: ret +; +; VBITS_GE_256-LABEL: srem_v8i64: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov x8, #4 +; VBITS_GE_256-NEXT: ptrue p0.d, vl4 +; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0] +; VBITS_GE_256-NEXT: ld1d { z2.d }, p0/z, [x1, x8, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z3.d }, p0/z, [x1] +; VBITS_GE_256-NEXT: movprfx z4, z0 +; VBITS_GE_256-NEXT: sdiv z4.d, p0/m, z4.d, z2.d +; VBITS_GE_256-NEXT: movprfx z5, z1 +; VBITS_GE_256-NEXT: sdiv z5.d, p0/m, z5.d, z3.d +; VBITS_GE_256-NEXT: mls z0.d, p0/m, z4.d, z2.d +; VBITS_GE_256-NEXT: mls z1.d, p0/m, z5.d, z3.d +; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x0, x8, lsl #3] +; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x0] +; VBITS_GE_256-NEXT: ret +; +; VBITS_GE_512-LABEL: srem_v8i64: +; VBITS_GE_512: // %bb.0: +; VBITS_GE_512-NEXT: ptrue p0.d, vl8 +; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x0] +; VBITS_GE_512-NEXT: ld1d { z1.d }, p0/z, [x1] +; VBITS_GE_512-NEXT: movprfx z2, z0 +; VBITS_GE_512-NEXT: sdiv z2.d, p0/m, z2.d, z1.d +; VBITS_GE_512-NEXT: mls z0.d, p0/m, z2.d, z1.d +; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x0] +; VBITS_GE_512-NEXT: ret + %op1 = load <8 x i64>, <8 x i64>* %a + %op2 = load <8 x i64>, <8 x i64>* %b + %res = srem <8 x i64> %op1, %op2 + store <8 x i64> %res, <8 x i64>* %a + ret void +} + +define void @srem_v16i64(<16 x i64>* %a, <16 x i64>* %b) #0 { +; VBITS_GE_128-LABEL: srem_v16i64: +; VBITS_GE_128: // %bb.0: +; VBITS_GE_128-NEXT: mov x8, #12 +; VBITS_GE_128-NEXT: mov x9, #14 +; VBITS_GE_128-NEXT: mov x10, #8 +; VBITS_GE_128-NEXT: mov x11, #10 +; VBITS_GE_128-NEXT: mov x12, #4 +; VBITS_GE_128-NEXT: mov x13, #6 +; VBITS_GE_128-NEXT: mov x14, #2 +; VBITS_GE_128-NEXT: ptrue p0.d, vl2 +; VBITS_GE_128-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] +; VBITS_GE_128-NEXT: ld1d { z1.d }, p0/z, [x0, x9, lsl #3] +; VBITS_GE_128-NEXT: ld1d { z2.d }, p0/z, [x0, x10, lsl #3] +; VBITS_GE_128-NEXT: ld1d { z3.d }, p0/z, [x0, x11, lsl #3] +; VBITS_GE_128-NEXT: ld1d { z4.d }, p0/z, [x0, x12, lsl #3] +; VBITS_GE_128-NEXT: ld1d { z5.d }, p0/z, [x0, x13, lsl #3] +; VBITS_GE_128-NEXT: ld1d { z6.d }, p0/z, [x0, x14, lsl #3] +; VBITS_GE_128-NEXT: ld1d { z7.d }, p0/z, [x0] +; VBITS_GE_128-NEXT: ld1d { z16.d }, p0/z, [x1, x13, lsl #3] +; VBITS_GE_128-NEXT: ld1d { z18.d }, p0/z, [x1, x9, lsl #3] +; VBITS_GE_128-NEXT: ld1d { z17.d }, p0/z, [x1, x8, lsl #3] +; VBITS_GE_128-NEXT: ld1d { z19.d }, p0/z, [x1, x10, lsl #3] +; VBITS_GE_128-NEXT: ld1d { z21.d }, p0/z, [x1, x11, lsl #3] +; VBITS_GE_128-NEXT: ld1d { z20.d }, p0/z, [x1, x14, lsl #3] +; VBITS_GE_128-NEXT: ld1d { z22.d }, p0/z, [x1, x12, lsl #3] +; VBITS_GE_128-NEXT: ld1d { z23.d }, p0/z, [x1] +; VBITS_GE_128-NEXT: movprfx z24, z5 +; VBITS_GE_128-NEXT: sdiv z24.d, p0/m, z24.d, z16.d +; VBITS_GE_128-NEXT: mls z5.d, p0/m, z24.d, z16.d +; VBITS_GE_128-NEXT: movprfx z16, z1 +; VBITS_GE_128-NEXT: sdiv z16.d, p0/m, z16.d, z18.d +; VBITS_GE_128-NEXT: movprfx z24, z6 +; VBITS_GE_128-NEXT: sdiv z24.d, p0/m, z24.d, z20.d +; VBITS_GE_128-NEXT: mls z1.d, p0/m, z16.d, z18.d +; VBITS_GE_128-NEXT: movprfx z16, z4 +; VBITS_GE_128-NEXT: sdiv z16.d, p0/m, z16.d, z22.d +; VBITS_GE_128-NEXT: movprfx z18, z0 +; VBITS_GE_128-NEXT: sdiv z18.d, p0/m, z18.d, z17.d +; VBITS_GE_128-NEXT: mls z0.d, p0/m, z18.d, z17.d +; VBITS_GE_128-NEXT: movprfx z17, z3 +; VBITS_GE_128-NEXT: sdiv z17.d, p0/m, z17.d, z21.d +; VBITS_GE_128-NEXT: movprfx z18, z2 +; VBITS_GE_128-NEXT: sdiv z18.d, p0/m, z18.d, z19.d +; VBITS_GE_128-NEXT: mls z2.d, p0/m, z18.d, z19.d +; VBITS_GE_128-NEXT: mls z3.d, p0/m, z17.d, z21.d +; VBITS_GE_128-NEXT: stp q2, q3, [x0, #64] +; VBITS_GE_128-NEXT: mls z4.d, p0/m, z16.d, z22.d +; VBITS_GE_128-NEXT: stp q0, q1, [x0, #96] +; VBITS_GE_128-NEXT: movprfx z0, z7 +; VBITS_GE_128-NEXT: sdiv z0.d, p0/m, z0.d, z23.d +; VBITS_GE_128-NEXT: mls z7.d, p0/m, z0.d, z23.d +; VBITS_GE_128-NEXT: mls z6.d, p0/m, z24.d, z20.d +; VBITS_GE_128-NEXT: stp q7, q6, [x0] +; VBITS_GE_128-NEXT: stp q4, q5, [x0, #32] +; VBITS_GE_128-NEXT: ret +; +; VBITS_GE_256-LABEL: srem_v16i64: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov x8, #8 +; VBITS_GE_256-NEXT: mov x9, #12 +; VBITS_GE_256-NEXT: mov x10, #4 +; VBITS_GE_256-NEXT: ptrue p0.d, vl4 +; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0, x9, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z2.d }, p0/z, [x0, x10, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z3.d }, p0/z, [x0] +; VBITS_GE_256-NEXT: ld1d { z4.d }, p0/z, [x1, x9, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z5.d }, p0/z, [x1, x10, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z6.d }, p0/z, [x1, x8, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z7.d }, p0/z, [x1] +; VBITS_GE_256-NEXT: movprfx z16, z2 +; VBITS_GE_256-NEXT: sdiv z16.d, p0/m, z16.d, z5.d +; VBITS_GE_256-NEXT: movprfx z17, z1 +; VBITS_GE_256-NEXT: sdiv z17.d, p0/m, z17.d, z4.d +; VBITS_GE_256-NEXT: mls z2.d, p0/m, z16.d, z5.d +; VBITS_GE_256-NEXT: mls z1.d, p0/m, z17.d, z4.d +; VBITS_GE_256-NEXT: movprfx z4, z0 +; VBITS_GE_256-NEXT: sdiv z4.d, p0/m, z4.d, z6.d +; VBITS_GE_256-NEXT: movprfx z5, z3 +; VBITS_GE_256-NEXT: sdiv z5.d, p0/m, z5.d, z7.d +; VBITS_GE_256-NEXT: mls z0.d, p0/m, z4.d, z6.d +; VBITS_GE_256-NEXT: mls z3.d, p0/m, z5.d, z7.d +; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x0, x8, lsl #3] +; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x0, x9, lsl #3] +; VBITS_GE_256-NEXT: st1d { z2.d }, p0, [x0, x10, lsl #3] +; VBITS_GE_256-NEXT: st1d { z3.d }, p0, [x0] +; VBITS_GE_256-NEXT: ret + %op1 = load <16 x i64>, <16 x i64>* %a + %op2 = load <16 x i64>, <16 x i64>* %b + %res = srem <16 x i64> %op1, %op2 + store <16 x i64> %res, <16 x i64>* %a + ret void +} + +define void @srem_v32i64(<32 x i64>* %a, <32 x i64>* %b) #0 { +; VBITS_GE_128-LABEL: srem_v32i64: +; VBITS_GE_128: // %bb.0: +; VBITS_GE_128-NEXT: mov x15, #30 +; VBITS_GE_128-NEXT: mov x16, #28 +; VBITS_GE_128-NEXT: mov x13, #26 +; VBITS_GE_128-NEXT: mov x14, #24 +; VBITS_GE_128-NEXT: mov x17, #22 +; VBITS_GE_128-NEXT: mov x11, #20 +; VBITS_GE_128-NEXT: mov x18, #18 +; VBITS_GE_128-NEXT: mov x12, #16 +; VBITS_GE_128-NEXT: mov x2, #14 +; VBITS_GE_128-NEXT: mov x8, #12 +; VBITS_GE_128-NEXT: mov x3, #10 +; VBITS_GE_128-NEXT: mov x9, #8 +; VBITS_GE_128-NEXT: mov x4, #6 +; VBITS_GE_128-NEXT: mov x10, #4 +; VBITS_GE_128-NEXT: mov x5, #2 +; VBITS_GE_128-NEXT: ptrue p0.d, vl2 +; VBITS_GE_128-NEXT: ld1d { z2.d }, p0/z, [x0, x15, lsl #3] +; VBITS_GE_128-NEXT: ld1d { z1.d }, p0/z, [x0, x16, lsl #3] +; VBITS_GE_128-NEXT: ld1d { z0.d }, p0/z, [x0, x13, lsl #3] +; VBITS_GE_128-NEXT: ld1d { z16.d }, p0/z, [x0, x14, lsl #3] +; VBITS_GE_128-NEXT: ld1d { z7.d }, p0/z, [x0, x17, lsl #3] +; VBITS_GE_128-NEXT: ld1d { z6.d }, p0/z, [x0, x11, lsl #3] +; VBITS_GE_128-NEXT: ld1d { z5.d }, p0/z, [x0, x18, lsl #3] +; VBITS_GE_128-NEXT: ld1d { z4.d }, p0/z, [x0, x12, lsl #3] +; VBITS_GE_128-NEXT: ld1d { z3.d }, p0/z, [x0, x2, lsl #3] +; VBITS_GE_128-NEXT: ld1d { z23.d }, p0/z, [x0, x8, lsl #3] +; VBITS_GE_128-NEXT: ld1d { z22.d }, p0/z, [x0, x3, lsl #3] +; VBITS_GE_128-NEXT: ld1d { z21.d }, p0/z, [x0, x9, lsl #3] +; VBITS_GE_128-NEXT: ld1d { z20.d }, p0/z, [x0, x4, lsl #3] +; VBITS_GE_128-NEXT: ld1d { z19.d }, p0/z, [x0, x10, lsl #3] +; VBITS_GE_128-NEXT: ld1d { z18.d }, p0/z, [x0, x5, lsl #3] +; VBITS_GE_128-NEXT: ld1d { z17.d }, p0/z, [x0] +; VBITS_GE_128-NEXT: ld1d { z24.d }, p0/z, [x1, x5, lsl #3] +; VBITS_GE_128-NEXT: ld1d { z25.d }, p0/z, [x1, x4, lsl #3] +; VBITS_GE_128-NEXT: ld1d { z28.d }, p0/z, [x1, x3, lsl #3] +; VBITS_GE_128-NEXT: ld1d { z26.d }, p0/z, [x1, x15, lsl #3] +; VBITS_GE_128-NEXT: ld1d { z27.d }, p0/z, [x1, x16, lsl #3] +; VBITS_GE_128-NEXT: ld1d { z29.d }, p0/z, [x1, x2, lsl #3] +; VBITS_GE_128-NEXT: movprfx z30, z18 +; VBITS_GE_128-NEXT: sdiv z30.d, p0/m, z30.d, z24.d +; VBITS_GE_128-NEXT: mls z18.d, p0/m, z30.d, z24.d +; VBITS_GE_128-NEXT: movprfx z24, z20 +; VBITS_GE_128-NEXT: sdiv z24.d, p0/m, z24.d, z25.d +; VBITS_GE_128-NEXT: ld1d { z30.d }, p0/z, [x1, x18, lsl #3] +; VBITS_GE_128-NEXT: mls z20.d, p0/m, z24.d, z25.d +; VBITS_GE_128-NEXT: ld1d { z24.d }, p0/z, [x1, x17, lsl #3] +; VBITS_GE_128-NEXT: movprfx z25, z22 +; VBITS_GE_128-NEXT: sdiv z25.d, p0/m, z25.d, z28.d +; VBITS_GE_128-NEXT: mls z22.d, p0/m, z25.d, z28.d +; VBITS_GE_128-NEXT: movprfx z25, z3 +; VBITS_GE_128-NEXT: sdiv z25.d, p0/m, z25.d, z29.d +; VBITS_GE_128-NEXT: ld1d { z28.d }, p0/z, [x1, x13, lsl #3] +; VBITS_GE_128-NEXT: mls z3.d, p0/m, z25.d, z29.d +; VBITS_GE_128-NEXT: ld1d { z25.d }, p0/z, [x1, x14, lsl #3] +; VBITS_GE_128-NEXT: movprfx z29, z5 +; VBITS_GE_128-NEXT: sdiv z29.d, p0/m, z29.d, z30.d +; VBITS_GE_128-NEXT: mls z5.d, p0/m, z29.d, z30.d +; VBITS_GE_128-NEXT: movprfx z29, z7 +; VBITS_GE_128-NEXT: sdiv z29.d, p0/m, z29.d, z24.d +; VBITS_GE_128-NEXT: ld1d { z30.d }, p0/z, [x1, x11, lsl #3] +; VBITS_GE_128-NEXT: mls z7.d, p0/m, z29.d, z24.d +; VBITS_GE_128-NEXT: ld1d { z24.d }, p0/z, [x1, x12, lsl #3] +; VBITS_GE_128-NEXT: movprfx z29, z0 +; VBITS_GE_128-NEXT: sdiv z29.d, p0/m, z29.d, z28.d +; VBITS_GE_128-NEXT: mls z0.d, p0/m, z29.d, z28.d +; VBITS_GE_128-NEXT: movprfx z28, z1 +; VBITS_GE_128-NEXT: sdiv z28.d, p0/m, z28.d, z27.d +; VBITS_GE_128-NEXT: ld1d { z29.d }, p0/z, [x1, x8, lsl #3] +; VBITS_GE_128-NEXT: mls z1.d, p0/m, z28.d, z27.d +; VBITS_GE_128-NEXT: ld1d { z27.d }, p0/z, [x1, x9, lsl #3] +; VBITS_GE_128-NEXT: movprfx z28, z2 +; VBITS_GE_128-NEXT: sdiv z28.d, p0/m, z28.d, z26.d +; VBITS_GE_128-NEXT: mls z2.d, p0/m, z28.d, z26.d +; VBITS_GE_128-NEXT: ld1d { z26.d }, p0/z, [x1, x10, lsl #3] +; VBITS_GE_128-NEXT: ld1d { z28.d }, p0/z, [x1] +; VBITS_GE_128-NEXT: stp q1, q2, [x0, #224] +; VBITS_GE_128-NEXT: movprfx z1, z16 +; VBITS_GE_128-NEXT: sdiv z1.d, p0/m, z1.d, z25.d +; VBITS_GE_128-NEXT: mls z16.d, p0/m, z1.d, z25.d +; VBITS_GE_128-NEXT: stp q16, q0, [x0, #192] +; VBITS_GE_128-NEXT: movprfx z0, z6 +; VBITS_GE_128-NEXT: sdiv z0.d, p0/m, z0.d, z30.d +; VBITS_GE_128-NEXT: mls z6.d, p0/m, z0.d, z30.d +; VBITS_GE_128-NEXT: stp q6, q7, [x0, #160] +; VBITS_GE_128-NEXT: movprfx z0, z4 +; VBITS_GE_128-NEXT: sdiv z0.d, p0/m, z0.d, z24.d +; VBITS_GE_128-NEXT: mls z4.d, p0/m, z0.d, z24.d +; VBITS_GE_128-NEXT: stp q4, q5, [x0, #128] +; VBITS_GE_128-NEXT: movprfx z0, z23 +; VBITS_GE_128-NEXT: sdiv z0.d, p0/m, z0.d, z29.d +; VBITS_GE_128-NEXT: mls z23.d, p0/m, z0.d, z29.d +; VBITS_GE_128-NEXT: stp q23, q3, [x0, #96] +; VBITS_GE_128-NEXT: movprfx z0, z21 +; VBITS_GE_128-NEXT: sdiv z0.d, p0/m, z0.d, z27.d +; VBITS_GE_128-NEXT: mls z21.d, p0/m, z0.d, z27.d +; VBITS_GE_128-NEXT: stp q21, q22, [x0, #64] +; VBITS_GE_128-NEXT: movprfx z0, z19 +; VBITS_GE_128-NEXT: sdiv z0.d, p0/m, z0.d, z26.d +; VBITS_GE_128-NEXT: mls z19.d, p0/m, z0.d, z26.d +; VBITS_GE_128-NEXT: stp q19, q20, [x0, #32] +; VBITS_GE_128-NEXT: movprfx z0, z17 +; VBITS_GE_128-NEXT: sdiv z0.d, p0/m, z0.d, z28.d +; VBITS_GE_128-NEXT: mls z17.d, p0/m, z0.d, z28.d +; VBITS_GE_128-NEXT: stp q17, q18, [x0] +; VBITS_GE_128-NEXT: ret +; +; VBITS_GE_256-LABEL: srem_v32i64: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov x8, #24 +; VBITS_GE_256-NEXT: mov x9, #28 +; VBITS_GE_256-NEXT: mov x10, #16 +; VBITS_GE_256-NEXT: mov x11, #20 +; VBITS_GE_256-NEXT: mov x12, #8 +; VBITS_GE_256-NEXT: mov x13, #12 +; VBITS_GE_256-NEXT: mov x14, #4 +; VBITS_GE_256-NEXT: ptrue p0.d, vl4 +; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0, x9, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z2.d }, p0/z, [x0, x10, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z3.d }, p0/z, [x0, x11, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z4.d }, p0/z, [x0, x12, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z5.d }, p0/z, [x0, x13, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z6.d }, p0/z, [x0, x14, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z7.d }, p0/z, [x0] +; VBITS_GE_256-NEXT: ld1d { z16.d }, p0/z, [x1, x14, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z17.d }, p0/z, [x1, x13, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z18.d }, p0/z, [x1, x8, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z19.d }, p0/z, [x1, x9, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z20.d }, p0/z, [x1, x11, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z21.d }, p0/z, [x1, x12, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z22.d }, p0/z, [x1, x10, lsl #3] +; VBITS_GE_256-NEXT: movprfx z23, z6 +; VBITS_GE_256-NEXT: sdiv z23.d, p0/m, z23.d, z16.d +; VBITS_GE_256-NEXT: mls z6.d, p0/m, z23.d, z16.d +; VBITS_GE_256-NEXT: movprfx z16, z5 +; VBITS_GE_256-NEXT: sdiv z16.d, p0/m, z16.d, z17.d +; VBITS_GE_256-NEXT: ld1d { z23.d }, p0/z, [x1] +; VBITS_GE_256-NEXT: mls z5.d, p0/m, z16.d, z17.d +; VBITS_GE_256-NEXT: movprfx z16, z4 +; VBITS_GE_256-NEXT: sdiv z16.d, p0/m, z16.d, z21.d +; VBITS_GE_256-NEXT: movprfx z17, z3 +; VBITS_GE_256-NEXT: sdiv z17.d, p0/m, z17.d, z20.d +; VBITS_GE_256-NEXT: mls z4.d, p0/m, z16.d, z21.d +; VBITS_GE_256-NEXT: mls z3.d, p0/m, z17.d, z20.d +; VBITS_GE_256-NEXT: movprfx z16, z2 +; VBITS_GE_256-NEXT: sdiv z16.d, p0/m, z16.d, z22.d +; VBITS_GE_256-NEXT: movprfx z17, z1 +; VBITS_GE_256-NEXT: sdiv z17.d, p0/m, z17.d, z19.d +; VBITS_GE_256-NEXT: mls z2.d, p0/m, z16.d, z22.d +; VBITS_GE_256-NEXT: mls z1.d, p0/m, z17.d, z19.d +; VBITS_GE_256-NEXT: movprfx z16, z0 +; VBITS_GE_256-NEXT: sdiv z16.d, p0/m, z16.d, z18.d +; VBITS_GE_256-NEXT: movprfx z17, z7 +; VBITS_GE_256-NEXT: sdiv z17.d, p0/m, z17.d, z23.d +; VBITS_GE_256-NEXT: mls z0.d, p0/m, z16.d, z18.d +; VBITS_GE_256-NEXT: mls z7.d, p0/m, z17.d, z23.d +; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x0, x8, lsl #3] +; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x0, x9, lsl #3] +; VBITS_GE_256-NEXT: st1d { z2.d }, p0, [x0, x10, lsl #3] +; VBITS_GE_256-NEXT: st1d { z3.d }, p0, [x0, x11, lsl #3] +; VBITS_GE_256-NEXT: st1d { z4.d }, p0, [x0, x12, lsl #3] +; VBITS_GE_256-NEXT: st1d { z5.d }, p0, [x0, x13, lsl #3] +; VBITS_GE_256-NEXT: st1d { z6.d }, p0, [x0, x14, lsl #3] +; VBITS_GE_256-NEXT: st1d { z7.d }, p0, [x0] +; VBITS_GE_256-NEXT: ret + %op1 = load <32 x i64>, <32 x i64>* %a + %op2 = load <32 x i64>, <32 x i64>* %b + %res = srem <32 x i64> %op1, %op2 + store <32 x i64> %res, <32 x i64>* %a + ret void +} + +; +; UREM +; + +define <8 x i8> @urem_v8i8(<8 x i8> %op1, <8 x i8> %op2) #0 { +; VBITS_GE_128-LABEL: urem_v8i8: +; VBITS_GE_128: // %bb.0: +; VBITS_GE_128-NEXT: sub sp, sp, #16 +; VBITS_GE_128-NEXT: .cfi_def_cfa_offset 16 +; VBITS_GE_128-NEXT: // kill: def $d1 killed $d1 def $z1 +; VBITS_GE_128-NEXT: // kill: def $d0 killed $d0 def $z0 +; VBITS_GE_128-NEXT: uunpklo z2.h, z1.b +; VBITS_GE_128-NEXT: uunpklo z3.h, z0.b +; VBITS_GE_128-NEXT: ptrue p0.s, vl4 +; VBITS_GE_128-NEXT: uunpkhi z4.s, z2.h +; VBITS_GE_128-NEXT: uunpkhi z5.s, z3.h +; VBITS_GE_128-NEXT: uunpklo z2.s, z2.h +; VBITS_GE_128-NEXT: uunpklo z3.s, z3.h +; VBITS_GE_128-NEXT: udivr z4.s, p0/m, z4.s, z5.s +; VBITS_GE_128-NEXT: udivr z2.s, p0/m, z2.s, z3.s +; VBITS_GE_128-NEXT: ptrue p0.b, vl8 +; VBITS_GE_128-NEXT: uzp1 z2.h, z2.h, z4.h +; VBITS_GE_128-NEXT: fmov w8, s2 +; VBITS_GE_128-NEXT: mov z3.h, z2.h[7] +; VBITS_GE_128-NEXT: mov z5.h, z2.h[5] +; VBITS_GE_128-NEXT: fmov w9, s3 +; VBITS_GE_128-NEXT: mov z4.h, z2.h[6] +; VBITS_GE_128-NEXT: mov z6.h, z2.h[4] +; VBITS_GE_128-NEXT: strb w8, [sp, #8] +; VBITS_GE_128-NEXT: fmov w8, s5 +; VBITS_GE_128-NEXT: mov z16.h, z2.h[2] +; VBITS_GE_128-NEXT: fmov w10, s4 +; VBITS_GE_128-NEXT: strb w9, [sp, #15] +; VBITS_GE_128-NEXT: fmov w9, s6 +; VBITS_GE_128-NEXT: strb w8, [sp, #13] +; VBITS_GE_128-NEXT: fmov w8, s16 +; VBITS_GE_128-NEXT: mov z7.h, z2.h[3] +; VBITS_GE_128-NEXT: mov z2.h, z2.h[1] +; VBITS_GE_128-NEXT: strb w10, [sp, #14] +; VBITS_GE_128-NEXT: fmov w10, s7 +; VBITS_GE_128-NEXT: strb w9, [sp, #12] +; VBITS_GE_128-NEXT: fmov w9, s2 +; VBITS_GE_128-NEXT: strb w8, [sp, #10] +; VBITS_GE_128-NEXT: add x8, sp, #8 +; VBITS_GE_128-NEXT: strb w10, [sp, #11] +; VBITS_GE_128-NEXT: strb w9, [sp, #9] +; VBITS_GE_128-NEXT: ld1b { z2.b }, p0/z, [x8] +; VBITS_GE_128-NEXT: mls z0.b, p0/m, z2.b, z1.b +; VBITS_GE_128-NEXT: // kill: def $d0 killed $d0 killed $z0 +; VBITS_GE_128-NEXT: add sp, sp, #16 +; VBITS_GE_128-NEXT: ret +; +; VBITS_GE_256-LABEL: urem_v8i8: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: sub sp, sp, #16 +; VBITS_GE_256-NEXT: .cfi_def_cfa_offset 16 +; VBITS_GE_256-NEXT: // kill: def $d1 killed $d1 def $z1 +; VBITS_GE_256-NEXT: // kill: def $d0 killed $d0 def $z0 +; VBITS_GE_256-NEXT: uunpklo z2.h, z1.b +; VBITS_GE_256-NEXT: uunpklo z3.h, z0.b +; VBITS_GE_256-NEXT: ptrue p0.s, vl8 +; VBITS_GE_256-NEXT: uunpklo z2.s, z2.h +; VBITS_GE_256-NEXT: uunpklo z3.s, z3.h +; VBITS_GE_256-NEXT: udivr z2.s, p0/m, z2.s, z3.s +; VBITS_GE_256-NEXT: ptrue p0.b, vl8 +; VBITS_GE_256-NEXT: uzp1 z2.h, z2.h, z2.h +; VBITS_GE_256-NEXT: fmov w8, s2 +; VBITS_GE_256-NEXT: mov z3.h, z2.h[7] +; VBITS_GE_256-NEXT: mov z5.h, z2.h[5] +; VBITS_GE_256-NEXT: fmov w9, s3 +; VBITS_GE_256-NEXT: mov z4.h, z2.h[6] +; VBITS_GE_256-NEXT: mov z6.h, z2.h[4] +; VBITS_GE_256-NEXT: strb w8, [sp, #8] +; VBITS_GE_256-NEXT: fmov w8, s5 +; VBITS_GE_256-NEXT: mov z16.h, z2.h[2] +; VBITS_GE_256-NEXT: fmov w10, s4 +; VBITS_GE_256-NEXT: strb w9, [sp, #15] +; VBITS_GE_256-NEXT: fmov w9, s6 +; VBITS_GE_256-NEXT: strb w8, [sp, #13] +; VBITS_GE_256-NEXT: fmov w8, s16 +; VBITS_GE_256-NEXT: mov z7.h, z2.h[3] +; VBITS_GE_256-NEXT: mov z2.h, z2.h[1] +; VBITS_GE_256-NEXT: strb w10, [sp, #14] +; VBITS_GE_256-NEXT: fmov w10, s7 +; VBITS_GE_256-NEXT: strb w9, [sp, #12] +; VBITS_GE_256-NEXT: fmov w9, s2 +; VBITS_GE_256-NEXT: strb w8, [sp, #10] +; VBITS_GE_256-NEXT: add x8, sp, #8 +; VBITS_GE_256-NEXT: strb w10, [sp, #11] +; VBITS_GE_256-NEXT: strb w9, [sp, #9] +; VBITS_GE_256-NEXT: ld1b { z2.b }, p0/z, [x8] +; VBITS_GE_256-NEXT: mls z0.b, p0/m, z2.b, z1.b +; VBITS_GE_256-NEXT: // kill: def $d0 killed $d0 killed $z0 +; VBITS_GE_256-NEXT: add sp, sp, #16 +; VBITS_GE_256-NEXT: ret +; +; VBITS_GE_512-LABEL: urem_v8i8: +; VBITS_GE_512: // %bb.0: +; VBITS_GE_512-NEXT: sub sp, sp, #16 +; VBITS_GE_512-NEXT: .cfi_def_cfa_offset 16 +; VBITS_GE_512-NEXT: // kill: def $d1 killed $d1 def $z1 +; VBITS_GE_512-NEXT: // kill: def $d0 killed $d0 def $z0 +; VBITS_GE_512-NEXT: uunpklo z2.h, z1.b +; VBITS_GE_512-NEXT: uunpklo z3.h, z0.b +; VBITS_GE_512-NEXT: ptrue p0.s, vl8 +; VBITS_GE_512-NEXT: uunpklo z2.s, z2.h +; VBITS_GE_512-NEXT: uunpklo z3.s, z3.h +; VBITS_GE_512-NEXT: udivr z2.s, p0/m, z2.s, z3.s +; VBITS_GE_512-NEXT: ptrue p0.b, vl8 +; VBITS_GE_512-NEXT: uzp1 z2.h, z2.h, z2.h +; VBITS_GE_512-NEXT: fmov w8, s2 +; VBITS_GE_512-NEXT: mov z3.h, z2.h[7] +; VBITS_GE_512-NEXT: mov z5.h, z2.h[5] +; VBITS_GE_512-NEXT: fmov w9, s3 +; VBITS_GE_512-NEXT: mov z4.h, z2.h[6] +; VBITS_GE_512-NEXT: mov z6.h, z2.h[4] +; VBITS_GE_512-NEXT: strb w8, [sp, #8] +; VBITS_GE_512-NEXT: fmov w8, s5 +; VBITS_GE_512-NEXT: mov z16.h, z2.h[2] +; VBITS_GE_512-NEXT: fmov w10, s4 +; VBITS_GE_512-NEXT: strb w9, [sp, #15] +; VBITS_GE_512-NEXT: fmov w9, s6 +; VBITS_GE_512-NEXT: strb w8, [sp, #13] +; VBITS_GE_512-NEXT: fmov w8, s16 +; VBITS_GE_512-NEXT: mov z7.h, z2.h[3] +; VBITS_GE_512-NEXT: mov z2.h, z2.h[1] +; VBITS_GE_512-NEXT: strb w10, [sp, #14] +; VBITS_GE_512-NEXT: fmov w10, s7 +; VBITS_GE_512-NEXT: strb w9, [sp, #12] +; VBITS_GE_512-NEXT: fmov w9, s2 +; VBITS_GE_512-NEXT: strb w8, [sp, #10] +; VBITS_GE_512-NEXT: add x8, sp, #8 +; VBITS_GE_512-NEXT: strb w10, [sp, #11] +; VBITS_GE_512-NEXT: strb w9, [sp, #9] +; VBITS_GE_512-NEXT: ld1b { z2.b }, p0/z, [x8] +; VBITS_GE_512-NEXT: mls z0.b, p0/m, z2.b, z1.b +; VBITS_GE_512-NEXT: // kill: def $d0 killed $d0 killed $z0 +; VBITS_GE_512-NEXT: add sp, sp, #16 +; VBITS_GE_512-NEXT: ret + %res = urem <8 x i8> %op1, %op2 + ret <8 x i8> %res +} + +define <16 x i8> @urem_v16i8(<16 x i8> %op1, <16 x i8> %op2) #0 { +; VBITS_GE_128-LABEL: urem_v16i8: +; VBITS_GE_128: // %bb.0: +; VBITS_GE_128-NEXT: // kill: def $q1 killed $q1 def $z1 +; VBITS_GE_128-NEXT: // kill: def $q0 killed $q0 def $z0 +; VBITS_GE_128-NEXT: uunpkhi z2.h, z1.b +; VBITS_GE_128-NEXT: uunpkhi z3.h, z0.b +; VBITS_GE_128-NEXT: ptrue p0.s, vl4 +; VBITS_GE_128-NEXT: uunpkhi z5.s, z2.h +; VBITS_GE_128-NEXT: uunpkhi z6.s, z3.h +; VBITS_GE_128-NEXT: uunpklo z2.s, z2.h +; VBITS_GE_128-NEXT: uunpklo z3.s, z3.h +; VBITS_GE_128-NEXT: uunpklo z4.h, z1.b +; VBITS_GE_128-NEXT: udivr z2.s, p0/m, z2.s, z3.s +; VBITS_GE_128-NEXT: uunpklo z3.h, z0.b +; VBITS_GE_128-NEXT: udivr z5.s, p0/m, z5.s, z6.s +; VBITS_GE_128-NEXT: uunpkhi z6.s, z4.h +; VBITS_GE_128-NEXT: uunpkhi z7.s, z3.h +; VBITS_GE_128-NEXT: uunpklo z4.s, z4.h +; VBITS_GE_128-NEXT: uunpklo z3.s, z3.h +; VBITS_GE_128-NEXT: udivr z6.s, p0/m, z6.s, z7.s +; VBITS_GE_128-NEXT: udiv z3.s, p0/m, z3.s, z4.s +; VBITS_GE_128-NEXT: uzp1 z2.h, z2.h, z5.h +; VBITS_GE_128-NEXT: uzp1 z3.h, z3.h, z6.h +; VBITS_GE_128-NEXT: ptrue p0.b, vl16 +; VBITS_GE_128-NEXT: uzp1 z2.b, z3.b, z2.b +; VBITS_GE_128-NEXT: mls z0.b, p0/m, z2.b, z1.b +; VBITS_GE_128-NEXT: // kill: def $q0 killed $q0 killed $z0 +; VBITS_GE_128-NEXT: ret +; +; VBITS_GE_256-LABEL: urem_v16i8: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: // kill: def $q1 killed $q1 def $z1 +; VBITS_GE_256-NEXT: // kill: def $q0 killed $q0 def $z0 +; VBITS_GE_256-NEXT: uunpklo z2.h, z1.b +; VBITS_GE_256-NEXT: uunpklo z3.h, z0.b +; VBITS_GE_256-NEXT: ptrue p0.s, vl8 +; VBITS_GE_256-NEXT: uunpkhi z4.s, z2.h +; VBITS_GE_256-NEXT: uunpkhi z5.s, z3.h +; VBITS_GE_256-NEXT: uunpklo z2.s, z2.h +; VBITS_GE_256-NEXT: uunpklo z3.s, z3.h +; VBITS_GE_256-NEXT: udivr z4.s, p0/m, z4.s, z5.s +; VBITS_GE_256-NEXT: udivr z2.s, p0/m, z2.s, z3.s +; VBITS_GE_256-NEXT: ptrue p0.b, vl16 +; VBITS_GE_256-NEXT: uzp1 z2.h, z2.h, z4.h +; VBITS_GE_256-NEXT: uzp1 z2.b, z2.b, z2.b +; VBITS_GE_256-NEXT: mls z0.b, p0/m, z2.b, z1.b +; VBITS_GE_256-NEXT: // kill: def $q0 killed $q0 killed $z0 +; VBITS_GE_256-NEXT: ret +; +; VBITS_GE_512-LABEL: urem_v16i8: +; VBITS_GE_512: // %bb.0: +; VBITS_GE_512-NEXT: // kill: def $q1 killed $q1 def $z1 +; VBITS_GE_512-NEXT: // kill: def $q0 killed $q0 def $z0 +; VBITS_GE_512-NEXT: uunpklo z2.h, z1.b +; VBITS_GE_512-NEXT: uunpklo z3.h, z0.b +; VBITS_GE_512-NEXT: ptrue p0.s, vl16 +; VBITS_GE_512-NEXT: uunpklo z2.s, z2.h +; VBITS_GE_512-NEXT: uunpklo z3.s, z3.h +; VBITS_GE_512-NEXT: udivr z2.s, p0/m, z2.s, z3.s +; VBITS_GE_512-NEXT: ptrue p0.b, vl16 +; VBITS_GE_512-NEXT: uzp1 z2.h, z2.h, z2.h +; VBITS_GE_512-NEXT: uzp1 z2.b, z2.b, z2.b +; VBITS_GE_512-NEXT: mls z0.b, p0/m, z2.b, z1.b +; VBITS_GE_512-NEXT: // kill: def $q0 killed $q0 killed $z0 +; VBITS_GE_512-NEXT: ret + %res = urem <16 x i8> %op1, %op2 + ret <16 x i8> %res +} + +define void @urem_v32i8(<32 x i8>* %a, <32 x i8>* %b) #0 { +; VBITS_GE_128-LABEL: urem_v32i8: +; VBITS_GE_128: // %bb.0: +; VBITS_GE_128-NEXT: mov w8, #16 +; VBITS_GE_128-NEXT: ptrue p0.b, vl16 +; VBITS_GE_128-NEXT: ptrue p1.s, vl4 +; VBITS_GE_128-NEXT: ld1b { z0.b }, p0/z, [x0, x8] +; VBITS_GE_128-NEXT: ld1b { z1.b }, p0/z, [x0] +; VBITS_GE_128-NEXT: ld1b { z2.b }, p0/z, [x1, x8] +; VBITS_GE_128-NEXT: ld1b { z3.b }, p0/z, [x1] +; VBITS_GE_128-NEXT: uunpkhi z5.h, z0.b +; VBITS_GE_128-NEXT: uunpklo z7.h, z0.b +; VBITS_GE_128-NEXT: uunpkhi z4.h, z2.b +; VBITS_GE_128-NEXT: uunpklo z6.h, z2.b +; VBITS_GE_128-NEXT: uunpkhi z16.s, z4.h +; VBITS_GE_128-NEXT: uunpkhi z17.s, z5.h +; VBITS_GE_128-NEXT: uunpklo z4.s, z4.h +; VBITS_GE_128-NEXT: uunpklo z5.s, z5.h +; VBITS_GE_128-NEXT: uunpkhi z18.s, z6.h +; VBITS_GE_128-NEXT: udivr z16.s, p1/m, z16.s, z17.s +; VBITS_GE_128-NEXT: udivr z4.s, p1/m, z4.s, z5.s +; VBITS_GE_128-NEXT: uunpkhi z5.s, z7.h +; VBITS_GE_128-NEXT: uunpklo z6.s, z6.h +; VBITS_GE_128-NEXT: uunpklo z7.s, z7.h +; VBITS_GE_128-NEXT: uzp1 z4.h, z4.h, z16.h +; VBITS_GE_128-NEXT: udivr z6.s, p1/m, z6.s, z7.s +; VBITS_GE_128-NEXT: uunpkhi z7.h, z3.b +; VBITS_GE_128-NEXT: uunpkhi z16.h, z1.b +; VBITS_GE_128-NEXT: udiv z5.s, p1/m, z5.s, z18.s +; VBITS_GE_128-NEXT: uunpkhi z17.s, z7.h +; VBITS_GE_128-NEXT: uunpkhi z18.s, z16.h +; VBITS_GE_128-NEXT: uunpklo z7.s, z7.h +; VBITS_GE_128-NEXT: uunpklo z16.s, z16.h +; VBITS_GE_128-NEXT: udivr z17.s, p1/m, z17.s, z18.s +; VBITS_GE_128-NEXT: udivr z7.s, p1/m, z7.s, z16.s +; VBITS_GE_128-NEXT: uunpklo z16.h, z3.b +; VBITS_GE_128-NEXT: uunpklo z18.h, z1.b +; VBITS_GE_128-NEXT: uunpkhi z19.s, z16.h +; VBITS_GE_128-NEXT: uunpkhi z20.s, z18.h +; VBITS_GE_128-NEXT: uunpklo z16.s, z16.h +; VBITS_GE_128-NEXT: uunpklo z18.s, z18.h +; VBITS_GE_128-NEXT: udivr z19.s, p1/m, z19.s, z20.s +; VBITS_GE_128-NEXT: udivr z16.s, p1/m, z16.s, z18.s +; VBITS_GE_128-NEXT: uzp1 z7.h, z7.h, z17.h +; VBITS_GE_128-NEXT: uzp1 z16.h, z16.h, z19.h +; VBITS_GE_128-NEXT: uzp1 z5.h, z6.h, z5.h +; VBITS_GE_128-NEXT: uzp1 z6.b, z16.b, z7.b +; VBITS_GE_128-NEXT: uzp1 z4.b, z5.b, z4.b +; VBITS_GE_128-NEXT: mls z1.b, p0/m, z6.b, z3.b +; VBITS_GE_128-NEXT: mls z0.b, p0/m, z4.b, z2.b +; VBITS_GE_128-NEXT: stp q1, q0, [x0] +; VBITS_GE_128-NEXT: ret +; +; VBITS_GE_256-LABEL: urem_v32i8: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: ptrue p0.b, vl32 +; VBITS_GE_256-NEXT: ptrue p1.s, vl8 +; VBITS_GE_256-NEXT: ld1b { z0.b }, p0/z, [x0] +; VBITS_GE_256-NEXT: ld1b { z1.b }, p0/z, [x1] +; VBITS_GE_256-NEXT: uunpkhi z2.h, z1.b +; VBITS_GE_256-NEXT: uunpkhi z3.h, z0.b +; VBITS_GE_256-NEXT: uunpklo z4.h, z1.b +; VBITS_GE_256-NEXT: uunpklo z5.h, z0.b +; VBITS_GE_256-NEXT: uunpkhi z6.s, z2.h +; VBITS_GE_256-NEXT: uunpkhi z7.s, z3.h +; VBITS_GE_256-NEXT: uunpklo z2.s, z2.h +; VBITS_GE_256-NEXT: uunpklo z3.s, z3.h +; VBITS_GE_256-NEXT: udivr z6.s, p1/m, z6.s, z7.s +; VBITS_GE_256-NEXT: uunpkhi z7.s, z4.h +; VBITS_GE_256-NEXT: udivr z2.s, p1/m, z2.s, z3.s +; VBITS_GE_256-NEXT: uunpkhi z3.s, z5.h +; VBITS_GE_256-NEXT: uunpklo z4.s, z4.h +; VBITS_GE_256-NEXT: uunpklo z5.s, z5.h +; VBITS_GE_256-NEXT: udiv z3.s, p1/m, z3.s, z7.s +; VBITS_GE_256-NEXT: udivr z4.s, p1/m, z4.s, z5.s +; VBITS_GE_256-NEXT: uzp1 z2.h, z2.h, z6.h +; VBITS_GE_256-NEXT: uzp1 z3.h, z4.h, z3.h +; VBITS_GE_256-NEXT: uzp1 z2.b, z3.b, z2.b +; VBITS_GE_256-NEXT: mls z0.b, p0/m, z2.b, z1.b +; VBITS_GE_256-NEXT: st1b { z0.b }, p0, [x0] +; VBITS_GE_256-NEXT: ret + %op1 = load <32 x i8>, <32 x i8>* %a + %op2 = load <32 x i8>, <32 x i8>* %b + %res = urem <32 x i8> %op1, %op2 + store <32 x i8> %res, <32 x i8>* %a + ret void +} + +define void @urem_v64i8(<64 x i8>* %a, <64 x i8>* %b) #0 { +; VBITS_GE_128-LABEL: urem_v64i8: +; VBITS_GE_128: // %bb.0: +; VBITS_GE_128-NEXT: mov w8, #32 +; VBITS_GE_128-NEXT: mov w9, #48 +; VBITS_GE_128-NEXT: mov w10, #16 +; VBITS_GE_128-NEXT: ptrue p0.b, vl16 +; VBITS_GE_128-NEXT: ptrue p1.s, vl4 +; VBITS_GE_128-NEXT: ld1b { z2.b }, p0/z, [x0, x8] +; VBITS_GE_128-NEXT: ld1b { z1.b }, p0/z, [x0, x9] +; VBITS_GE_128-NEXT: ld1b { z0.b }, p0/z, [x0, x10] +; VBITS_GE_128-NEXT: ld1b { z3.b }, p0/z, [x0] +; VBITS_GE_128-NEXT: ld1b { z4.b }, p0/z, [x1, x10] +; VBITS_GE_128-NEXT: ld1b { z6.b }, p0/z, [x1, x9] +; VBITS_GE_128-NEXT: ld1b { z5.b }, p0/z, [x1, x8] +; VBITS_GE_128-NEXT: uunpkhi z16.h, z0.b +; VBITS_GE_128-NEXT: uunpkhi z7.h, z4.b +; VBITS_GE_128-NEXT: uunpkhi z18.s, z16.h +; VBITS_GE_128-NEXT: uunpkhi z17.s, z7.h +; VBITS_GE_128-NEXT: uunpklo z7.s, z7.h +; VBITS_GE_128-NEXT: uunpklo z16.s, z16.h +; VBITS_GE_128-NEXT: udivr z17.s, p1/m, z17.s, z18.s +; VBITS_GE_128-NEXT: udivr z7.s, p1/m, z7.s, z16.s +; VBITS_GE_128-NEXT: uunpklo z16.h, z4.b +; VBITS_GE_128-NEXT: uunpklo z18.h, z0.b +; VBITS_GE_128-NEXT: uunpkhi z19.s, z16.h +; VBITS_GE_128-NEXT: uunpkhi z20.s, z18.h +; VBITS_GE_128-NEXT: uunpklo z16.s, z16.h +; VBITS_GE_128-NEXT: uunpklo z18.s, z18.h +; VBITS_GE_128-NEXT: udivr z19.s, p1/m, z19.s, z20.s +; VBITS_GE_128-NEXT: udivr z16.s, p1/m, z16.s, z18.s +; VBITS_GE_128-NEXT: uzp1 z7.h, z7.h, z17.h +; VBITS_GE_128-NEXT: uunpkhi z17.h, z6.b +; VBITS_GE_128-NEXT: uunpkhi z18.h, z1.b +; VBITS_GE_128-NEXT: uzp1 z16.h, z16.h, z19.h +; VBITS_GE_128-NEXT: uunpkhi z19.s, z17.h +; VBITS_GE_128-NEXT: uunpkhi z20.s, z18.h +; VBITS_GE_128-NEXT: uunpklo z17.s, z17.h +; VBITS_GE_128-NEXT: uunpklo z18.s, z18.h +; VBITS_GE_128-NEXT: udivr z19.s, p1/m, z19.s, z20.s +; VBITS_GE_128-NEXT: udivr z17.s, p1/m, z17.s, z18.s +; VBITS_GE_128-NEXT: uzp1 z7.b, z16.b, z7.b +; VBITS_GE_128-NEXT: uzp1 z16.h, z17.h, z19.h +; VBITS_GE_128-NEXT: uunpklo z17.h, z6.b +; VBITS_GE_128-NEXT: uunpklo z18.h, z1.b +; VBITS_GE_128-NEXT: uunpkhi z19.s, z17.h +; VBITS_GE_128-NEXT: uunpkhi z20.s, z18.h +; VBITS_GE_128-NEXT: uunpklo z17.s, z17.h +; VBITS_GE_128-NEXT: uunpklo z18.s, z18.h +; VBITS_GE_128-NEXT: udivr z19.s, p1/m, z19.s, z20.s +; VBITS_GE_128-NEXT: udivr z17.s, p1/m, z17.s, z18.s +; VBITS_GE_128-NEXT: uunpkhi z18.h, z5.b +; VBITS_GE_128-NEXT: uunpkhi z20.h, z2.b +; VBITS_GE_128-NEXT: uunpkhi z21.s, z18.h +; VBITS_GE_128-NEXT: uunpkhi z22.s, z20.h +; VBITS_GE_128-NEXT: uzp1 z17.h, z17.h, z19.h +; VBITS_GE_128-NEXT: movprfx z19, z22 +; VBITS_GE_128-NEXT: udiv z19.s, p1/m, z19.s, z21.s +; VBITS_GE_128-NEXT: uunpklo z21.h, z5.b +; VBITS_GE_128-NEXT: uunpklo z22.h, z2.b +; VBITS_GE_128-NEXT: uunpklo z18.s, z18.h +; VBITS_GE_128-NEXT: uunpklo z20.s, z20.h +; VBITS_GE_128-NEXT: uunpkhi z23.s, z21.h +; VBITS_GE_128-NEXT: uunpkhi z24.s, z22.h +; VBITS_GE_128-NEXT: uunpklo z21.s, z21.h +; VBITS_GE_128-NEXT: uunpklo z22.s, z22.h +; VBITS_GE_128-NEXT: udivr z18.s, p1/m, z18.s, z20.s +; VBITS_GE_128-NEXT: movprfx z20, z24 +; VBITS_GE_128-NEXT: udiv z20.s, p1/m, z20.s, z23.s +; VBITS_GE_128-NEXT: udivr z21.s, p1/m, z21.s, z22.s +; VBITS_GE_128-NEXT: ld1b { z22.b }, p0/z, [x1] +; VBITS_GE_128-NEXT: uzp1 z18.h, z18.h, z19.h +; VBITS_GE_128-NEXT: uzp1 z19.h, z21.h, z20.h +; VBITS_GE_128-NEXT: uzp1 z16.b, z17.b, z16.b +; VBITS_GE_128-NEXT: uzp1 z17.b, z19.b, z18.b +; VBITS_GE_128-NEXT: mls z1.b, p0/m, z16.b, z6.b +; VBITS_GE_128-NEXT: mls z2.b, p0/m, z17.b, z5.b +; VBITS_GE_128-NEXT: uunpkhi z6.h, z3.b +; VBITS_GE_128-NEXT: uunpkhi z5.h, z22.b +; VBITS_GE_128-NEXT: uunpkhi z17.s, z6.h +; VBITS_GE_128-NEXT: uunpkhi z16.s, z5.h +; VBITS_GE_128-NEXT: uunpklo z5.s, z5.h +; VBITS_GE_128-NEXT: uunpklo z6.s, z6.h +; VBITS_GE_128-NEXT: udivr z16.s, p1/m, z16.s, z17.s +; VBITS_GE_128-NEXT: udivr z5.s, p1/m, z5.s, z6.s +; VBITS_GE_128-NEXT: uunpklo z6.h, z22.b +; VBITS_GE_128-NEXT: uunpklo z17.h, z3.b +; VBITS_GE_128-NEXT: uunpkhi z18.s, z6.h +; VBITS_GE_128-NEXT: uunpkhi z19.s, z17.h +; VBITS_GE_128-NEXT: uunpklo z6.s, z6.h +; VBITS_GE_128-NEXT: uunpklo z17.s, z17.h +; VBITS_GE_128-NEXT: udivr z18.s, p1/m, z18.s, z19.s +; VBITS_GE_128-NEXT: udivr z6.s, p1/m, z6.s, z17.s +; VBITS_GE_128-NEXT: uzp1 z5.h, z5.h, z16.h +; VBITS_GE_128-NEXT: uzp1 z6.h, z6.h, z18.h +; VBITS_GE_128-NEXT: mls z0.b, p0/m, z7.b, z4.b +; VBITS_GE_128-NEXT: uzp1 z5.b, z6.b, z5.b +; VBITS_GE_128-NEXT: stp q2, q1, [x0, #32] +; VBITS_GE_128-NEXT: mls z3.b, p0/m, z5.b, z22.b +; VBITS_GE_128-NEXT: stp q3, q0, [x0] +; VBITS_GE_128-NEXT: ret +; +; VBITS_GE_256-LABEL: urem_v64i8: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov w8, #32 +; VBITS_GE_256-NEXT: ptrue p0.b, vl32 +; VBITS_GE_256-NEXT: ptrue p1.s, vl8 +; VBITS_GE_256-NEXT: ld1b { z0.b }, p0/z, [x0, x8] +; VBITS_GE_256-NEXT: ld1b { z1.b }, p0/z, [x0] +; VBITS_GE_256-NEXT: ld1b { z2.b }, p0/z, [x1, x8] +; VBITS_GE_256-NEXT: ld1b { z3.b }, p0/z, [x1] +; VBITS_GE_256-NEXT: uunpkhi z5.h, z0.b +; VBITS_GE_256-NEXT: uunpklo z7.h, z0.b +; VBITS_GE_256-NEXT: uunpkhi z4.h, z2.b +; VBITS_GE_256-NEXT: uunpklo z6.h, z2.b +; VBITS_GE_256-NEXT: uunpkhi z16.s, z4.h +; VBITS_GE_256-NEXT: uunpkhi z17.s, z5.h +; VBITS_GE_256-NEXT: uunpklo z4.s, z4.h +; VBITS_GE_256-NEXT: uunpklo z5.s, z5.h +; VBITS_GE_256-NEXT: uunpkhi z18.s, z6.h +; VBITS_GE_256-NEXT: udivr z4.s, p1/m, z4.s, z5.s +; VBITS_GE_256-NEXT: uunpkhi z5.s, z7.h +; VBITS_GE_256-NEXT: uunpklo z6.s, z6.h +; VBITS_GE_256-NEXT: uunpklo z7.s, z7.h +; VBITS_GE_256-NEXT: udiv z5.s, p1/m, z5.s, z18.s +; VBITS_GE_256-NEXT: udivr z6.s, p1/m, z6.s, z7.s +; VBITS_GE_256-NEXT: udivr z16.s, p1/m, z16.s, z17.s +; VBITS_GE_256-NEXT: uzp1 z5.h, z6.h, z5.h +; VBITS_GE_256-NEXT: uunpkhi z6.h, z3.b +; VBITS_GE_256-NEXT: uunpkhi z7.h, z1.b +; VBITS_GE_256-NEXT: uzp1 z4.h, z4.h, z16.h +; VBITS_GE_256-NEXT: uunpkhi z16.s, z6.h +; VBITS_GE_256-NEXT: uunpkhi z17.s, z7.h +; VBITS_GE_256-NEXT: uunpklo z6.s, z6.h +; VBITS_GE_256-NEXT: uunpklo z7.s, z7.h +; VBITS_GE_256-NEXT: udivr z16.s, p1/m, z16.s, z17.s +; VBITS_GE_256-NEXT: udivr z6.s, p1/m, z6.s, z7.s +; VBITS_GE_256-NEXT: uunpklo z7.h, z3.b +; VBITS_GE_256-NEXT: uunpklo z17.h, z1.b +; VBITS_GE_256-NEXT: uunpkhi z18.s, z7.h +; VBITS_GE_256-NEXT: uunpkhi z19.s, z17.h +; VBITS_GE_256-NEXT: uunpklo z7.s, z7.h +; VBITS_GE_256-NEXT: uunpklo z17.s, z17.h +; VBITS_GE_256-NEXT: udivr z18.s, p1/m, z18.s, z19.s +; VBITS_GE_256-NEXT: udivr z7.s, p1/m, z7.s, z17.s +; VBITS_GE_256-NEXT: uzp1 z6.h, z6.h, z16.h +; VBITS_GE_256-NEXT: uzp1 z7.h, z7.h, z18.h +; VBITS_GE_256-NEXT: uzp1 z4.b, z5.b, z4.b +; VBITS_GE_256-NEXT: uzp1 z5.b, z7.b, z6.b +; VBITS_GE_256-NEXT: mls z0.b, p0/m, z4.b, z2.b +; VBITS_GE_256-NEXT: mls z1.b, p0/m, z5.b, z3.b +; VBITS_GE_256-NEXT: st1b { z0.b }, p0, [x0, x8] +; VBITS_GE_256-NEXT: st1b { z1.b }, p0, [x0] +; VBITS_GE_256-NEXT: ret + %op1 = load <64 x i8>, <64 x i8>* %a + %op2 = load <64 x i8>, <64 x i8>* %b + %res = urem <64 x i8> %op1, %op2 + store <64 x i8> %res, <64 x i8>* %a + ret void +} + +define void @urem_v128i8(<128 x i8>* %a, <128 x i8>* %b) #0 { +; VBITS_GE_128-LABEL: urem_v128i8: +; VBITS_GE_128: // %bb.0: +; VBITS_GE_128-NEXT: mov w11, #96 +; VBITS_GE_128-NEXT: mov w8, #112 +; VBITS_GE_128-NEXT: mov w9, #64 +; VBITS_GE_128-NEXT: mov w10, #80 +; VBITS_GE_128-NEXT: mov w12, #32 +; VBITS_GE_128-NEXT: mov w13, #48 +; VBITS_GE_128-NEXT: mov w14, #16 +; VBITS_GE_128-NEXT: ptrue p0.b, vl16 +; VBITS_GE_128-NEXT: ld1b { z0.b }, p0/z, [x0, x11] +; VBITS_GE_128-NEXT: ld1b { z7.b }, p0/z, [x0, x8] +; VBITS_GE_128-NEXT: ld1b { z6.b }, p0/z, [x0, x9] +; VBITS_GE_128-NEXT: ld1b { z4.b }, p0/z, [x0, x10] +; VBITS_GE_128-NEXT: ld1b { z3.b }, p0/z, [x0, x12] +; VBITS_GE_128-NEXT: ld1b { z2.b }, p0/z, [x0, x13] +; VBITS_GE_128-NEXT: ld1b { z1.b }, p0/z, [x0, x14] +; VBITS_GE_128-NEXT: ld1b { z5.b }, p0/z, [x0] +; VBITS_GE_128-NEXT: ld1b { z16.b }, p0/z, [x1, x14] +; VBITS_GE_128-NEXT: ptrue p1.s, vl4 +; VBITS_GE_128-NEXT: ld1b { z19.b }, p0/z, [x1, x13] +; VBITS_GE_128-NEXT: ld1b { z18.b }, p0/z, [x1, x11] +; VBITS_GE_128-NEXT: uunpkhi z21.h, z1.b +; VBITS_GE_128-NEXT: uunpkhi z17.h, z16.b +; VBITS_GE_128-NEXT: uunpkhi z22.s, z21.h +; VBITS_GE_128-NEXT: uunpkhi z20.s, z17.h +; VBITS_GE_128-NEXT: uunpklo z17.s, z17.h +; VBITS_GE_128-NEXT: uunpklo z21.s, z21.h +; VBITS_GE_128-NEXT: udivr z20.s, p1/m, z20.s, z22.s +; VBITS_GE_128-NEXT: udivr z17.s, p1/m, z17.s, z21.s +; VBITS_GE_128-NEXT: uunpklo z21.h, z16.b +; VBITS_GE_128-NEXT: uunpklo z22.h, z1.b +; VBITS_GE_128-NEXT: uunpkhi z23.s, z21.h +; VBITS_GE_128-NEXT: uunpkhi z24.s, z22.h +; VBITS_GE_128-NEXT: uunpklo z21.s, z21.h +; VBITS_GE_128-NEXT: uunpklo z22.s, z22.h +; VBITS_GE_128-NEXT: udivr z23.s, p1/m, z23.s, z24.s +; VBITS_GE_128-NEXT: udivr z21.s, p1/m, z21.s, z22.s +; VBITS_GE_128-NEXT: uzp1 z17.h, z17.h, z20.h +; VBITS_GE_128-NEXT: uzp1 z20.h, z21.h, z23.h +; VBITS_GE_128-NEXT: ld1b { z21.b }, p0/z, [x1, x12] +; VBITS_GE_128-NEXT: uzp1 z17.b, z20.b, z17.b +; VBITS_GE_128-NEXT: uunpkhi z20.h, z19.b +; VBITS_GE_128-NEXT: uunpkhi z22.h, z2.b +; VBITS_GE_128-NEXT: uunpkhi z23.s, z20.h +; VBITS_GE_128-NEXT: uunpkhi z24.s, z22.h +; VBITS_GE_128-NEXT: uunpklo z20.s, z20.h +; VBITS_GE_128-NEXT: uunpklo z22.s, z22.h +; VBITS_GE_128-NEXT: udivr z23.s, p1/m, z23.s, z24.s +; VBITS_GE_128-NEXT: udivr z20.s, p1/m, z20.s, z22.s +; VBITS_GE_128-NEXT: uunpklo z22.h, z19.b +; VBITS_GE_128-NEXT: uunpklo z24.h, z2.b +; VBITS_GE_128-NEXT: uunpkhi z25.s, z22.h +; VBITS_GE_128-NEXT: uunpkhi z26.s, z24.h +; VBITS_GE_128-NEXT: uunpklo z22.s, z22.h +; VBITS_GE_128-NEXT: uunpklo z24.s, z24.h +; VBITS_GE_128-NEXT: udivr z25.s, p1/m, z25.s, z26.s +; VBITS_GE_128-NEXT: udivr z22.s, p1/m, z22.s, z24.s +; VBITS_GE_128-NEXT: uzp1 z20.h, z20.h, z23.h +; VBITS_GE_128-NEXT: uunpkhi z23.h, z21.b +; VBITS_GE_128-NEXT: uunpkhi z24.h, z3.b +; VBITS_GE_128-NEXT: uzp1 z22.h, z22.h, z25.h +; VBITS_GE_128-NEXT: uunpkhi z25.s, z23.h +; VBITS_GE_128-NEXT: uunpkhi z26.s, z24.h +; VBITS_GE_128-NEXT: uunpklo z23.s, z23.h +; VBITS_GE_128-NEXT: uunpklo z24.s, z24.h +; VBITS_GE_128-NEXT: udivr z25.s, p1/m, z25.s, z26.s +; VBITS_GE_128-NEXT: udivr z23.s, p1/m, z23.s, z24.s +; VBITS_GE_128-NEXT: uunpklo z24.h, z21.b +; VBITS_GE_128-NEXT: uunpklo z26.h, z3.b +; VBITS_GE_128-NEXT: uunpkhi z27.s, z24.h +; VBITS_GE_128-NEXT: uunpkhi z28.s, z26.h +; VBITS_GE_128-NEXT: uunpklo z24.s, z24.h +; VBITS_GE_128-NEXT: uunpklo z26.s, z26.h +; VBITS_GE_128-NEXT: udivr z27.s, p1/m, z27.s, z28.s +; VBITS_GE_128-NEXT: udivr z24.s, p1/m, z24.s, z26.s +; VBITS_GE_128-NEXT: ld1b { z26.b }, p0/z, [x1, x10] +; VBITS_GE_128-NEXT: uzp1 z23.h, z23.h, z25.h +; VBITS_GE_128-NEXT: uzp1 z24.h, z24.h, z27.h +; VBITS_GE_128-NEXT: uzp1 z20.b, z22.b, z20.b +; VBITS_GE_128-NEXT: ld1b { z28.b }, p0/z, [x1, x9] +; VBITS_GE_128-NEXT: uzp1 z22.b, z24.b, z23.b +; VBITS_GE_128-NEXT: mls z2.b, p0/m, z20.b, z19.b +; VBITS_GE_128-NEXT: uunpkhi z19.h, z26.b +; VBITS_GE_128-NEXT: uunpkhi z20.h, z4.b +; VBITS_GE_128-NEXT: mls z3.b, p0/m, z22.b, z21.b +; VBITS_GE_128-NEXT: uunpkhi z21.s, z19.h +; VBITS_GE_128-NEXT: uunpkhi z22.s, z20.h +; VBITS_GE_128-NEXT: uunpklo z19.s, z19.h +; VBITS_GE_128-NEXT: uunpklo z20.s, z20.h +; VBITS_GE_128-NEXT: udivr z21.s, p1/m, z21.s, z22.s +; VBITS_GE_128-NEXT: udivr z19.s, p1/m, z19.s, z20.s +; VBITS_GE_128-NEXT: uunpklo z20.h, z26.b +; VBITS_GE_128-NEXT: uunpklo z22.h, z4.b +; VBITS_GE_128-NEXT: uunpkhi z23.s, z20.h +; VBITS_GE_128-NEXT: uunpkhi z24.s, z22.h +; VBITS_GE_128-NEXT: uunpklo z20.s, z20.h +; VBITS_GE_128-NEXT: uunpklo z22.s, z22.h +; VBITS_GE_128-NEXT: udivr z23.s, p1/m, z23.s, z24.s +; VBITS_GE_128-NEXT: udivr z20.s, p1/m, z20.s, z22.s +; VBITS_GE_128-NEXT: uzp1 z21.h, z19.h, z21.h +; VBITS_GE_128-NEXT: uunpkhi z19.h, z28.b +; VBITS_GE_128-NEXT: uunpkhi z22.h, z6.b +; VBITS_GE_128-NEXT: uzp1 z20.h, z20.h, z23.h +; VBITS_GE_128-NEXT: uunpkhi z23.s, z19.h +; VBITS_GE_128-NEXT: uunpkhi z24.s, z22.h +; VBITS_GE_128-NEXT: uunpklo z19.s, z19.h +; VBITS_GE_128-NEXT: uunpklo z22.s, z22.h +; VBITS_GE_128-NEXT: udivr z23.s, p1/m, z23.s, z24.s +; VBITS_GE_128-NEXT: udiv z22.s, p1/m, z22.s, z19.s +; VBITS_GE_128-NEXT: uunpklo z19.h, z28.b +; VBITS_GE_128-NEXT: uunpklo z24.h, z6.b +; VBITS_GE_128-NEXT: uunpkhi z25.s, z19.h +; VBITS_GE_128-NEXT: uunpkhi z27.s, z24.h +; VBITS_GE_128-NEXT: uunpklo z19.s, z19.h +; VBITS_GE_128-NEXT: udivr z25.s, p1/m, z25.s, z27.s +; VBITS_GE_128-NEXT: ld1b { z27.b }, p0/z, [x1, x8] +; VBITS_GE_128-NEXT: uunpklo z24.s, z24.h +; VBITS_GE_128-NEXT: uzp1 z22.h, z22.h, z23.h +; VBITS_GE_128-NEXT: udiv z24.s, p1/m, z24.s, z19.s +; VBITS_GE_128-NEXT: uzp1 z20.b, z20.b, z21.b +; VBITS_GE_128-NEXT: uzp1 z23.h, z24.h, z25.h +; VBITS_GE_128-NEXT: mls z4.b, p0/m, z20.b, z26.b +; VBITS_GE_128-NEXT: uzp1 z21.b, z23.b, z22.b +; VBITS_GE_128-NEXT: uunpkhi z20.h, z27.b +; VBITS_GE_128-NEXT: mls z6.b, p0/m, z21.b, z28.b +; VBITS_GE_128-NEXT: uunpkhi z21.h, z7.b +; VBITS_GE_128-NEXT: uunpkhi z22.s, z20.h +; VBITS_GE_128-NEXT: uunpkhi z23.s, z21.h +; VBITS_GE_128-NEXT: uunpklo z20.s, z20.h +; VBITS_GE_128-NEXT: uunpklo z21.s, z21.h +; VBITS_GE_128-NEXT: udivr z22.s, p1/m, z22.s, z23.s +; VBITS_GE_128-NEXT: udivr z20.s, p1/m, z20.s, z21.s +; VBITS_GE_128-NEXT: uunpklo z21.h, z27.b +; VBITS_GE_128-NEXT: uunpklo z23.h, z7.b +; VBITS_GE_128-NEXT: uunpkhi z24.s, z21.h +; VBITS_GE_128-NEXT: uunpkhi z25.s, z23.h +; VBITS_GE_128-NEXT: uunpklo z21.s, z21.h +; VBITS_GE_128-NEXT: uunpklo z23.s, z23.h +; VBITS_GE_128-NEXT: udivr z24.s, p1/m, z24.s, z25.s +; VBITS_GE_128-NEXT: udivr z21.s, p1/m, z21.s, z23.s +; VBITS_GE_128-NEXT: uzp1 z20.h, z20.h, z22.h +; VBITS_GE_128-NEXT: uunpkhi z22.h, z18.b +; VBITS_GE_128-NEXT: uunpkhi z23.h, z0.b +; VBITS_GE_128-NEXT: uzp1 z21.h, z21.h, z24.h +; VBITS_GE_128-NEXT: uunpkhi z24.s, z22.h +; VBITS_GE_128-NEXT: uunpkhi z25.s, z23.h +; VBITS_GE_128-NEXT: uunpklo z22.s, z22.h +; VBITS_GE_128-NEXT: uunpklo z23.s, z23.h +; VBITS_GE_128-NEXT: udivr z24.s, p1/m, z24.s, z25.s +; VBITS_GE_128-NEXT: udivr z22.s, p1/m, z22.s, z23.s +; VBITS_GE_128-NEXT: uunpklo z23.h, z18.b +; VBITS_GE_128-NEXT: uunpklo z25.h, z0.b +; VBITS_GE_128-NEXT: ld1b { z19.b }, p0/z, [x1] +; VBITS_GE_128-NEXT: uunpkhi z26.s, z23.h +; VBITS_GE_128-NEXT: uunpkhi z28.s, z25.h +; VBITS_GE_128-NEXT: uunpklo z23.s, z23.h +; VBITS_GE_128-NEXT: uunpklo z25.s, z25.h +; VBITS_GE_128-NEXT: udivr z26.s, p1/m, z26.s, z28.s +; VBITS_GE_128-NEXT: udivr z23.s, p1/m, z23.s, z25.s +; VBITS_GE_128-NEXT: uzp1 z22.h, z22.h, z24.h +; VBITS_GE_128-NEXT: uzp1 z23.h, z23.h, z26.h +; VBITS_GE_128-NEXT: uzp1 z20.b, z21.b, z20.b +; VBITS_GE_128-NEXT: uzp1 z21.b, z23.b, z22.b +; VBITS_GE_128-NEXT: mls z7.b, p0/m, z20.b, z27.b +; VBITS_GE_128-NEXT: mls z0.b, p0/m, z21.b, z18.b +; VBITS_GE_128-NEXT: uunpkhi z18.h, z19.b +; VBITS_GE_128-NEXT: uunpkhi z20.h, z5.b +; VBITS_GE_128-NEXT: uunpkhi z21.s, z18.h +; VBITS_GE_128-NEXT: uunpkhi z22.s, z20.h +; VBITS_GE_128-NEXT: uunpklo z18.s, z18.h +; VBITS_GE_128-NEXT: uunpklo z20.s, z20.h +; VBITS_GE_128-NEXT: udivr z21.s, p1/m, z21.s, z22.s +; VBITS_GE_128-NEXT: udivr z18.s, p1/m, z18.s, z20.s +; VBITS_GE_128-NEXT: uunpklo z20.h, z19.b +; VBITS_GE_128-NEXT: uunpklo z22.h, z5.b +; VBITS_GE_128-NEXT: uunpkhi z23.s, z20.h +; VBITS_GE_128-NEXT: uunpkhi z24.s, z22.h +; VBITS_GE_128-NEXT: uzp1 z18.h, z18.h, z21.h +; VBITS_GE_128-NEXT: movprfx z21, z24 +; VBITS_GE_128-NEXT: udiv z21.s, p1/m, z21.s, z23.s +; VBITS_GE_128-NEXT: uunpklo z20.s, z20.h +; VBITS_GE_128-NEXT: stp q3, q2, [x0, #32] +; VBITS_GE_128-NEXT: mls z1.b, p0/m, z17.b, z16.b +; VBITS_GE_128-NEXT: stp q6, q4, [x0, #64] +; VBITS_GE_128-NEXT: stp q0, q7, [x0, #96] +; VBITS_GE_128-NEXT: uunpklo z0.s, z22.h +; VBITS_GE_128-NEXT: udiv z0.s, p1/m, z0.s, z20.s +; VBITS_GE_128-NEXT: uzp1 z0.h, z0.h, z21.h +; VBITS_GE_128-NEXT: uzp1 z0.b, z0.b, z18.b +; VBITS_GE_128-NEXT: mls z5.b, p0/m, z0.b, z19.b +; VBITS_GE_128-NEXT: stp q5, q1, [x0] +; VBITS_GE_128-NEXT: ret +; +; VBITS_GE_256-LABEL: urem_v128i8: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov w8, #64 +; VBITS_GE_256-NEXT: mov w9, #96 +; VBITS_GE_256-NEXT: mov w10, #32 +; VBITS_GE_256-NEXT: ptrue p0.b, vl32 +; VBITS_GE_256-NEXT: ptrue p1.s, vl8 +; VBITS_GE_256-NEXT: ld1b { z3.b }, p0/z, [x0, x8] +; VBITS_GE_256-NEXT: ld1b { z2.b }, p0/z, [x0, x9] +; VBITS_GE_256-NEXT: ld1b { z1.b }, p0/z, [x0, x10] +; VBITS_GE_256-NEXT: ld1b { z0.b }, p0/z, [x0] +; VBITS_GE_256-NEXT: ld1b { z4.b }, p0/z, [x1, x10] +; VBITS_GE_256-NEXT: ld1b { z5.b }, p0/z, [x1, x9] +; VBITS_GE_256-NEXT: uunpkhi z7.h, z1.b +; VBITS_GE_256-NEXT: uunpkhi z6.h, z4.b +; VBITS_GE_256-NEXT: uunpkhi z17.s, z7.h +; VBITS_GE_256-NEXT: uunpkhi z16.s, z6.h +; VBITS_GE_256-NEXT: uunpklo z6.s, z6.h +; VBITS_GE_256-NEXT: uunpklo z7.s, z7.h +; VBITS_GE_256-NEXT: udivr z16.s, p1/m, z16.s, z17.s +; VBITS_GE_256-NEXT: udivr z6.s, p1/m, z6.s, z7.s +; VBITS_GE_256-NEXT: uunpklo z7.h, z4.b +; VBITS_GE_256-NEXT: uunpklo z17.h, z1.b +; VBITS_GE_256-NEXT: uunpkhi z18.s, z7.h +; VBITS_GE_256-NEXT: uunpkhi z19.s, z17.h +; VBITS_GE_256-NEXT: uunpklo z7.s, z7.h +; VBITS_GE_256-NEXT: uunpklo z17.s, z17.h +; VBITS_GE_256-NEXT: udivr z18.s, p1/m, z18.s, z19.s +; VBITS_GE_256-NEXT: udivr z7.s, p1/m, z7.s, z17.s +; VBITS_GE_256-NEXT: uzp1 z6.h, z6.h, z16.h +; VBITS_GE_256-NEXT: uunpkhi z16.h, z5.b +; VBITS_GE_256-NEXT: uunpkhi z17.h, z2.b +; VBITS_GE_256-NEXT: uzp1 z7.h, z7.h, z18.h +; VBITS_GE_256-NEXT: uunpkhi z18.s, z16.h +; VBITS_GE_256-NEXT: uunpkhi z19.s, z17.h +; VBITS_GE_256-NEXT: uunpklo z16.s, z16.h +; VBITS_GE_256-NEXT: uunpklo z17.s, z17.h +; VBITS_GE_256-NEXT: udivr z18.s, p1/m, z18.s, z19.s +; VBITS_GE_256-NEXT: udivr z16.s, p1/m, z16.s, z17.s +; VBITS_GE_256-NEXT: uunpklo z17.h, z5.b +; VBITS_GE_256-NEXT: uunpklo z19.h, z2.b +; VBITS_GE_256-NEXT: uunpkhi z20.s, z17.h +; VBITS_GE_256-NEXT: uunpkhi z21.s, z19.h +; VBITS_GE_256-NEXT: uunpklo z17.s, z17.h +; VBITS_GE_256-NEXT: udivr z20.s, p1/m, z20.s, z21.s +; VBITS_GE_256-NEXT: ld1b { z21.b }, p0/z, [x1, x8] +; VBITS_GE_256-NEXT: uunpklo z19.s, z19.h +; VBITS_GE_256-NEXT: uzp1 z16.h, z16.h, z18.h +; VBITS_GE_256-NEXT: udivr z17.s, p1/m, z17.s, z19.s +; VBITS_GE_256-NEXT: uzp1 z6.b, z7.b, z6.b +; VBITS_GE_256-NEXT: uzp1 z17.h, z17.h, z20.h +; VBITS_GE_256-NEXT: ld1b { z19.b }, p0/z, [x1] +; VBITS_GE_256-NEXT: uzp1 z7.b, z17.b, z16.b +; VBITS_GE_256-NEXT: mls z1.b, p0/m, z6.b, z4.b +; VBITS_GE_256-NEXT: mls z2.b, p0/m, z7.b, z5.b +; VBITS_GE_256-NEXT: uunpkhi z4.h, z21.b +; VBITS_GE_256-NEXT: uunpkhi z5.h, z3.b +; VBITS_GE_256-NEXT: uunpkhi z6.s, z4.h +; VBITS_GE_256-NEXT: uunpkhi z7.s, z5.h +; VBITS_GE_256-NEXT: uunpklo z4.s, z4.h +; VBITS_GE_256-NEXT: uunpklo z5.s, z5.h +; VBITS_GE_256-NEXT: udivr z6.s, p1/m, z6.s, z7.s +; VBITS_GE_256-NEXT: udivr z4.s, p1/m, z4.s, z5.s +; VBITS_GE_256-NEXT: uunpklo z5.h, z21.b +; VBITS_GE_256-NEXT: uunpklo z7.h, z3.b +; VBITS_GE_256-NEXT: uunpkhi z16.s, z5.h +; VBITS_GE_256-NEXT: uunpkhi z17.s, z7.h +; VBITS_GE_256-NEXT: uunpklo z5.s, z5.h +; VBITS_GE_256-NEXT: uunpklo z7.s, z7.h +; VBITS_GE_256-NEXT: udivr z16.s, p1/m, z16.s, z17.s +; VBITS_GE_256-NEXT: udivr z5.s, p1/m, z5.s, z7.s +; VBITS_GE_256-NEXT: uzp1 z4.h, z4.h, z6.h +; VBITS_GE_256-NEXT: uunpkhi z6.h, z19.b +; VBITS_GE_256-NEXT: uunpkhi z7.h, z0.b +; VBITS_GE_256-NEXT: uzp1 z5.h, z5.h, z16.h +; VBITS_GE_256-NEXT: uunpkhi z16.s, z6.h +; VBITS_GE_256-NEXT: uunpkhi z17.s, z7.h +; VBITS_GE_256-NEXT: uunpklo z6.s, z6.h +; VBITS_GE_256-NEXT: uunpklo z7.s, z7.h +; VBITS_GE_256-NEXT: udivr z16.s, p1/m, z16.s, z17.s +; VBITS_GE_256-NEXT: udivr z6.s, p1/m, z6.s, z7.s +; VBITS_GE_256-NEXT: uunpklo z7.h, z19.b +; VBITS_GE_256-NEXT: uunpklo z17.h, z0.b +; VBITS_GE_256-NEXT: uunpkhi z18.s, z7.h +; VBITS_GE_256-NEXT: uunpkhi z20.s, z17.h +; VBITS_GE_256-NEXT: uunpklo z7.s, z7.h +; VBITS_GE_256-NEXT: uunpklo z17.s, z17.h +; VBITS_GE_256-NEXT: udivr z18.s, p1/m, z18.s, z20.s +; VBITS_GE_256-NEXT: udivr z7.s, p1/m, z7.s, z17.s +; VBITS_GE_256-NEXT: uzp1 z6.h, z6.h, z16.h +; VBITS_GE_256-NEXT: uzp1 z7.h, z7.h, z18.h +; VBITS_GE_256-NEXT: uzp1 z4.b, z5.b, z4.b +; VBITS_GE_256-NEXT: uzp1 z5.b, z7.b, z6.b +; VBITS_GE_256-NEXT: mls z3.b, p0/m, z4.b, z21.b +; VBITS_GE_256-NEXT: mls z0.b, p0/m, z5.b, z19.b +; VBITS_GE_256-NEXT: st1b { z3.b }, p0, [x0, x8] +; VBITS_GE_256-NEXT: st1b { z2.b }, p0, [x0, x9] +; VBITS_GE_256-NEXT: st1b { z1.b }, p0, [x0, x10] +; VBITS_GE_256-NEXT: st1b { z0.b }, p0, [x0] +; VBITS_GE_256-NEXT: ret + %op1 = load <128 x i8>, <128 x i8>* %a + %op2 = load <128 x i8>, <128 x i8>* %b + %res = urem <128 x i8> %op1, %op2 + store <128 x i8> %res, <128 x i8>* %a + ret void +} + +define void @urem_v256i8(<256 x i8>* %a, <256 x i8>* %b) #0 { +; VBITS_GE_128-LABEL: urem_v256i8: +; VBITS_GE_128: // %bb.0: +; VBITS_GE_128-NEXT: str d10, [sp, #-32]! // 8-byte Folded Spill +; VBITS_GE_128-NEXT: .cfi_def_cfa_offset 32 +; VBITS_GE_128-NEXT: stp d9, d8, [sp, #16] // 16-byte Folded Spill +; VBITS_GE_128-NEXT: .cfi_offset b8, -8 +; VBITS_GE_128-NEXT: .cfi_offset b9, -16 +; VBITS_GE_128-NEXT: .cfi_offset b10, -32 +; VBITS_GE_128-NEXT: mov w3, #240 +; VBITS_GE_128-NEXT: mov w9, #224 +; VBITS_GE_128-NEXT: mov w10, #208 +; VBITS_GE_128-NEXT: mov w11, #192 +; VBITS_GE_128-NEXT: mov w12, #176 +; VBITS_GE_128-NEXT: mov w13, #160 +; VBITS_GE_128-NEXT: mov w14, #144 +; VBITS_GE_128-NEXT: mov w15, #128 +; VBITS_GE_128-NEXT: mov w16, #112 +; VBITS_GE_128-NEXT: mov w17, #96 +; VBITS_GE_128-NEXT: mov w18, #80 +; VBITS_GE_128-NEXT: mov w2, #64 +; VBITS_GE_128-NEXT: mov w4, #48 +; VBITS_GE_128-NEXT: mov w5, #32 +; VBITS_GE_128-NEXT: mov w8, #16 +; VBITS_GE_128-NEXT: ptrue p0.b, vl16 +; VBITS_GE_128-NEXT: ld1b { z2.b }, p0/z, [x0, x3] +; VBITS_GE_128-NEXT: ld1b { z1.b }, p0/z, [x0, x9] +; VBITS_GE_128-NEXT: ld1b { z0.b }, p0/z, [x0, x10] +; VBITS_GE_128-NEXT: ld1b { z16.b }, p0/z, [x0, x11] +; VBITS_GE_128-NEXT: ld1b { z7.b }, p0/z, [x0, x12] +; VBITS_GE_128-NEXT: ld1b { z6.b }, p0/z, [x0, x13] +; VBITS_GE_128-NEXT: ld1b { z5.b }, p0/z, [x0, x14] +; VBITS_GE_128-NEXT: ld1b { z4.b }, p0/z, [x0, x15] +; VBITS_GE_128-NEXT: ld1b { z3.b }, p0/z, [x0, x16] +; VBITS_GE_128-NEXT: ld1b { z23.b }, p0/z, [x0, x17] +; VBITS_GE_128-NEXT: ld1b { z22.b }, p0/z, [x0, x18] +; VBITS_GE_128-NEXT: ld1b { z21.b }, p0/z, [x0, x2] +; VBITS_GE_128-NEXT: ld1b { z19.b }, p0/z, [x0, x4] +; VBITS_GE_128-NEXT: ld1b { z18.b }, p0/z, [x0, x5] +; VBITS_GE_128-NEXT: ld1b { z17.b }, p0/z, [x0, x8] +; VBITS_GE_128-NEXT: ld1b { z20.b }, p0/z, [x0] +; VBITS_GE_128-NEXT: ld1b { z26.b }, p0/z, [x1, x5] +; VBITS_GE_128-NEXT: ptrue p1.s, vl4 +; VBITS_GE_128-NEXT: ld1b { z25.b }, p0/z, [x1, x4] +; VBITS_GE_128-NEXT: ld1b { z24.b }, p0/z, [x1, x3] +; VBITS_GE_128-NEXT: uunpkhi z28.h, z18.b +; VBITS_GE_128-NEXT: uunpkhi z30.s, z28.h +; VBITS_GE_128-NEXT: uunpklo z28.s, z28.h +; VBITS_GE_128-NEXT: uunpkhi z27.h, z26.b +; VBITS_GE_128-NEXT: uunpkhi z29.s, z27.h +; VBITS_GE_128-NEXT: uunpklo z27.s, z27.h +; VBITS_GE_128-NEXT: udivr z29.s, p1/m, z29.s, z30.s +; VBITS_GE_128-NEXT: udivr z27.s, p1/m, z27.s, z28.s +; VBITS_GE_128-NEXT: uunpklo z28.h, z26.b +; VBITS_GE_128-NEXT: uunpklo z30.h, z18.b +; VBITS_GE_128-NEXT: uunpkhi z31.s, z28.h +; VBITS_GE_128-NEXT: uunpkhi z8.s, z30.h +; VBITS_GE_128-NEXT: uunpklo z28.s, z28.h +; VBITS_GE_128-NEXT: uunpklo z30.s, z30.h +; VBITS_GE_128-NEXT: udivr z31.s, p1/m, z31.s, z8.s +; VBITS_GE_128-NEXT: udivr z28.s, p1/m, z28.s, z30.s +; VBITS_GE_128-NEXT: uzp1 z29.h, z27.h, z29.h +; VBITS_GE_128-NEXT: uzp1 z28.h, z28.h, z31.h +; VBITS_GE_128-NEXT: uzp1 z28.b, z28.b, z29.b +; VBITS_GE_128-NEXT: uunpkhi z29.h, z25.b +; VBITS_GE_128-NEXT: uunpkhi z30.h, z19.b +; VBITS_GE_128-NEXT: uunpkhi z31.s, z29.h +; VBITS_GE_128-NEXT: uunpkhi z8.s, z30.h +; VBITS_GE_128-NEXT: uunpklo z29.s, z29.h +; VBITS_GE_128-NEXT: uunpklo z30.s, z30.h +; VBITS_GE_128-NEXT: ld1b { z27.b }, p0/z, [x1, x2] +; VBITS_GE_128-NEXT: udivr z31.s, p1/m, z31.s, z8.s +; VBITS_GE_128-NEXT: udivr z29.s, p1/m, z29.s, z30.s +; VBITS_GE_128-NEXT: uunpklo z30.h, z25.b +; VBITS_GE_128-NEXT: uunpklo z8.h, z19.b +; VBITS_GE_128-NEXT: uunpkhi z9.s, z30.h +; VBITS_GE_128-NEXT: uunpkhi z10.s, z8.h +; VBITS_GE_128-NEXT: uunpklo z30.s, z30.h +; VBITS_GE_128-NEXT: uunpklo z8.s, z8.h +; VBITS_GE_128-NEXT: udivr z9.s, p1/m, z9.s, z10.s +; VBITS_GE_128-NEXT: udivr z30.s, p1/m, z30.s, z8.s +; VBITS_GE_128-NEXT: uzp1 z29.h, z29.h, z31.h +; VBITS_GE_128-NEXT: uzp1 z30.h, z30.h, z9.h +; VBITS_GE_128-NEXT: mls z18.b, p0/m, z28.b, z26.b +; VBITS_GE_128-NEXT: uzp1 z28.b, z30.b, z29.b +; VBITS_GE_128-NEXT: uunpkhi z26.h, z27.b +; VBITS_GE_128-NEXT: uunpkhi z29.h, z21.b +; VBITS_GE_128-NEXT: uunpkhi z30.s, z26.h +; VBITS_GE_128-NEXT: uunpkhi z31.s, z29.h +; VBITS_GE_128-NEXT: uunpklo z26.s, z26.h +; VBITS_GE_128-NEXT: uunpklo z29.s, z29.h +; VBITS_GE_128-NEXT: udivr z30.s, p1/m, z30.s, z31.s +; VBITS_GE_128-NEXT: udiv z29.s, p1/m, z29.s, z26.s +; VBITS_GE_128-NEXT: uunpklo z26.h, z27.b +; VBITS_GE_128-NEXT: uunpklo z31.h, z21.b +; VBITS_GE_128-NEXT: uunpkhi z8.s, z26.h +; VBITS_GE_128-NEXT: uunpkhi z9.s, z31.h +; VBITS_GE_128-NEXT: ld1b { z10.b }, p0/z, [x1, x18] +; VBITS_GE_128-NEXT: udivr z8.s, p1/m, z8.s, z9.s +; VBITS_GE_128-NEXT: uunpklo z9.s, z26.h +; VBITS_GE_128-NEXT: uunpklo z31.s, z31.h +; VBITS_GE_128-NEXT: udiv z31.s, p1/m, z31.s, z9.s +; VBITS_GE_128-NEXT: uzp1 z29.h, z29.h, z30.h +; VBITS_GE_128-NEXT: uzp1 z30.h, z31.h, z8.h +; VBITS_GE_128-NEXT: mls z19.b, p0/m, z28.b, z25.b +; VBITS_GE_128-NEXT: uzp1 z25.b, z30.b, z29.b +; VBITS_GE_128-NEXT: uunpkhi z28.h, z10.b +; VBITS_GE_128-NEXT: uunpkhi z29.h, z22.b +; VBITS_GE_128-NEXT: uunpkhi z30.s, z28.h +; VBITS_GE_128-NEXT: uunpkhi z31.s, z29.h +; VBITS_GE_128-NEXT: uunpklo z28.s, z28.h +; VBITS_GE_128-NEXT: uunpklo z29.s, z29.h +; VBITS_GE_128-NEXT: ld1b { z26.b }, p0/z, [x1, x17] +; VBITS_GE_128-NEXT: udivr z30.s, p1/m, z30.s, z31.s +; VBITS_GE_128-NEXT: udivr z28.s, p1/m, z28.s, z29.s +; VBITS_GE_128-NEXT: uunpklo z29.h, z10.b +; VBITS_GE_128-NEXT: uunpklo z31.h, z22.b +; VBITS_GE_128-NEXT: uunpkhi z8.s, z29.h +; VBITS_GE_128-NEXT: uunpkhi z9.s, z31.h +; VBITS_GE_128-NEXT: uunpklo z29.s, z29.h +; VBITS_GE_128-NEXT: uunpklo z31.s, z31.h +; VBITS_GE_128-NEXT: udivr z8.s, p1/m, z8.s, z9.s +; VBITS_GE_128-NEXT: udivr z29.s, p1/m, z29.s, z31.s +; VBITS_GE_128-NEXT: uzp1 z28.h, z28.h, z30.h +; VBITS_GE_128-NEXT: uzp1 z29.h, z29.h, z8.h +; VBITS_GE_128-NEXT: mls z21.b, p0/m, z25.b, z27.b +; VBITS_GE_128-NEXT: uzp1 z27.b, z29.b, z28.b +; VBITS_GE_128-NEXT: uunpkhi z25.h, z26.b +; VBITS_GE_128-NEXT: uunpkhi z28.h, z23.b +; VBITS_GE_128-NEXT: uunpkhi z29.s, z25.h +; VBITS_GE_128-NEXT: uunpkhi z30.s, z28.h +; VBITS_GE_128-NEXT: uunpklo z25.s, z25.h +; VBITS_GE_128-NEXT: uunpklo z28.s, z28.h +; VBITS_GE_128-NEXT: udivr z29.s, p1/m, z29.s, z30.s +; VBITS_GE_128-NEXT: udiv z28.s, p1/m, z28.s, z25.s +; VBITS_GE_128-NEXT: uunpklo z25.h, z26.b +; VBITS_GE_128-NEXT: uunpklo z30.h, z23.b +; VBITS_GE_128-NEXT: uunpkhi z31.s, z25.h +; VBITS_GE_128-NEXT: uunpkhi z8.s, z30.h +; VBITS_GE_128-NEXT: ld1b { z9.b }, p0/z, [x1, x16] +; VBITS_GE_128-NEXT: udivr z31.s, p1/m, z31.s, z8.s +; VBITS_GE_128-NEXT: uunpklo z8.s, z25.h +; VBITS_GE_128-NEXT: uunpklo z30.s, z30.h +; VBITS_GE_128-NEXT: udiv z30.s, p1/m, z30.s, z8.s +; VBITS_GE_128-NEXT: uzp1 z28.h, z28.h, z29.h +; VBITS_GE_128-NEXT: uzp1 z29.h, z30.h, z31.h +; VBITS_GE_128-NEXT: mls z22.b, p0/m, z27.b, z10.b +; VBITS_GE_128-NEXT: uzp1 z27.b, z29.b, z28.b +; VBITS_GE_128-NEXT: uunpkhi z28.h, z9.b +; VBITS_GE_128-NEXT: uunpkhi z29.h, z3.b +; VBITS_GE_128-NEXT: uunpkhi z30.s, z28.h +; VBITS_GE_128-NEXT: uunpkhi z31.s, z29.h +; VBITS_GE_128-NEXT: uunpklo z28.s, z28.h +; VBITS_GE_128-NEXT: uunpklo z29.s, z29.h +; VBITS_GE_128-NEXT: ld1b { z25.b }, p0/z, [x1, x15] +; VBITS_GE_128-NEXT: udivr z30.s, p1/m, z30.s, z31.s +; VBITS_GE_128-NEXT: udivr z28.s, p1/m, z28.s, z29.s +; VBITS_GE_128-NEXT: uunpklo z29.h, z9.b +; VBITS_GE_128-NEXT: uunpklo z31.h, z3.b +; VBITS_GE_128-NEXT: uunpkhi z8.s, z29.h +; VBITS_GE_128-NEXT: uunpkhi z10.s, z31.h +; VBITS_GE_128-NEXT: uunpklo z29.s, z29.h +; VBITS_GE_128-NEXT: uunpklo z31.s, z31.h +; VBITS_GE_128-NEXT: udivr z8.s, p1/m, z8.s, z10.s +; VBITS_GE_128-NEXT: udivr z29.s, p1/m, z29.s, z31.s +; VBITS_GE_128-NEXT: uzp1 z28.h, z28.h, z30.h +; VBITS_GE_128-NEXT: uzp1 z29.h, z29.h, z8.h +; VBITS_GE_128-NEXT: mls z23.b, p0/m, z27.b, z26.b +; VBITS_GE_128-NEXT: uzp1 z27.b, z29.b, z28.b +; VBITS_GE_128-NEXT: uunpkhi z26.h, z25.b +; VBITS_GE_128-NEXT: uunpkhi z28.h, z4.b +; VBITS_GE_128-NEXT: uunpkhi z29.s, z26.h +; VBITS_GE_128-NEXT: uunpkhi z30.s, z28.h +; VBITS_GE_128-NEXT: uunpklo z26.s, z26.h +; VBITS_GE_128-NEXT: uunpklo z28.s, z28.h +; VBITS_GE_128-NEXT: udivr z29.s, p1/m, z29.s, z30.s +; VBITS_GE_128-NEXT: udiv z28.s, p1/m, z28.s, z26.s +; VBITS_GE_128-NEXT: uunpklo z26.h, z25.b +; VBITS_GE_128-NEXT: uunpklo z30.h, z4.b +; VBITS_GE_128-NEXT: uunpkhi z31.s, z26.h +; VBITS_GE_128-NEXT: uunpkhi z8.s, z30.h +; VBITS_GE_128-NEXT: ld1b { z10.b }, p0/z, [x1, x14] +; VBITS_GE_128-NEXT: udivr z31.s, p1/m, z31.s, z8.s +; VBITS_GE_128-NEXT: uunpklo z8.s, z26.h +; VBITS_GE_128-NEXT: uunpklo z30.s, z30.h +; VBITS_GE_128-NEXT: udiv z30.s, p1/m, z30.s, z8.s +; VBITS_GE_128-NEXT: uzp1 z28.h, z28.h, z29.h +; VBITS_GE_128-NEXT: uzp1 z29.h, z30.h, z31.h +; VBITS_GE_128-NEXT: mls z3.b, p0/m, z27.b, z9.b +; VBITS_GE_128-NEXT: uzp1 z27.b, z29.b, z28.b +; VBITS_GE_128-NEXT: uunpkhi z28.h, z10.b +; VBITS_GE_128-NEXT: uunpkhi z29.h, z5.b +; VBITS_GE_128-NEXT: uunpkhi z30.s, z28.h +; VBITS_GE_128-NEXT: uunpkhi z31.s, z29.h +; VBITS_GE_128-NEXT: uunpklo z28.s, z28.h +; VBITS_GE_128-NEXT: uunpklo z29.s, z29.h +; VBITS_GE_128-NEXT: ld1b { z26.b }, p0/z, [x1, x13] +; VBITS_GE_128-NEXT: udivr z30.s, p1/m, z30.s, z31.s +; VBITS_GE_128-NEXT: udivr z28.s, p1/m, z28.s, z29.s +; VBITS_GE_128-NEXT: uunpklo z29.h, z10.b +; VBITS_GE_128-NEXT: uunpklo z31.h, z5.b +; VBITS_GE_128-NEXT: uunpkhi z8.s, z29.h +; VBITS_GE_128-NEXT: uunpkhi z9.s, z31.h +; VBITS_GE_128-NEXT: uunpklo z29.s, z29.h +; VBITS_GE_128-NEXT: uunpklo z31.s, z31.h +; VBITS_GE_128-NEXT: udivr z8.s, p1/m, z8.s, z9.s +; VBITS_GE_128-NEXT: udivr z29.s, p1/m, z29.s, z31.s +; VBITS_GE_128-NEXT: uzp1 z28.h, z28.h, z30.h +; VBITS_GE_128-NEXT: uzp1 z29.h, z29.h, z8.h +; VBITS_GE_128-NEXT: mls z4.b, p0/m, z27.b, z25.b +; VBITS_GE_128-NEXT: uzp1 z27.b, z29.b, z28.b +; VBITS_GE_128-NEXT: uunpkhi z25.h, z26.b +; VBITS_GE_128-NEXT: uunpkhi z28.h, z6.b +; VBITS_GE_128-NEXT: uunpkhi z29.s, z25.h +; VBITS_GE_128-NEXT: uunpkhi z30.s, z28.h +; VBITS_GE_128-NEXT: uunpklo z25.s, z25.h +; VBITS_GE_128-NEXT: uunpklo z28.s, z28.h +; VBITS_GE_128-NEXT: udivr z29.s, p1/m, z29.s, z30.s +; VBITS_GE_128-NEXT: udiv z28.s, p1/m, z28.s, z25.s +; VBITS_GE_128-NEXT: uunpklo z25.h, z26.b +; VBITS_GE_128-NEXT: uunpklo z30.h, z6.b +; VBITS_GE_128-NEXT: uunpkhi z31.s, z25.h +; VBITS_GE_128-NEXT: uunpkhi z8.s, z30.h +; VBITS_GE_128-NEXT: ld1b { z9.b }, p0/z, [x1, x12] +; VBITS_GE_128-NEXT: udivr z31.s, p1/m, z31.s, z8.s +; VBITS_GE_128-NEXT: uunpklo z8.s, z25.h +; VBITS_GE_128-NEXT: uunpklo z30.s, z30.h +; VBITS_GE_128-NEXT: udiv z30.s, p1/m, z30.s, z8.s +; VBITS_GE_128-NEXT: uzp1 z28.h, z28.h, z29.h +; VBITS_GE_128-NEXT: uzp1 z29.h, z30.h, z31.h +; VBITS_GE_128-NEXT: mls z5.b, p0/m, z27.b, z10.b +; VBITS_GE_128-NEXT: uzp1 z27.b, z29.b, z28.b +; VBITS_GE_128-NEXT: uunpkhi z28.h, z9.b +; VBITS_GE_128-NEXT: uunpkhi z29.h, z7.b +; VBITS_GE_128-NEXT: uunpkhi z30.s, z28.h +; VBITS_GE_128-NEXT: uunpkhi z31.s, z29.h +; VBITS_GE_128-NEXT: uunpklo z28.s, z28.h +; VBITS_GE_128-NEXT: uunpklo z29.s, z29.h +; VBITS_GE_128-NEXT: ld1b { z25.b }, p0/z, [x1, x11] +; VBITS_GE_128-NEXT: udivr z30.s, p1/m, z30.s, z31.s +; VBITS_GE_128-NEXT: udivr z28.s, p1/m, z28.s, z29.s +; VBITS_GE_128-NEXT: uunpklo z29.h, z9.b +; VBITS_GE_128-NEXT: uunpklo z31.h, z7.b +; VBITS_GE_128-NEXT: uunpkhi z8.s, z29.h +; VBITS_GE_128-NEXT: uunpkhi z10.s, z31.h +; VBITS_GE_128-NEXT: uunpklo z29.s, z29.h +; VBITS_GE_128-NEXT: uunpklo z31.s, z31.h +; VBITS_GE_128-NEXT: udivr z8.s, p1/m, z8.s, z10.s +; VBITS_GE_128-NEXT: udivr z29.s, p1/m, z29.s, z31.s +; VBITS_GE_128-NEXT: uzp1 z28.h, z28.h, z30.h +; VBITS_GE_128-NEXT: uzp1 z29.h, z29.h, z8.h +; VBITS_GE_128-NEXT: mls z6.b, p0/m, z27.b, z26.b +; VBITS_GE_128-NEXT: uzp1 z27.b, z29.b, z28.b +; VBITS_GE_128-NEXT: uunpkhi z26.h, z25.b +; VBITS_GE_128-NEXT: uunpkhi z28.h, z16.b +; VBITS_GE_128-NEXT: uunpkhi z29.s, z26.h +; VBITS_GE_128-NEXT: uunpkhi z30.s, z28.h +; VBITS_GE_128-NEXT: uunpklo z26.s, z26.h +; VBITS_GE_128-NEXT: uunpklo z28.s, z28.h +; VBITS_GE_128-NEXT: udivr z29.s, p1/m, z29.s, z30.s +; VBITS_GE_128-NEXT: udiv z28.s, p1/m, z28.s, z26.s +; VBITS_GE_128-NEXT: uunpklo z26.h, z25.b +; VBITS_GE_128-NEXT: uunpklo z30.h, z16.b +; VBITS_GE_128-NEXT: uunpkhi z31.s, z26.h +; VBITS_GE_128-NEXT: uunpkhi z8.s, z30.h +; VBITS_GE_128-NEXT: ld1b { z10.b }, p0/z, [x1, x10] +; VBITS_GE_128-NEXT: udivr z31.s, p1/m, z31.s, z8.s +; VBITS_GE_128-NEXT: uunpklo z8.s, z26.h +; VBITS_GE_128-NEXT: uunpklo z30.s, z30.h +; VBITS_GE_128-NEXT: udiv z30.s, p1/m, z30.s, z8.s +; VBITS_GE_128-NEXT: uzp1 z28.h, z28.h, z29.h +; VBITS_GE_128-NEXT: uzp1 z29.h, z30.h, z31.h +; VBITS_GE_128-NEXT: mls z7.b, p0/m, z27.b, z9.b +; VBITS_GE_128-NEXT: uzp1 z27.b, z29.b, z28.b +; VBITS_GE_128-NEXT: uunpkhi z28.h, z10.b +; VBITS_GE_128-NEXT: uunpkhi z29.h, z0.b +; VBITS_GE_128-NEXT: uunpkhi z30.s, z28.h +; VBITS_GE_128-NEXT: uunpkhi z31.s, z29.h +; VBITS_GE_128-NEXT: uunpklo z28.s, z28.h +; VBITS_GE_128-NEXT: uunpklo z29.s, z29.h +; VBITS_GE_128-NEXT: ld1b { z26.b }, p0/z, [x1, x9] +; VBITS_GE_128-NEXT: udivr z30.s, p1/m, z30.s, z31.s +; VBITS_GE_128-NEXT: udivr z28.s, p1/m, z28.s, z29.s +; VBITS_GE_128-NEXT: uunpklo z29.h, z10.b +; VBITS_GE_128-NEXT: uunpklo z31.h, z0.b +; VBITS_GE_128-NEXT: uunpkhi z8.s, z29.h +; VBITS_GE_128-NEXT: uunpkhi z9.s, z31.h +; VBITS_GE_128-NEXT: uunpklo z29.s, z29.h +; VBITS_GE_128-NEXT: uunpklo z31.s, z31.h +; VBITS_GE_128-NEXT: udivr z8.s, p1/m, z8.s, z9.s +; VBITS_GE_128-NEXT: udivr z29.s, p1/m, z29.s, z31.s +; VBITS_GE_128-NEXT: uzp1 z28.h, z28.h, z30.h +; VBITS_GE_128-NEXT: uzp1 z29.h, z29.h, z8.h +; VBITS_GE_128-NEXT: mls z16.b, p0/m, z27.b, z25.b +; VBITS_GE_128-NEXT: uzp1 z27.b, z29.b, z28.b +; VBITS_GE_128-NEXT: uunpkhi z25.h, z26.b +; VBITS_GE_128-NEXT: uunpkhi z28.h, z1.b +; VBITS_GE_128-NEXT: uunpkhi z29.s, z25.h +; VBITS_GE_128-NEXT: uunpkhi z30.s, z28.h +; VBITS_GE_128-NEXT: udivr z29.s, p1/m, z29.s, z30.s +; VBITS_GE_128-NEXT: uunpklo z30.h, z26.b +; VBITS_GE_128-NEXT: uunpklo z31.h, z1.b +; VBITS_GE_128-NEXT: uunpklo z25.s, z25.h +; VBITS_GE_128-NEXT: uunpklo z28.s, z28.h +; VBITS_GE_128-NEXT: uunpkhi z8.s, z30.h +; VBITS_GE_128-NEXT: uunpkhi z9.s, z31.h +; VBITS_GE_128-NEXT: uunpklo z30.s, z30.h +; VBITS_GE_128-NEXT: uunpklo z31.s, z31.h +; VBITS_GE_128-NEXT: udiv z28.s, p1/m, z28.s, z25.s +; VBITS_GE_128-NEXT: udivr z8.s, p1/m, z8.s, z9.s +; VBITS_GE_128-NEXT: udivr z30.s, p1/m, z30.s, z31.s +; VBITS_GE_128-NEXT: uzp1 z28.h, z28.h, z29.h +; VBITS_GE_128-NEXT: uzp1 z29.h, z30.h, z8.h +; VBITS_GE_128-NEXT: mls z0.b, p0/m, z27.b, z10.b +; VBITS_GE_128-NEXT: uzp1 z27.b, z29.b, z28.b +; VBITS_GE_128-NEXT: uunpkhi z28.h, z24.b +; VBITS_GE_128-NEXT: uunpkhi z29.h, z2.b +; VBITS_GE_128-NEXT: uunpkhi z30.s, z28.h +; VBITS_GE_128-NEXT: uunpkhi z31.s, z29.h +; VBITS_GE_128-NEXT: uunpklo z28.s, z28.h +; VBITS_GE_128-NEXT: uunpklo z29.s, z29.h +; VBITS_GE_128-NEXT: ld1b { z25.b }, p0/z, [x1, x8] +; VBITS_GE_128-NEXT: ld1b { z9.b }, p0/z, [x1] +; VBITS_GE_128-NEXT: udivr z30.s, p1/m, z30.s, z31.s +; VBITS_GE_128-NEXT: udivr z28.s, p1/m, z28.s, z29.s +; VBITS_GE_128-NEXT: uunpklo z29.h, z24.b +; VBITS_GE_128-NEXT: uunpklo z31.h, z2.b +; VBITS_GE_128-NEXT: uunpkhi z8.s, z29.h +; VBITS_GE_128-NEXT: uunpkhi z10.s, z31.h +; VBITS_GE_128-NEXT: uunpklo z29.s, z29.h +; VBITS_GE_128-NEXT: uunpklo z31.s, z31.h +; VBITS_GE_128-NEXT: udivr z8.s, p1/m, z8.s, z10.s +; VBITS_GE_128-NEXT: udivr z29.s, p1/m, z29.s, z31.s +; VBITS_GE_128-NEXT: uzp1 z28.h, z28.h, z30.h +; VBITS_GE_128-NEXT: uzp1 z29.h, z29.h, z8.h +; VBITS_GE_128-NEXT: mls z1.b, p0/m, z27.b, z26.b +; VBITS_GE_128-NEXT: uzp1 z26.b, z29.b, z28.b +; VBITS_GE_128-NEXT: uunpkhi z27.h, z9.b +; VBITS_GE_128-NEXT: uunpkhi z28.h, z20.b +; VBITS_GE_128-NEXT: uunpkhi z29.s, z27.h +; VBITS_GE_128-NEXT: uunpkhi z30.s, z28.h +; VBITS_GE_128-NEXT: uunpklo z27.s, z27.h +; VBITS_GE_128-NEXT: uunpklo z28.s, z28.h +; VBITS_GE_128-NEXT: udivr z29.s, p1/m, z29.s, z30.s +; VBITS_GE_128-NEXT: udivr z27.s, p1/m, z27.s, z28.s +; VBITS_GE_128-NEXT: uunpklo z28.h, z9.b +; VBITS_GE_128-NEXT: uunpklo z30.h, z20.b +; VBITS_GE_128-NEXT: uunpkhi z31.s, z28.h +; VBITS_GE_128-NEXT: uunpkhi z8.s, z30.h +; VBITS_GE_128-NEXT: uunpklo z28.s, z28.h +; VBITS_GE_128-NEXT: uunpklo z30.s, z30.h +; VBITS_GE_128-NEXT: udivr z31.s, p1/m, z31.s, z8.s +; VBITS_GE_128-NEXT: udivr z28.s, p1/m, z28.s, z30.s +; VBITS_GE_128-NEXT: uzp1 z27.h, z27.h, z29.h +; VBITS_GE_128-NEXT: uzp1 z28.h, z28.h, z31.h +; VBITS_GE_128-NEXT: mls z2.b, p0/m, z26.b, z24.b +; VBITS_GE_128-NEXT: uzp1 z24.b, z28.b, z27.b +; VBITS_GE_128-NEXT: mls z20.b, p0/m, z24.b, z9.b +; VBITS_GE_128-NEXT: uunpkhi z24.h, z25.b +; VBITS_GE_128-NEXT: ldp d9, d8, [sp, #16] // 16-byte Folded Reload +; VBITS_GE_128-NEXT: stp q18, q19, [x0, #32] +; VBITS_GE_128-NEXT: stp q21, q22, [x0, #64] +; VBITS_GE_128-NEXT: stp q23, q3, [x0, #96] +; VBITS_GE_128-NEXT: uunpklo z3.h, z17.b +; VBITS_GE_128-NEXT: stp q4, q5, [x0, #128] +; VBITS_GE_128-NEXT: uunpkhi z4.s, z3.h +; VBITS_GE_128-NEXT: stp q16, q0, [x0, #192] +; VBITS_GE_128-NEXT: uunpklo z3.s, z3.h +; VBITS_GE_128-NEXT: stp q6, q7, [x0, #160] +; VBITS_GE_128-NEXT: stp q1, q2, [x0, #224] +; VBITS_GE_128-NEXT: uunpkhi z1.h, z17.b +; VBITS_GE_128-NEXT: uunpkhi z2.s, z24.h +; VBITS_GE_128-NEXT: uunpkhi z0.s, z1.h +; VBITS_GE_128-NEXT: udiv z0.s, p1/m, z0.s, z2.s +; VBITS_GE_128-NEXT: uunpklo z2.s, z24.h +; VBITS_GE_128-NEXT: uunpklo z1.s, z1.h +; VBITS_GE_128-NEXT: udiv z1.s, p1/m, z1.s, z2.s +; VBITS_GE_128-NEXT: uzp1 z0.h, z1.h, z0.h +; VBITS_GE_128-NEXT: uunpklo z1.h, z25.b +; VBITS_GE_128-NEXT: uunpkhi z2.s, z1.h +; VBITS_GE_128-NEXT: uunpklo z1.s, z1.h +; VBITS_GE_128-NEXT: udivr z2.s, p1/m, z2.s, z4.s +; VBITS_GE_128-NEXT: udivr z1.s, p1/m, z1.s, z3.s +; VBITS_GE_128-NEXT: uzp1 z1.h, z1.h, z2.h +; VBITS_GE_128-NEXT: uzp1 z0.b, z1.b, z0.b +; VBITS_GE_128-NEXT: mls z17.b, p0/m, z0.b, z25.b +; VBITS_GE_128-NEXT: stp q20, q17, [x0] +; VBITS_GE_128-NEXT: ldr d10, [sp], #32 // 8-byte Folded Reload +; VBITS_GE_128-NEXT: ret +; +; VBITS_GE_256-LABEL: urem_v256i8: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov w8, #192 +; VBITS_GE_256-NEXT: mov w9, #224 +; VBITS_GE_256-NEXT: mov w10, #128 +; VBITS_GE_256-NEXT: mov w11, #160 +; VBITS_GE_256-NEXT: mov w12, #64 +; VBITS_GE_256-NEXT: mov w13, #96 +; VBITS_GE_256-NEXT: mov w14, #32 +; VBITS_GE_256-NEXT: ptrue p0.b, vl32 +; VBITS_GE_256-NEXT: ld1b { z0.b }, p0/z, [x0, x8] +; VBITS_GE_256-NEXT: ld1b { z7.b }, p0/z, [x0, x9] +; VBITS_GE_256-NEXT: ld1b { z6.b }, p0/z, [x0, x10] +; VBITS_GE_256-NEXT: ld1b { z5.b }, p0/z, [x0, x11] +; VBITS_GE_256-NEXT: ld1b { z4.b }, p0/z, [x0, x12] +; VBITS_GE_256-NEXT: ld1b { z3.b }, p0/z, [x0, x13] +; VBITS_GE_256-NEXT: ld1b { z2.b }, p0/z, [x0, x14] +; VBITS_GE_256-NEXT: ld1b { z1.b }, p0/z, [x0] +; VBITS_GE_256-NEXT: ld1b { z18.b }, p0/z, [x1, x14] +; VBITS_GE_256-NEXT: ld1b { z19.b }, p0/z, [x1, x13] +; VBITS_GE_256-NEXT: ptrue p1.s, vl8 +; VBITS_GE_256-NEXT: ld1b { z28.b }, p0/z, [x1, x11] +; VBITS_GE_256-NEXT: ld1b { z17.b }, p0/z, [x1, x9] +; VBITS_GE_256-NEXT: ld1b { z16.b }, p0/z, [x1, x8] +; VBITS_GE_256-NEXT: uunpkhi z22.h, z2.b +; VBITS_GE_256-NEXT: uunpkhi z20.h, z18.b +; VBITS_GE_256-NEXT: uunpkhi z23.s, z22.h +; VBITS_GE_256-NEXT: uunpkhi z21.s, z20.h +; VBITS_GE_256-NEXT: uunpklo z20.s, z20.h +; VBITS_GE_256-NEXT: uunpklo z22.s, z22.h +; VBITS_GE_256-NEXT: udivr z21.s, p1/m, z21.s, z23.s +; VBITS_GE_256-NEXT: udivr z20.s, p1/m, z20.s, z22.s +; VBITS_GE_256-NEXT: uunpklo z22.h, z18.b +; VBITS_GE_256-NEXT: uunpklo z23.h, z2.b +; VBITS_GE_256-NEXT: uunpkhi z24.s, z22.h +; VBITS_GE_256-NEXT: uunpkhi z25.s, z23.h +; VBITS_GE_256-NEXT: uunpklo z22.s, z22.h +; VBITS_GE_256-NEXT: uunpklo z23.s, z23.h +; VBITS_GE_256-NEXT: udivr z24.s, p1/m, z24.s, z25.s +; VBITS_GE_256-NEXT: udivr z22.s, p1/m, z22.s, z23.s +; VBITS_GE_256-NEXT: uzp1 z20.h, z20.h, z21.h +; VBITS_GE_256-NEXT: uzp1 z21.h, z22.h, z24.h +; VBITS_GE_256-NEXT: uunpkhi z22.h, z19.b +; VBITS_GE_256-NEXT: uunpkhi z23.h, z3.b +; VBITS_GE_256-NEXT: uunpkhi z24.s, z22.h +; VBITS_GE_256-NEXT: uunpkhi z25.s, z23.h +; VBITS_GE_256-NEXT: uunpklo z22.s, z22.h +; VBITS_GE_256-NEXT: uunpklo z23.s, z23.h +; VBITS_GE_256-NEXT: udivr z24.s, p1/m, z24.s, z25.s +; VBITS_GE_256-NEXT: udivr z22.s, p1/m, z22.s, z23.s +; VBITS_GE_256-NEXT: uunpklo z23.h, z19.b +; VBITS_GE_256-NEXT: uunpklo z25.h, z3.b +; VBITS_GE_256-NEXT: uunpkhi z26.s, z23.h +; VBITS_GE_256-NEXT: uunpkhi z27.s, z25.h +; VBITS_GE_256-NEXT: uunpklo z23.s, z23.h +; VBITS_GE_256-NEXT: udivr z26.s, p1/m, z26.s, z27.s +; VBITS_GE_256-NEXT: uunpklo z25.s, z25.h +; VBITS_GE_256-NEXT: ld1b { z27.b }, p0/z, [x1, x12] +; VBITS_GE_256-NEXT: udivr z23.s, p1/m, z23.s, z25.s +; VBITS_GE_256-NEXT: uzp1 z22.h, z22.h, z24.h +; VBITS_GE_256-NEXT: uzp1 z23.h, z23.h, z26.h +; VBITS_GE_256-NEXT: uzp1 z20.b, z21.b, z20.b +; VBITS_GE_256-NEXT: uzp1 z21.b, z23.b, z22.b +; VBITS_GE_256-NEXT: mls z2.b, p0/m, z20.b, z18.b +; VBITS_GE_256-NEXT: mls z3.b, p0/m, z21.b, z19.b +; VBITS_GE_256-NEXT: uunpkhi z18.h, z27.b +; VBITS_GE_256-NEXT: uunpkhi z19.h, z4.b +; VBITS_GE_256-NEXT: uunpkhi z20.s, z18.h +; VBITS_GE_256-NEXT: uunpkhi z21.s, z19.h +; VBITS_GE_256-NEXT: uunpklo z18.s, z18.h +; VBITS_GE_256-NEXT: uunpklo z19.s, z19.h +; VBITS_GE_256-NEXT: udivr z20.s, p1/m, z20.s, z21.s +; VBITS_GE_256-NEXT: udivr z18.s, p1/m, z18.s, z19.s +; VBITS_GE_256-NEXT: uunpklo z19.h, z27.b +; VBITS_GE_256-NEXT: uunpklo z21.h, z4.b +; VBITS_GE_256-NEXT: uunpkhi z22.s, z19.h +; VBITS_GE_256-NEXT: uunpkhi z23.s, z21.h +; VBITS_GE_256-NEXT: uunpklo z19.s, z19.h +; VBITS_GE_256-NEXT: uunpklo z21.s, z21.h +; VBITS_GE_256-NEXT: udivr z22.s, p1/m, z22.s, z23.s +; VBITS_GE_256-NEXT: udivr z19.s, p1/m, z19.s, z21.s +; VBITS_GE_256-NEXT: uzp1 z20.h, z18.h, z20.h +; VBITS_GE_256-NEXT: uunpkhi z18.h, z28.b +; VBITS_GE_256-NEXT: uunpkhi z21.h, z5.b +; VBITS_GE_256-NEXT: uzp1 z19.h, z19.h, z22.h +; VBITS_GE_256-NEXT: uunpkhi z22.s, z18.h +; VBITS_GE_256-NEXT: uunpkhi z23.s, z21.h +; VBITS_GE_256-NEXT: uunpklo z18.s, z18.h +; VBITS_GE_256-NEXT: uunpklo z21.s, z21.h +; VBITS_GE_256-NEXT: udivr z22.s, p1/m, z22.s, z23.s +; VBITS_GE_256-NEXT: udiv z21.s, p1/m, z21.s, z18.s +; VBITS_GE_256-NEXT: uunpklo z18.h, z28.b +; VBITS_GE_256-NEXT: uunpklo z23.h, z5.b +; VBITS_GE_256-NEXT: uunpkhi z24.s, z18.h +; VBITS_GE_256-NEXT: uunpkhi z25.s, z23.h +; VBITS_GE_256-NEXT: ld1b { z26.b }, p0/z, [x1, x10] +; VBITS_GE_256-NEXT: udivr z24.s, p1/m, z24.s, z25.s +; VBITS_GE_256-NEXT: uunpklo z25.s, z18.h +; VBITS_GE_256-NEXT: uunpklo z23.s, z23.h +; VBITS_GE_256-NEXT: udiv z23.s, p1/m, z23.s, z25.s +; VBITS_GE_256-NEXT: uzp1 z21.h, z21.h, z22.h +; VBITS_GE_256-NEXT: uzp1 z22.h, z23.h, z24.h +; VBITS_GE_256-NEXT: uzp1 z19.b, z19.b, z20.b +; VBITS_GE_256-NEXT: uzp1 z20.b, z22.b, z21.b +; VBITS_GE_256-NEXT: mls z4.b, p0/m, z19.b, z27.b +; VBITS_GE_256-NEXT: mls z5.b, p0/m, z20.b, z28.b +; VBITS_GE_256-NEXT: uunpkhi z19.h, z26.b +; VBITS_GE_256-NEXT: uunpkhi z20.h, z6.b +; VBITS_GE_256-NEXT: uunpkhi z21.s, z19.h +; VBITS_GE_256-NEXT: uunpkhi z22.s, z20.h +; VBITS_GE_256-NEXT: uunpklo z19.s, z19.h +; VBITS_GE_256-NEXT: uunpklo z20.s, z20.h +; VBITS_GE_256-NEXT: udivr z21.s, p1/m, z21.s, z22.s +; VBITS_GE_256-NEXT: udivr z19.s, p1/m, z19.s, z20.s +; VBITS_GE_256-NEXT: uunpklo z20.h, z26.b +; VBITS_GE_256-NEXT: uunpklo z22.h, z6.b +; VBITS_GE_256-NEXT: uunpkhi z23.s, z20.h +; VBITS_GE_256-NEXT: uunpkhi z24.s, z22.h +; VBITS_GE_256-NEXT: uunpklo z20.s, z20.h +; VBITS_GE_256-NEXT: uunpklo z22.s, z22.h +; VBITS_GE_256-NEXT: udivr z23.s, p1/m, z23.s, z24.s +; VBITS_GE_256-NEXT: udivr z20.s, p1/m, z20.s, z22.s +; VBITS_GE_256-NEXT: uzp1 z19.h, z19.h, z21.h +; VBITS_GE_256-NEXT: uunpkhi z21.h, z17.b +; VBITS_GE_256-NEXT: uunpkhi z22.h, z7.b +; VBITS_GE_256-NEXT: uzp1 z20.h, z20.h, z23.h +; VBITS_GE_256-NEXT: uunpkhi z23.s, z21.h +; VBITS_GE_256-NEXT: uunpkhi z24.s, z22.h +; VBITS_GE_256-NEXT: uunpklo z21.s, z21.h +; VBITS_GE_256-NEXT: uunpklo z22.s, z22.h +; VBITS_GE_256-NEXT: udivr z23.s, p1/m, z23.s, z24.s +; VBITS_GE_256-NEXT: udivr z21.s, p1/m, z21.s, z22.s +; VBITS_GE_256-NEXT: uunpklo z22.h, z17.b +; VBITS_GE_256-NEXT: uunpklo z24.h, z7.b +; VBITS_GE_256-NEXT: uunpkhi z25.s, z22.h +; VBITS_GE_256-NEXT: uunpkhi z27.s, z24.h +; VBITS_GE_256-NEXT: uunpklo z22.s, z22.h +; VBITS_GE_256-NEXT: uunpklo z24.s, z24.h +; VBITS_GE_256-NEXT: udivr z25.s, p1/m, z25.s, z27.s +; VBITS_GE_256-NEXT: udivr z22.s, p1/m, z22.s, z24.s +; VBITS_GE_256-NEXT: uzp1 z21.h, z21.h, z23.h +; VBITS_GE_256-NEXT: uzp1 z22.h, z22.h, z25.h +; VBITS_GE_256-NEXT: uzp1 z19.b, z20.b, z19.b +; VBITS_GE_256-NEXT: uzp1 z20.b, z22.b, z21.b +; VBITS_GE_256-NEXT: ld1b { z18.b }, p0/z, [x1] +; VBITS_GE_256-NEXT: mls z6.b, p0/m, z19.b, z26.b +; VBITS_GE_256-NEXT: mls z7.b, p0/m, z20.b, z17.b +; VBITS_GE_256-NEXT: uunpkhi z17.h, z16.b +; VBITS_GE_256-NEXT: uunpkhi z19.h, z0.b +; VBITS_GE_256-NEXT: uunpkhi z20.s, z17.h +; VBITS_GE_256-NEXT: uunpkhi z21.s, z19.h +; VBITS_GE_256-NEXT: uunpklo z17.s, z17.h +; VBITS_GE_256-NEXT: uunpklo z19.s, z19.h +; VBITS_GE_256-NEXT: udivr z20.s, p1/m, z20.s, z21.s +; VBITS_GE_256-NEXT: udivr z17.s, p1/m, z17.s, z19.s +; VBITS_GE_256-NEXT: uunpklo z19.h, z16.b +; VBITS_GE_256-NEXT: uunpklo z21.h, z0.b +; VBITS_GE_256-NEXT: uunpkhi z22.s, z19.h +; VBITS_GE_256-NEXT: uunpkhi z23.s, z21.h +; VBITS_GE_256-NEXT: uunpklo z19.s, z19.h +; VBITS_GE_256-NEXT: uunpklo z21.s, z21.h +; VBITS_GE_256-NEXT: udivr z22.s, p1/m, z22.s, z23.s +; VBITS_GE_256-NEXT: udivr z19.s, p1/m, z19.s, z21.s +; VBITS_GE_256-NEXT: uzp1 z17.h, z17.h, z20.h +; VBITS_GE_256-NEXT: uunpkhi z20.h, z18.b +; VBITS_GE_256-NEXT: uunpkhi z21.h, z1.b +; VBITS_GE_256-NEXT: uzp1 z19.h, z19.h, z22.h +; VBITS_GE_256-NEXT: uunpkhi z22.s, z20.h +; VBITS_GE_256-NEXT: uunpkhi z23.s, z21.h +; VBITS_GE_256-NEXT: uunpklo z20.s, z20.h +; VBITS_GE_256-NEXT: uunpklo z21.s, z21.h +; VBITS_GE_256-NEXT: udivr z22.s, p1/m, z22.s, z23.s +; VBITS_GE_256-NEXT: udivr z20.s, p1/m, z20.s, z21.s +; VBITS_GE_256-NEXT: uunpklo z21.h, z18.b +; VBITS_GE_256-NEXT: uunpklo z23.h, z1.b +; VBITS_GE_256-NEXT: uunpkhi z24.s, z21.h +; VBITS_GE_256-NEXT: uunpkhi z25.s, z23.h +; VBITS_GE_256-NEXT: uunpklo z21.s, z21.h +; VBITS_GE_256-NEXT: uunpklo z23.s, z23.h +; VBITS_GE_256-NEXT: udivr z24.s, p1/m, z24.s, z25.s +; VBITS_GE_256-NEXT: udivr z21.s, p1/m, z21.s, z23.s +; VBITS_GE_256-NEXT: uzp1 z20.h, z20.h, z22.h +; VBITS_GE_256-NEXT: uzp1 z21.h, z21.h, z24.h +; VBITS_GE_256-NEXT: uzp1 z17.b, z19.b, z17.b +; VBITS_GE_256-NEXT: uzp1 z19.b, z21.b, z20.b +; VBITS_GE_256-NEXT: mls z0.b, p0/m, z17.b, z16.b +; VBITS_GE_256-NEXT: mls z1.b, p0/m, z19.b, z18.b +; VBITS_GE_256-NEXT: st1b { z0.b }, p0, [x0, x8] +; VBITS_GE_256-NEXT: st1b { z7.b }, p0, [x0, x9] +; VBITS_GE_256-NEXT: st1b { z6.b }, p0, [x0, x10] +; VBITS_GE_256-NEXT: st1b { z5.b }, p0, [x0, x11] +; VBITS_GE_256-NEXT: st1b { z4.b }, p0, [x0, x12] +; VBITS_GE_256-NEXT: st1b { z3.b }, p0, [x0, x13] +; VBITS_GE_256-NEXT: st1b { z2.b }, p0, [x0, x14] +; VBITS_GE_256-NEXT: st1b { z1.b }, p0, [x0] +; VBITS_GE_256-NEXT: ret + %op1 = load <256 x i8>, <256 x i8>* %a + %op2 = load <256 x i8>, <256 x i8>* %b + %res = urem <256 x i8> %op1, %op2 + store <256 x i8> %res, <256 x i8>* %a + ret void +} + +define <4 x i16> @urem_v4i16(<4 x i16> %op1, <4 x i16> %op2) #0 { +; CHECK-LABEL: urem_v4i16: +; CHECK: // %bb.0: +; CHECK-NEXT: sub sp, sp, #16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1 +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: ptrue p0.s, vl4 +; CHECK-NEXT: uunpklo z2.s, z1.h +; CHECK-NEXT: uunpklo z3.s, z0.h +; CHECK-NEXT: udivr z2.s, p0/m, z2.s, z3.s +; CHECK-NEXT: ptrue p0.h, vl4 +; CHECK-NEXT: fmov w8, s2 +; CHECK-NEXT: mov z3.s, z2.s[3] +; CHECK-NEXT: mov z4.s, z2.s[2] +; CHECK-NEXT: mov z2.s, z2.s[1] +; CHECK-NEXT: fmov w9, s3 +; CHECK-NEXT: fmov w10, s4 +; CHECK-NEXT: strh w8, [sp, #8] +; CHECK-NEXT: fmov w8, s2 +; CHECK-NEXT: strh w9, [sp, #14] +; CHECK-NEXT: strh w8, [sp, #10] +; CHECK-NEXT: add x8, sp, #8 +; CHECK-NEXT: strh w10, [sp, #12] +; CHECK-NEXT: ld1h { z2.h }, p0/z, [x8] +; CHECK-NEXT: mls z0.h, p0/m, z2.h, z1.h +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 +; CHECK-NEXT: add sp, sp, #16 +; CHECK-NEXT: ret + %res = urem <4 x i16> %op1, %op2 + ret <4 x i16> %res +} + +define <8 x i16> @urem_v8i16(<8 x i16> %op1, <8 x i16> %op2) #0 { +; VBITS_GE_128-LABEL: urem_v8i16: +; VBITS_GE_128: // %bb.0: +; VBITS_GE_128-NEXT: // kill: def $q1 killed $q1 def $z1 +; VBITS_GE_128-NEXT: // kill: def $q0 killed $q0 def $z0 +; VBITS_GE_128-NEXT: ptrue p0.s, vl4 +; VBITS_GE_128-NEXT: uunpkhi z2.s, z1.h +; VBITS_GE_128-NEXT: uunpkhi z3.s, z0.h +; VBITS_GE_128-NEXT: uunpklo z4.s, z1.h +; VBITS_GE_128-NEXT: udivr z2.s, p0/m, z2.s, z3.s +; VBITS_GE_128-NEXT: uunpklo z5.s, z0.h +; VBITS_GE_128-NEXT: movprfx z3, z5 +; VBITS_GE_128-NEXT: udiv z3.s, p0/m, z3.s, z4.s +; VBITS_GE_128-NEXT: uzp1 z2.h, z3.h, z2.h +; VBITS_GE_128-NEXT: ptrue p0.h, vl8 +; VBITS_GE_128-NEXT: mls z0.h, p0/m, z2.h, z1.h +; VBITS_GE_128-NEXT: // kill: def $q0 killed $q0 killed $z0 +; VBITS_GE_128-NEXT: ret +; +; VBITS_GE_256-LABEL: urem_v8i16: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: // kill: def $q1 killed $q1 def $z1 +; VBITS_GE_256-NEXT: // kill: def $q0 killed $q0 def $z0 +; VBITS_GE_256-NEXT: ptrue p0.s, vl8 +; VBITS_GE_256-NEXT: uunpklo z2.s, z1.h +; VBITS_GE_256-NEXT: uunpklo z3.s, z0.h +; VBITS_GE_256-NEXT: udivr z2.s, p0/m, z2.s, z3.s +; VBITS_GE_256-NEXT: ptrue p0.h, vl8 +; VBITS_GE_256-NEXT: uzp1 z2.h, z2.h, z2.h +; VBITS_GE_256-NEXT: mls z0.h, p0/m, z2.h, z1.h +; VBITS_GE_256-NEXT: // kill: def $q0 killed $q0 killed $z0 +; VBITS_GE_256-NEXT: ret +; +; VBITS_GE_512-LABEL: urem_v8i16: +; VBITS_GE_512: // %bb.0: +; VBITS_GE_512-NEXT: // kill: def $q1 killed $q1 def $z1 +; VBITS_GE_512-NEXT: // kill: def $q0 killed $q0 def $z0 +; VBITS_GE_512-NEXT: ptrue p0.s, vl8 +; VBITS_GE_512-NEXT: uunpklo z2.s, z1.h +; VBITS_GE_512-NEXT: uunpklo z3.s, z0.h +; VBITS_GE_512-NEXT: udivr z2.s, p0/m, z2.s, z3.s +; VBITS_GE_512-NEXT: ptrue p0.h, vl8 +; VBITS_GE_512-NEXT: uzp1 z2.h, z2.h, z2.h +; VBITS_GE_512-NEXT: mls z0.h, p0/m, z2.h, z1.h +; VBITS_GE_512-NEXT: // kill: def $q0 killed $q0 killed $z0 +; VBITS_GE_512-NEXT: ret + %res = urem <8 x i16> %op1, %op2 + ret <8 x i16> %res +} + +define void @urem_v16i16(<16 x i16>* %a, <16 x i16>* %b) #0 { +; VBITS_GE_128-LABEL: urem_v16i16: +; VBITS_GE_128: // %bb.0: +; VBITS_GE_128-NEXT: mov x8, #8 +; VBITS_GE_128-NEXT: ptrue p0.h, vl8 +; VBITS_GE_128-NEXT: ptrue p1.s, vl4 +; VBITS_GE_128-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] +; VBITS_GE_128-NEXT: ld1h { z1.h }, p0/z, [x0] +; VBITS_GE_128-NEXT: ld1h { z2.h }, p0/z, [x1, x8, lsl #1] +; VBITS_GE_128-NEXT: ld1h { z3.h }, p0/z, [x1] +; VBITS_GE_128-NEXT: uunpkhi z5.s, z0.h +; VBITS_GE_128-NEXT: uunpkhi z16.s, z1.h +; VBITS_GE_128-NEXT: uunpkhi z4.s, z2.h +; VBITS_GE_128-NEXT: uunpkhi z7.s, z3.h +; VBITS_GE_128-NEXT: udivr z4.s, p1/m, z4.s, z5.s +; VBITS_GE_128-NEXT: uunpklo z5.s, z3.h +; VBITS_GE_128-NEXT: udivr z7.s, p1/m, z7.s, z16.s +; VBITS_GE_128-NEXT: uunpklo z16.s, z1.h +; VBITS_GE_128-NEXT: uunpklo z6.s, z2.h +; VBITS_GE_128-NEXT: udivr z5.s, p1/m, z5.s, z16.s +; VBITS_GE_128-NEXT: uunpklo z16.s, z0.h +; VBITS_GE_128-NEXT: uzp1 z5.h, z5.h, z7.h +; VBITS_GE_128-NEXT: udivr z6.s, p1/m, z6.s, z16.s +; VBITS_GE_128-NEXT: mls z1.h, p0/m, z5.h, z3.h +; VBITS_GE_128-NEXT: uzp1 z4.h, z6.h, z4.h +; VBITS_GE_128-NEXT: mls z0.h, p0/m, z4.h, z2.h +; VBITS_GE_128-NEXT: stp q1, q0, [x0] +; VBITS_GE_128-NEXT: ret +; +; VBITS_GE_256-LABEL: urem_v16i16: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: ptrue p0.h, vl16 +; VBITS_GE_256-NEXT: ptrue p1.s, vl8 +; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0] +; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x1] +; VBITS_GE_256-NEXT: uunpkhi z2.s, z1.h +; VBITS_GE_256-NEXT: uunpkhi z3.s, z0.h +; VBITS_GE_256-NEXT: udivr z2.s, p1/m, z2.s, z3.s +; VBITS_GE_256-NEXT: uunpklo z4.s, z1.h +; VBITS_GE_256-NEXT: uunpklo z5.s, z0.h +; VBITS_GE_256-NEXT: movprfx z3, z5 +; VBITS_GE_256-NEXT: udiv z3.s, p1/m, z3.s, z4.s +; VBITS_GE_256-NEXT: uzp1 z2.h, z3.h, z2.h +; VBITS_GE_256-NEXT: mls z0.h, p0/m, z2.h, z1.h +; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x0] +; VBITS_GE_256-NEXT: ret +; +; VBITS_GE_512-LABEL: urem_v16i16: +; VBITS_GE_512: // %bb.0: +; VBITS_GE_512-NEXT: ptrue p0.h, vl16 +; VBITS_GE_512-NEXT: ptrue p1.s, vl16 +; VBITS_GE_512-NEXT: ld1h { z0.h }, p0/z, [x0] +; VBITS_GE_512-NEXT: ld1h { z1.h }, p0/z, [x1] +; VBITS_GE_512-NEXT: uunpklo z2.s, z1.h +; VBITS_GE_512-NEXT: uunpklo z3.s, z0.h +; VBITS_GE_512-NEXT: udivr z2.s, p1/m, z2.s, z3.s +; VBITS_GE_512-NEXT: uzp1 z2.h, z2.h, z2.h +; VBITS_GE_512-NEXT: mls z0.h, p0/m, z2.h, z1.h +; VBITS_GE_512-NEXT: st1h { z0.h }, p0, [x0] +; VBITS_GE_512-NEXT: ret + %op1 = load <16 x i16>, <16 x i16>* %a + %op2 = load <16 x i16>, <16 x i16>* %b + %res = urem <16 x i16> %op1, %op2 + store <16 x i16> %res, <16 x i16>* %a + ret void +} + +define void @urem_v32i16(<32 x i16>* %a, <32 x i16>* %b) #0 { +; VBITS_GE_128-LABEL: urem_v32i16: +; VBITS_GE_128: // %bb.0: +; VBITS_GE_128-NEXT: mov x8, #16 +; VBITS_GE_128-NEXT: mov x9, #24 +; VBITS_GE_128-NEXT: mov x10, #8 +; VBITS_GE_128-NEXT: ptrue p0.h, vl8 +; VBITS_GE_128-NEXT: ptrue p1.s, vl4 +; VBITS_GE_128-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] +; VBITS_GE_128-NEXT: ld1h { z1.h }, p0/z, [x0, x9, lsl #1] +; VBITS_GE_128-NEXT: ld1h { z2.h }, p0/z, [x0, x10, lsl #1] +; VBITS_GE_128-NEXT: ld1h { z3.h }, p0/z, [x0] +; VBITS_GE_128-NEXT: ld1h { z4.h }, p0/z, [x1, x10, lsl #1] +; VBITS_GE_128-NEXT: ld1h { z6.h }, p0/z, [x1, x9, lsl #1] +; VBITS_GE_128-NEXT: ld1h { z5.h }, p0/z, [x1, x8, lsl #1] +; VBITS_GE_128-NEXT: uunpklo z20.s, z1.h +; VBITS_GE_128-NEXT: uunpklo z22.s, z0.h +; VBITS_GE_128-NEXT: uunpkhi z16.s, z2.h +; VBITS_GE_128-NEXT: uunpklo z18.s, z2.h +; VBITS_GE_128-NEXT: uunpkhi z7.s, z4.h +; VBITS_GE_128-NEXT: uunpklo z17.s, z4.h +; VBITS_GE_128-NEXT: udivr z7.s, p1/m, z7.s, z16.s +; VBITS_GE_128-NEXT: movprfx z16, z18 +; VBITS_GE_128-NEXT: udiv z16.s, p1/m, z16.s, z17.s +; VBITS_GE_128-NEXT: ld1h { z17.h }, p0/z, [x1] +; VBITS_GE_128-NEXT: uzp1 z7.h, z16.h, z7.h +; VBITS_GE_128-NEXT: uunpkhi z16.s, z6.h +; VBITS_GE_128-NEXT: uunpkhi z18.s, z1.h +; VBITS_GE_128-NEXT: uunpklo z19.s, z6.h +; VBITS_GE_128-NEXT: udivr z16.s, p1/m, z16.s, z18.s +; VBITS_GE_128-NEXT: movprfx z18, z20 +; VBITS_GE_128-NEXT: udiv z18.s, p1/m, z18.s, z19.s +; VBITS_GE_128-NEXT: uunpkhi z19.s, z5.h +; VBITS_GE_128-NEXT: uunpkhi z20.s, z0.h +; VBITS_GE_128-NEXT: uunpklo z21.s, z5.h +; VBITS_GE_128-NEXT: udivr z19.s, p1/m, z19.s, z20.s +; VBITS_GE_128-NEXT: movprfx z20, z22 +; VBITS_GE_128-NEXT: udiv z20.s, p1/m, z20.s, z21.s +; VBITS_GE_128-NEXT: uzp1 z16.h, z18.h, z16.h +; VBITS_GE_128-NEXT: uzp1 z18.h, z20.h, z19.h +; VBITS_GE_128-NEXT: mls z1.h, p0/m, z16.h, z6.h +; VBITS_GE_128-NEXT: mls z0.h, p0/m, z18.h, z5.h +; VBITS_GE_128-NEXT: uunpkhi z5.s, z17.h +; VBITS_GE_128-NEXT: uunpkhi z6.s, z3.h +; VBITS_GE_128-NEXT: udivr z5.s, p1/m, z5.s, z6.s +; VBITS_GE_128-NEXT: uunpklo z16.s, z17.h +; VBITS_GE_128-NEXT: uunpklo z18.s, z3.h +; VBITS_GE_128-NEXT: movprfx z6, z18 +; VBITS_GE_128-NEXT: udiv z6.s, p1/m, z6.s, z16.s +; VBITS_GE_128-NEXT: uzp1 z5.h, z6.h, z5.h +; VBITS_GE_128-NEXT: mls z2.h, p0/m, z7.h, z4.h +; VBITS_GE_128-NEXT: mls z3.h, p0/m, z5.h, z17.h +; VBITS_GE_128-NEXT: stp q0, q1, [x0, #32] +; VBITS_GE_128-NEXT: stp q3, q2, [x0] +; VBITS_GE_128-NEXT: ret +; +; VBITS_GE_256-LABEL: urem_v32i16: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov x8, #16 +; VBITS_GE_256-NEXT: ptrue p0.h, vl16 +; VBITS_GE_256-NEXT: ptrue p1.s, vl8 +; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] +; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0] +; VBITS_GE_256-NEXT: ld1h { z2.h }, p0/z, [x1, x8, lsl #1] +; VBITS_GE_256-NEXT: ld1h { z3.h }, p0/z, [x1] +; VBITS_GE_256-NEXT: uunpkhi z5.s, z0.h +; VBITS_GE_256-NEXT: uunpklo z7.s, z0.h +; VBITS_GE_256-NEXT: uunpkhi z4.s, z2.h +; VBITS_GE_256-NEXT: uunpklo z6.s, z2.h +; VBITS_GE_256-NEXT: uunpkhi z16.s, z3.h +; VBITS_GE_256-NEXT: uunpkhi z17.s, z1.h +; VBITS_GE_256-NEXT: udivr z4.s, p1/m, z4.s, z5.s +; VBITS_GE_256-NEXT: uunpklo z5.s, z3.h +; VBITS_GE_256-NEXT: udivr z6.s, p1/m, z6.s, z7.s +; VBITS_GE_256-NEXT: uunpklo z7.s, z1.h +; VBITS_GE_256-NEXT: udivr z16.s, p1/m, z16.s, z17.s +; VBITS_GE_256-NEXT: udivr z5.s, p1/m, z5.s, z7.s +; VBITS_GE_256-NEXT: uzp1 z4.h, z6.h, z4.h +; VBITS_GE_256-NEXT: uzp1 z5.h, z5.h, z16.h +; VBITS_GE_256-NEXT: mls z0.h, p0/m, z4.h, z2.h +; VBITS_GE_256-NEXT: mls z1.h, p0/m, z5.h, z3.h +; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x0, x8, lsl #1] +; VBITS_GE_256-NEXT: st1h { z1.h }, p0, [x0] +; VBITS_GE_256-NEXT: ret + %op1 = load <32 x i16>, <32 x i16>* %a + %op2 = load <32 x i16>, <32 x i16>* %b + %res = urem <32 x i16> %op1, %op2 + store <32 x i16> %res, <32 x i16>* %a + ret void +} + +define void @urem_v64i16(<64 x i16>* %a, <64 x i16>* %b) #0 { +; VBITS_GE_128-LABEL: urem_v64i16: +; VBITS_GE_128: // %bb.0: +; VBITS_GE_128-NEXT: mov x8, #48 +; VBITS_GE_128-NEXT: mov x9, #56 +; VBITS_GE_128-NEXT: mov x10, #32 +; VBITS_GE_128-NEXT: mov x11, #40 +; VBITS_GE_128-NEXT: mov x12, #16 +; VBITS_GE_128-NEXT: mov x13, #24 +; VBITS_GE_128-NEXT: mov x14, #8 +; VBITS_GE_128-NEXT: ptrue p0.h, vl8 +; VBITS_GE_128-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] +; VBITS_GE_128-NEXT: ld1h { z7.h }, p0/z, [x0, x9, lsl #1] +; VBITS_GE_128-NEXT: ld1h { z6.h }, p0/z, [x0, x10, lsl #1] +; VBITS_GE_128-NEXT: ld1h { z5.h }, p0/z, [x0, x11, lsl #1] +; VBITS_GE_128-NEXT: ld1h { z4.h }, p0/z, [x0, x12, lsl #1] +; VBITS_GE_128-NEXT: ld1h { z2.h }, p0/z, [x0, x13, lsl #1] +; VBITS_GE_128-NEXT: ld1h { z1.h }, p0/z, [x0, x14, lsl #1] +; VBITS_GE_128-NEXT: ld1h { z3.h }, p0/z, [x0] +; VBITS_GE_128-NEXT: ld1h { z16.h }, p0/z, [x1, x14, lsl #1] +; VBITS_GE_128-NEXT: ld1h { z21.h }, p0/z, [x1, x13, lsl #1] +; VBITS_GE_128-NEXT: ptrue p1.s, vl4 +; VBITS_GE_128-NEXT: ld1h { z18.h }, p0/z, [x1, x8, lsl #1] +; VBITS_GE_128-NEXT: ld1h { z19.h }, p0/z, [x1, x9, lsl #1] +; VBITS_GE_128-NEXT: ld1h { z20.h }, p0/z, [x1, x11, lsl #1] +; VBITS_GE_128-NEXT: uunpkhi z22.s, z1.h +; VBITS_GE_128-NEXT: uunpklo z24.s, z1.h +; VBITS_GE_128-NEXT: uunpkhi z17.s, z16.h +; VBITS_GE_128-NEXT: uunpklo z23.s, z16.h +; VBITS_GE_128-NEXT: udivr z17.s, p1/m, z17.s, z22.s +; VBITS_GE_128-NEXT: movprfx z22, z24 +; VBITS_GE_128-NEXT: udiv z22.s, p1/m, z22.s, z23.s +; VBITS_GE_128-NEXT: ld1h { z23.h }, p0/z, [x1, x12, lsl #1] +; VBITS_GE_128-NEXT: uzp1 z17.h, z22.h, z17.h +; VBITS_GE_128-NEXT: uunpkhi z22.s, z21.h +; VBITS_GE_128-NEXT: uunpkhi z24.s, z2.h +; VBITS_GE_128-NEXT: uunpklo z25.s, z21.h +; VBITS_GE_128-NEXT: uunpklo z26.s, z2.h +; VBITS_GE_128-NEXT: udivr z22.s, p1/m, z22.s, z24.s +; VBITS_GE_128-NEXT: movprfx z24, z26 +; VBITS_GE_128-NEXT: udiv z24.s, p1/m, z24.s, z25.s +; VBITS_GE_128-NEXT: uunpkhi z25.s, z23.h +; VBITS_GE_128-NEXT: uunpkhi z26.s, z4.h +; VBITS_GE_128-NEXT: uzp1 z22.h, z24.h, z22.h +; VBITS_GE_128-NEXT: movprfx z24, z26 +; VBITS_GE_128-NEXT: udiv z24.s, p1/m, z24.s, z25.s +; VBITS_GE_128-NEXT: uunpklo z25.s, z23.h +; VBITS_GE_128-NEXT: uunpklo z26.s, z4.h +; VBITS_GE_128-NEXT: udivr z25.s, p1/m, z25.s, z26.s +; VBITS_GE_128-NEXT: ld1h { z26.h }, p0/z, [x1, x10, lsl #1] +; VBITS_GE_128-NEXT: mls z2.h, p0/m, z22.h, z21.h +; VBITS_GE_128-NEXT: uzp1 z21.h, z25.h, z24.h +; VBITS_GE_128-NEXT: uunpkhi z22.s, z20.h +; VBITS_GE_128-NEXT: uunpkhi z24.s, z5.h +; VBITS_GE_128-NEXT: mls z4.h, p0/m, z21.h, z23.h +; VBITS_GE_128-NEXT: movprfx z21, z24 +; VBITS_GE_128-NEXT: udiv z21.s, p1/m, z21.s, z22.s +; VBITS_GE_128-NEXT: uunpklo z22.s, z20.h +; VBITS_GE_128-NEXT: uunpklo z23.s, z5.h +; VBITS_GE_128-NEXT: uunpkhi z24.s, z26.h +; VBITS_GE_128-NEXT: uunpkhi z25.s, z6.h +; VBITS_GE_128-NEXT: udivr z22.s, p1/m, z22.s, z23.s +; VBITS_GE_128-NEXT: movprfx z23, z25 +; VBITS_GE_128-NEXT: udiv z23.s, p1/m, z23.s, z24.s +; VBITS_GE_128-NEXT: uunpklo z24.s, z26.h +; VBITS_GE_128-NEXT: uunpklo z25.s, z6.h +; VBITS_GE_128-NEXT: udivr z24.s, p1/m, z24.s, z25.s +; VBITS_GE_128-NEXT: uzp1 z21.h, z22.h, z21.h +; VBITS_GE_128-NEXT: uzp1 z22.h, z24.h, z23.h +; VBITS_GE_128-NEXT: mls z5.h, p0/m, z21.h, z20.h +; VBITS_GE_128-NEXT: mls z6.h, p0/m, z22.h, z26.h +; VBITS_GE_128-NEXT: uunpkhi z20.s, z19.h +; VBITS_GE_128-NEXT: uunpkhi z21.s, z7.h +; VBITS_GE_128-NEXT: uunpklo z22.s, z19.h +; VBITS_GE_128-NEXT: uunpklo z23.s, z7.h +; VBITS_GE_128-NEXT: ld1h { z25.h }, p0/z, [x1] +; VBITS_GE_128-NEXT: udivr z20.s, p1/m, z20.s, z21.s +; VBITS_GE_128-NEXT: movprfx z21, z23 +; VBITS_GE_128-NEXT: udiv z21.s, p1/m, z21.s, z22.s +; VBITS_GE_128-NEXT: uunpkhi z22.s, z18.h +; VBITS_GE_128-NEXT: uunpkhi z23.s, z0.h +; VBITS_GE_128-NEXT: udivr z22.s, p1/m, z22.s, z23.s +; VBITS_GE_128-NEXT: uunpklo z24.s, z18.h +; VBITS_GE_128-NEXT: uunpklo z26.s, z0.h +; VBITS_GE_128-NEXT: movprfx z23, z26 +; VBITS_GE_128-NEXT: udiv z23.s, p1/m, z23.s, z24.s +; VBITS_GE_128-NEXT: uzp1 z20.h, z21.h, z20.h +; VBITS_GE_128-NEXT: uzp1 z21.h, z23.h, z22.h +; VBITS_GE_128-NEXT: mls z7.h, p0/m, z20.h, z19.h +; VBITS_GE_128-NEXT: mls z0.h, p0/m, z21.h, z18.h +; VBITS_GE_128-NEXT: uunpkhi z18.s, z25.h +; VBITS_GE_128-NEXT: uunpkhi z19.s, z3.h +; VBITS_GE_128-NEXT: stp q0, q7, [x0, #96] +; VBITS_GE_128-NEXT: uunpklo z0.s, z25.h +; VBITS_GE_128-NEXT: uunpklo z7.s, z3.h +; VBITS_GE_128-NEXT: udivr z18.s, p1/m, z18.s, z19.s +; VBITS_GE_128-NEXT: udivr z0.s, p1/m, z0.s, z7.s +; VBITS_GE_128-NEXT: mls z1.h, p0/m, z17.h, z16.h +; VBITS_GE_128-NEXT: uzp1 z0.h, z0.h, z18.h +; VBITS_GE_128-NEXT: stp q4, q2, [x0, #32] +; VBITS_GE_128-NEXT: mls z3.h, p0/m, z0.h, z25.h +; VBITS_GE_128-NEXT: stp q6, q5, [x0, #64] +; VBITS_GE_128-NEXT: stp q3, q1, [x0] +; VBITS_GE_128-NEXT: ret +; +; VBITS_GE_256-LABEL: urem_v64i16: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov x8, #32 +; VBITS_GE_256-NEXT: mov x9, #48 +; VBITS_GE_256-NEXT: mov x10, #16 +; VBITS_GE_256-NEXT: ptrue p0.h, vl16 +; VBITS_GE_256-NEXT: ptrue p1.s, vl8 +; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] +; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0, x9, lsl #1] +; VBITS_GE_256-NEXT: ld1h { z2.h }, p0/z, [x0, x10, lsl #1] +; VBITS_GE_256-NEXT: ld1h { z3.h }, p0/z, [x0] +; VBITS_GE_256-NEXT: ld1h { z4.h }, p0/z, [x1, x10, lsl #1] +; VBITS_GE_256-NEXT: ld1h { z5.h }, p0/z, [x1, x9, lsl #1] +; VBITS_GE_256-NEXT: ld1h { z6.h }, p0/z, [x1, x8, lsl #1] +; VBITS_GE_256-NEXT: uunpkhi z19.s, z1.h +; VBITS_GE_256-NEXT: uunpkhi z16.s, z2.h +; VBITS_GE_256-NEXT: uunpkhi z7.s, z4.h +; VBITS_GE_256-NEXT: uunpklo z17.s, z4.h +; VBITS_GE_256-NEXT: udivr z7.s, p1/m, z7.s, z16.s +; VBITS_GE_256-NEXT: uunpklo z16.s, z2.h +; VBITS_GE_256-NEXT: uunpkhi z18.s, z5.h +; VBITS_GE_256-NEXT: udiv z16.s, p1/m, z16.s, z17.s +; VBITS_GE_256-NEXT: movprfx z17, z19 +; VBITS_GE_256-NEXT: udiv z17.s, p1/m, z17.s, z18.s +; VBITS_GE_256-NEXT: uunpklo z18.s, z5.h +; VBITS_GE_256-NEXT: uunpklo z19.s, z1.h +; VBITS_GE_256-NEXT: uzp1 z7.h, z16.h, z7.h +; VBITS_GE_256-NEXT: udivr z18.s, p1/m, z18.s, z19.s +; VBITS_GE_256-NEXT: ld1h { z19.h }, p0/z, [x1] +; VBITS_GE_256-NEXT: uzp1 z16.h, z18.h, z17.h +; VBITS_GE_256-NEXT: mls z2.h, p0/m, z7.h, z4.h +; VBITS_GE_256-NEXT: mls z1.h, p0/m, z16.h, z5.h +; VBITS_GE_256-NEXT: uunpkhi z4.s, z6.h +; VBITS_GE_256-NEXT: uunpkhi z5.s, z0.h +; VBITS_GE_256-NEXT: uunpklo z7.s, z6.h +; VBITS_GE_256-NEXT: uunpklo z16.s, z0.h +; VBITS_GE_256-NEXT: udivr z4.s, p1/m, z4.s, z5.s +; VBITS_GE_256-NEXT: movprfx z5, z16 +; VBITS_GE_256-NEXT: udiv z5.s, p1/m, z5.s, z7.s +; VBITS_GE_256-NEXT: uunpkhi z7.s, z19.h +; VBITS_GE_256-NEXT: uunpkhi z16.s, z3.h +; VBITS_GE_256-NEXT: uunpklo z17.s, z19.h +; VBITS_GE_256-NEXT: udivr z7.s, p1/m, z7.s, z16.s +; VBITS_GE_256-NEXT: uunpklo z18.s, z3.h +; VBITS_GE_256-NEXT: movprfx z16, z18 +; VBITS_GE_256-NEXT: udiv z16.s, p1/m, z16.s, z17.s +; VBITS_GE_256-NEXT: uzp1 z4.h, z5.h, z4.h +; VBITS_GE_256-NEXT: uzp1 z5.h, z16.h, z7.h +; VBITS_GE_256-NEXT: mls z0.h, p0/m, z4.h, z6.h +; VBITS_GE_256-NEXT: mls z3.h, p0/m, z5.h, z19.h +; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x0, x8, lsl #1] +; VBITS_GE_256-NEXT: st1h { z1.h }, p0, [x0, x9, lsl #1] +; VBITS_GE_256-NEXT: st1h { z2.h }, p0, [x0, x10, lsl #1] +; VBITS_GE_256-NEXT: st1h { z3.h }, p0, [x0] +; VBITS_GE_256-NEXT: ret + %op1 = load <64 x i16>, <64 x i16>* %a + %op2 = load <64 x i16>, <64 x i16>* %b + %res = urem <64 x i16> %op1, %op2 + store <64 x i16> %res, <64 x i16>* %a + ret void +} + +define void @urem_v128i16(<128 x i16>* %a, <128 x i16>* %b) #0 { +; VBITS_GE_128-LABEL: urem_v128i16: +; VBITS_GE_128: // %bb.0: +; VBITS_GE_128-NEXT: str d8, [sp, #-16]! // 8-byte Folded Spill +; VBITS_GE_128-NEXT: .cfi_def_cfa_offset 16 +; VBITS_GE_128-NEXT: .cfi_offset b8, -16 +; VBITS_GE_128-NEXT: mov x16, #120 +; VBITS_GE_128-NEXT: mov x17, #112 +; VBITS_GE_128-NEXT: mov x9, #104 +; VBITS_GE_128-NEXT: mov x10, #96 +; VBITS_GE_128-NEXT: mov x11, #88 +; VBITS_GE_128-NEXT: mov x12, #80 +; VBITS_GE_128-NEXT: mov x13, #72 +; VBITS_GE_128-NEXT: mov x8, #64 +; VBITS_GE_128-NEXT: mov x14, #56 +; VBITS_GE_128-NEXT: mov x15, #48 +; VBITS_GE_128-NEXT: mov x18, #40 +; VBITS_GE_128-NEXT: mov x2, #32 +; VBITS_GE_128-NEXT: mov x3, #24 +; VBITS_GE_128-NEXT: mov x4, #16 +; VBITS_GE_128-NEXT: mov x5, #8 +; VBITS_GE_128-NEXT: ptrue p0.h, vl8 +; VBITS_GE_128-NEXT: ld1h { z2.h }, p0/z, [x0, x16, lsl #1] +; VBITS_GE_128-NEXT: ld1h { z1.h }, p0/z, [x0, x17, lsl #1] +; VBITS_GE_128-NEXT: ld1h { z0.h }, p0/z, [x0, x9, lsl #1] +; VBITS_GE_128-NEXT: ld1h { z16.h }, p0/z, [x0, x10, lsl #1] +; VBITS_GE_128-NEXT: ld1h { z7.h }, p0/z, [x0, x11, lsl #1] +; VBITS_GE_128-NEXT: ld1h { z6.h }, p0/z, [x0, x12, lsl #1] +; VBITS_GE_128-NEXT: ld1h { z5.h }, p0/z, [x0, x13, lsl #1] +; VBITS_GE_128-NEXT: ld1h { z4.h }, p0/z, [x0, x8, lsl #1] +; VBITS_GE_128-NEXT: ld1h { z3.h }, p0/z, [x0, x14, lsl #1] +; VBITS_GE_128-NEXT: ld1h { z23.h }, p0/z, [x0, x15, lsl #1] +; VBITS_GE_128-NEXT: ld1h { z22.h }, p0/z, [x0, x18, lsl #1] +; VBITS_GE_128-NEXT: ld1h { z21.h }, p0/z, [x0, x2, lsl #1] +; VBITS_GE_128-NEXT: ld1h { z20.h }, p0/z, [x0, x3, lsl #1] +; VBITS_GE_128-NEXT: ld1h { z19.h }, p0/z, [x0, x4, lsl #1] +; VBITS_GE_128-NEXT: ld1h { z18.h }, p0/z, [x0, x5, lsl #1] +; VBITS_GE_128-NEXT: ld1h { z17.h }, p0/z, [x0] +; VBITS_GE_128-NEXT: ld1h { z26.h }, p0/z, [x1, x5, lsl #1] +; VBITS_GE_128-NEXT: ld1h { z28.h }, p0/z, [x1, x4, lsl #1] +; VBITS_GE_128-NEXT: ptrue p1.s, vl4 +; VBITS_GE_128-NEXT: ld1h { z24.h }, p0/z, [x1, x16, lsl #1] +; VBITS_GE_128-NEXT: ld1h { z25.h }, p0/z, [x1, x17, lsl #1] +; VBITS_GE_128-NEXT: ld1h { z27.h }, p0/z, [x1, x3, lsl #1] +; VBITS_GE_128-NEXT: uunpkhi z30.s, z18.h +; VBITS_GE_128-NEXT: uunpklo z8.s, z18.h +; VBITS_GE_128-NEXT: uunpkhi z29.s, z26.h +; VBITS_GE_128-NEXT: uunpklo z31.s, z26.h +; VBITS_GE_128-NEXT: udivr z29.s, p1/m, z29.s, z30.s +; VBITS_GE_128-NEXT: movprfx z30, z8 +; VBITS_GE_128-NEXT: udiv z30.s, p1/m, z30.s, z31.s +; VBITS_GE_128-NEXT: uunpkhi z31.s, z28.h +; VBITS_GE_128-NEXT: uunpkhi z8.s, z19.h +; VBITS_GE_128-NEXT: uzp1 z29.h, z30.h, z29.h +; VBITS_GE_128-NEXT: movprfx z30, z8 +; VBITS_GE_128-NEXT: udiv z30.s, p1/m, z30.s, z31.s +; VBITS_GE_128-NEXT: uunpklo z31.s, z28.h +; VBITS_GE_128-NEXT: uunpklo z8.s, z19.h +; VBITS_GE_128-NEXT: udivr z31.s, p1/m, z31.s, z8.s +; VBITS_GE_128-NEXT: mls z18.h, p0/m, z29.h, z26.h +; VBITS_GE_128-NEXT: uzp1 z30.h, z31.h, z30.h +; VBITS_GE_128-NEXT: ld1h { z31.h }, p0/z, [x1, x2, lsl #1] +; VBITS_GE_128-NEXT: mls z19.h, p0/m, z30.h, z28.h +; VBITS_GE_128-NEXT: uunpkhi z26.s, z27.h +; VBITS_GE_128-NEXT: uunpkhi z28.s, z20.h +; VBITS_GE_128-NEXT: uunpklo z29.s, z27.h +; VBITS_GE_128-NEXT: uunpklo z30.s, z20.h +; VBITS_GE_128-NEXT: ld1h { z8.h }, p0/z, [x1, x18, lsl #1] +; VBITS_GE_128-NEXT: udivr z26.s, p1/m, z26.s, z28.s +; VBITS_GE_128-NEXT: movprfx z28, z30 +; VBITS_GE_128-NEXT: udiv z28.s, p1/m, z28.s, z29.s +; VBITS_GE_128-NEXT: uunpkhi z29.s, z31.h +; VBITS_GE_128-NEXT: uunpkhi z30.s, z21.h +; VBITS_GE_128-NEXT: uzp1 z26.h, z28.h, z26.h +; VBITS_GE_128-NEXT: movprfx z28, z30 +; VBITS_GE_128-NEXT: udiv z28.s, p1/m, z28.s, z29.s +; VBITS_GE_128-NEXT: uunpklo z29.s, z31.h +; VBITS_GE_128-NEXT: uunpklo z30.s, z21.h +; VBITS_GE_128-NEXT: udivr z29.s, p1/m, z29.s, z30.s +; VBITS_GE_128-NEXT: mls z20.h, p0/m, z26.h, z27.h +; VBITS_GE_128-NEXT: uzp1 z28.h, z29.h, z28.h +; VBITS_GE_128-NEXT: ld1h { z29.h }, p0/z, [x1, x15, lsl #1] +; VBITS_GE_128-NEXT: mls z21.h, p0/m, z28.h, z31.h +; VBITS_GE_128-NEXT: uunpkhi z26.s, z8.h +; VBITS_GE_128-NEXT: uunpkhi z27.s, z22.h +; VBITS_GE_128-NEXT: uunpklo z28.s, z8.h +; VBITS_GE_128-NEXT: uunpklo z31.s, z22.h +; VBITS_GE_128-NEXT: ld1h { z30.h }, p0/z, [x1, x14, lsl #1] +; VBITS_GE_128-NEXT: udivr z26.s, p1/m, z26.s, z27.s +; VBITS_GE_128-NEXT: movprfx z27, z31 +; VBITS_GE_128-NEXT: udiv z27.s, p1/m, z27.s, z28.s +; VBITS_GE_128-NEXT: uunpkhi z28.s, z29.h +; VBITS_GE_128-NEXT: uunpkhi z31.s, z23.h +; VBITS_GE_128-NEXT: uzp1 z26.h, z27.h, z26.h +; VBITS_GE_128-NEXT: movprfx z27, z31 +; VBITS_GE_128-NEXT: udiv z27.s, p1/m, z27.s, z28.s +; VBITS_GE_128-NEXT: uunpklo z28.s, z29.h +; VBITS_GE_128-NEXT: uunpklo z31.s, z23.h +; VBITS_GE_128-NEXT: udivr z28.s, p1/m, z28.s, z31.s +; VBITS_GE_128-NEXT: mls z22.h, p0/m, z26.h, z8.h +; VBITS_GE_128-NEXT: uzp1 z27.h, z28.h, z27.h +; VBITS_GE_128-NEXT: ld1h { z28.h }, p0/z, [x1, x13, lsl #1] +; VBITS_GE_128-NEXT: mls z23.h, p0/m, z27.h, z29.h +; VBITS_GE_128-NEXT: uunpkhi z26.s, z30.h +; VBITS_GE_128-NEXT: uunpkhi z27.s, z3.h +; VBITS_GE_128-NEXT: uunpklo z29.s, z30.h +; VBITS_GE_128-NEXT: uunpklo z8.s, z3.h +; VBITS_GE_128-NEXT: ld1h { z31.h }, p0/z, [x1, x12, lsl #1] +; VBITS_GE_128-NEXT: udivr z26.s, p1/m, z26.s, z27.s +; VBITS_GE_128-NEXT: movprfx z27, z8 +; VBITS_GE_128-NEXT: udiv z27.s, p1/m, z27.s, z29.s +; VBITS_GE_128-NEXT: uunpkhi z29.s, z28.h +; VBITS_GE_128-NEXT: uunpkhi z8.s, z5.h +; VBITS_GE_128-NEXT: uzp1 z26.h, z27.h, z26.h +; VBITS_GE_128-NEXT: movprfx z27, z8 +; VBITS_GE_128-NEXT: udiv z27.s, p1/m, z27.s, z29.s +; VBITS_GE_128-NEXT: uunpklo z29.s, z28.h +; VBITS_GE_128-NEXT: uunpklo z8.s, z5.h +; VBITS_GE_128-NEXT: udivr z29.s, p1/m, z29.s, z8.s +; VBITS_GE_128-NEXT: mls z3.h, p0/m, z26.h, z30.h +; VBITS_GE_128-NEXT: uzp1 z27.h, z29.h, z27.h +; VBITS_GE_128-NEXT: ld1h { z29.h }, p0/z, [x1, x11, lsl #1] +; VBITS_GE_128-NEXT: mls z5.h, p0/m, z27.h, z28.h +; VBITS_GE_128-NEXT: uunpkhi z26.s, z31.h +; VBITS_GE_128-NEXT: uunpkhi z27.s, z6.h +; VBITS_GE_128-NEXT: uunpklo z28.s, z31.h +; VBITS_GE_128-NEXT: uunpklo z30.s, z6.h +; VBITS_GE_128-NEXT: ld1h { z8.h }, p0/z, [x1, x10, lsl #1] +; VBITS_GE_128-NEXT: udivr z26.s, p1/m, z26.s, z27.s +; VBITS_GE_128-NEXT: movprfx z27, z30 +; VBITS_GE_128-NEXT: udiv z27.s, p1/m, z27.s, z28.s +; VBITS_GE_128-NEXT: uunpkhi z28.s, z29.h +; VBITS_GE_128-NEXT: uunpkhi z30.s, z7.h +; VBITS_GE_128-NEXT: uzp1 z26.h, z27.h, z26.h +; VBITS_GE_128-NEXT: movprfx z27, z30 +; VBITS_GE_128-NEXT: udiv z27.s, p1/m, z27.s, z28.s +; VBITS_GE_128-NEXT: uunpklo z28.s, z29.h +; VBITS_GE_128-NEXT: uunpklo z30.s, z7.h +; VBITS_GE_128-NEXT: udivr z28.s, p1/m, z28.s, z30.s +; VBITS_GE_128-NEXT: ld1h { z30.h }, p0/z, [x1, x9, lsl #1] +; VBITS_GE_128-NEXT: mls z6.h, p0/m, z26.h, z31.h +; VBITS_GE_128-NEXT: uzp1 z26.h, z28.h, z27.h +; VBITS_GE_128-NEXT: uunpkhi z27.s, z8.h +; VBITS_GE_128-NEXT: uunpkhi z28.s, z16.h +; VBITS_GE_128-NEXT: mls z7.h, p0/m, z26.h, z29.h +; VBITS_GE_128-NEXT: movprfx z26, z28 +; VBITS_GE_128-NEXT: udiv z26.s, p1/m, z26.s, z27.s +; VBITS_GE_128-NEXT: uunpklo z27.s, z8.h +; VBITS_GE_128-NEXT: uunpklo z28.s, z16.h +; VBITS_GE_128-NEXT: uunpkhi z29.s, z30.h +; VBITS_GE_128-NEXT: uunpkhi z31.s, z0.h +; VBITS_GE_128-NEXT: udivr z27.s, p1/m, z27.s, z28.s +; VBITS_GE_128-NEXT: movprfx z28, z31 +; VBITS_GE_128-NEXT: udiv z28.s, p1/m, z28.s, z29.s +; VBITS_GE_128-NEXT: uunpklo z29.s, z30.h +; VBITS_GE_128-NEXT: uunpklo z31.s, z0.h +; VBITS_GE_128-NEXT: udivr z29.s, p1/m, z29.s, z31.s +; VBITS_GE_128-NEXT: uzp1 z26.h, z27.h, z26.h +; VBITS_GE_128-NEXT: uzp1 z27.h, z29.h, z28.h +; VBITS_GE_128-NEXT: mls z16.h, p0/m, z26.h, z8.h +; VBITS_GE_128-NEXT: mls z0.h, p0/m, z27.h, z30.h +; VBITS_GE_128-NEXT: uunpkhi z26.s, z25.h +; VBITS_GE_128-NEXT: uunpkhi z27.s, z1.h +; VBITS_GE_128-NEXT: uunpklo z28.s, z25.h +; VBITS_GE_128-NEXT: uunpklo z29.s, z1.h +; VBITS_GE_128-NEXT: ld1h { z31.h }, p0/z, [x1, x8, lsl #1] +; VBITS_GE_128-NEXT: udivr z26.s, p1/m, z26.s, z27.s +; VBITS_GE_128-NEXT: movprfx z27, z29 +; VBITS_GE_128-NEXT: udiv z27.s, p1/m, z27.s, z28.s +; VBITS_GE_128-NEXT: uunpkhi z28.s, z24.h +; VBITS_GE_128-NEXT: uunpkhi z29.s, z2.h +; VBITS_GE_128-NEXT: udivr z28.s, p1/m, z28.s, z29.s +; VBITS_GE_128-NEXT: uunpklo z30.s, z24.h +; VBITS_GE_128-NEXT: uunpklo z8.s, z2.h +; VBITS_GE_128-NEXT: movprfx z29, z8 +; VBITS_GE_128-NEXT: udiv z29.s, p1/m, z29.s, z30.s +; VBITS_GE_128-NEXT: uzp1 z26.h, z27.h, z26.h +; VBITS_GE_128-NEXT: uzp1 z27.h, z29.h, z28.h +; VBITS_GE_128-NEXT: mls z1.h, p0/m, z26.h, z25.h +; VBITS_GE_128-NEXT: mls z2.h, p0/m, z27.h, z24.h +; VBITS_GE_128-NEXT: ld1h { z24.h }, p0/z, [x1] +; VBITS_GE_128-NEXT: uunpkhi z25.s, z31.h +; VBITS_GE_128-NEXT: stp q19, q20, [x0, #32] +; VBITS_GE_128-NEXT: stp q21, q22, [x0, #64] +; VBITS_GE_128-NEXT: stp q23, q3, [x0, #96] +; VBITS_GE_128-NEXT: stp q6, q7, [x0, #160] +; VBITS_GE_128-NEXT: stp q16, q0, [x0, #192] +; VBITS_GE_128-NEXT: uunpklo z0.s, z31.h +; VBITS_GE_128-NEXT: stp q1, q2, [x0, #224] +; VBITS_GE_128-NEXT: uunpkhi z2.s, z4.h +; VBITS_GE_128-NEXT: uunpklo z1.s, z4.h +; VBITS_GE_128-NEXT: udiv z2.s, p1/m, z2.s, z25.s +; VBITS_GE_128-NEXT: udivr z0.s, p1/m, z0.s, z1.s +; VBITS_GE_128-NEXT: uunpkhi z1.s, z17.h +; VBITS_GE_128-NEXT: uzp1 z0.h, z0.h, z2.h +; VBITS_GE_128-NEXT: uunpklo z2.s, z17.h +; VBITS_GE_128-NEXT: mls z4.h, p0/m, z0.h, z31.h +; VBITS_GE_128-NEXT: uunpkhi z0.s, z24.h +; VBITS_GE_128-NEXT: udivr z0.s, p1/m, z0.s, z1.s +; VBITS_GE_128-NEXT: uunpklo z1.s, z24.h +; VBITS_GE_128-NEXT: udivr z1.s, p1/m, z1.s, z2.s +; VBITS_GE_128-NEXT: stp q4, q5, [x0, #128] +; VBITS_GE_128-NEXT: uzp1 z0.h, z1.h, z0.h +; VBITS_GE_128-NEXT: mls z17.h, p0/m, z0.h, z24.h +; VBITS_GE_128-NEXT: stp q17, q18, [x0] +; VBITS_GE_128-NEXT: ldr d8, [sp], #16 // 8-byte Folded Reload +; VBITS_GE_128-NEXT: ret +; +; VBITS_GE_256-LABEL: urem_v128i16: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov x8, #96 +; VBITS_GE_256-NEXT: mov x9, #112 +; VBITS_GE_256-NEXT: mov x10, #64 +; VBITS_GE_256-NEXT: mov x11, #80 +; VBITS_GE_256-NEXT: mov x12, #32 +; VBITS_GE_256-NEXT: mov x13, #48 +; VBITS_GE_256-NEXT: mov x14, #16 +; VBITS_GE_256-NEXT: ptrue p0.h, vl16 +; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] +; VBITS_GE_256-NEXT: ld1h { z7.h }, p0/z, [x0, x9, lsl #1] +; VBITS_GE_256-NEXT: ld1h { z6.h }, p0/z, [x0, x10, lsl #1] +; VBITS_GE_256-NEXT: ld1h { z5.h }, p0/z, [x0, x11, lsl #1] +; VBITS_GE_256-NEXT: ld1h { z4.h }, p0/z, [x0, x12, lsl #1] +; VBITS_GE_256-NEXT: ld1h { z3.h }, p0/z, [x0, x13, lsl #1] +; VBITS_GE_256-NEXT: ld1h { z2.h }, p0/z, [x0, x14, lsl #1] +; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0] +; VBITS_GE_256-NEXT: ld1h { z17.h }, p0/z, [x1, x14, lsl #1] +; VBITS_GE_256-NEXT: ld1h { z21.h }, p0/z, [x1, x13, lsl #1] +; VBITS_GE_256-NEXT: ptrue p1.s, vl8 +; VBITS_GE_256-NEXT: ld1h { z16.h }, p0/z, [x1, x8, lsl #1] +; VBITS_GE_256-NEXT: ld1h { z18.h }, p0/z, [x1, x9, lsl #1] +; VBITS_GE_256-NEXT: ld1h { z19.h }, p0/z, [x1, x10, lsl #1] +; VBITS_GE_256-NEXT: uunpkhi z23.s, z2.h +; VBITS_GE_256-NEXT: ld1h { z20.h }, p0/z, [x1, x12, lsl #1] +; VBITS_GE_256-NEXT: uunpkhi z22.s, z17.h +; VBITS_GE_256-NEXT: uunpklo z24.s, z17.h +; VBITS_GE_256-NEXT: uunpklo z25.s, z2.h +; VBITS_GE_256-NEXT: udivr z22.s, p1/m, z22.s, z23.s +; VBITS_GE_256-NEXT: movprfx z23, z25 +; VBITS_GE_256-NEXT: udiv z23.s, p1/m, z23.s, z24.s +; VBITS_GE_256-NEXT: uunpkhi z24.s, z21.h +; VBITS_GE_256-NEXT: uunpkhi z25.s, z3.h +; VBITS_GE_256-NEXT: uzp1 z22.h, z23.h, z22.h +; VBITS_GE_256-NEXT: movprfx z23, z25 +; VBITS_GE_256-NEXT: udiv z23.s, p1/m, z23.s, z24.s +; VBITS_GE_256-NEXT: ld1h { z26.h }, p0/z, [x1, x11, lsl #1] +; VBITS_GE_256-NEXT: uunpklo z24.s, z21.h +; VBITS_GE_256-NEXT: uunpklo z25.s, z3.h +; VBITS_GE_256-NEXT: udivr z24.s, p1/m, z24.s, z25.s +; VBITS_GE_256-NEXT: mls z2.h, p0/m, z22.h, z17.h +; VBITS_GE_256-NEXT: uzp1 z17.h, z24.h, z23.h +; VBITS_GE_256-NEXT: uunpkhi z22.s, z20.h +; VBITS_GE_256-NEXT: uunpkhi z23.s, z4.h +; VBITS_GE_256-NEXT: mls z3.h, p0/m, z17.h, z21.h +; VBITS_GE_256-NEXT: movprfx z17, z23 +; VBITS_GE_256-NEXT: udiv z17.s, p1/m, z17.s, z22.s +; VBITS_GE_256-NEXT: uunpklo z21.s, z20.h +; VBITS_GE_256-NEXT: uunpklo z22.s, z4.h +; VBITS_GE_256-NEXT: uunpkhi z23.s, z26.h +; VBITS_GE_256-NEXT: uunpkhi z24.s, z5.h +; VBITS_GE_256-NEXT: udivr z21.s, p1/m, z21.s, z22.s +; VBITS_GE_256-NEXT: movprfx z22, z24 +; VBITS_GE_256-NEXT: udiv z22.s, p1/m, z22.s, z23.s +; VBITS_GE_256-NEXT: uunpklo z23.s, z26.h +; VBITS_GE_256-NEXT: uunpklo z24.s, z5.h +; VBITS_GE_256-NEXT: uzp1 z17.h, z21.h, z17.h +; VBITS_GE_256-NEXT: udivr z23.s, p1/m, z23.s, z24.s +; VBITS_GE_256-NEXT: mls z4.h, p0/m, z17.h, z20.h +; VBITS_GE_256-NEXT: uzp1 z21.h, z23.h, z22.h +; VBITS_GE_256-NEXT: uunpkhi z17.s, z19.h +; VBITS_GE_256-NEXT: mls z5.h, p0/m, z21.h, z26.h +; VBITS_GE_256-NEXT: uunpkhi z20.s, z6.h +; VBITS_GE_256-NEXT: uunpklo z21.s, z19.h +; VBITS_GE_256-NEXT: uunpklo z22.s, z6.h +; VBITS_GE_256-NEXT: ld1h { z25.h }, p0/z, [x1] +; VBITS_GE_256-NEXT: udivr z17.s, p1/m, z17.s, z20.s +; VBITS_GE_256-NEXT: movprfx z20, z22 +; VBITS_GE_256-NEXT: udiv z20.s, p1/m, z20.s, z21.s +; VBITS_GE_256-NEXT: uunpkhi z21.s, z18.h +; VBITS_GE_256-NEXT: uunpkhi z22.s, z7.h +; VBITS_GE_256-NEXT: uunpklo z23.s, z18.h +; VBITS_GE_256-NEXT: udivr z21.s, p1/m, z21.s, z22.s +; VBITS_GE_256-NEXT: uunpklo z24.s, z7.h +; VBITS_GE_256-NEXT: movprfx z22, z24 +; VBITS_GE_256-NEXT: udiv z22.s, p1/m, z22.s, z23.s +; VBITS_GE_256-NEXT: uzp1 z17.h, z20.h, z17.h +; VBITS_GE_256-NEXT: uzp1 z20.h, z22.h, z21.h +; VBITS_GE_256-NEXT: mls z6.h, p0/m, z17.h, z19.h +; VBITS_GE_256-NEXT: mls z7.h, p0/m, z20.h, z18.h +; VBITS_GE_256-NEXT: uunpkhi z17.s, z16.h +; VBITS_GE_256-NEXT: uunpkhi z18.s, z0.h +; VBITS_GE_256-NEXT: uunpklo z19.s, z16.h +; VBITS_GE_256-NEXT: uunpklo z20.s, z0.h +; VBITS_GE_256-NEXT: udivr z17.s, p1/m, z17.s, z18.s +; VBITS_GE_256-NEXT: movprfx z18, z20 +; VBITS_GE_256-NEXT: udiv z18.s, p1/m, z18.s, z19.s +; VBITS_GE_256-NEXT: uunpkhi z19.s, z25.h +; VBITS_GE_256-NEXT: uunpkhi z20.s, z1.h +; VBITS_GE_256-NEXT: uunpklo z21.s, z25.h +; VBITS_GE_256-NEXT: udivr z19.s, p1/m, z19.s, z20.s +; VBITS_GE_256-NEXT: uunpklo z22.s, z1.h +; VBITS_GE_256-NEXT: movprfx z20, z22 +; VBITS_GE_256-NEXT: udiv z20.s, p1/m, z20.s, z21.s +; VBITS_GE_256-NEXT: uzp1 z17.h, z18.h, z17.h +; VBITS_GE_256-NEXT: uzp1 z18.h, z20.h, z19.h +; VBITS_GE_256-NEXT: mls z0.h, p0/m, z17.h, z16.h +; VBITS_GE_256-NEXT: mls z1.h, p0/m, z18.h, z25.h +; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x0, x8, lsl #1] +; VBITS_GE_256-NEXT: st1h { z7.h }, p0, [x0, x9, lsl #1] +; VBITS_GE_256-NEXT: st1h { z6.h }, p0, [x0, x10, lsl #1] +; VBITS_GE_256-NEXT: st1h { z5.h }, p0, [x0, x11, lsl #1] +; VBITS_GE_256-NEXT: st1h { z4.h }, p0, [x0, x12, lsl #1] +; VBITS_GE_256-NEXT: st1h { z3.h }, p0, [x0, x13, lsl #1] +; VBITS_GE_256-NEXT: st1h { z2.h }, p0, [x0, x14, lsl #1] +; VBITS_GE_256-NEXT: st1h { z1.h }, p0, [x0] +; VBITS_GE_256-NEXT: ret + %op1 = load <128 x i16>, <128 x i16>* %a + %op2 = load <128 x i16>, <128 x i16>* %b + %res = urem <128 x i16> %op1, %op2 + store <128 x i16> %res, <128 x i16>* %a + ret void +} + +define <2 x i32> @urem_v2i32(<2 x i32> %op1, <2 x i32> %op2) #0 { +; CHECK-LABEL: urem_v2i32: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: ptrue p0.s, vl2 +; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1 +; CHECK-NEXT: movprfx z2, z0 +; CHECK-NEXT: udiv z2.s, p0/m, z2.s, z1.s +; CHECK-NEXT: mls z0.s, p0/m, z2.s, z1.s +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 +; CHECK-NEXT: ret + %res = urem <2 x i32> %op1, %op2 + ret <2 x i32> %res +} + +define <4 x i32> @urem_v4i32(<4 x i32> %op1, <4 x i32> %op2) #0 { +; CHECK-LABEL: urem_v4i32: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: ptrue p0.s, vl4 +; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1 +; CHECK-NEXT: movprfx z2, z0 +; CHECK-NEXT: udiv z2.s, p0/m, z2.s, z1.s +; CHECK-NEXT: mls z0.s, p0/m, z2.s, z1.s +; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 +; CHECK-NEXT: ret + %res = urem <4 x i32> %op1, %op2 + ret <4 x i32> %res +} + +define void @urem_v8i32(<8 x i32>* %a, <8 x i32>* %b) #0 { +; VBITS_GE_128-LABEL: urem_v8i32: +; VBITS_GE_128: // %bb.0: +; VBITS_GE_128-NEXT: mov x8, #4 +; VBITS_GE_128-NEXT: ptrue p0.s, vl4 +; VBITS_GE_128-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] +; VBITS_GE_128-NEXT: ld1w { z1.s }, p0/z, [x0] +; VBITS_GE_128-NEXT: ld1w { z2.s }, p0/z, [x1, x8, lsl #2] +; VBITS_GE_128-NEXT: ld1w { z3.s }, p0/z, [x1] +; VBITS_GE_128-NEXT: movprfx z4, z1 +; VBITS_GE_128-NEXT: udiv z4.s, p0/m, z4.s, z3.s +; VBITS_GE_128-NEXT: movprfx z5, z0 +; VBITS_GE_128-NEXT: udiv z5.s, p0/m, z5.s, z2.s +; VBITS_GE_128-NEXT: mls z1.s, p0/m, z4.s, z3.s +; VBITS_GE_128-NEXT: mls z0.s, p0/m, z5.s, z2.s +; VBITS_GE_128-NEXT: stp q1, q0, [x0] +; VBITS_GE_128-NEXT: ret +; +; VBITS_GE_256-LABEL: urem_v8i32: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: ptrue p0.s, vl8 +; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0] +; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x1] +; VBITS_GE_256-NEXT: movprfx z2, z0 +; VBITS_GE_256-NEXT: udiv z2.s, p0/m, z2.s, z1.s +; VBITS_GE_256-NEXT: mls z0.s, p0/m, z2.s, z1.s +; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0] +; VBITS_GE_256-NEXT: ret +; +; VBITS_GE_512-LABEL: urem_v8i32: +; VBITS_GE_512: // %bb.0: +; VBITS_GE_512-NEXT: ptrue p0.s, vl8 +; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x0] +; VBITS_GE_512-NEXT: ld1w { z1.s }, p0/z, [x1] +; VBITS_GE_512-NEXT: movprfx z2, z0 +; VBITS_GE_512-NEXT: udiv z2.s, p0/m, z2.s, z1.s +; VBITS_GE_512-NEXT: mls z0.s, p0/m, z2.s, z1.s +; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x0] +; VBITS_GE_512-NEXT: ret + %op1 = load <8 x i32>, <8 x i32>* %a + %op2 = load <8 x i32>, <8 x i32>* %b + %res = urem <8 x i32> %op1, %op2 + store <8 x i32> %res, <8 x i32>* %a + ret void +} + +define void @urem_v16i32(<16 x i32>* %a, <16 x i32>* %b) #0 { +; VBITS_GE_128-LABEL: urem_v16i32: +; VBITS_GE_128: // %bb.0: +; VBITS_GE_128-NEXT: mov x8, #8 +; VBITS_GE_128-NEXT: mov x9, #12 +; VBITS_GE_128-NEXT: mov x10, #4 +; VBITS_GE_128-NEXT: ptrue p0.s, vl4 +; VBITS_GE_128-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] +; VBITS_GE_128-NEXT: ld1w { z1.s }, p0/z, [x0, x9, lsl #2] +; VBITS_GE_128-NEXT: ld1w { z2.s }, p0/z, [x0, x10, lsl #2] +; VBITS_GE_128-NEXT: ld1w { z3.s }, p0/z, [x0] +; VBITS_GE_128-NEXT: ld1w { z4.s }, p0/z, [x1, x9, lsl #2] +; VBITS_GE_128-NEXT: ld1w { z5.s }, p0/z, [x1, x8, lsl #2] +; VBITS_GE_128-NEXT: ld1w { z6.s }, p0/z, [x1, x10, lsl #2] +; VBITS_GE_128-NEXT: ld1w { z7.s }, p0/z, [x1] +; VBITS_GE_128-NEXT: movprfx z16, z1 +; VBITS_GE_128-NEXT: udiv z16.s, p0/m, z16.s, z4.s +; VBITS_GE_128-NEXT: mls z1.s, p0/m, z16.s, z4.s +; VBITS_GE_128-NEXT: movprfx z4, z0 +; VBITS_GE_128-NEXT: udiv z4.s, p0/m, z4.s, z5.s +; VBITS_GE_128-NEXT: movprfx z16, z2 +; VBITS_GE_128-NEXT: udiv z16.s, p0/m, z16.s, z6.s +; VBITS_GE_128-NEXT: mls z0.s, p0/m, z4.s, z5.s +; VBITS_GE_128-NEXT: movprfx z4, z3 +; VBITS_GE_128-NEXT: udiv z4.s, p0/m, z4.s, z7.s +; VBITS_GE_128-NEXT: mls z3.s, p0/m, z4.s, z7.s +; VBITS_GE_128-NEXT: mls z2.s, p0/m, z16.s, z6.s +; VBITS_GE_128-NEXT: stp q0, q1, [x0, #32] +; VBITS_GE_128-NEXT: stp q3, q2, [x0] +; VBITS_GE_128-NEXT: ret +; +; VBITS_GE_256-LABEL: urem_v16i32: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov x8, #8 +; VBITS_GE_256-NEXT: ptrue p0.s, vl8 +; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0] +; VBITS_GE_256-NEXT: ld1w { z2.s }, p0/z, [x1, x8, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z3.s }, p0/z, [x1] +; VBITS_GE_256-NEXT: movprfx z4, z0 +; VBITS_GE_256-NEXT: udiv z4.s, p0/m, z4.s, z2.s +; VBITS_GE_256-NEXT: movprfx z5, z1 +; VBITS_GE_256-NEXT: udiv z5.s, p0/m, z5.s, z3.s +; VBITS_GE_256-NEXT: mls z0.s, p0/m, z4.s, z2.s +; VBITS_GE_256-NEXT: mls z1.s, p0/m, z5.s, z3.s +; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0, x8, lsl #2] +; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x0] +; VBITS_GE_256-NEXT: ret +; +; VBITS_GE_512-LABEL: urem_v16i32: +; VBITS_GE_512: // %bb.0: +; VBITS_GE_512-NEXT: ptrue p0.s, vl16 +; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x0] +; VBITS_GE_512-NEXT: ld1w { z1.s }, p0/z, [x1] +; VBITS_GE_512-NEXT: movprfx z2, z0 +; VBITS_GE_512-NEXT: udiv z2.s, p0/m, z2.s, z1.s +; VBITS_GE_512-NEXT: mls z0.s, p0/m, z2.s, z1.s +; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x0] +; VBITS_GE_512-NEXT: ret + %op1 = load <16 x i32>, <16 x i32>* %a + %op2 = load <16 x i32>, <16 x i32>* %b + %res = urem <16 x i32> %op1, %op2 + store <16 x i32> %res, <16 x i32>* %a + ret void +} + +define void @urem_v32i32(<32 x i32>* %a, <32 x i32>* %b) #0 { +; VBITS_GE_128-LABEL: urem_v32i32: +; VBITS_GE_128: // %bb.0: +; VBITS_GE_128-NEXT: mov x8, #24 +; VBITS_GE_128-NEXT: mov x9, #28 +; VBITS_GE_128-NEXT: mov x10, #16 +; VBITS_GE_128-NEXT: mov x11, #20 +; VBITS_GE_128-NEXT: mov x12, #8 +; VBITS_GE_128-NEXT: mov x13, #12 +; VBITS_GE_128-NEXT: mov x14, #4 +; VBITS_GE_128-NEXT: ptrue p0.s, vl4 +; VBITS_GE_128-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] +; VBITS_GE_128-NEXT: ld1w { z1.s }, p0/z, [x0, x9, lsl #2] +; VBITS_GE_128-NEXT: ld1w { z2.s }, p0/z, [x0, x10, lsl #2] +; VBITS_GE_128-NEXT: ld1w { z3.s }, p0/z, [x0, x11, lsl #2] +; VBITS_GE_128-NEXT: ld1w { z4.s }, p0/z, [x0, x12, lsl #2] +; VBITS_GE_128-NEXT: ld1w { z5.s }, p0/z, [x0, x13, lsl #2] +; VBITS_GE_128-NEXT: ld1w { z6.s }, p0/z, [x0, x14, lsl #2] +; VBITS_GE_128-NEXT: ld1w { z7.s }, p0/z, [x0] +; VBITS_GE_128-NEXT: ld1w { z16.s }, p0/z, [x1, x13, lsl #2] +; VBITS_GE_128-NEXT: ld1w { z18.s }, p0/z, [x1, x9, lsl #2] +; VBITS_GE_128-NEXT: ld1w { z17.s }, p0/z, [x1, x8, lsl #2] +; VBITS_GE_128-NEXT: ld1w { z19.s }, p0/z, [x1, x10, lsl #2] +; VBITS_GE_128-NEXT: ld1w { z21.s }, p0/z, [x1, x11, lsl #2] +; VBITS_GE_128-NEXT: ld1w { z20.s }, p0/z, [x1, x14, lsl #2] +; VBITS_GE_128-NEXT: ld1w { z22.s }, p0/z, [x1, x12, lsl #2] +; VBITS_GE_128-NEXT: ld1w { z23.s }, p0/z, [x1] +; VBITS_GE_128-NEXT: movprfx z24, z5 +; VBITS_GE_128-NEXT: udiv z24.s, p0/m, z24.s, z16.s +; VBITS_GE_128-NEXT: mls z5.s, p0/m, z24.s, z16.s +; VBITS_GE_128-NEXT: movprfx z16, z1 +; VBITS_GE_128-NEXT: udiv z16.s, p0/m, z16.s, z18.s +; VBITS_GE_128-NEXT: movprfx z24, z6 +; VBITS_GE_128-NEXT: udiv z24.s, p0/m, z24.s, z20.s +; VBITS_GE_128-NEXT: mls z1.s, p0/m, z16.s, z18.s +; VBITS_GE_128-NEXT: movprfx z16, z4 +; VBITS_GE_128-NEXT: udiv z16.s, p0/m, z16.s, z22.s +; VBITS_GE_128-NEXT: movprfx z18, z0 +; VBITS_GE_128-NEXT: udiv z18.s, p0/m, z18.s, z17.s +; VBITS_GE_128-NEXT: mls z0.s, p0/m, z18.s, z17.s +; VBITS_GE_128-NEXT: movprfx z17, z3 +; VBITS_GE_128-NEXT: udiv z17.s, p0/m, z17.s, z21.s +; VBITS_GE_128-NEXT: movprfx z18, z2 +; VBITS_GE_128-NEXT: udiv z18.s, p0/m, z18.s, z19.s +; VBITS_GE_128-NEXT: mls z2.s, p0/m, z18.s, z19.s +; VBITS_GE_128-NEXT: mls z3.s, p0/m, z17.s, z21.s +; VBITS_GE_128-NEXT: stp q2, q3, [x0, #64] +; VBITS_GE_128-NEXT: mls z4.s, p0/m, z16.s, z22.s +; VBITS_GE_128-NEXT: stp q0, q1, [x0, #96] +; VBITS_GE_128-NEXT: movprfx z0, z7 +; VBITS_GE_128-NEXT: udiv z0.s, p0/m, z0.s, z23.s +; VBITS_GE_128-NEXT: mls z7.s, p0/m, z0.s, z23.s +; VBITS_GE_128-NEXT: mls z6.s, p0/m, z24.s, z20.s +; VBITS_GE_128-NEXT: stp q7, q6, [x0] +; VBITS_GE_128-NEXT: stp q4, q5, [x0, #32] +; VBITS_GE_128-NEXT: ret +; +; VBITS_GE_256-LABEL: urem_v32i32: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov x8, #16 +; VBITS_GE_256-NEXT: mov x9, #24 +; VBITS_GE_256-NEXT: mov x10, #8 +; VBITS_GE_256-NEXT: ptrue p0.s, vl8 +; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0, x9, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z2.s }, p0/z, [x0, x10, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z3.s }, p0/z, [x0] +; VBITS_GE_256-NEXT: ld1w { z4.s }, p0/z, [x1, x9, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z5.s }, p0/z, [x1, x10, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z6.s }, p0/z, [x1, x8, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z7.s }, p0/z, [x1] +; VBITS_GE_256-NEXT: movprfx z16, z2 +; VBITS_GE_256-NEXT: udiv z16.s, p0/m, z16.s, z5.s +; VBITS_GE_256-NEXT: movprfx z17, z1 +; VBITS_GE_256-NEXT: udiv z17.s, p0/m, z17.s, z4.s +; VBITS_GE_256-NEXT: mls z2.s, p0/m, z16.s, z5.s +; VBITS_GE_256-NEXT: mls z1.s, p0/m, z17.s, z4.s +; VBITS_GE_256-NEXT: movprfx z4, z0 +; VBITS_GE_256-NEXT: udiv z4.s, p0/m, z4.s, z6.s +; VBITS_GE_256-NEXT: movprfx z5, z3 +; VBITS_GE_256-NEXT: udiv z5.s, p0/m, z5.s, z7.s +; VBITS_GE_256-NEXT: mls z0.s, p0/m, z4.s, z6.s +; VBITS_GE_256-NEXT: mls z3.s, p0/m, z5.s, z7.s +; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0, x8, lsl #2] +; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x0, x9, lsl #2] +; VBITS_GE_256-NEXT: st1w { z2.s }, p0, [x0, x10, lsl #2] +; VBITS_GE_256-NEXT: st1w { z3.s }, p0, [x0] +; VBITS_GE_256-NEXT: ret + %op1 = load <32 x i32>, <32 x i32>* %a + %op2 = load <32 x i32>, <32 x i32>* %b + %res = urem <32 x i32> %op1, %op2 + store <32 x i32> %res, <32 x i32>* %a + ret void +} + +define void @urem_v64i32(<64 x i32>* %a, <64 x i32>* %b) #0 { +; VBITS_GE_128-LABEL: urem_v64i32: +; VBITS_GE_128: // %bb.0: +; VBITS_GE_128-NEXT: mov x15, #60 +; VBITS_GE_128-NEXT: mov x16, #56 +; VBITS_GE_128-NEXT: mov x13, #52 +; VBITS_GE_128-NEXT: mov x14, #48 +; VBITS_GE_128-NEXT: mov x17, #44 +; VBITS_GE_128-NEXT: mov x11, #40 +; VBITS_GE_128-NEXT: mov x18, #36 +; VBITS_GE_128-NEXT: mov x12, #32 +; VBITS_GE_128-NEXT: mov x2, #28 +; VBITS_GE_128-NEXT: mov x8, #24 +; VBITS_GE_128-NEXT: mov x3, #20 +; VBITS_GE_128-NEXT: mov x9, #16 +; VBITS_GE_128-NEXT: mov x4, #12 +; VBITS_GE_128-NEXT: mov x10, #8 +; VBITS_GE_128-NEXT: mov x5, #4 +; VBITS_GE_128-NEXT: ptrue p0.s, vl4 +; VBITS_GE_128-NEXT: ld1w { z2.s }, p0/z, [x0, x15, lsl #2] +; VBITS_GE_128-NEXT: ld1w { z1.s }, p0/z, [x0, x16, lsl #2] +; VBITS_GE_128-NEXT: ld1w { z0.s }, p0/z, [x0, x13, lsl #2] +; VBITS_GE_128-NEXT: ld1w { z16.s }, p0/z, [x0, x14, lsl #2] +; VBITS_GE_128-NEXT: ld1w { z7.s }, p0/z, [x0, x17, lsl #2] +; VBITS_GE_128-NEXT: ld1w { z6.s }, p0/z, [x0, x11, lsl #2] +; VBITS_GE_128-NEXT: ld1w { z5.s }, p0/z, [x0, x18, lsl #2] +; VBITS_GE_128-NEXT: ld1w { z4.s }, p0/z, [x0, x12, lsl #2] +; VBITS_GE_128-NEXT: ld1w { z3.s }, p0/z, [x0, x2, lsl #2] +; VBITS_GE_128-NEXT: ld1w { z23.s }, p0/z, [x0, x8, lsl #2] +; VBITS_GE_128-NEXT: ld1w { z22.s }, p0/z, [x0, x3, lsl #2] +; VBITS_GE_128-NEXT: ld1w { z21.s }, p0/z, [x0, x9, lsl #2] +; VBITS_GE_128-NEXT: ld1w { z20.s }, p0/z, [x0, x4, lsl #2] +; VBITS_GE_128-NEXT: ld1w { z19.s }, p0/z, [x0, x10, lsl #2] +; VBITS_GE_128-NEXT: ld1w { z18.s }, p0/z, [x0, x5, lsl #2] +; VBITS_GE_128-NEXT: ld1w { z17.s }, p0/z, [x0] +; VBITS_GE_128-NEXT: ld1w { z24.s }, p0/z, [x1, x5, lsl #2] +; VBITS_GE_128-NEXT: ld1w { z25.s }, p0/z, [x1, x4, lsl #2] +; VBITS_GE_128-NEXT: ld1w { z28.s }, p0/z, [x1, x3, lsl #2] +; VBITS_GE_128-NEXT: ld1w { z26.s }, p0/z, [x1, x15, lsl #2] +; VBITS_GE_128-NEXT: ld1w { z27.s }, p0/z, [x1, x16, lsl #2] +; VBITS_GE_128-NEXT: ld1w { z29.s }, p0/z, [x1, x2, lsl #2] +; VBITS_GE_128-NEXT: movprfx z30, z18 +; VBITS_GE_128-NEXT: udiv z30.s, p0/m, z30.s, z24.s +; VBITS_GE_128-NEXT: mls z18.s, p0/m, z30.s, z24.s +; VBITS_GE_128-NEXT: movprfx z24, z20 +; VBITS_GE_128-NEXT: udiv z24.s, p0/m, z24.s, z25.s +; VBITS_GE_128-NEXT: ld1w { z30.s }, p0/z, [x1, x18, lsl #2] +; VBITS_GE_128-NEXT: mls z20.s, p0/m, z24.s, z25.s +; VBITS_GE_128-NEXT: ld1w { z24.s }, p0/z, [x1, x17, lsl #2] +; VBITS_GE_128-NEXT: movprfx z25, z22 +; VBITS_GE_128-NEXT: udiv z25.s, p0/m, z25.s, z28.s +; VBITS_GE_128-NEXT: mls z22.s, p0/m, z25.s, z28.s +; VBITS_GE_128-NEXT: movprfx z25, z3 +; VBITS_GE_128-NEXT: udiv z25.s, p0/m, z25.s, z29.s +; VBITS_GE_128-NEXT: ld1w { z28.s }, p0/z, [x1, x13, lsl #2] +; VBITS_GE_128-NEXT: mls z3.s, p0/m, z25.s, z29.s +; VBITS_GE_128-NEXT: ld1w { z25.s }, p0/z, [x1, x14, lsl #2] +; VBITS_GE_128-NEXT: movprfx z29, z5 +; VBITS_GE_128-NEXT: udiv z29.s, p0/m, z29.s, z30.s +; VBITS_GE_128-NEXT: mls z5.s, p0/m, z29.s, z30.s +; VBITS_GE_128-NEXT: movprfx z29, z7 +; VBITS_GE_128-NEXT: udiv z29.s, p0/m, z29.s, z24.s +; VBITS_GE_128-NEXT: ld1w { z30.s }, p0/z, [x1, x11, lsl #2] +; VBITS_GE_128-NEXT: mls z7.s, p0/m, z29.s, z24.s +; VBITS_GE_128-NEXT: ld1w { z24.s }, p0/z, [x1, x12, lsl #2] +; VBITS_GE_128-NEXT: movprfx z29, z0 +; VBITS_GE_128-NEXT: udiv z29.s, p0/m, z29.s, z28.s +; VBITS_GE_128-NEXT: mls z0.s, p0/m, z29.s, z28.s +; VBITS_GE_128-NEXT: movprfx z28, z1 +; VBITS_GE_128-NEXT: udiv z28.s, p0/m, z28.s, z27.s +; VBITS_GE_128-NEXT: ld1w { z29.s }, p0/z, [x1, x8, lsl #2] +; VBITS_GE_128-NEXT: mls z1.s, p0/m, z28.s, z27.s +; VBITS_GE_128-NEXT: ld1w { z27.s }, p0/z, [x1, x9, lsl #2] +; VBITS_GE_128-NEXT: movprfx z28, z2 +; VBITS_GE_128-NEXT: udiv z28.s, p0/m, z28.s, z26.s +; VBITS_GE_128-NEXT: mls z2.s, p0/m, z28.s, z26.s +; VBITS_GE_128-NEXT: ld1w { z26.s }, p0/z, [x1, x10, lsl #2] +; VBITS_GE_128-NEXT: ld1w { z28.s }, p0/z, [x1] +; VBITS_GE_128-NEXT: stp q1, q2, [x0, #224] +; VBITS_GE_128-NEXT: movprfx z1, z16 +; VBITS_GE_128-NEXT: udiv z1.s, p0/m, z1.s, z25.s +; VBITS_GE_128-NEXT: mls z16.s, p0/m, z1.s, z25.s +; VBITS_GE_128-NEXT: stp q16, q0, [x0, #192] +; VBITS_GE_128-NEXT: movprfx z0, z6 +; VBITS_GE_128-NEXT: udiv z0.s, p0/m, z0.s, z30.s +; VBITS_GE_128-NEXT: mls z6.s, p0/m, z0.s, z30.s +; VBITS_GE_128-NEXT: stp q6, q7, [x0, #160] +; VBITS_GE_128-NEXT: movprfx z0, z4 +; VBITS_GE_128-NEXT: udiv z0.s, p0/m, z0.s, z24.s +; VBITS_GE_128-NEXT: mls z4.s, p0/m, z0.s, z24.s +; VBITS_GE_128-NEXT: stp q4, q5, [x0, #128] +; VBITS_GE_128-NEXT: movprfx z0, z23 +; VBITS_GE_128-NEXT: udiv z0.s, p0/m, z0.s, z29.s +; VBITS_GE_128-NEXT: mls z23.s, p0/m, z0.s, z29.s +; VBITS_GE_128-NEXT: stp q23, q3, [x0, #96] +; VBITS_GE_128-NEXT: movprfx z0, z21 +; VBITS_GE_128-NEXT: udiv z0.s, p0/m, z0.s, z27.s +; VBITS_GE_128-NEXT: mls z21.s, p0/m, z0.s, z27.s +; VBITS_GE_128-NEXT: stp q21, q22, [x0, #64] +; VBITS_GE_128-NEXT: movprfx z0, z19 +; VBITS_GE_128-NEXT: udiv z0.s, p0/m, z0.s, z26.s +; VBITS_GE_128-NEXT: mls z19.s, p0/m, z0.s, z26.s +; VBITS_GE_128-NEXT: stp q19, q20, [x0, #32] +; VBITS_GE_128-NEXT: movprfx z0, z17 +; VBITS_GE_128-NEXT: udiv z0.s, p0/m, z0.s, z28.s +; VBITS_GE_128-NEXT: mls z17.s, p0/m, z0.s, z28.s +; VBITS_GE_128-NEXT: stp q17, q18, [x0] +; VBITS_GE_128-NEXT: ret +; +; VBITS_GE_256-LABEL: urem_v64i32: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov x8, #48 +; VBITS_GE_256-NEXT: mov x9, #56 +; VBITS_GE_256-NEXT: mov x10, #32 +; VBITS_GE_256-NEXT: mov x11, #40 +; VBITS_GE_256-NEXT: mov x12, #16 +; VBITS_GE_256-NEXT: mov x13, #24 +; VBITS_GE_256-NEXT: mov x14, #8 +; VBITS_GE_256-NEXT: ptrue p0.s, vl8 +; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0, x9, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z2.s }, p0/z, [x0, x10, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z3.s }, p0/z, [x0, x11, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z4.s }, p0/z, [x0, x12, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z5.s }, p0/z, [x0, x13, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z6.s }, p0/z, [x0, x14, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z7.s }, p0/z, [x0] +; VBITS_GE_256-NEXT: ld1w { z16.s }, p0/z, [x1, x14, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z17.s }, p0/z, [x1, x13, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z18.s }, p0/z, [x1, x8, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z19.s }, p0/z, [x1, x9, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z20.s }, p0/z, [x1, x11, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z21.s }, p0/z, [x1, x12, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z22.s }, p0/z, [x1, x10, lsl #2] +; VBITS_GE_256-NEXT: movprfx z23, z6 +; VBITS_GE_256-NEXT: udiv z23.s, p0/m, z23.s, z16.s +; VBITS_GE_256-NEXT: mls z6.s, p0/m, z23.s, z16.s +; VBITS_GE_256-NEXT: movprfx z16, z5 +; VBITS_GE_256-NEXT: udiv z16.s, p0/m, z16.s, z17.s +; VBITS_GE_256-NEXT: ld1w { z23.s }, p0/z, [x1] +; VBITS_GE_256-NEXT: mls z5.s, p0/m, z16.s, z17.s +; VBITS_GE_256-NEXT: movprfx z16, z4 +; VBITS_GE_256-NEXT: udiv z16.s, p0/m, z16.s, z21.s +; VBITS_GE_256-NEXT: movprfx z17, z3 +; VBITS_GE_256-NEXT: udiv z17.s, p0/m, z17.s, z20.s +; VBITS_GE_256-NEXT: mls z4.s, p0/m, z16.s, z21.s +; VBITS_GE_256-NEXT: mls z3.s, p0/m, z17.s, z20.s +; VBITS_GE_256-NEXT: movprfx z16, z2 +; VBITS_GE_256-NEXT: udiv z16.s, p0/m, z16.s, z22.s +; VBITS_GE_256-NEXT: movprfx z17, z1 +; VBITS_GE_256-NEXT: udiv z17.s, p0/m, z17.s, z19.s +; VBITS_GE_256-NEXT: mls z2.s, p0/m, z16.s, z22.s +; VBITS_GE_256-NEXT: mls z1.s, p0/m, z17.s, z19.s +; VBITS_GE_256-NEXT: movprfx z16, z0 +; VBITS_GE_256-NEXT: udiv z16.s, p0/m, z16.s, z18.s +; VBITS_GE_256-NEXT: movprfx z17, z7 +; VBITS_GE_256-NEXT: udiv z17.s, p0/m, z17.s, z23.s +; VBITS_GE_256-NEXT: mls z0.s, p0/m, z16.s, z18.s +; VBITS_GE_256-NEXT: mls z7.s, p0/m, z17.s, z23.s +; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0, x8, lsl #2] +; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x0, x9, lsl #2] +; VBITS_GE_256-NEXT: st1w { z2.s }, p0, [x0, x10, lsl #2] +; VBITS_GE_256-NEXT: st1w { z3.s }, p0, [x0, x11, lsl #2] +; VBITS_GE_256-NEXT: st1w { z4.s }, p0, [x0, x12, lsl #2] +; VBITS_GE_256-NEXT: st1w { z5.s }, p0, [x0, x13, lsl #2] +; VBITS_GE_256-NEXT: st1w { z6.s }, p0, [x0, x14, lsl #2] +; VBITS_GE_256-NEXT: st1w { z7.s }, p0, [x0] +; VBITS_GE_256-NEXT: ret + %op1 = load <64 x i32>, <64 x i32>* %a + %op2 = load <64 x i32>, <64 x i32>* %b + %res = urem <64 x i32> %op1, %op2 + store <64 x i32> %res, <64 x i32>* %a + ret void +} + +define <1 x i64> @urem_v1i64(<1 x i64> %op1, <1 x i64> %op2) #0 { +; CHECK-LABEL: urem_v1i64: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: ptrue p0.d, vl1 +; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1 +; CHECK-NEXT: movprfx z2, z0 +; CHECK-NEXT: udiv z2.d, p0/m, z2.d, z1.d +; CHECK-NEXT: mls z0.d, p0/m, z2.d, z1.d +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 +; CHECK-NEXT: ret + %res = urem <1 x i64> %op1, %op2 + ret <1 x i64> %res +} + +define <2 x i64> @urem_v2i64(<2 x i64> %op1, <2 x i64> %op2) #0 { +; CHECK-LABEL: urem_v2i64: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: ptrue p0.d, vl2 +; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1 +; CHECK-NEXT: movprfx z2, z0 +; CHECK-NEXT: udiv z2.d, p0/m, z2.d, z1.d +; CHECK-NEXT: mls z0.d, p0/m, z2.d, z1.d +; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 +; CHECK-NEXT: ret + %res = urem <2 x i64> %op1, %op2 + ret <2 x i64> %res +} + +define void @urem_v4i64(<4 x i64>* %a, <4 x i64>* %b) #0 { +; VBITS_GE_128-LABEL: urem_v4i64: +; VBITS_GE_128: // %bb.0: +; VBITS_GE_128-NEXT: mov x8, #2 +; VBITS_GE_128-NEXT: ptrue p0.d, vl2 +; VBITS_GE_128-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] +; VBITS_GE_128-NEXT: ld1d { z1.d }, p0/z, [x0] +; VBITS_GE_128-NEXT: ld1d { z2.d }, p0/z, [x1, x8, lsl #3] +; VBITS_GE_128-NEXT: ld1d { z3.d }, p0/z, [x1] +; VBITS_GE_128-NEXT: movprfx z4, z1 +; VBITS_GE_128-NEXT: udiv z4.d, p0/m, z4.d, z3.d +; VBITS_GE_128-NEXT: movprfx z5, z0 +; VBITS_GE_128-NEXT: udiv z5.d, p0/m, z5.d, z2.d +; VBITS_GE_128-NEXT: mls z1.d, p0/m, z4.d, z3.d +; VBITS_GE_128-NEXT: mls z0.d, p0/m, z5.d, z2.d +; VBITS_GE_128-NEXT: stp q1, q0, [x0] +; VBITS_GE_128-NEXT: ret +; +; VBITS_GE_256-LABEL: urem_v4i64: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: ptrue p0.d, vl4 +; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0] +; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x1] +; VBITS_GE_256-NEXT: movprfx z2, z0 +; VBITS_GE_256-NEXT: udiv z2.d, p0/m, z2.d, z1.d +; VBITS_GE_256-NEXT: mls z0.d, p0/m, z2.d, z1.d +; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x0] +; VBITS_GE_256-NEXT: ret +; +; VBITS_GE_512-LABEL: urem_v4i64: +; VBITS_GE_512: // %bb.0: +; VBITS_GE_512-NEXT: ptrue p0.d, vl4 +; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x0] +; VBITS_GE_512-NEXT: ld1d { z1.d }, p0/z, [x1] +; VBITS_GE_512-NEXT: movprfx z2, z0 +; VBITS_GE_512-NEXT: udiv z2.d, p0/m, z2.d, z1.d +; VBITS_GE_512-NEXT: mls z0.d, p0/m, z2.d, z1.d +; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x0] +; VBITS_GE_512-NEXT: ret + %op1 = load <4 x i64>, <4 x i64>* %a + %op2 = load <4 x i64>, <4 x i64>* %b + %res = urem <4 x i64> %op1, %op2 + store <4 x i64> %res, <4 x i64>* %a + ret void +} + +define void @urem_v8i64(<8 x i64>* %a, <8 x i64>* %b) #0 { +; VBITS_GE_128-LABEL: urem_v8i64: +; VBITS_GE_128: // %bb.0: +; VBITS_GE_128-NEXT: mov x8, #4 +; VBITS_GE_128-NEXT: mov x9, #6 +; VBITS_GE_128-NEXT: mov x10, #2 +; VBITS_GE_128-NEXT: ptrue p0.d, vl2 +; VBITS_GE_128-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] +; VBITS_GE_128-NEXT: ld1d { z1.d }, p0/z, [x0, x9, lsl #3] +; VBITS_GE_128-NEXT: ld1d { z2.d }, p0/z, [x0, x10, lsl #3] +; VBITS_GE_128-NEXT: ld1d { z3.d }, p0/z, [x0] +; VBITS_GE_128-NEXT: ld1d { z4.d }, p0/z, [x1, x9, lsl #3] +; VBITS_GE_128-NEXT: ld1d { z5.d }, p0/z, [x1, x8, lsl #3] +; VBITS_GE_128-NEXT: ld1d { z6.d }, p0/z, [x1, x10, lsl #3] +; VBITS_GE_128-NEXT: ld1d { z7.d }, p0/z, [x1] +; VBITS_GE_128-NEXT: movprfx z16, z1 +; VBITS_GE_128-NEXT: udiv z16.d, p0/m, z16.d, z4.d +; VBITS_GE_128-NEXT: mls z1.d, p0/m, z16.d, z4.d +; VBITS_GE_128-NEXT: movprfx z4, z0 +; VBITS_GE_128-NEXT: udiv z4.d, p0/m, z4.d, z5.d +; VBITS_GE_128-NEXT: movprfx z16, z2 +; VBITS_GE_128-NEXT: udiv z16.d, p0/m, z16.d, z6.d +; VBITS_GE_128-NEXT: mls z0.d, p0/m, z4.d, z5.d +; VBITS_GE_128-NEXT: movprfx z4, z3 +; VBITS_GE_128-NEXT: udiv z4.d, p0/m, z4.d, z7.d +; VBITS_GE_128-NEXT: mls z3.d, p0/m, z4.d, z7.d +; VBITS_GE_128-NEXT: mls z2.d, p0/m, z16.d, z6.d +; VBITS_GE_128-NEXT: stp q0, q1, [x0, #32] +; VBITS_GE_128-NEXT: stp q3, q2, [x0] +; VBITS_GE_128-NEXT: ret +; +; VBITS_GE_256-LABEL: urem_v8i64: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov x8, #4 +; VBITS_GE_256-NEXT: ptrue p0.d, vl4 +; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0] +; VBITS_GE_256-NEXT: ld1d { z2.d }, p0/z, [x1, x8, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z3.d }, p0/z, [x1] +; VBITS_GE_256-NEXT: movprfx z4, z0 +; VBITS_GE_256-NEXT: udiv z4.d, p0/m, z4.d, z2.d +; VBITS_GE_256-NEXT: movprfx z5, z1 +; VBITS_GE_256-NEXT: udiv z5.d, p0/m, z5.d, z3.d +; VBITS_GE_256-NEXT: mls z0.d, p0/m, z4.d, z2.d +; VBITS_GE_256-NEXT: mls z1.d, p0/m, z5.d, z3.d +; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x0, x8, lsl #3] +; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x0] +; VBITS_GE_256-NEXT: ret +; +; VBITS_GE_512-LABEL: urem_v8i64: +; VBITS_GE_512: // %bb.0: +; VBITS_GE_512-NEXT: ptrue p0.d, vl8 +; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x0] +; VBITS_GE_512-NEXT: ld1d { z1.d }, p0/z, [x1] +; VBITS_GE_512-NEXT: movprfx z2, z0 +; VBITS_GE_512-NEXT: udiv z2.d, p0/m, z2.d, z1.d +; VBITS_GE_512-NEXT: mls z0.d, p0/m, z2.d, z1.d +; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x0] +; VBITS_GE_512-NEXT: ret + %op1 = load <8 x i64>, <8 x i64>* %a + %op2 = load <8 x i64>, <8 x i64>* %b + %res = urem <8 x i64> %op1, %op2 + store <8 x i64> %res, <8 x i64>* %a + ret void +} + +define void @urem_v16i64(<16 x i64>* %a, <16 x i64>* %b) #0 { +; VBITS_GE_128-LABEL: urem_v16i64: +; VBITS_GE_128: // %bb.0: +; VBITS_GE_128-NEXT: mov x8, #12 +; VBITS_GE_128-NEXT: mov x9, #14 +; VBITS_GE_128-NEXT: mov x10, #8 +; VBITS_GE_128-NEXT: mov x11, #10 +; VBITS_GE_128-NEXT: mov x12, #4 +; VBITS_GE_128-NEXT: mov x13, #6 +; VBITS_GE_128-NEXT: mov x14, #2 +; VBITS_GE_128-NEXT: ptrue p0.d, vl2 +; VBITS_GE_128-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] +; VBITS_GE_128-NEXT: ld1d { z1.d }, p0/z, [x0, x9, lsl #3] +; VBITS_GE_128-NEXT: ld1d { z2.d }, p0/z, [x0, x10, lsl #3] +; VBITS_GE_128-NEXT: ld1d { z3.d }, p0/z, [x0, x11, lsl #3] +; VBITS_GE_128-NEXT: ld1d { z4.d }, p0/z, [x0, x12, lsl #3] +; VBITS_GE_128-NEXT: ld1d { z5.d }, p0/z, [x0, x13, lsl #3] +; VBITS_GE_128-NEXT: ld1d { z6.d }, p0/z, [x0, x14, lsl #3] +; VBITS_GE_128-NEXT: ld1d { z7.d }, p0/z, [x0] +; VBITS_GE_128-NEXT: ld1d { z16.d }, p0/z, [x1, x13, lsl #3] +; VBITS_GE_128-NEXT: ld1d { z18.d }, p0/z, [x1, x9, lsl #3] +; VBITS_GE_128-NEXT: ld1d { z17.d }, p0/z, [x1, x8, lsl #3] +; VBITS_GE_128-NEXT: ld1d { z19.d }, p0/z, [x1, x10, lsl #3] +; VBITS_GE_128-NEXT: ld1d { z21.d }, p0/z, [x1, x11, lsl #3] +; VBITS_GE_128-NEXT: ld1d { z20.d }, p0/z, [x1, x14, lsl #3] +; VBITS_GE_128-NEXT: ld1d { z22.d }, p0/z, [x1, x12, lsl #3] +; VBITS_GE_128-NEXT: ld1d { z23.d }, p0/z, [x1] +; VBITS_GE_128-NEXT: movprfx z24, z5 +; VBITS_GE_128-NEXT: udiv z24.d, p0/m, z24.d, z16.d +; VBITS_GE_128-NEXT: mls z5.d, p0/m, z24.d, z16.d +; VBITS_GE_128-NEXT: movprfx z16, z1 +; VBITS_GE_128-NEXT: udiv z16.d, p0/m, z16.d, z18.d +; VBITS_GE_128-NEXT: movprfx z24, z6 +; VBITS_GE_128-NEXT: udiv z24.d, p0/m, z24.d, z20.d +; VBITS_GE_128-NEXT: mls z1.d, p0/m, z16.d, z18.d +; VBITS_GE_128-NEXT: movprfx z16, z4 +; VBITS_GE_128-NEXT: udiv z16.d, p0/m, z16.d, z22.d +; VBITS_GE_128-NEXT: movprfx z18, z0 +; VBITS_GE_128-NEXT: udiv z18.d, p0/m, z18.d, z17.d +; VBITS_GE_128-NEXT: mls z0.d, p0/m, z18.d, z17.d +; VBITS_GE_128-NEXT: movprfx z17, z3 +; VBITS_GE_128-NEXT: udiv z17.d, p0/m, z17.d, z21.d +; VBITS_GE_128-NEXT: movprfx z18, z2 +; VBITS_GE_128-NEXT: udiv z18.d, p0/m, z18.d, z19.d +; VBITS_GE_128-NEXT: mls z2.d, p0/m, z18.d, z19.d +; VBITS_GE_128-NEXT: mls z3.d, p0/m, z17.d, z21.d +; VBITS_GE_128-NEXT: stp q2, q3, [x0, #64] +; VBITS_GE_128-NEXT: mls z4.d, p0/m, z16.d, z22.d +; VBITS_GE_128-NEXT: stp q0, q1, [x0, #96] +; VBITS_GE_128-NEXT: movprfx z0, z7 +; VBITS_GE_128-NEXT: udiv z0.d, p0/m, z0.d, z23.d +; VBITS_GE_128-NEXT: mls z7.d, p0/m, z0.d, z23.d +; VBITS_GE_128-NEXT: mls z6.d, p0/m, z24.d, z20.d +; VBITS_GE_128-NEXT: stp q7, q6, [x0] +; VBITS_GE_128-NEXT: stp q4, q5, [x0, #32] +; VBITS_GE_128-NEXT: ret +; +; VBITS_GE_256-LABEL: urem_v16i64: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov x8, #8 +; VBITS_GE_256-NEXT: mov x9, #12 +; VBITS_GE_256-NEXT: mov x10, #4 +; VBITS_GE_256-NEXT: ptrue p0.d, vl4 +; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0, x9, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z2.d }, p0/z, [x0, x10, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z3.d }, p0/z, [x0] +; VBITS_GE_256-NEXT: ld1d { z4.d }, p0/z, [x1, x9, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z5.d }, p0/z, [x1, x10, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z6.d }, p0/z, [x1, x8, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z7.d }, p0/z, [x1] +; VBITS_GE_256-NEXT: movprfx z16, z2 +; VBITS_GE_256-NEXT: udiv z16.d, p0/m, z16.d, z5.d +; VBITS_GE_256-NEXT: movprfx z17, z1 +; VBITS_GE_256-NEXT: udiv z17.d, p0/m, z17.d, z4.d +; VBITS_GE_256-NEXT: mls z2.d, p0/m, z16.d, z5.d +; VBITS_GE_256-NEXT: mls z1.d, p0/m, z17.d, z4.d +; VBITS_GE_256-NEXT: movprfx z4, z0 +; VBITS_GE_256-NEXT: udiv z4.d, p0/m, z4.d, z6.d +; VBITS_GE_256-NEXT: movprfx z5, z3 +; VBITS_GE_256-NEXT: udiv z5.d, p0/m, z5.d, z7.d +; VBITS_GE_256-NEXT: mls z0.d, p0/m, z4.d, z6.d +; VBITS_GE_256-NEXT: mls z3.d, p0/m, z5.d, z7.d +; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x0, x8, lsl #3] +; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x0, x9, lsl #3] +; VBITS_GE_256-NEXT: st1d { z2.d }, p0, [x0, x10, lsl #3] +; VBITS_GE_256-NEXT: st1d { z3.d }, p0, [x0] +; VBITS_GE_256-NEXT: ret + %op1 = load <16 x i64>, <16 x i64>* %a + %op2 = load <16 x i64>, <16 x i64>* %b + %res = urem <16 x i64> %op1, %op2 + store <16 x i64> %res, <16 x i64>* %a + ret void +} + +define void @urem_v32i64(<32 x i64>* %a, <32 x i64>* %b) #0 { +; VBITS_GE_128-LABEL: urem_v32i64: +; VBITS_GE_128: // %bb.0: +; VBITS_GE_128-NEXT: mov x15, #30 +; VBITS_GE_128-NEXT: mov x16, #28 +; VBITS_GE_128-NEXT: mov x13, #26 +; VBITS_GE_128-NEXT: mov x14, #24 +; VBITS_GE_128-NEXT: mov x17, #22 +; VBITS_GE_128-NEXT: mov x11, #20 +; VBITS_GE_128-NEXT: mov x18, #18 +; VBITS_GE_128-NEXT: mov x12, #16 +; VBITS_GE_128-NEXT: mov x2, #14 +; VBITS_GE_128-NEXT: mov x8, #12 +; VBITS_GE_128-NEXT: mov x3, #10 +; VBITS_GE_128-NEXT: mov x9, #8 +; VBITS_GE_128-NEXT: mov x4, #6 +; VBITS_GE_128-NEXT: mov x10, #4 +; VBITS_GE_128-NEXT: mov x5, #2 +; VBITS_GE_128-NEXT: ptrue p0.d, vl2 +; VBITS_GE_128-NEXT: ld1d { z2.d }, p0/z, [x0, x15, lsl #3] +; VBITS_GE_128-NEXT: ld1d { z1.d }, p0/z, [x0, x16, lsl #3] +; VBITS_GE_128-NEXT: ld1d { z0.d }, p0/z, [x0, x13, lsl #3] +; VBITS_GE_128-NEXT: ld1d { z16.d }, p0/z, [x0, x14, lsl #3] +; VBITS_GE_128-NEXT: ld1d { z7.d }, p0/z, [x0, x17, lsl #3] +; VBITS_GE_128-NEXT: ld1d { z6.d }, p0/z, [x0, x11, lsl #3] +; VBITS_GE_128-NEXT: ld1d { z5.d }, p0/z, [x0, x18, lsl #3] +; VBITS_GE_128-NEXT: ld1d { z4.d }, p0/z, [x0, x12, lsl #3] +; VBITS_GE_128-NEXT: ld1d { z3.d }, p0/z, [x0, x2, lsl #3] +; VBITS_GE_128-NEXT: ld1d { z23.d }, p0/z, [x0, x8, lsl #3] +; VBITS_GE_128-NEXT: ld1d { z22.d }, p0/z, [x0, x3, lsl #3] +; VBITS_GE_128-NEXT: ld1d { z21.d }, p0/z, [x0, x9, lsl #3] +; VBITS_GE_128-NEXT: ld1d { z20.d }, p0/z, [x0, x4, lsl #3] +; VBITS_GE_128-NEXT: ld1d { z19.d }, p0/z, [x0, x10, lsl #3] +; VBITS_GE_128-NEXT: ld1d { z18.d }, p0/z, [x0, x5, lsl #3] +; VBITS_GE_128-NEXT: ld1d { z17.d }, p0/z, [x0] +; VBITS_GE_128-NEXT: ld1d { z24.d }, p0/z, [x1, x5, lsl #3] +; VBITS_GE_128-NEXT: ld1d { z25.d }, p0/z, [x1, x4, lsl #3] +; VBITS_GE_128-NEXT: ld1d { z28.d }, p0/z, [x1, x3, lsl #3] +; VBITS_GE_128-NEXT: ld1d { z26.d }, p0/z, [x1, x15, lsl #3] +; VBITS_GE_128-NEXT: ld1d { z27.d }, p0/z, [x1, x16, lsl #3] +; VBITS_GE_128-NEXT: ld1d { z29.d }, p0/z, [x1, x2, lsl #3] +; VBITS_GE_128-NEXT: movprfx z30, z18 +; VBITS_GE_128-NEXT: udiv z30.d, p0/m, z30.d, z24.d +; VBITS_GE_128-NEXT: mls z18.d, p0/m, z30.d, z24.d +; VBITS_GE_128-NEXT: movprfx z24, z20 +; VBITS_GE_128-NEXT: udiv z24.d, p0/m, z24.d, z25.d +; VBITS_GE_128-NEXT: ld1d { z30.d }, p0/z, [x1, x18, lsl #3] +; VBITS_GE_128-NEXT: mls z20.d, p0/m, z24.d, z25.d +; VBITS_GE_128-NEXT: ld1d { z24.d }, p0/z, [x1, x17, lsl #3] +; VBITS_GE_128-NEXT: movprfx z25, z22 +; VBITS_GE_128-NEXT: udiv z25.d, p0/m, z25.d, z28.d +; VBITS_GE_128-NEXT: mls z22.d, p0/m, z25.d, z28.d +; VBITS_GE_128-NEXT: movprfx z25, z3 +; VBITS_GE_128-NEXT: udiv z25.d, p0/m, z25.d, z29.d +; VBITS_GE_128-NEXT: ld1d { z28.d }, p0/z, [x1, x13, lsl #3] +; VBITS_GE_128-NEXT: mls z3.d, p0/m, z25.d, z29.d +; VBITS_GE_128-NEXT: ld1d { z25.d }, p0/z, [x1, x14, lsl #3] +; VBITS_GE_128-NEXT: movprfx z29, z5 +; VBITS_GE_128-NEXT: udiv z29.d, p0/m, z29.d, z30.d +; VBITS_GE_128-NEXT: mls z5.d, p0/m, z29.d, z30.d +; VBITS_GE_128-NEXT: movprfx z29, z7 +; VBITS_GE_128-NEXT: udiv z29.d, p0/m, z29.d, z24.d +; VBITS_GE_128-NEXT: ld1d { z30.d }, p0/z, [x1, x11, lsl #3] +; VBITS_GE_128-NEXT: mls z7.d, p0/m, z29.d, z24.d +; VBITS_GE_128-NEXT: ld1d { z24.d }, p0/z, [x1, x12, lsl #3] +; VBITS_GE_128-NEXT: movprfx z29, z0 +; VBITS_GE_128-NEXT: udiv z29.d, p0/m, z29.d, z28.d +; VBITS_GE_128-NEXT: mls z0.d, p0/m, z29.d, z28.d +; VBITS_GE_128-NEXT: movprfx z28, z1 +; VBITS_GE_128-NEXT: udiv z28.d, p0/m, z28.d, z27.d +; VBITS_GE_128-NEXT: ld1d { z29.d }, p0/z, [x1, x8, lsl #3] +; VBITS_GE_128-NEXT: mls z1.d, p0/m, z28.d, z27.d +; VBITS_GE_128-NEXT: ld1d { z27.d }, p0/z, [x1, x9, lsl #3] +; VBITS_GE_128-NEXT: movprfx z28, z2 +; VBITS_GE_128-NEXT: udiv z28.d, p0/m, z28.d, z26.d +; VBITS_GE_128-NEXT: mls z2.d, p0/m, z28.d, z26.d +; VBITS_GE_128-NEXT: ld1d { z26.d }, p0/z, [x1, x10, lsl #3] +; VBITS_GE_128-NEXT: ld1d { z28.d }, p0/z, [x1] +; VBITS_GE_128-NEXT: stp q1, q2, [x0, #224] +; VBITS_GE_128-NEXT: movprfx z1, z16 +; VBITS_GE_128-NEXT: udiv z1.d, p0/m, z1.d, z25.d +; VBITS_GE_128-NEXT: mls z16.d, p0/m, z1.d, z25.d +; VBITS_GE_128-NEXT: stp q16, q0, [x0, #192] +; VBITS_GE_128-NEXT: movprfx z0, z6 +; VBITS_GE_128-NEXT: udiv z0.d, p0/m, z0.d, z30.d +; VBITS_GE_128-NEXT: mls z6.d, p0/m, z0.d, z30.d +; VBITS_GE_128-NEXT: stp q6, q7, [x0, #160] +; VBITS_GE_128-NEXT: movprfx z0, z4 +; VBITS_GE_128-NEXT: udiv z0.d, p0/m, z0.d, z24.d +; VBITS_GE_128-NEXT: mls z4.d, p0/m, z0.d, z24.d +; VBITS_GE_128-NEXT: stp q4, q5, [x0, #128] +; VBITS_GE_128-NEXT: movprfx z0, z23 +; VBITS_GE_128-NEXT: udiv z0.d, p0/m, z0.d, z29.d +; VBITS_GE_128-NEXT: mls z23.d, p0/m, z0.d, z29.d +; VBITS_GE_128-NEXT: stp q23, q3, [x0, #96] +; VBITS_GE_128-NEXT: movprfx z0, z21 +; VBITS_GE_128-NEXT: udiv z0.d, p0/m, z0.d, z27.d +; VBITS_GE_128-NEXT: mls z21.d, p0/m, z0.d, z27.d +; VBITS_GE_128-NEXT: stp q21, q22, [x0, #64] +; VBITS_GE_128-NEXT: movprfx z0, z19 +; VBITS_GE_128-NEXT: udiv z0.d, p0/m, z0.d, z26.d +; VBITS_GE_128-NEXT: mls z19.d, p0/m, z0.d, z26.d +; VBITS_GE_128-NEXT: stp q19, q20, [x0, #32] +; VBITS_GE_128-NEXT: movprfx z0, z17 +; VBITS_GE_128-NEXT: udiv z0.d, p0/m, z0.d, z28.d +; VBITS_GE_128-NEXT: mls z17.d, p0/m, z0.d, z28.d +; VBITS_GE_128-NEXT: stp q17, q18, [x0] +; VBITS_GE_128-NEXT: ret +; +; VBITS_GE_256-LABEL: urem_v32i64: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov x8, #24 +; VBITS_GE_256-NEXT: mov x9, #28 +; VBITS_GE_256-NEXT: mov x10, #16 +; VBITS_GE_256-NEXT: mov x11, #20 +; VBITS_GE_256-NEXT: mov x12, #8 +; VBITS_GE_256-NEXT: mov x13, #12 +; VBITS_GE_256-NEXT: mov x14, #4 +; VBITS_GE_256-NEXT: ptrue p0.d, vl4 +; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0, x9, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z2.d }, p0/z, [x0, x10, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z3.d }, p0/z, [x0, x11, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z4.d }, p0/z, [x0, x12, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z5.d }, p0/z, [x0, x13, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z6.d }, p0/z, [x0, x14, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z7.d }, p0/z, [x0] +; VBITS_GE_256-NEXT: ld1d { z16.d }, p0/z, [x1, x14, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z17.d }, p0/z, [x1, x13, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z18.d }, p0/z, [x1, x8, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z19.d }, p0/z, [x1, x9, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z20.d }, p0/z, [x1, x11, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z21.d }, p0/z, [x1, x12, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z22.d }, p0/z, [x1, x10, lsl #3] +; VBITS_GE_256-NEXT: movprfx z23, z6 +; VBITS_GE_256-NEXT: udiv z23.d, p0/m, z23.d, z16.d +; VBITS_GE_256-NEXT: mls z6.d, p0/m, z23.d, z16.d +; VBITS_GE_256-NEXT: movprfx z16, z5 +; VBITS_GE_256-NEXT: udiv z16.d, p0/m, z16.d, z17.d +; VBITS_GE_256-NEXT: ld1d { z23.d }, p0/z, [x1] +; VBITS_GE_256-NEXT: mls z5.d, p0/m, z16.d, z17.d +; VBITS_GE_256-NEXT: movprfx z16, z4 +; VBITS_GE_256-NEXT: udiv z16.d, p0/m, z16.d, z21.d +; VBITS_GE_256-NEXT: movprfx z17, z3 +; VBITS_GE_256-NEXT: udiv z17.d, p0/m, z17.d, z20.d +; VBITS_GE_256-NEXT: mls z4.d, p0/m, z16.d, z21.d +; VBITS_GE_256-NEXT: mls z3.d, p0/m, z17.d, z20.d +; VBITS_GE_256-NEXT: movprfx z16, z2 +; VBITS_GE_256-NEXT: udiv z16.d, p0/m, z16.d, z22.d +; VBITS_GE_256-NEXT: movprfx z17, z1 +; VBITS_GE_256-NEXT: udiv z17.d, p0/m, z17.d, z19.d +; VBITS_GE_256-NEXT: mls z2.d, p0/m, z16.d, z22.d +; VBITS_GE_256-NEXT: mls z1.d, p0/m, z17.d, z19.d +; VBITS_GE_256-NEXT: movprfx z16, z0 +; VBITS_GE_256-NEXT: udiv z16.d, p0/m, z16.d, z18.d +; VBITS_GE_256-NEXT: movprfx z17, z7 +; VBITS_GE_256-NEXT: udiv z17.d, p0/m, z17.d, z23.d +; VBITS_GE_256-NEXT: mls z0.d, p0/m, z16.d, z18.d +; VBITS_GE_256-NEXT: mls z7.d, p0/m, z17.d, z23.d +; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x0, x8, lsl #3] +; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x0, x9, lsl #3] +; VBITS_GE_256-NEXT: st1d { z2.d }, p0, [x0, x10, lsl #3] +; VBITS_GE_256-NEXT: st1d { z3.d }, p0, [x0, x11, lsl #3] +; VBITS_GE_256-NEXT: st1d { z4.d }, p0, [x0, x12, lsl #3] +; VBITS_GE_256-NEXT: st1d { z5.d }, p0, [x0, x13, lsl #3] +; VBITS_GE_256-NEXT: st1d { z6.d }, p0, [x0, x14, lsl #3] +; VBITS_GE_256-NEXT: st1d { z7.d }, p0, [x0] +; VBITS_GE_256-NEXT: ret + %op1 = load <32 x i64>, <32 x i64>* %a + %op2 = load <32 x i64>, <32 x i64>* %b + %res = urem <32 x i64> %op1, %op2 + store <32 x i64> %res, <32 x i64>* %a + ret void +} + +attributes #0 = { "target-features"="+sve" }