diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -1391,7 +1391,7 @@ if (Subtarget->forceStreamingCompatibleSVE()) { for (MVT VT : {MVT::v8i8, MVT::v16i8, MVT::v4i16, MVT::v8i16, MVT::v2i32, - MVT::v4i32, MVT::v2i64}) + MVT::v4i32, MVT::v1i64, MVT::v2i64}) addTypeForStreamingSVE(VT); for (MVT VT : {MVT::v4f16, MVT::v8f16, MVT::v2f32, MVT::v4f32, MVT::v2f64}) @@ -1610,8 +1610,15 @@ setOperationAction(ISD::ZERO_EXTEND, VT, Custom); setOperationAction(ISD::SIGN_EXTEND, VT, Custom); } - setOperationAction(ISD::CONCAT_VECTORS, VT, Custom); setOperationAction(ISD::AND, VT, Custom); + setOperationAction(ISD::CONCAT_VECTORS, VT, Custom); + setOperationAction(ISD::ADD, VT, Custom); + setOperationAction(ISD::SUB, VT, Custom); + setOperationAction(ISD::MUL, VT, Custom); + setOperationAction(ISD::MULHS, VT, Custom); + setOperationAction(ISD::MULHU, VT, Custom); + setOperationAction(ISD::ABS, VT, Custom); + setOperationAction(ISD::XOR, VT, Custom); } void AArch64TargetLowering::addTypeForFixedLengthSVE(MVT VT) { @@ -3536,7 +3543,8 @@ } SDValue AArch64TargetLowering::LowerXOR(SDValue Op, SelectionDAG &DAG) const { - if (useSVEForFixedLengthVectorVT(Op.getValueType())) + if (useSVEForFixedLengthVectorVT(Op.getValueType(), + Subtarget->forceStreamingCompatibleSVE())) return LowerToScalableOp(Op, DAG); SDValue Sel = Op.getOperand(0); @@ -4448,7 +4456,8 @@ EVT VT = Op.getValueType(); // If SVE is available then i64 vector multiplications can also be made legal. - bool OverrideNEON = VT == MVT::v2i64 || VT == MVT::v1i64; + bool OverrideNEON = VT == MVT::v2i64 || VT == MVT::v1i64 || + Subtarget->forceStreamingCompatibleSVE(); if (VT.isScalableVector() || useSVEForFixedLengthVectorVT(VT, OverrideNEON)) return LowerToPredicatedOp(Op, DAG, AArch64ISD::MUL_PRED); @@ -11369,13 +11378,13 @@ const APInt &Bits, const SDValue *LHS = nullptr, const AArch64Subtarget *const Subtarget = nullptr) { + EVT VT = Op.getValueType(); if(Subtarget && VT.isFixedLengthVector() && Subtarget->forceStreamingCompatibleSVE()) return SDValue(); if (Bits.getHiBits(64) == Bits.getLoBits(64)) { uint64_t Value = Bits.zextOrTrunc(64).getZExtValue(); - EVT VT = Op.getValueType(); MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32; bool isAdvSIMDModImm = false; uint64_t Shift; @@ -11428,7 +11437,6 @@ if (Bits.getHiBits(64) == Bits.getLoBits(64)) { uint64_t Value = Bits.zextOrTrunc(64).getZExtValue(); - EVT VT = Op.getValueType(); MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v8i16 : MVT::v4i16; bool isAdvSIMDModImm = false; uint64_t Shift; @@ -11655,7 +11663,8 @@ SDValue AArch64TargetLowering::LowerVectorOR(SDValue Op, SelectionDAG &DAG) const { - if (useSVEForFixedLengthVectorVT(Op.getValueType())) + if (useSVEForFixedLengthVectorVT(Op.getValueType(), + Subtarget->forceStreamingCompatibleSVE())) return LowerToScalableOp(Op, DAG); // Attempt to form a vector S[LR]I from (or (and X, C1), (lsl Y, C2)) @@ -15743,7 +15752,6 @@ if (!DAG.getTargetLoweringInfo().isTypeLegal(VT)) return SDValue(); - if (VT.isScalableVector()) return performSVEAndCombine(N, DCI); @@ -22393,7 +22401,8 @@ } // "cast" fixed length vector to a scalable vector. - assert(useSVEForFixedLengthVectorVT(V.getValueType(), Subtarget->forceStreamingCompatibleSVE()) && + assert(useSVEForFixedLengthVectorVT( + V.getValueType(), Subtarget->forceStreamingCompatibleSVE()) && "Only fixed length vectors are supported!"); Ops.push_back(convertToScalableVector(DAG, ContainerVT, V)); } diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-arith.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-arith.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-arith.ll @@ -0,0 +1,791 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -force-streaming-compatible-sve < %s | FileCheck %s + +target triple = "aarch64-unknown-linux-gnu" + +; +; ADD +; +define <4 x i8> @add_v4i8(<4 x i8> %op1, <4 x i8> %op2) #0 { +; CHECK-LABEL: add_v4i8: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1 +; CHECK-NEXT: add z0.h, z0.h, z1.h +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 +; CHECK-NEXT: ret + %res = add <4 x i8> %op1, %op2 + ret <4 x i8> %res +} + +define <8 x i8> @add_v8i8(<8 x i8> %op1, <8 x i8> %op2) #0 { +; CHECK-LABEL: add_v8i8: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1 +; CHECK-NEXT: add z0.b, z0.b, z1.b +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 +; CHECK-NEXT: ret + %res = add <8 x i8> %op1, %op2 + ret <8 x i8> %res +} + +define <16 x i8> @add_v16i8(<16 x i8> %op1, <16 x i8> %op2) #0 { +; CHECK-LABEL: add_v16i8: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1 +; CHECK-NEXT: add z0.b, z0.b, z1.b +; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 +; CHECK-NEXT: ret + %res = add <16 x i8> %op1, %op2 + ret <16 x i8> %res +} + +define void @add_v32i8(<32 x i8>* %a, <32 x i8>* %b) #0 { +; CHECK-LABEL: add_v32i8: +; CHECK: // %bb.0: +; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ldp q2, q3, [x1] +; CHECK-NEXT: add z0.b, z0.b, z2.b +; CHECK-NEXT: add z1.b, z1.b, z3.b +; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: ret + %op1 = load <32 x i8>, <32 x i8>* %a + %op2 = load <32 x i8>, <32 x i8>* %b + %res = add <32 x i8> %op1, %op2 + store <32 x i8> %res, <32 x i8>* %a + ret void +} + +define <2 x i16> @add_v2i16(<2 x i16> %op1, <2 x i16> %op2) #0 { +; CHECK-LABEL: add_v2i16: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1 +; CHECK-NEXT: add z0.s, z0.s, z1.s +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 +; CHECK-NEXT: ret + %res = add <2 x i16> %op1, %op2 + ret <2 x i16> %res +} + +define <4 x i16> @add_v4i16(<4 x i16> %op1, <4 x i16> %op2) #0 { +; CHECK-LABEL: add_v4i16: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1 +; CHECK-NEXT: add z0.h, z0.h, z1.h +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 +; CHECK-NEXT: ret + %res = add <4 x i16> %op1, %op2 + ret <4 x i16> %res +} + +define <8 x i16> @add_v8i16(<8 x i16> %op1, <8 x i16> %op2) #0 { +; CHECK-LABEL: add_v8i16: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1 +; CHECK-NEXT: add z0.h, z0.h, z1.h +; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 +; CHECK-NEXT: ret + %res = add <8 x i16> %op1, %op2 + ret <8 x i16> %res +} + +define void @add_v16i16(<16 x i16>* %a, <16 x i16>* %b) #0 { +; CHECK-LABEL: add_v16i16: +; CHECK: // %bb.0: +; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ldp q2, q3, [x1] +; CHECK-NEXT: add z0.h, z0.h, z2.h +; CHECK-NEXT: add z1.h, z1.h, z3.h +; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: ret + %op1 = load <16 x i16>, <16 x i16>* %a + %op2 = load <16 x i16>, <16 x i16>* %b + %res = add <16 x i16> %op1, %op2 + store <16 x i16> %res, <16 x i16>* %a + ret void +} + +define <2 x i32> @add_v2i32(<2 x i32> %op1, <2 x i32> %op2) #0 { +; CHECK-LABEL: add_v2i32: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1 +; CHECK-NEXT: add z0.s, z0.s, z1.s +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 +; CHECK-NEXT: ret + %res = add <2 x i32> %op1, %op2 + ret <2 x i32> %res +} + +define <4 x i32> @add_v4i32(<4 x i32> %op1, <4 x i32> %op2) #0 { +; CHECK-LABEL: add_v4i32: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1 +; CHECK-NEXT: add z0.s, z0.s, z1.s +; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 +; CHECK-NEXT: ret + %res = add <4 x i32> %op1, %op2 + ret <4 x i32> %res +} + +define void @add_v8i32(<8 x i32>* %a, <8 x i32>* %b) #0 { +; CHECK-LABEL: add_v8i32: +; CHECK: // %bb.0: +; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ldp q2, q3, [x1] +; CHECK-NEXT: add z0.s, z0.s, z2.s +; CHECK-NEXT: add z1.s, z1.s, z3.s +; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: ret + %op1 = load <8 x i32>, <8 x i32>* %a + %op2 = load <8 x i32>, <8 x i32>* %b + %res = add <8 x i32> %op1, %op2 + store <8 x i32> %res, <8 x i32>* %a + ret void +} + +define <1 x i64> @add_v1i64(<1 x i64> %op1, <1 x i64> %op2) #0 { +; CHECK-LABEL: add_v1i64: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1 +; CHECK-NEXT: add z0.d, z0.d, z1.d +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 +; CHECK-NEXT: ret + %res = add <1 x i64> %op1, %op2 + ret <1 x i64> %res +} + +define <2 x i64> @add_v2i64(<2 x i64> %op1, <2 x i64> %op2) #0 { +; CHECK-LABEL: add_v2i64: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1 +; CHECK-NEXT: add z0.d, z0.d, z1.d +; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 +; CHECK-NEXT: ret + %res = add <2 x i64> %op1, %op2 + ret <2 x i64> %res +} + +define void @add_v4i64(<4 x i64>* %a, <4 x i64>* %b) #0 { +; CHECK-LABEL: add_v4i64: +; CHECK: // %bb.0: +; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ldp q2, q3, [x1] +; CHECK-NEXT: add z0.d, z0.d, z2.d +; CHECK-NEXT: add z1.d, z1.d, z3.d +; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: ret + %op1 = load <4 x i64>, <4 x i64>* %a + %op2 = load <4 x i64>, <4 x i64>* %b + %res = add <4 x i64> %op1, %op2 + store <4 x i64> %res, <4 x i64>* %a + ret void +} + +; +; MUL +; + +define <4 x i8> @mul_v4i8(<4 x i8> %op1, <4 x i8> %op2) #0 { +; CHECK-LABEL: mul_v4i8: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: ptrue p0.h, vl4 +; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1 +; CHECK-NEXT: mul z0.h, p0/m, z0.h, z1.h +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 +; CHECK-NEXT: ret + %res = mul <4 x i8> %op1, %op2 + ret <4 x i8> %res +} + +define <8 x i8> @mul_v8i8(<8 x i8> %op1, <8 x i8> %op2) #0 { +; CHECK-LABEL: mul_v8i8: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: ptrue p0.b, vl8 +; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1 +; CHECK-NEXT: mul z0.b, p0/m, z0.b, z1.b +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 +; CHECK-NEXT: ret + %res = mul <8 x i8> %op1, %op2 + ret <8 x i8> %res +} + +define <16 x i8> @mul_v16i8(<16 x i8> %op1, <16 x i8> %op2) #0 { +; CHECK-LABEL: mul_v16i8: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: ptrue p0.b, vl16 +; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1 +; CHECK-NEXT: mul z0.b, p0/m, z0.b, z1.b +; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 +; CHECK-NEXT: ret + %res = mul <16 x i8> %op1, %op2 + ret <16 x i8> %res +} + +define void @mul_v32i8(<32 x i8>* %a, <32 x i8>* %b) #0 { +; CHECK-LABEL: mul_v32i8: +; CHECK: // %bb.0: +; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ptrue p0.b, vl16 +; CHECK-NEXT: ldp q2, q3, [x1] +; CHECK-NEXT: mul z0.b, p0/m, z0.b, z2.b +; CHECK-NEXT: mul z1.b, p0/m, z1.b, z3.b +; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: ret + %op1 = load <32 x i8>, <32 x i8>* %a + %op2 = load <32 x i8>, <32 x i8>* %b + %res = mul <32 x i8> %op1, %op2 + store <32 x i8> %res, <32 x i8>* %a + ret void +} + +define <2 x i16> @mul_v2i16(<2 x i16> %op1, <2 x i16> %op2) #0 { +; CHECK-LABEL: mul_v2i16: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: ptrue p0.s, vl2 +; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1 +; CHECK-NEXT: mul z0.s, p0/m, z0.s, z1.s +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 +; CHECK-NEXT: ret + %res = mul <2 x i16> %op1, %op2 + ret <2 x i16> %res +} + +define <4 x i16> @mul_v4i16(<4 x i16> %op1, <4 x i16> %op2) #0 { +; CHECK-LABEL: mul_v4i16: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: ptrue p0.h, vl4 +; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1 +; CHECK-NEXT: mul z0.h, p0/m, z0.h, z1.h +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 +; CHECK-NEXT: ret + %res = mul <4 x i16> %op1, %op2 + ret <4 x i16> %res +} + +define <8 x i16> @mul_v8i16(<8 x i16> %op1, <8 x i16> %op2) #0 { +; CHECK-LABEL: mul_v8i16: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: ptrue p0.h, vl8 +; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1 +; CHECK-NEXT: mul z0.h, p0/m, z0.h, z1.h +; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 +; CHECK-NEXT: ret + %res = mul <8 x i16> %op1, %op2 + ret <8 x i16> %res +} + +define void @mul_v16i16(<16 x i16>* %a, <16 x i16>* %b) #0 { +; CHECK-LABEL: mul_v16i16: +; CHECK: // %bb.0: +; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ptrue p0.h, vl8 +; CHECK-NEXT: ldp q2, q3, [x1] +; CHECK-NEXT: mul z0.h, p0/m, z0.h, z2.h +; CHECK-NEXT: mul z1.h, p0/m, z1.h, z3.h +; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: ret + %op1 = load <16 x i16>, <16 x i16>* %a + %op2 = load <16 x i16>, <16 x i16>* %b + %res = mul <16 x i16> %op1, %op2 + store <16 x i16> %res, <16 x i16>* %a + ret void +} + +define <2 x i32> @mul_v2i32(<2 x i32> %op1, <2 x i32> %op2) #0 { +; CHECK-LABEL: mul_v2i32: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: ptrue p0.s, vl2 +; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1 +; CHECK-NEXT: mul z0.s, p0/m, z0.s, z1.s +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 +; CHECK-NEXT: ret + %res = mul <2 x i32> %op1, %op2 + ret <2 x i32> %res +} + +define <4 x i32> @mul_v4i32(<4 x i32> %op1, <4 x i32> %op2) #0 { +; CHECK-LABEL: mul_v4i32: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: ptrue p0.s, vl4 +; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1 +; CHECK-NEXT: mul z0.s, p0/m, z0.s, z1.s +; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 +; CHECK-NEXT: ret + %res = mul <4 x i32> %op1, %op2 + ret <4 x i32> %res +} + +define void @mul_v8i32(<8 x i32>* %a, <8 x i32>* %b) #0 { +; CHECK-LABEL: mul_v8i32: +; CHECK: // %bb.0: +; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ptrue p0.s, vl4 +; CHECK-NEXT: ldp q2, q3, [x1] +; CHECK-NEXT: mul z0.s, p0/m, z0.s, z2.s +; CHECK-NEXT: mul z1.s, p0/m, z1.s, z3.s +; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: ret + %op1 = load <8 x i32>, <8 x i32>* %a + %op2 = load <8 x i32>, <8 x i32>* %b + %res = mul <8 x i32> %op1, %op2 + store <8 x i32> %res, <8 x i32>* %a + ret void +} + +define <1 x i64> @mul_v1i64(<1 x i64> %op1, <1 x i64> %op2) #0 { +; CHECK-LABEL: mul_v1i64: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: ptrue p0.d, vl1 +; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1 +; CHECK-NEXT: mul z0.d, p0/m, z0.d, z1.d +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 +; CHECK-NEXT: ret + %res = mul <1 x i64> %op1, %op2 + ret <1 x i64> %res +} + +define <2 x i64> @mul_v2i64(<2 x i64> %op1, <2 x i64> %op2) #0 { +; CHECK-LABEL: mul_v2i64: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: ptrue p0.d, vl2 +; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1 +; CHECK-NEXT: mul z0.d, p0/m, z0.d, z1.d +; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 +; CHECK-NEXT: ret + %res = mul <2 x i64> %op1, %op2 + ret <2 x i64> %res +} + +define void @mul_v4i64(<4 x i64>* %a, <4 x i64>* %b) #0 { +; CHECK-LABEL: mul_v4i64: +; CHECK: // %bb.0: +; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ptrue p0.d, vl2 +; CHECK-NEXT: ldp q2, q3, [x1] +; CHECK-NEXT: mul z0.d, p0/m, z0.d, z2.d +; CHECK-NEXT: mul z1.d, p0/m, z1.d, z3.d +; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: ret + %op1 = load <4 x i64>, <4 x i64>* %a + %op2 = load <4 x i64>, <4 x i64>* %b + %res = mul <4 x i64> %op1, %op2 + store <4 x i64> %res, <4 x i64>* %a + ret void +} + +; +; SUB +; + +define <4 x i8> @sub_v4i8(<4 x i8> %op1, <4 x i8> %op2) #0 { +; CHECK-LABEL: sub_v4i8: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1 +; CHECK-NEXT: sub z0.h, z0.h, z1.h +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 +; CHECK-NEXT: ret + %res = sub <4 x i8> %op1, %op2 + ret <4 x i8> %res +} + +define <8 x i8> @sub_v8i8(<8 x i8> %op1, <8 x i8> %op2) #0 { +; CHECK-LABEL: sub_v8i8: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1 +; CHECK-NEXT: sub z0.b, z0.b, z1.b +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 +; CHECK-NEXT: ret + %res = sub <8 x i8> %op1, %op2 + ret <8 x i8> %res +} + +define <16 x i8> @sub_v16i8(<16 x i8> %op1, <16 x i8> %op2) #0 { +; CHECK-LABEL: sub_v16i8: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1 +; CHECK-NEXT: sub z0.b, z0.b, z1.b +; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 +; CHECK-NEXT: ret + %res = sub <16 x i8> %op1, %op2 + ret <16 x i8> %res +} + +define void @sub_v32i8(<32 x i8>* %a, <32 x i8>* %b) #0 { +; CHECK-LABEL: sub_v32i8: +; CHECK: // %bb.0: +; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ldp q2, q3, [x1] +; CHECK-NEXT: sub z0.b, z0.b, z2.b +; CHECK-NEXT: sub z1.b, z1.b, z3.b +; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: ret + %op1 = load <32 x i8>, <32 x i8>* %a + %op2 = load <32 x i8>, <32 x i8>* %b + %res = sub <32 x i8> %op1, %op2 + store <32 x i8> %res, <32 x i8>* %a + ret void +} + +define <2 x i16> @sub_v2i16(<2 x i16> %op1, <2 x i16> %op2) #0 { +; CHECK-LABEL: sub_v2i16: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1 +; CHECK-NEXT: sub z0.s, z0.s, z1.s +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 +; CHECK-NEXT: ret + %res = sub <2 x i16> %op1, %op2 + ret <2 x i16> %res +} + +define <4 x i16> @sub_v4i16(<4 x i16> %op1, <4 x i16> %op2) #0 { +; CHECK-LABEL: sub_v4i16: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1 +; CHECK-NEXT: sub z0.h, z0.h, z1.h +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 +; CHECK-NEXT: ret + %res = sub <4 x i16> %op1, %op2 + ret <4 x i16> %res +} + +define <8 x i16> @sub_v8i16(<8 x i16> %op1, <8 x i16> %op2) #0 { +; CHECK-LABEL: sub_v8i16: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1 +; CHECK-NEXT: sub z0.h, z0.h, z1.h +; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 +; CHECK-NEXT: ret + %res = sub <8 x i16> %op1, %op2 + ret <8 x i16> %res +} + +define void @sub_v16i16(<16 x i16>* %a, <16 x i16>* %b) #0 { +; CHECK-LABEL: sub_v16i16: +; CHECK: // %bb.0: +; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ldp q2, q3, [x1] +; CHECK-NEXT: sub z0.h, z0.h, z2.h +; CHECK-NEXT: sub z1.h, z1.h, z3.h +; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: ret + %op1 = load <16 x i16>, <16 x i16>* %a + %op2 = load <16 x i16>, <16 x i16>* %b + %res = sub <16 x i16> %op1, %op2 + store <16 x i16> %res, <16 x i16>* %a + ret void +} + +define <2 x i32> @sub_v2i32(<2 x i32> %op1, <2 x i32> %op2) #0 { +; CHECK-LABEL: sub_v2i32: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1 +; CHECK-NEXT: sub z0.s, z0.s, z1.s +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 +; CHECK-NEXT: ret + %res = sub <2 x i32> %op1, %op2 + ret <2 x i32> %res +} + +define <4 x i32> @sub_v4i32(<4 x i32> %op1, <4 x i32> %op2) #0 { +; CHECK-LABEL: sub_v4i32: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1 +; CHECK-NEXT: sub z0.s, z0.s, z1.s +; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 +; CHECK-NEXT: ret + %res = sub <4 x i32> %op1, %op2 + ret <4 x i32> %res +} + +define void @sub_v8i32(<8 x i32>* %a, <8 x i32>* %b) #0 { +; CHECK-LABEL: sub_v8i32: +; CHECK: // %bb.0: +; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ldp q2, q3, [x1] +; CHECK-NEXT: sub z0.s, z0.s, z2.s +; CHECK-NEXT: sub z1.s, z1.s, z3.s +; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: ret + %op1 = load <8 x i32>, <8 x i32>* %a + %op2 = load <8 x i32>, <8 x i32>* %b + %res = sub <8 x i32> %op1, %op2 + store <8 x i32> %res, <8 x i32>* %a + ret void +} + +define <1 x i64> @sub_v1i64(<1 x i64> %op1, <1 x i64> %op2) #0 { +; CHECK-LABEL: sub_v1i64: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1 +; CHECK-NEXT: sub z0.d, z0.d, z1.d +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 +; CHECK-NEXT: ret + %res = sub <1 x i64> %op1, %op2 + ret <1 x i64> %res +} + +define <2 x i64> @sub_v2i64(<2 x i64> %op1, <2 x i64> %op2) #0 { +; CHECK-LABEL: sub_v2i64: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1 +; CHECK-NEXT: sub z0.d, z0.d, z1.d +; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 +; CHECK-NEXT: ret + %res = sub <2 x i64> %op1, %op2 + ret <2 x i64> %res +} + +define void @sub_v4i64(<4 x i64>* %a, <4 x i64>* %b) #0 { +; CHECK-LABEL: sub_v4i64: +; CHECK: // %bb.0: +; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ldp q2, q3, [x1] +; CHECK-NEXT: sub z0.d, z0.d, z2.d +; CHECK-NEXT: sub z1.d, z1.d, z3.d +; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: ret + %op1 = load <4 x i64>, <4 x i64>* %a + %op2 = load <4 x i64>, <4 x i64>* %b + %res = sub <4 x i64> %op1, %op2 + store <4 x i64> %res, <4 x i64>* %a + ret void +} + +; +; ABS +; + +define <4 x i8> @abs_v4i8(<4 x i8> %op1) #0 { +; CHECK-LABEL: abs_v4i8: +; CHECK: // %bb.0: +; CHECK-NEXT: adrp x8, .LCPI42_0 +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: ptrue p0.h, vl4 +; CHECK-NEXT: ldr d1, [x8, :lo12:.LCPI42_0] +; CHECK-NEXT: lsl z0.h, p0/m, z0.h, z1.h +; CHECK-NEXT: asr z0.h, p0/m, z0.h, z1.h +; CHECK-NEXT: abs z0.h, p0/m, z0.h +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 +; CHECK-NEXT: ret + %res = call <4 x i8> @llvm.abs.v4i8(<4 x i8> %op1, i1 false) + ret <4 x i8> %res +} + +define <8 x i8> @abs_v8i8(<8 x i8> %op1) #0 { +; CHECK-LABEL: abs_v8i8: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: ptrue p0.b, vl8 +; CHECK-NEXT: abs z0.b, p0/m, z0.b +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 +; CHECK-NEXT: ret + %res = call <8 x i8> @llvm.abs.v8i8(<8 x i8> %op1, i1 false) + ret <8 x i8> %res +} + +define <16 x i8> @abs_v16i8(<16 x i8> %op1) #0 { +; CHECK-LABEL: abs_v16i8: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: ptrue p0.b, vl16 +; CHECK-NEXT: abs z0.b, p0/m, z0.b +; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 +; CHECK-NEXT: ret + %res = call <16 x i8> @llvm.abs.v16i8(<16 x i8> %op1, i1 false) + ret <16 x i8> %res +} + +define void @abs_v32i8(<32 x i8>* %a) #0 { +; CHECK-LABEL: abs_v32i8: +; CHECK: // %bb.0: +; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ptrue p0.b, vl16 +; CHECK-NEXT: abs z0.b, p0/m, z0.b +; CHECK-NEXT: abs z1.b, p0/m, z1.b +; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: ret + %op1 = load <32 x i8>, <32 x i8>* %a + %res = call <32 x i8> @llvm.abs.v32i8(<32 x i8> %op1, i1 false) + store <32 x i8> %res, <32 x i8>* %a + ret void +} + +define <2 x i16> @abs_v2i16(<2 x i16> %op1) #0 { +; CHECK-LABEL: abs_v2i16: +; CHECK: // %bb.0: +; CHECK-NEXT: adrp x8, .LCPI46_0 +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: ptrue p0.s, vl2 +; CHECK-NEXT: ldr d1, [x8, :lo12:.LCPI46_0] +; CHECK-NEXT: lsl z0.s, p0/m, z0.s, z1.s +; CHECK-NEXT: asr z0.s, p0/m, z0.s, z1.s +; CHECK-NEXT: abs z0.s, p0/m, z0.s +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 +; CHECK-NEXT: ret + %res = call <2 x i16> @llvm.abs.v2i16(<2 x i16> %op1, i1 false) + ret <2 x i16> %res +} + +define <4 x i16> @abs_v4i16(<4 x i16> %op1) #0 { +; CHECK-LABEL: abs_v4i16: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: ptrue p0.h, vl4 +; CHECK-NEXT: abs z0.h, p0/m, z0.h +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 +; CHECK-NEXT: ret + %res = call <4 x i16> @llvm.abs.v4i16(<4 x i16> %op1, i1 false) + ret <4 x i16> %res +} + +define <8 x i16> @abs_v8i16(<8 x i16> %op1) #0 { +; CHECK-LABEL: abs_v8i16: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: ptrue p0.h, vl8 +; CHECK-NEXT: abs z0.h, p0/m, z0.h +; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 +; CHECK-NEXT: ret + %res = call <8 x i16> @llvm.abs.v8i16(<8 x i16> %op1, i1 false) + ret <8 x i16> %res +} + +define void @abs_v16i16(<16 x i16>* %a) #0 { +; CHECK-LABEL: abs_v16i16: +; CHECK: // %bb.0: +; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ptrue p0.h, vl8 +; CHECK-NEXT: abs z0.h, p0/m, z0.h +; CHECK-NEXT: abs z1.h, p0/m, z1.h +; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: ret + %op1 = load <16 x i16>, <16 x i16>* %a + %res = call <16 x i16> @llvm.abs.v16i16(<16 x i16> %op1, i1 false) + store <16 x i16> %res, <16 x i16>* %a + ret void +} + +define <2 x i32> @abs_v2i32(<2 x i32> %op1) #0 { +; CHECK-LABEL: abs_v2i32: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: ptrue p0.s, vl2 +; CHECK-NEXT: abs z0.s, p0/m, z0.s +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 +; CHECK-NEXT: ret + %res = call <2 x i32> @llvm.abs.v2i32(<2 x i32> %op1, i1 false) + ret <2 x i32> %res +} + +define <4 x i32> @abs_v4i32(<4 x i32> %op1) #0 { +; CHECK-LABEL: abs_v4i32: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: ptrue p0.s, vl4 +; CHECK-NEXT: abs z0.s, p0/m, z0.s +; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 +; CHECK-NEXT: ret + %res = call <4 x i32> @llvm.abs.v4i32(<4 x i32> %op1, i1 false) + ret <4 x i32> %res +} + +define void @abs_v8i32(<8 x i32>* %a) #0 { +; CHECK-LABEL: abs_v8i32: +; CHECK: // %bb.0: +; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ptrue p0.s, vl4 +; CHECK-NEXT: abs z0.s, p0/m, z0.s +; CHECK-NEXT: abs z1.s, p0/m, z1.s +; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: ret + %op1 = load <8 x i32>, <8 x i32>* %a + %res = call <8 x i32> @llvm.abs.v8i32(<8 x i32> %op1, i1 false) + store <8 x i32> %res, <8 x i32>* %a + ret void +} + +define <1 x i64> @abs_v1i64(<1 x i64> %op1) #0 { +; CHECK-LABEL: abs_v1i64: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: ptrue p0.d, vl1 +; CHECK-NEXT: abs z0.d, p0/m, z0.d +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 +; CHECK-NEXT: ret + %res = call <1 x i64> @llvm.abs.v1i64(<1 x i64> %op1, i1 false) + ret <1 x i64> %res +} + +define <2 x i64> @abs_v2i64(<2 x i64> %op1) #0 { +; CHECK-LABEL: abs_v2i64: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: ptrue p0.d, vl2 +; CHECK-NEXT: abs z0.d, p0/m, z0.d +; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 +; CHECK-NEXT: ret + %res = call <2 x i64> @llvm.abs.v2i64(<2 x i64> %op1, i1 false) + ret <2 x i64> %res +} + +define void @abs_v4i64(<4 x i64>* %a) #0 { +; CHECK-LABEL: abs_v4i64: +; CHECK: // %bb.0: +; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ptrue p0.d, vl2 +; CHECK-NEXT: abs z0.d, p0/m, z0.d +; CHECK-NEXT: abs z1.d, p0/m, z1.d +; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: ret + %op1 = load <4 x i64>, <4 x i64>* %a + %res = call <4 x i64> @llvm.abs.v4i64(<4 x i64> %op1, i1 false) + store <4 x i64> %res, <4 x i64>* %a + ret void +} + +declare <4 x i8> @llvm.abs.v4i8(<4 x i8>, i1) +declare <8 x i8> @llvm.abs.v8i8(<8 x i8>, i1) +declare <16 x i8> @llvm.abs.v16i8(<16 x i8>, i1) +declare <32 x i8> @llvm.abs.v32i8(<32 x i8>, i1) +declare <4 x i16> @llvm.abs.v4i16(<4 x i16>, i1) +declare <2 x i16> @llvm.abs.v2i16(<2 x i16>, i1) +declare <8 x i16> @llvm.abs.v8i16(<8 x i16>, i1) +declare <16 x i16> @llvm.abs.v16i16(<16 x i16>, i1) +declare <2 x i32> @llvm.abs.v2i32(<2 x i32>, i1) +declare <4 x i32> @llvm.abs.v4i32(<4 x i32>, i1) +declare <8 x i32> @llvm.abs.v8i32(<8 x i32>, i1) +declare <1 x i64> @llvm.abs.v1i64(<1 x i64>, i1) +declare <2 x i64> @llvm.abs.v2i64(<2 x i64>, i1) +declare <4 x i64> @llvm.abs.v4i64(<4 x i64>, i1) + + +attributes #0 = { "target-features"="+sve" } diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-div.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-div.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-div.ll @@ -0,0 +1,737 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -force-streaming-compatible-sve < %s | FileCheck %s + +target triple = "aarch64-unknown-linux-gnu" + +; +; SDIV +; + +define <4 x i8> @sdiv_v4i8(<4 x i8> %op1, <4 x i8> %op2) #0 { +; CHECK-LABEL: sdiv_v4i8: +; CHECK: // %bb.0: +; CHECK-NEXT: sub sp, sp, #16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: adrp x8, .LCPI0_0 +; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1 +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: ptrue p0.h, vl4 +; CHECK-NEXT: ldr d2, [x8, :lo12:.LCPI0_0] +; CHECK-NEXT: lsl z0.h, p0/m, z0.h, z2.h +; CHECK-NEXT: lsl z1.h, p0/m, z1.h, z2.h +; CHECK-NEXT: asr z0.h, p0/m, z0.h, z2.h +; CHECK-NEXT: asr z1.h, p0/m, z1.h, z2.h +; CHECK-NEXT: ptrue p0.s, vl4 +; CHECK-NEXT: sunpklo z1.s, z1.h +; CHECK-NEXT: sunpklo z0.s, z0.h +; CHECK-NEXT: sdiv z0.s, p0/m, z0.s, z1.s +; CHECK-NEXT: fmov w8, s0 +; CHECK-NEXT: mov z1.s, z0.s[3] +; CHECK-NEXT: mov z2.s, z0.s[2] +; CHECK-NEXT: mov z0.s, z0.s[1] +; CHECK-NEXT: fmov w9, s1 +; CHECK-NEXT: fmov w10, s2 +; CHECK-NEXT: strh w8, [sp, #8] +; CHECK-NEXT: fmov w8, s0 +; CHECK-NEXT: strh w9, [sp, #14] +; CHECK-NEXT: strh w10, [sp, #12] +; CHECK-NEXT: strh w8, [sp, #10] +; CHECK-NEXT: ldr d0, [sp, #8] +; CHECK-NEXT: add sp, sp, #16 +; CHECK-NEXT: ret + %res = sdiv <4 x i8> %op1, %op2 + ret <4 x i8> %res +} + +define <8 x i8> @sdiv_v8i8(<8 x i8> %op1, <8 x i8> %op2) #0 { +; CHECK-LABEL: sdiv_v8i8: +; CHECK: // %bb.0: +; CHECK-NEXT: sub sp, sp, #16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1 +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: ptrue p0.s, vl4 +; CHECK-NEXT: sunpklo z1.h, z1.b +; CHECK-NEXT: sunpklo z0.h, z0.b +; CHECK-NEXT: sunpkhi z2.s, z1.h +; CHECK-NEXT: sunpkhi z3.s, z0.h +; CHECK-NEXT: sunpklo z1.s, z1.h +; CHECK-NEXT: sunpklo z0.s, z0.h +; CHECK-NEXT: sdivr z2.s, p0/m, z2.s, z3.s +; CHECK-NEXT: sdiv z0.s, p0/m, z0.s, z1.s +; CHECK-NEXT: uzp1 z0.h, z0.h, z2.h +; CHECK-NEXT: mov z1.h, z0.h[7] +; CHECK-NEXT: fmov w8, s0 +; CHECK-NEXT: fmov w9, s1 +; CHECK-NEXT: mov z2.h, z0.h[6] +; CHECK-NEXT: mov z3.h, z0.h[5] +; CHECK-NEXT: mov z4.h, z0.h[4] +; CHECK-NEXT: fmov w10, s2 +; CHECK-NEXT: strb w8, [sp, #8] +; CHECK-NEXT: fmov w8, s3 +; CHECK-NEXT: strb w9, [sp, #15] +; CHECK-NEXT: fmov w9, s4 +; CHECK-NEXT: mov z5.h, z0.h[3] +; CHECK-NEXT: mov z6.h, z0.h[2] +; CHECK-NEXT: mov z0.h, z0.h[1] +; CHECK-NEXT: strb w10, [sp, #14] +; CHECK-NEXT: fmov w10, s5 +; CHECK-NEXT: strb w8, [sp, #13] +; CHECK-NEXT: fmov w8, s6 +; CHECK-NEXT: strb w9, [sp, #12] +; CHECK-NEXT: fmov w9, s0 +; CHECK-NEXT: strb w10, [sp, #11] +; CHECK-NEXT: strb w8, [sp, #10] +; CHECK-NEXT: strb w9, [sp, #9] +; CHECK-NEXT: ldr d0, [sp, #8] +; CHECK-NEXT: add sp, sp, #16 +; CHECK-NEXT: ret + %res = sdiv <8 x i8> %op1, %op2 + ret <8 x i8> %res +} + +define <16 x i8> @sdiv_v16i8(<16 x i8> %op1, <16 x i8> %op2) #0 { +; CHECK-LABEL: sdiv_v16i8: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1 +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: sunpkhi z2.h, z1.b +; CHECK-NEXT: sunpkhi z3.h, z0.b +; CHECK-NEXT: ptrue p0.s, vl4 +; CHECK-NEXT: sunpklo z1.h, z1.b +; CHECK-NEXT: sunpkhi z4.s, z2.h +; CHECK-NEXT: sunpkhi z5.s, z3.h +; CHECK-NEXT: sunpklo z2.s, z2.h +; CHECK-NEXT: sunpklo z3.s, z3.h +; CHECK-NEXT: sunpklo z0.h, z0.b +; CHECK-NEXT: sdivr z4.s, p0/m, z4.s, z5.s +; CHECK-NEXT: sdivr z2.s, p0/m, z2.s, z3.s +; CHECK-NEXT: sunpkhi z3.s, z1.h +; CHECK-NEXT: sunpkhi z5.s, z0.h +; CHECK-NEXT: sunpklo z1.s, z1.h +; CHECK-NEXT: sunpklo z0.s, z0.h +; CHECK-NEXT: sdivr z3.s, p0/m, z3.s, z5.s +; CHECK-NEXT: sdiv z0.s, p0/m, z0.s, z1.s +; CHECK-NEXT: uzp1 z1.h, z2.h, z4.h +; CHECK-NEXT: uzp1 z0.h, z0.h, z3.h +; CHECK-NEXT: uzp1 z0.b, z0.b, z1.b +; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 +; CHECK-NEXT: ret + %res = sdiv <16 x i8> %op1, %op2 + ret <16 x i8> %res +} + +define void @sdiv_v32i8(<32 x i8>* %a, <32 x i8>* %b) #0 { +; CHECK-LABEL: sdiv_v32i8: +; CHECK: // %bb.0: +; CHECK-NEXT: ldp q3, q0, [x1] +; CHECK-NEXT: ptrue p0.s, vl4 +; CHECK-NEXT: ldp q2, q1, [x0] +; CHECK-NEXT: sunpkhi z4.h, z0.b +; CHECK-NEXT: sunpklo z0.h, z0.b +; CHECK-NEXT: sunpkhi z6.s, z4.h +; CHECK-NEXT: sunpklo z4.s, z4.h +; CHECK-NEXT: sunpkhi z16.s, z0.h +; CHECK-NEXT: sunpklo z0.s, z0.h +; CHECK-NEXT: sunpkhi z5.h, z1.b +; CHECK-NEXT: sunpklo z1.h, z1.b +; CHECK-NEXT: sunpkhi z7.s, z5.h +; CHECK-NEXT: sunpklo z5.s, z5.h +; CHECK-NEXT: sdivr z6.s, p0/m, z6.s, z7.s +; CHECK-NEXT: sdivr z4.s, p0/m, z4.s, z5.s +; CHECK-NEXT: sunpkhi z5.s, z1.h +; CHECK-NEXT: sunpklo z1.s, z1.h +; CHECK-NEXT: uzp1 z4.h, z4.h, z6.h +; CHECK-NEXT: sdivr z0.s, p0/m, z0.s, z1.s +; CHECK-NEXT: sunpkhi z1.h, z3.b +; CHECK-NEXT: sunpkhi z6.h, z2.b +; CHECK-NEXT: sdiv z5.s, p0/m, z5.s, z16.s +; CHECK-NEXT: sunpkhi z7.s, z1.h +; CHECK-NEXT: sunpkhi z16.s, z6.h +; CHECK-NEXT: sunpklo z1.s, z1.h +; CHECK-NEXT: sunpklo z6.s, z6.h +; CHECK-NEXT: sunpklo z3.h, z3.b +; CHECK-NEXT: sunpklo z2.h, z2.b +; CHECK-NEXT: sdivr z7.s, p0/m, z7.s, z16.s +; CHECK-NEXT: sdivr z1.s, p0/m, z1.s, z6.s +; CHECK-NEXT: sunpkhi z6.s, z3.h +; CHECK-NEXT: sunpkhi z16.s, z2.h +; CHECK-NEXT: sunpklo z3.s, z3.h +; CHECK-NEXT: sunpklo z2.s, z2.h +; CHECK-NEXT: sdivr z6.s, p0/m, z6.s, z16.s +; CHECK-NEXT: sdiv z2.s, p0/m, z2.s, z3.s +; CHECK-NEXT: uzp1 z1.h, z1.h, z7.h +; CHECK-NEXT: uzp1 z2.h, z2.h, z6.h +; CHECK-NEXT: uzp1 z0.h, z0.h, z5.h +; CHECK-NEXT: uzp1 z1.b, z2.b, z1.b +; CHECK-NEXT: uzp1 z0.b, z0.b, z4.b +; CHECK-NEXT: stp q1, q0, [x0] +; CHECK-NEXT: ret + %op1 = load <32 x i8>, <32 x i8>* %a + %op2 = load <32 x i8>, <32 x i8>* %b + %res = sdiv <32 x i8> %op1, %op2 + store <32 x i8> %res, <32 x i8>* %a + ret void +} + +define <2 x i16> @sdiv_v2i16(<2 x i16> %op1, <2 x i16> %op2) #0 { +; CHECK-LABEL: sdiv_v2i16: +; CHECK: // %bb.0: +; CHECK-NEXT: adrp x8, .LCPI4_0 +; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1 +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: ptrue p0.s, vl2 +; CHECK-NEXT: ldr d2, [x8, :lo12:.LCPI4_0] +; CHECK-NEXT: lsl z1.s, p0/m, z1.s, z2.s +; CHECK-NEXT: lsl z0.s, p0/m, z0.s, z2.s +; CHECK-NEXT: asr z1.s, p0/m, z1.s, z2.s +; CHECK-NEXT: asr z0.s, p0/m, z0.s, z2.s +; CHECK-NEXT: sdiv z0.s, p0/m, z0.s, z1.s +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 +; CHECK-NEXT: ret + %res = sdiv <2 x i16> %op1, %op2 + ret <2 x i16> %res +} + +define <4 x i16> @sdiv_v4i16(<4 x i16> %op1, <4 x i16> %op2) #0 { +; CHECK-LABEL: sdiv_v4i16: +; CHECK: // %bb.0: +; CHECK-NEXT: sub sp, sp, #16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1 +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: ptrue p0.s, vl4 +; CHECK-NEXT: sunpklo z1.s, z1.h +; CHECK-NEXT: sunpklo z0.s, z0.h +; CHECK-NEXT: sdiv z0.s, p0/m, z0.s, z1.s +; CHECK-NEXT: fmov w8, s0 +; CHECK-NEXT: mov z1.s, z0.s[3] +; CHECK-NEXT: mov z2.s, z0.s[2] +; CHECK-NEXT: mov z0.s, z0.s[1] +; CHECK-NEXT: fmov w9, s1 +; CHECK-NEXT: fmov w10, s2 +; CHECK-NEXT: strh w8, [sp, #8] +; CHECK-NEXT: fmov w8, s0 +; CHECK-NEXT: strh w9, [sp, #14] +; CHECK-NEXT: strh w10, [sp, #12] +; CHECK-NEXT: strh w8, [sp, #10] +; CHECK-NEXT: ldr d0, [sp, #8] +; CHECK-NEXT: add sp, sp, #16 +; CHECK-NEXT: ret + %res = sdiv <4 x i16> %op1, %op2 + ret <4 x i16> %res +} + +define <8 x i16> @sdiv_v8i16(<8 x i16> %op1, <8 x i16> %op2) #0 { +; CHECK-LABEL: sdiv_v8i16: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1 +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: ptrue p0.s, vl4 +; CHECK-NEXT: sunpkhi z2.s, z1.h +; CHECK-NEXT: sunpkhi z3.s, z0.h +; CHECK-NEXT: sunpklo z1.s, z1.h +; CHECK-NEXT: sunpklo z0.s, z0.h +; CHECK-NEXT: sdivr z2.s, p0/m, z2.s, z3.s +; CHECK-NEXT: sdiv z0.s, p0/m, z0.s, z1.s +; CHECK-NEXT: uzp1 z0.h, z0.h, z2.h +; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 +; CHECK-NEXT: ret + %res = sdiv <8 x i16> %op1, %op2 + ret <8 x i16> %res +} + +define void @sdiv_v16i16(<16 x i16>* %a, <16 x i16>* %b) #0 { +; CHECK-LABEL: sdiv_v16i16: +; CHECK: // %bb.0: +; CHECK-NEXT: ldp q0, q1, [x1] +; CHECK-NEXT: ptrue p0.s, vl4 +; CHECK-NEXT: sunpkhi z6.s, z0.h +; CHECK-NEXT: sunpklo z0.s, z0.h +; CHECK-NEXT: ldp q3, q2, [x0] +; CHECK-NEXT: sunpkhi z4.s, z1.h +; CHECK-NEXT: sunpklo z1.s, z1.h +; CHECK-NEXT: sunpkhi z5.s, z2.h +; CHECK-NEXT: sunpklo z2.s, z2.h +; CHECK-NEXT: sdivr z4.s, p0/m, z4.s, z5.s +; CHECK-NEXT: sunpkhi z5.s, z3.h +; CHECK-NEXT: sunpklo z3.s, z3.h +; CHECK-NEXT: sdiv z5.s, p0/m, z5.s, z6.s +; CHECK-NEXT: sdivr z0.s, p0/m, z0.s, z3.s +; CHECK-NEXT: sdivr z1.s, p0/m, z1.s, z2.s +; CHECK-NEXT: uzp1 z0.h, z0.h, z5.h +; CHECK-NEXT: uzp1 z1.h, z1.h, z4.h +; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: ret + %op1 = load <16 x i16>, <16 x i16>* %a + %op2 = load <16 x i16>, <16 x i16>* %b + %res = sdiv <16 x i16> %op1, %op2 + store <16 x i16> %res, <16 x i16>* %a + ret void +} + +define <2 x i32> @sdiv_v2i32(<2 x i32> %op1, <2 x i32> %op2) #0 { +; CHECK-LABEL: sdiv_v2i32: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: ptrue p0.s, vl2 +; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1 +; CHECK-NEXT: sdiv z0.s, p0/m, z0.s, z1.s +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 +; CHECK-NEXT: ret + %res = sdiv <2 x i32> %op1, %op2 + ret <2 x i32> %res +} + +define <4 x i32> @sdiv_v4i32(<4 x i32> %op1, <4 x i32> %op2) #0 { +; CHECK-LABEL: sdiv_v4i32: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: ptrue p0.s, vl4 +; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1 +; CHECK-NEXT: sdiv z0.s, p0/m, z0.s, z1.s +; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 +; CHECK-NEXT: ret + %res = sdiv <4 x i32> %op1, %op2 + ret <4 x i32> %res +} + +define void @sdiv_v8i32(<8 x i32>* %a, <8 x i32>* %b) #0 { +; CHECK-LABEL: sdiv_v8i32: +; CHECK: // %bb.0: +; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ptrue p0.s, vl4 +; CHECK-NEXT: ldp q2, q3, [x1] +; CHECK-NEXT: sdiv z0.s, p0/m, z0.s, z2.s +; CHECK-NEXT: sdiv z1.s, p0/m, z1.s, z3.s +; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: ret + %op1 = load <8 x i32>, <8 x i32>* %a + %op2 = load <8 x i32>, <8 x i32>* %b + %res = sdiv <8 x i32> %op1, %op2 + store <8 x i32> %res, <8 x i32>* %a + ret void +} + +define <1 x i64> @sdiv_v1i64(<1 x i64> %op1, <1 x i64> %op2) #0 { +; CHECK-LABEL: sdiv_v1i64: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: ptrue p0.d, vl1 +; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1 +; CHECK-NEXT: sdiv z0.d, p0/m, z0.d, z1.d +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 +; CHECK-NEXT: ret + %res = sdiv <1 x i64> %op1, %op2 + ret <1 x i64> %res +} + +define <2 x i64> @sdiv_v2i64(<2 x i64> %op1, <2 x i64> %op2) #0 { +; CHECK-LABEL: sdiv_v2i64: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: ptrue p0.d, vl2 +; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1 +; CHECK-NEXT: sdiv z0.d, p0/m, z0.d, z1.d +; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 +; CHECK-NEXT: ret + %res = sdiv <2 x i64> %op1, %op2 + ret <2 x i64> %res +} + +define void @sdiv_v4i64(<4 x i64>* %a, <4 x i64>* %b) #0 { +; CHECK-LABEL: sdiv_v4i64: +; CHECK: // %bb.0: +; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ptrue p0.d, vl2 +; CHECK-NEXT: ldp q2, q3, [x1] +; CHECK-NEXT: sdiv z0.d, p0/m, z0.d, z2.d +; CHECK-NEXT: sdiv z1.d, p0/m, z1.d, z3.d +; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: ret + %op1 = load <4 x i64>, <4 x i64>* %a + %op2 = load <4 x i64>, <4 x i64>* %b + %res = sdiv <4 x i64> %op1, %op2 + store <4 x i64> %res, <4 x i64>* %a + ret void +} + +; +; UDIV +; + +define <4 x i8> @udiv_v4i8(<4 x i8> %op1, <4 x i8> %op2) #0 { +; CHECK-LABEL: udiv_v4i8: +; CHECK: // %bb.0: +; CHECK-NEXT: sub sp, sp, #16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: adrp x8, .LCPI14_0 +; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1 +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: ptrue p0.s, vl4 +; CHECK-NEXT: ldr d2, [x8, :lo12:.LCPI14_0] +; CHECK-NEXT: and z0.d, z0.d, z2.d +; CHECK-NEXT: and z1.d, z1.d, z2.d +; CHECK-NEXT: uunpklo z1.s, z1.h +; CHECK-NEXT: uunpklo z0.s, z0.h +; CHECK-NEXT: udiv z0.s, p0/m, z0.s, z1.s +; CHECK-NEXT: fmov w8, s0 +; CHECK-NEXT: mov z1.s, z0.s[3] +; CHECK-NEXT: mov z2.s, z0.s[2] +; CHECK-NEXT: mov z0.s, z0.s[1] +; CHECK-NEXT: fmov w9, s1 +; CHECK-NEXT: fmov w10, s2 +; CHECK-NEXT: strh w8, [sp, #8] +; CHECK-NEXT: fmov w8, s0 +; CHECK-NEXT: strh w9, [sp, #14] +; CHECK-NEXT: strh w10, [sp, #12] +; CHECK-NEXT: strh w8, [sp, #10] +; CHECK-NEXT: ldr d0, [sp, #8] +; CHECK-NEXT: add sp, sp, #16 +; CHECK-NEXT: ret + %res = udiv <4 x i8> %op1, %op2 + ret <4 x i8> %res +} + +define <8 x i8> @udiv_v8i8(<8 x i8> %op1, <8 x i8> %op2) #0 { +; CHECK-LABEL: udiv_v8i8: +; CHECK: // %bb.0: +; CHECK-NEXT: sub sp, sp, #16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1 +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: ptrue p0.s, vl4 +; CHECK-NEXT: uunpklo z1.h, z1.b +; CHECK-NEXT: uunpklo z0.h, z0.b +; CHECK-NEXT: uunpkhi z2.s, z1.h +; CHECK-NEXT: uunpkhi z3.s, z0.h +; CHECK-NEXT: uunpklo z1.s, z1.h +; CHECK-NEXT: uunpklo z0.s, z0.h +; CHECK-NEXT: udivr z2.s, p0/m, z2.s, z3.s +; CHECK-NEXT: udiv z0.s, p0/m, z0.s, z1.s +; CHECK-NEXT: uzp1 z0.h, z0.h, z2.h +; CHECK-NEXT: mov z1.h, z0.h[7] +; CHECK-NEXT: fmov w8, s0 +; CHECK-NEXT: fmov w9, s1 +; CHECK-NEXT: mov z2.h, z0.h[6] +; CHECK-NEXT: mov z3.h, z0.h[5] +; CHECK-NEXT: mov z4.h, z0.h[4] +; CHECK-NEXT: fmov w10, s2 +; CHECK-NEXT: strb w8, [sp, #8] +; CHECK-NEXT: fmov w8, s3 +; CHECK-NEXT: strb w9, [sp, #15] +; CHECK-NEXT: fmov w9, s4 +; CHECK-NEXT: mov z5.h, z0.h[3] +; CHECK-NEXT: mov z6.h, z0.h[2] +; CHECK-NEXT: mov z0.h, z0.h[1] +; CHECK-NEXT: strb w10, [sp, #14] +; CHECK-NEXT: fmov w10, s5 +; CHECK-NEXT: strb w8, [sp, #13] +; CHECK-NEXT: fmov w8, s6 +; CHECK-NEXT: strb w9, [sp, #12] +; CHECK-NEXT: fmov w9, s0 +; CHECK-NEXT: strb w10, [sp, #11] +; CHECK-NEXT: strb w8, [sp, #10] +; CHECK-NEXT: strb w9, [sp, #9] +; CHECK-NEXT: ldr d0, [sp, #8] +; CHECK-NEXT: add sp, sp, #16 +; CHECK-NEXT: ret + %res = udiv <8 x i8> %op1, %op2 + ret <8 x i8> %res +} + +define <16 x i8> @udiv_v16i8(<16 x i8> %op1, <16 x i8> %op2) #0 { +; CHECK-LABEL: udiv_v16i8: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1 +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: uunpkhi z2.h, z1.b +; CHECK-NEXT: uunpkhi z3.h, z0.b +; CHECK-NEXT: ptrue p0.s, vl4 +; CHECK-NEXT: uunpklo z1.h, z1.b +; CHECK-NEXT: uunpkhi z4.s, z2.h +; CHECK-NEXT: uunpkhi z5.s, z3.h +; CHECK-NEXT: uunpklo z2.s, z2.h +; CHECK-NEXT: uunpklo z3.s, z3.h +; CHECK-NEXT: uunpklo z0.h, z0.b +; CHECK-NEXT: udivr z4.s, p0/m, z4.s, z5.s +; CHECK-NEXT: udivr z2.s, p0/m, z2.s, z3.s +; CHECK-NEXT: uunpkhi z3.s, z1.h +; CHECK-NEXT: uunpkhi z5.s, z0.h +; CHECK-NEXT: uunpklo z1.s, z1.h +; CHECK-NEXT: uunpklo z0.s, z0.h +; CHECK-NEXT: udivr z3.s, p0/m, z3.s, z5.s +; CHECK-NEXT: udiv z0.s, p0/m, z0.s, z1.s +; CHECK-NEXT: uzp1 z1.h, z2.h, z4.h +; CHECK-NEXT: uzp1 z0.h, z0.h, z3.h +; CHECK-NEXT: uzp1 z0.b, z0.b, z1.b +; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 +; CHECK-NEXT: ret + %res = udiv <16 x i8> %op1, %op2 + ret <16 x i8> %res +} + +define void @udiv_v32i8(<32 x i8>* %a, <32 x i8>* %b) #0 { +; CHECK-LABEL: udiv_v32i8: +; CHECK: // %bb.0: +; CHECK-NEXT: ldp q3, q0, [x1] +; CHECK-NEXT: ptrue p0.s, vl4 +; CHECK-NEXT: ldp q2, q1, [x0] +; CHECK-NEXT: uunpkhi z4.h, z0.b +; CHECK-NEXT: uunpklo z0.h, z0.b +; CHECK-NEXT: uunpkhi z6.s, z4.h +; CHECK-NEXT: uunpklo z4.s, z4.h +; CHECK-NEXT: uunpkhi z16.s, z0.h +; CHECK-NEXT: uunpklo z0.s, z0.h +; CHECK-NEXT: uunpkhi z5.h, z1.b +; CHECK-NEXT: uunpklo z1.h, z1.b +; CHECK-NEXT: uunpkhi z7.s, z5.h +; CHECK-NEXT: uunpklo z5.s, z5.h +; CHECK-NEXT: udivr z6.s, p0/m, z6.s, z7.s +; CHECK-NEXT: udivr z4.s, p0/m, z4.s, z5.s +; CHECK-NEXT: uunpkhi z5.s, z1.h +; CHECK-NEXT: uunpklo z1.s, z1.h +; CHECK-NEXT: uzp1 z4.h, z4.h, z6.h +; CHECK-NEXT: udivr z0.s, p0/m, z0.s, z1.s +; CHECK-NEXT: uunpkhi z1.h, z3.b +; CHECK-NEXT: uunpkhi z6.h, z2.b +; CHECK-NEXT: udiv z5.s, p0/m, z5.s, z16.s +; CHECK-NEXT: uunpkhi z7.s, z1.h +; CHECK-NEXT: uunpkhi z16.s, z6.h +; CHECK-NEXT: uunpklo z1.s, z1.h +; CHECK-NEXT: uunpklo z6.s, z6.h +; CHECK-NEXT: uunpklo z3.h, z3.b +; CHECK-NEXT: uunpklo z2.h, z2.b +; CHECK-NEXT: udivr z7.s, p0/m, z7.s, z16.s +; CHECK-NEXT: udivr z1.s, p0/m, z1.s, z6.s +; CHECK-NEXT: uunpkhi z6.s, z3.h +; CHECK-NEXT: uunpkhi z16.s, z2.h +; CHECK-NEXT: uunpklo z3.s, z3.h +; CHECK-NEXT: uunpklo z2.s, z2.h +; CHECK-NEXT: udivr z6.s, p0/m, z6.s, z16.s +; CHECK-NEXT: udiv z2.s, p0/m, z2.s, z3.s +; CHECK-NEXT: uzp1 z1.h, z1.h, z7.h +; CHECK-NEXT: uzp1 z2.h, z2.h, z6.h +; CHECK-NEXT: uzp1 z0.h, z0.h, z5.h +; CHECK-NEXT: uzp1 z1.b, z2.b, z1.b +; CHECK-NEXT: uzp1 z0.b, z0.b, z4.b +; CHECK-NEXT: stp q1, q0, [x0] +; CHECK-NEXT: ret + %op1 = load <32 x i8>, <32 x i8>* %a + %op2 = load <32 x i8>, <32 x i8>* %b + %res = udiv <32 x i8> %op1, %op2 + store <32 x i8> %res, <32 x i8>* %a + ret void +} + +define <2 x i16> @udiv_v2i16(<2 x i16> %op1, <2 x i16> %op2) #0 { +; CHECK-LABEL: udiv_v2i16: +; CHECK: // %bb.0: +; CHECK-NEXT: adrp x8, .LCPI18_0 +; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1 +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: ptrue p0.s, vl2 +; CHECK-NEXT: ldr d2, [x8, :lo12:.LCPI18_0] +; CHECK-NEXT: and z1.d, z1.d, z2.d +; CHECK-NEXT: and z0.d, z0.d, z2.d +; CHECK-NEXT: udiv z0.s, p0/m, z0.s, z1.s +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 +; CHECK-NEXT: ret + %res = udiv <2 x i16> %op1, %op2 + ret <2 x i16> %res +} + +define <4 x i16> @udiv_v4i16(<4 x i16> %op1, <4 x i16> %op2) #0 { +; CHECK-LABEL: udiv_v4i16: +; CHECK: // %bb.0: +; CHECK-NEXT: sub sp, sp, #16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1 +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: ptrue p0.s, vl4 +; CHECK-NEXT: uunpklo z1.s, z1.h +; CHECK-NEXT: uunpklo z0.s, z0.h +; CHECK-NEXT: udiv z0.s, p0/m, z0.s, z1.s +; CHECK-NEXT: fmov w8, s0 +; CHECK-NEXT: mov z1.s, z0.s[3] +; CHECK-NEXT: mov z2.s, z0.s[2] +; CHECK-NEXT: mov z0.s, z0.s[1] +; CHECK-NEXT: fmov w9, s1 +; CHECK-NEXT: fmov w10, s2 +; CHECK-NEXT: strh w8, [sp, #8] +; CHECK-NEXT: fmov w8, s0 +; CHECK-NEXT: strh w9, [sp, #14] +; CHECK-NEXT: strh w10, [sp, #12] +; CHECK-NEXT: strh w8, [sp, #10] +; CHECK-NEXT: ldr d0, [sp, #8] +; CHECK-NEXT: add sp, sp, #16 +; CHECK-NEXT: ret + %res = udiv <4 x i16> %op1, %op2 + ret <4 x i16> %res +} + +define <8 x i16> @udiv_v8i16(<8 x i16> %op1, <8 x i16> %op2) #0 { +; CHECK-LABEL: udiv_v8i16: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1 +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: ptrue p0.s, vl4 +; CHECK-NEXT: uunpkhi z2.s, z1.h +; CHECK-NEXT: uunpkhi z3.s, z0.h +; CHECK-NEXT: uunpklo z1.s, z1.h +; CHECK-NEXT: uunpklo z0.s, z0.h +; CHECK-NEXT: udivr z2.s, p0/m, z2.s, z3.s +; CHECK-NEXT: udiv z0.s, p0/m, z0.s, z1.s +; CHECK-NEXT: uzp1 z0.h, z0.h, z2.h +; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 +; CHECK-NEXT: ret + %res = udiv <8 x i16> %op1, %op2 + ret <8 x i16> %res +} + +define void @udiv_v16i16(<16 x i16>* %a, <16 x i16>* %b) #0 { +; CHECK-LABEL: udiv_v16i16: +; CHECK: // %bb.0: +; CHECK-NEXT: ldp q0, q1, [x1] +; CHECK-NEXT: ptrue p0.s, vl4 +; CHECK-NEXT: uunpkhi z6.s, z0.h +; CHECK-NEXT: uunpklo z0.s, z0.h +; CHECK-NEXT: ldp q3, q2, [x0] +; CHECK-NEXT: uunpkhi z4.s, z1.h +; CHECK-NEXT: uunpklo z1.s, z1.h +; CHECK-NEXT: uunpkhi z5.s, z2.h +; CHECK-NEXT: uunpklo z2.s, z2.h +; CHECK-NEXT: udivr z4.s, p0/m, z4.s, z5.s +; CHECK-NEXT: uunpkhi z5.s, z3.h +; CHECK-NEXT: uunpklo z3.s, z3.h +; CHECK-NEXT: udiv z5.s, p0/m, z5.s, z6.s +; CHECK-NEXT: udivr z0.s, p0/m, z0.s, z3.s +; CHECK-NEXT: udivr z1.s, p0/m, z1.s, z2.s +; CHECK-NEXT: uzp1 z0.h, z0.h, z5.h +; CHECK-NEXT: uzp1 z1.h, z1.h, z4.h +; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: ret + %op1 = load <16 x i16>, <16 x i16>* %a + %op2 = load <16 x i16>, <16 x i16>* %b + %res = udiv <16 x i16> %op1, %op2 + store <16 x i16> %res, <16 x i16>* %a + ret void +} + +define <2 x i32> @udiv_v2i32(<2 x i32> %op1, <2 x i32> %op2) #0 { +; CHECK-LABEL: udiv_v2i32: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: ptrue p0.s, vl2 +; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1 +; CHECK-NEXT: udiv z0.s, p0/m, z0.s, z1.s +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 +; CHECK-NEXT: ret + %res = udiv <2 x i32> %op1, %op2 + ret <2 x i32> %res +} + +define <4 x i32> @udiv_v4i32(<4 x i32> %op1, <4 x i32> %op2) #0 { +; CHECK-LABEL: udiv_v4i32: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: ptrue p0.s, vl4 +; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1 +; CHECK-NEXT: udiv z0.s, p0/m, z0.s, z1.s +; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 +; CHECK-NEXT: ret + %res = udiv <4 x i32> %op1, %op2 + ret <4 x i32> %res +} + +define void @udiv_v8i32(<8 x i32>* %a, <8 x i32>* %b) #0 { +; CHECK-LABEL: udiv_v8i32: +; CHECK: // %bb.0: +; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ptrue p0.s, vl4 +; CHECK-NEXT: ldp q2, q3, [x1] +; CHECK-NEXT: udiv z0.s, p0/m, z0.s, z2.s +; CHECK-NEXT: udiv z1.s, p0/m, z1.s, z3.s +; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: ret + %op1 = load <8 x i32>, <8 x i32>* %a + %op2 = load <8 x i32>, <8 x i32>* %b + %res = udiv <8 x i32> %op1, %op2 + store <8 x i32> %res, <8 x i32>* %a + ret void +} + +define <1 x i64> @udiv_v1i64(<1 x i64> %op1, <1 x i64> %op2) #0 { +; CHECK-LABEL: udiv_v1i64: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: ptrue p0.d, vl1 +; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1 +; CHECK-NEXT: udiv z0.d, p0/m, z0.d, z1.d +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 +; CHECK-NEXT: ret + %res = udiv <1 x i64> %op1, %op2 + ret <1 x i64> %res +} + +define <2 x i64> @udiv_v2i64(<2 x i64> %op1, <2 x i64> %op2) #0 { +; CHECK-LABEL: udiv_v2i64: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: ptrue p0.d, vl2 +; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1 +; CHECK-NEXT: udiv z0.d, p0/m, z0.d, z1.d +; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 +; CHECK-NEXT: ret + %res = udiv <2 x i64> %op1, %op2 + ret <2 x i64> %res +} + +define void @udiv_v4i64(<4 x i64>* %a, <4 x i64>* %b) #0 { +; CHECK-LABEL: udiv_v4i64: +; CHECK: // %bb.0: +; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ptrue p0.d, vl2 +; CHECK-NEXT: ldp q2, q3, [x1] +; CHECK-NEXT: udiv z0.d, p0/m, z0.d, z2.d +; CHECK-NEXT: udiv z1.d, p0/m, z1.d, z3.d +; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: ret + %op1 = load <4 x i64>, <4 x i64>* %a + %op2 = load <4 x i64>, <4 x i64>* %b + %res = udiv <4 x i64> %op1, %op2 + store <4 x i64> %res, <4 x i64>* %a + ret void +} + +define void @udiv_constantsplat_v8i32(<8 x i32>* %a) #0 { +; CHECK-LABEL: udiv_constantsplat_v8i32: +; CHECK: // %bb.0: +; CHECK-NEXT: adrp x8, .LCPI28_0 +; CHECK-NEXT: adrp x9, .LCPI28_1 +; CHECK-NEXT: ldp q1, q2, [x0] +; CHECK-NEXT: ptrue p0.s, vl4 +; CHECK-NEXT: ldr q0, [x8, :lo12:.LCPI28_0] +; CHECK-NEXT: adrp x8, .LCPI28_2 +; CHECK-NEXT: ldr q3, [x9, :lo12:.LCPI28_1] +; CHECK-NEXT: movprfx z5, z1 +; CHECK-NEXT: umulh z5.s, p0/m, z5.s, z0.s +; CHECK-NEXT: sub z1.s, z1.s, z5.s +; CHECK-NEXT: umulh z0.s, p0/m, z0.s, z2.s +; CHECK-NEXT: ldr q4, [x8, :lo12:.LCPI28_2] +; CHECK-NEXT: sub z2.s, z2.s, z0.s +; CHECK-NEXT: lsr z1.s, p0/m, z1.s, z3.s +; CHECK-NEXT: lsr z2.s, p0/m, z2.s, z3.s +; CHECK-NEXT: add z1.s, z1.s, z5.s +; CHECK-NEXT: add z0.s, z2.s, z0.s +; CHECK-NEXT: lsr z1.s, p0/m, z1.s, z4.s +; CHECK-NEXT: lsr z0.s, p0/m, z0.s, z4.s +; CHECK-NEXT: stp q1, q0, [x0] +; CHECK-NEXT: ret + %op1 = load <8 x i32>, <8 x i32>* %a + %res = udiv <8 x i32> %op1, + store <8 x i32> %res, <8 x i32>* %a + ret void +} + +attributes #0 = { "target-features"="+sve" } diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-log.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-log.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-log.ll @@ -0,0 +1,498 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -force-streaming-compatible-sve < %s | FileCheck %s + +target triple = "aarch64-unknown-linux-gnu" + +; +; AND +; + +define <8 x i8> @and_v8i8(<8 x i8> %op1, <8 x i8> %op2) #0 { +; CHECK-LABEL: and_v8i8: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1 +; CHECK-NEXT: and z0.d, z0.d, z1.d +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 +; CHECK-NEXT: ret + %res = and <8 x i8> %op1, %op2 + ret <8 x i8> %res +} + +define <16 x i8> @and_v16i8(<16 x i8> %op1, <16 x i8> %op2) #0 { +; CHECK-LABEL: and_v16i8: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1 +; CHECK-NEXT: and z0.d, z0.d, z1.d +; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 +; CHECK-NEXT: ret + %res = and <16 x i8> %op1, %op2 + ret <16 x i8> %res +} + +define void @and_v32i8(<32 x i8>* %a, <32 x i8>* %b) #0 { +; CHECK-LABEL: and_v32i8: +; CHECK: // %bb.0: +; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ldp q2, q3, [x1] +; CHECK-NEXT: and z0.d, z0.d, z2.d +; CHECK-NEXT: and z1.d, z1.d, z3.d +; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: ret + %op1 = load <32 x i8>, <32 x i8>* %a + %op2 = load <32 x i8>, <32 x i8>* %b + %res = and <32 x i8> %op1, %op2 + store <32 x i8> %res, <32 x i8>* %a + ret void +} + +define <4 x i16> @and_v4i16(<4 x i16> %op1, <4 x i16> %op2) #0 { +; CHECK-LABEL: and_v4i16: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1 +; CHECK-NEXT: and z0.d, z0.d, z1.d +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 +; CHECK-NEXT: ret + %res = and <4 x i16> %op1, %op2 + ret <4 x i16> %res +} + +define <8 x i16> @and_v8i16(<8 x i16> %op1, <8 x i16> %op2) #0 { +; CHECK-LABEL: and_v8i16: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1 +; CHECK-NEXT: and z0.d, z0.d, z1.d +; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 +; CHECK-NEXT: ret + %res = and <8 x i16> %op1, %op2 + ret <8 x i16> %res +} + +define void @and_v16i16(<16 x i16>* %a, <16 x i16>* %b) #0 { +; CHECK-LABEL: and_v16i16: +; CHECK: // %bb.0: +; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ldp q2, q3, [x1] +; CHECK-NEXT: and z0.d, z0.d, z2.d +; CHECK-NEXT: and z1.d, z1.d, z3.d +; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: ret + %op1 = load <16 x i16>, <16 x i16>* %a + %op2 = load <16 x i16>, <16 x i16>* %b + %res = and <16 x i16> %op1, %op2 + store <16 x i16> %res, <16 x i16>* %a + ret void +} + +define <2 x i32> @and_v2i32(<2 x i32> %op1, <2 x i32> %op2) #0 { +; CHECK-LABEL: and_v2i32: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1 +; CHECK-NEXT: and z0.d, z0.d, z1.d +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 +; CHECK-NEXT: ret + %res = and <2 x i32> %op1, %op2 + ret <2 x i32> %res +} + +define <4 x i32> @and_v4i32(<4 x i32> %op1, <4 x i32> %op2) #0 { +; CHECK-LABEL: and_v4i32: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1 +; CHECK-NEXT: and z0.d, z0.d, z1.d +; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 +; CHECK-NEXT: ret + %res = and <4 x i32> %op1, %op2 + ret <4 x i32> %res +} + +define void @and_v8i32(<8 x i32>* %a, <8 x i32>* %b) #0 { +; CHECK-LABEL: and_v8i32: +; CHECK: // %bb.0: +; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ldp q2, q3, [x1] +; CHECK-NEXT: and z0.d, z0.d, z2.d +; CHECK-NEXT: and z1.d, z1.d, z3.d +; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: ret + %op1 = load <8 x i32>, <8 x i32>* %a + %op2 = load <8 x i32>, <8 x i32>* %b + %res = and <8 x i32> %op1, %op2 + store <8 x i32> %res, <8 x i32>* %a + ret void +} + +define <1 x i64> @and_v1i64(<1 x i64> %op1, <1 x i64> %op2) #0 { +; CHECK-LABEL: and_v1i64: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1 +; CHECK-NEXT: and z0.d, z0.d, z1.d +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 +; CHECK-NEXT: ret + %res = and <1 x i64> %op1, %op2 + ret <1 x i64> %res +} + +define <2 x i64> @and_v2i64(<2 x i64> %op1, <2 x i64> %op2) #0 { +; CHECK-LABEL: and_v2i64: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1 +; CHECK-NEXT: and z0.d, z0.d, z1.d +; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 +; CHECK-NEXT: ret + %res = and <2 x i64> %op1, %op2 + ret <2 x i64> %res +} + +define void @and_v4i64(<4 x i64>* %a, <4 x i64>* %b) #0 { +; CHECK-LABEL: and_v4i64: +; CHECK: // %bb.0: +; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ldp q2, q3, [x1] +; CHECK-NEXT: and z0.d, z0.d, z2.d +; CHECK-NEXT: and z1.d, z1.d, z3.d +; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: ret + %op1 = load <4 x i64>, <4 x i64>* %a + %op2 = load <4 x i64>, <4 x i64>* %b + %res = and <4 x i64> %op1, %op2 + store <4 x i64> %res, <4 x i64>* %a + ret void +} + +; +; OR +; + +define <8 x i8> @or_v8i8(<8 x i8> %op1, <8 x i8> %op2) #0 { +; CHECK-LABEL: or_v8i8: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1 +; CHECK-NEXT: orr z0.d, z0.d, z1.d +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 +; CHECK-NEXT: ret + %res = or <8 x i8> %op1, %op2 + ret <8 x i8> %res +} + +define <16 x i8> @or_v16i8(<16 x i8> %op1, <16 x i8> %op2) #0 { +; CHECK-LABEL: or_v16i8: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1 +; CHECK-NEXT: orr z0.d, z0.d, z1.d +; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 +; CHECK-NEXT: ret + %res = or <16 x i8> %op1, %op2 + ret <16 x i8> %res +} + +define void @or_v32i8(<32 x i8>* %a, <32 x i8>* %b) #0 { +; CHECK-LABEL: or_v32i8: +; CHECK: // %bb.0: +; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ldp q2, q3, [x1] +; CHECK-NEXT: orr z0.d, z0.d, z2.d +; CHECK-NEXT: orr z1.d, z1.d, z3.d +; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: ret + %op1 = load <32 x i8>, <32 x i8>* %a + %op2 = load <32 x i8>, <32 x i8>* %b + %res = or <32 x i8> %op1, %op2 + store <32 x i8> %res, <32 x i8>* %a + ret void +} + +define <4 x i16> @or_v4i16(<4 x i16> %op1, <4 x i16> %op2) #0 { +; CHECK-LABEL: or_v4i16: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1 +; CHECK-NEXT: orr z0.d, z0.d, z1.d +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 +; CHECK-NEXT: ret + %res = or <4 x i16> %op1, %op2 + ret <4 x i16> %res +} + +define <8 x i16> @or_v8i16(<8 x i16> %op1, <8 x i16> %op2) #0 { +; CHECK-LABEL: or_v8i16: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1 +; CHECK-NEXT: orr z0.d, z0.d, z1.d +; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 +; CHECK-NEXT: ret + %res = or <8 x i16> %op1, %op2 + ret <8 x i16> %res +} + +define void @or_v16i16(<16 x i16>* %a, <16 x i16>* %b) #0 { +; CHECK-LABEL: or_v16i16: +; CHECK: // %bb.0: +; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ldp q2, q3, [x1] +; CHECK-NEXT: orr z0.d, z0.d, z2.d +; CHECK-NEXT: orr z1.d, z1.d, z3.d +; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: ret + %op1 = load <16 x i16>, <16 x i16>* %a + %op2 = load <16 x i16>, <16 x i16>* %b + %res = or <16 x i16> %op1, %op2 + store <16 x i16> %res, <16 x i16>* %a + ret void +} + +define <2 x i32> @or_v2i32(<2 x i32> %op1, <2 x i32> %op2) #0 { +; CHECK-LABEL: or_v2i32: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1 +; CHECK-NEXT: orr z0.d, z0.d, z1.d +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 +; CHECK-NEXT: ret + %res = or <2 x i32> %op1, %op2 + ret <2 x i32> %res +} + +define <4 x i32> @or_v4i32(<4 x i32> %op1, <4 x i32> %op2) #0 { +; CHECK-LABEL: or_v4i32: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1 +; CHECK-NEXT: orr z0.d, z0.d, z1.d +; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 +; CHECK-NEXT: ret + %res = or <4 x i32> %op1, %op2 + ret <4 x i32> %res +} + +define void @or_v8i32(<8 x i32>* %a, <8 x i32>* %b) #0 { +; CHECK-LABEL: or_v8i32: +; CHECK: // %bb.0: +; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ldp q2, q3, [x1] +; CHECK-NEXT: orr z0.d, z0.d, z2.d +; CHECK-NEXT: orr z1.d, z1.d, z3.d +; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: ret + %op1 = load <8 x i32>, <8 x i32>* %a + %op2 = load <8 x i32>, <8 x i32>* %b + %res = or <8 x i32> %op1, %op2 + store <8 x i32> %res, <8 x i32>* %a + ret void +} + +define <1 x i64> @or_v1i64(<1 x i64> %op1, <1 x i64> %op2) #0 { +; CHECK-LABEL: or_v1i64: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1 +; CHECK-NEXT: orr z0.d, z0.d, z1.d +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 +; CHECK-NEXT: ret + %res = or <1 x i64> %op1, %op2 + ret <1 x i64> %res +} + +define <2 x i64> @or_v2i64(<2 x i64> %op1, <2 x i64> %op2) #0 { +; CHECK-LABEL: or_v2i64: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1 +; CHECK-NEXT: orr z0.d, z0.d, z1.d +; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 +; CHECK-NEXT: ret + %res = or <2 x i64> %op1, %op2 + ret <2 x i64> %res +} + +define void @or_v4i64(<4 x i64>* %a, <4 x i64>* %b) #0 { +; CHECK-LABEL: or_v4i64: +; CHECK: // %bb.0: +; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ldp q2, q3, [x1] +; CHECK-NEXT: orr z0.d, z0.d, z2.d +; CHECK-NEXT: orr z1.d, z1.d, z3.d +; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: ret + %op1 = load <4 x i64>, <4 x i64>* %a + %op2 = load <4 x i64>, <4 x i64>* %b + %res = or <4 x i64> %op1, %op2 + store <4 x i64> %res, <4 x i64>* %a + ret void +} + +; +; XOR +; + +define <8 x i8> @xor_v8i8(<8 x i8> %op1, <8 x i8> %op2) #0 { +; CHECK-LABEL: xor_v8i8: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1 +; CHECK-NEXT: eor z0.d, z0.d, z1.d +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 +; CHECK-NEXT: ret + %res = xor <8 x i8> %op1, %op2 + ret <8 x i8> %res +} + +define <16 x i8> @xor_v16i8(<16 x i8> %op1, <16 x i8> %op2) #0 { +; CHECK-LABEL: xor_v16i8: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1 +; CHECK-NEXT: eor z0.d, z0.d, z1.d +; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 +; CHECK-NEXT: ret + %res = xor <16 x i8> %op1, %op2 + ret <16 x i8> %res +} + +define void @xor_v32i8(<32 x i8>* %a, <32 x i8>* %b) #0 { +; CHECK-LABEL: xor_v32i8: +; CHECK: // %bb.0: +; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ldp q2, q3, [x1] +; CHECK-NEXT: eor z0.d, z0.d, z2.d +; CHECK-NEXT: eor z1.d, z1.d, z3.d +; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: ret + %op1 = load <32 x i8>, <32 x i8>* %a + %op2 = load <32 x i8>, <32 x i8>* %b + %res = xor <32 x i8> %op1, %op2 + store <32 x i8> %res, <32 x i8>* %a + ret void +} + +define <4 x i16> @xor_v4i16(<4 x i16> %op1, <4 x i16> %op2) #0 { +; CHECK-LABEL: xor_v4i16: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1 +; CHECK-NEXT: eor z0.d, z0.d, z1.d +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 +; CHECK-NEXT: ret + %res = xor <4 x i16> %op1, %op2 + ret <4 x i16> %res +} + +define <8 x i16> @xor_v8i16(<8 x i16> %op1, <8 x i16> %op2) #0 { +; CHECK-LABEL: xor_v8i16: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1 +; CHECK-NEXT: eor z0.d, z0.d, z1.d +; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 +; CHECK-NEXT: ret + %res = xor <8 x i16> %op1, %op2 + ret <8 x i16> %res +} + +define void @xor_v16i16(<16 x i16>* %a, <16 x i16>* %b) #0 { +; CHECK-LABEL: xor_v16i16: +; CHECK: // %bb.0: +; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ldp q2, q3, [x1] +; CHECK-NEXT: eor z0.d, z0.d, z2.d +; CHECK-NEXT: eor z1.d, z1.d, z3.d +; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: ret + %op1 = load <16 x i16>, <16 x i16>* %a + %op2 = load <16 x i16>, <16 x i16>* %b + %res = xor <16 x i16> %op1, %op2 + store <16 x i16> %res, <16 x i16>* %a + ret void +} + +define <2 x i32> @xor_v2i32(<2 x i32> %op1, <2 x i32> %op2) #0 { +; CHECK-LABEL: xor_v2i32: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1 +; CHECK-NEXT: eor z0.d, z0.d, z1.d +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 +; CHECK-NEXT: ret + %res = xor <2 x i32> %op1, %op2 + ret <2 x i32> %res +} + +define <4 x i32> @xor_v4i32(<4 x i32> %op1, <4 x i32> %op2) #0 { +; CHECK-LABEL: xor_v4i32: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1 +; CHECK-NEXT: eor z0.d, z0.d, z1.d +; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 +; CHECK-NEXT: ret + %res = xor <4 x i32> %op1, %op2 + ret <4 x i32> %res +} + +define void @xor_v8i32(<8 x i32>* %a, <8 x i32>* %b) #0 { +; CHECK-LABEL: xor_v8i32: +; CHECK: // %bb.0: +; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ldp q2, q3, [x1] +; CHECK-NEXT: eor z0.d, z0.d, z2.d +; CHECK-NEXT: eor z1.d, z1.d, z3.d +; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: ret + %op1 = load <8 x i32>, <8 x i32>* %a + %op2 = load <8 x i32>, <8 x i32>* %b + %res = xor <8 x i32> %op1, %op2 + store <8 x i32> %res, <8 x i32>* %a + ret void +} + +define <1 x i64> @xor_v1i64(<1 x i64> %op1, <1 x i64> %op2) #0 { +; CHECK-LABEL: xor_v1i64: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1 +; CHECK-NEXT: eor z0.d, z0.d, z1.d +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 +; CHECK-NEXT: ret + %res = xor <1 x i64> %op1, %op2 + ret <1 x i64> %res +} + +define <2 x i64> @xor_v2i64(<2 x i64> %op1, <2 x i64> %op2) #0 { +; CHECK-LABEL: xor_v2i64: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1 +; CHECK-NEXT: eor z0.d, z0.d, z1.d +; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 +; CHECK-NEXT: ret + %res = xor <2 x i64> %op1, %op2 + ret <2 x i64> %res +} + +define void @xor_v4i64(<4 x i64>* %a, <4 x i64>* %b) #0 { +; CHECK-LABEL: xor_v4i64: +; CHECK: // %bb.0: +; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ldp q2, q3, [x1] +; CHECK-NEXT: eor z0.d, z0.d, z2.d +; CHECK-NEXT: eor z1.d, z1.d, z3.d +; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: ret + %op1 = load <4 x i64>, <4 x i64>* %a + %op2 = load <4 x i64>, <4 x i64>* %b + %res = xor <4 x i64> %op1, %op2 + store <4 x i64> %res, <4 x i64>* %a + ret void +} + +attributes #0 = { "target-features"="+sve" } diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-mulh.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-mulh.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-mulh.ll @@ -0,0 +1,893 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -force-streaming-compatible-sve < %s | FileCheck %s + +; This test only tests the legal types for a given vector width, as mulh nodes +; do not get generated for non-legal types. + +target triple = "aarch64-unknown-linux-gnu" + +; +; SMULH +; + +define <4 x i8> @smulh_v4i8(<4 x i8> %op1, <4 x i8> %op2) #0 { +; CHECK-LABEL: smulh_v4i8: +; CHECK: // %bb.0: +; CHECK-NEXT: adrp x8, .LCPI0_0 +; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1 +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: ptrue p0.h, vl4 +; CHECK-NEXT: ldr d2, [x8, :lo12:.LCPI0_0] +; CHECK-NEXT: adrp x8, .LCPI0_1 +; CHECK-NEXT: lsl z0.h, p0/m, z0.h, z2.h +; CHECK-NEXT: lsl z1.h, p0/m, z1.h, z2.h +; CHECK-NEXT: ldr d3, [x8, :lo12:.LCPI0_1] +; CHECK-NEXT: asr z0.h, p0/m, z0.h, z2.h +; CHECK-NEXT: asr z1.h, p0/m, z1.h, z2.h +; CHECK-NEXT: mul z0.h, p0/m, z0.h, z1.h +; CHECK-NEXT: lsr z0.h, p0/m, z0.h, z3.h +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 +; CHECK-NEXT: ret + %insert = insertelement <4 x i16> undef, i16 4, i64 0 + %splat = shufflevector <4 x i16> %insert, <4 x i16> undef, <4 x i32> zeroinitializer + %1 = sext <4 x i8> %op1 to <4 x i16> + %2 = sext <4 x i8> %op2 to <4 x i16> + %mul = mul <4 x i16> %1, %2 + %shr = lshr <4 x i16> %mul, + %res = trunc <4 x i16> %shr to <4 x i8> + ret <4 x i8> %res +} + +define <8 x i8> @smulh_v8i8(<8 x i8> %op1, <8 x i8> %op2) #0 { +; CHECK-LABEL: smulh_v8i8: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: ptrue p0.b, vl8 +; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1 +; CHECK-NEXT: smulh z0.b, p0/m, z0.b, z1.b +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 +; CHECK-NEXT: ret + %insert = insertelement <8 x i16> undef, i16 8, i64 0 + %splat = shufflevector <8 x i16> %insert, <8 x i16> undef, <8 x i32> zeroinitializer + %1 = sext <8 x i8> %op1 to <8 x i16> + %2 = sext <8 x i8> %op2 to <8 x i16> + %mul = mul <8 x i16> %1, %2 + %shr = lshr <8 x i16> %mul, + %res = trunc <8 x i16> %shr to <8 x i8> + ret <8 x i8> %res +} + +define <16 x i8> @smulh_v16i8(<16 x i8> %op1, <16 x i8> %op2) #0 { +; CHECK-LABEL: smulh_v16i8: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: ptrue p0.b, vl16 +; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1 +; CHECK-NEXT: smulh z0.b, p0/m, z0.b, z1.b +; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 +; CHECK-NEXT: ret + %1 = sext <16 x i8> %op1 to <16 x i16> + %2 = sext <16 x i8> %op2 to <16 x i16> + %mul = mul <16 x i16> %1, %2 + %shr = lshr <16 x i16> %mul, + %res = trunc <16 x i16> %shr to <16 x i8> + ret <16 x i8> %res +} + +define void @smulh_v32i8(<32 x i8>* %a, <32 x i8>* %b) #0 { +; CHECK-LABEL: smulh_v32i8: +; CHECK: // %bb.0: +; CHECK-NEXT: sub sp, sp, #32 +; CHECK-NEXT: .cfi_def_cfa_offset 32 +; CHECK-NEXT: ldp q2, q3, [x0] +; CHECK-NEXT: adrp x8, .LCPI3_0 +; CHECK-NEXT: ptrue p0.h, vl8 +; CHECK-NEXT: sunpklo z0.h, z2.b +; CHECK-NEXT: ext z2.b, z2.b, z2.b, #8 +; CHECK-NEXT: sunpklo z2.h, z2.b +; CHECK-NEXT: ldp q4, q5, [x1] +; CHECK-NEXT: sunpklo z6.h, z3.b +; CHECK-NEXT: ext z3.b, z3.b, z3.b, #8 +; CHECK-NEXT: sunpklo z3.h, z3.b +; CHECK-NEXT: sunpklo z1.h, z4.b +; CHECK-NEXT: ext z4.b, z4.b, z4.b, #8 +; CHECK-NEXT: sunpklo z4.h, z4.b +; CHECK-NEXT: mul z0.h, p0/m, z0.h, z1.h +; CHECK-NEXT: sunpklo z7.h, z5.b +; CHECK-NEXT: ext z5.b, z5.b, z5.b, #8 +; CHECK-NEXT: ldr q16, [x8, :lo12:.LCPI3_0] +; CHECK-NEXT: sunpklo z5.h, z5.b +; CHECK-NEXT: mul z3.h, p0/m, z3.h, z5.h +; CHECK-NEXT: movprfx z5, z6 +; CHECK-NEXT: mul z5.h, p0/m, z5.h, z7.h +; CHECK-NEXT: mul z2.h, p0/m, z2.h, z4.h +; CHECK-NEXT: movprfx z4, z5 +; CHECK-NEXT: lsr z4.h, p0/m, z4.h, z16.h +; CHECK-NEXT: lsr z3.h, p0/m, z3.h, z16.h +; CHECK-NEXT: fmov w9, s4 +; CHECK-NEXT: fmov w8, s3 +; CHECK-NEXT: mov z5.h, z3.h[7] +; CHECK-NEXT: mov z6.h, z3.h[6] +; CHECK-NEXT: mov z7.h, z3.h[5] +; CHECK-NEXT: fmov w10, s5 +; CHECK-NEXT: strb w9, [sp, #16] +; CHECK-NEXT: strb w8, [sp, #24] +; CHECK-NEXT: fmov w8, s6 +; CHECK-NEXT: fmov w9, s7 +; CHECK-NEXT: mov z17.h, z3.h[4] +; CHECK-NEXT: mov z18.h, z3.h[3] +; CHECK-NEXT: mov z19.h, z3.h[2] +; CHECK-NEXT: strb w10, [sp, #31] +; CHECK-NEXT: fmov w10, s17 +; CHECK-NEXT: strb w8, [sp, #30] +; CHECK-NEXT: fmov w8, s18 +; CHECK-NEXT: strb w9, [sp, #29] +; CHECK-NEXT: fmov w9, s19 +; CHECK-NEXT: mov z20.h, z3.h[1] +; CHECK-NEXT: mov z3.h, z4.h[7] +; CHECK-NEXT: mov z21.h, z4.h[6] +; CHECK-NEXT: strb w10, [sp, #28] +; CHECK-NEXT: fmov w10, s20 +; CHECK-NEXT: strb w8, [sp, #27] +; CHECK-NEXT: fmov w8, s3 +; CHECK-NEXT: strb w9, [sp, #26] +; CHECK-NEXT: fmov w9, s21 +; CHECK-NEXT: mov z22.h, z4.h[5] +; CHECK-NEXT: mov z23.h, z4.h[4] +; CHECK-NEXT: mov z24.h, z4.h[3] +; CHECK-NEXT: strb w10, [sp, #25] +; CHECK-NEXT: fmov w10, s22 +; CHECK-NEXT: strb w8, [sp, #23] +; CHECK-NEXT: fmov w8, s23 +; CHECK-NEXT: strb w9, [sp, #22] +; CHECK-NEXT: fmov w9, s24 +; CHECK-NEXT: mov z25.h, z4.h[2] +; CHECK-NEXT: mov z26.h, z4.h[1] +; CHECK-NEXT: strb w10, [sp, #21] +; CHECK-NEXT: fmov w10, s25 +; CHECK-NEXT: strb w8, [sp, #20] +; CHECK-NEXT: movprfx z1, z2 +; CHECK-NEXT: lsr z1.h, p0/m, z1.h, z16.h +; CHECK-NEXT: strb w9, [sp, #19] +; CHECK-NEXT: fmov w8, s26 +; CHECK-NEXT: fmov w9, s1 +; CHECK-NEXT: lsr z0.h, p0/m, z0.h, z16.h +; CHECK-NEXT: mov z2.h, z1.h[7] +; CHECK-NEXT: mov z3.h, z1.h[6] +; CHECK-NEXT: strb w10, [sp, #18] +; CHECK-NEXT: fmov w10, s0 +; CHECK-NEXT: strb w8, [sp, #17] +; CHECK-NEXT: fmov w8, s2 +; CHECK-NEXT: strb w9, [sp, #8] +; CHECK-NEXT: fmov w9, s3 +; CHECK-NEXT: mov z4.h, z1.h[5] +; CHECK-NEXT: mov z5.h, z1.h[4] +; CHECK-NEXT: mov z6.h, z1.h[3] +; CHECK-NEXT: strb w10, [sp] +; CHECK-NEXT: fmov w10, s4 +; CHECK-NEXT: strb w8, [sp, #15] +; CHECK-NEXT: fmov w8, s5 +; CHECK-NEXT: strb w9, [sp, #14] +; CHECK-NEXT: fmov w9, s6 +; CHECK-NEXT: mov z7.h, z1.h[2] +; CHECK-NEXT: mov z16.h, z1.h[1] +; CHECK-NEXT: mov z1.h, z0.h[7] +; CHECK-NEXT: strb w10, [sp, #13] +; CHECK-NEXT: fmov w10, s7 +; CHECK-NEXT: strb w8, [sp, #12] +; CHECK-NEXT: fmov w8, s16 +; CHECK-NEXT: strb w9, [sp, #11] +; CHECK-NEXT: fmov w9, s1 +; CHECK-NEXT: mov z17.h, z0.h[6] +; CHECK-NEXT: mov z18.h, z0.h[5] +; CHECK-NEXT: mov z19.h, z0.h[4] +; CHECK-NEXT: strb w10, [sp, #10] +; CHECK-NEXT: fmov w10, s17 +; CHECK-NEXT: strb w8, [sp, #9] +; CHECK-NEXT: fmov w8, s18 +; CHECK-NEXT: strb w9, [sp, #7] +; CHECK-NEXT: fmov w9, s19 +; CHECK-NEXT: mov z20.h, z0.h[3] +; CHECK-NEXT: mov z21.h, z0.h[2] +; CHECK-NEXT: mov z22.h, z0.h[1] +; CHECK-NEXT: strb w10, [sp, #6] +; CHECK-NEXT: fmov w10, s20 +; CHECK-NEXT: strb w8, [sp, #5] +; CHECK-NEXT: fmov w8, s21 +; CHECK-NEXT: strb w9, [sp, #4] +; CHECK-NEXT: fmov w9, s22 +; CHECK-NEXT: strb w10, [sp, #3] +; CHECK-NEXT: strb w8, [sp, #2] +; CHECK-NEXT: strb w9, [sp, #1] +; CHECK-NEXT: ldp q0, q1, [sp] +; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: add sp, sp, #32 +; CHECK-NEXT: ret + %op1 = load <32 x i8>, <32 x i8>* %a + %op2 = load <32 x i8>, <32 x i8>* %b + %1 = sext <32 x i8> %op1 to <32 x i16> + %2 = sext <32 x i8> %op2 to <32 x i16> + %mul = mul <32 x i16> %1, %2 + %shr = lshr <32 x i16> %mul, + %res = trunc <32 x i16> %shr to <32 x i8> + store <32 x i8> %res, <32 x i8>* %a + ret void +} + +define <2 x i16> @smulh_v2i16(<2 x i16> %op1, <2 x i16> %op2) #0 { +; CHECK-LABEL: smulh_v2i16: +; CHECK: // %bb.0: +; CHECK-NEXT: adrp x8, .LCPI4_0 +; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1 +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: ptrue p0.s, vl2 +; CHECK-NEXT: ldr d2, [x8, :lo12:.LCPI4_0] +; CHECK-NEXT: lsl z0.s, p0/m, z0.s, z2.s +; CHECK-NEXT: lsl z1.s, p0/m, z1.s, z2.s +; CHECK-NEXT: asr z0.s, p0/m, z0.s, z2.s +; CHECK-NEXT: asr z1.s, p0/m, z1.s, z2.s +; CHECK-NEXT: mul z0.s, p0/m, z0.s, z1.s +; CHECK-NEXT: lsr z0.s, p0/m, z0.s, z2.s +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 +; CHECK-NEXT: ret + %1 = sext <2 x i16> %op1 to <2 x i32> + %2 = sext <2 x i16> %op2 to <2 x i32> + %mul = mul <2 x i32> %1, %2 + %shr = lshr <2 x i32> %mul, + %res = trunc <2 x i32> %shr to <2 x i16> + ret <2 x i16> %res +} + +define <4 x i16> @smulh_v4i16(<4 x i16> %op1, <4 x i16> %op2) #0 { +; CHECK-LABEL: smulh_v4i16: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: ptrue p0.h, vl4 +; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1 +; CHECK-NEXT: smulh z0.h, p0/m, z0.h, z1.h +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 +; CHECK-NEXT: ret + %1 = sext <4 x i16> %op1 to <4 x i32> + %2 = sext <4 x i16> %op2 to <4 x i32> + %mul = mul <4 x i32> %1, %2 + %shr = lshr <4 x i32> %mul, + %res = trunc <4 x i32> %shr to <4 x i16> + ret <4 x i16> %res +} + +define <8 x i16> @smulh_v8i16(<8 x i16> %op1, <8 x i16> %op2) #0 { +; CHECK-LABEL: smulh_v8i16: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: ptrue p0.h, vl8 +; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1 +; CHECK-NEXT: smulh z0.h, p0/m, z0.h, z1.h +; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 +; CHECK-NEXT: ret + %1 = sext <8 x i16> %op1 to <8 x i32> + %2 = sext <8 x i16> %op2 to <8 x i32> + %mul = mul <8 x i32> %1, %2 + %shr = lshr <8 x i32> %mul, + %res = trunc <8 x i32> %shr to <8 x i16> + ret <8 x i16> %res +} + +define void @smulh_v16i16(<16 x i16>* %a, <16 x i16>* %b) #0 { +; CHECK-LABEL: smulh_v16i16: +; CHECK: // %bb.0: +; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ptrue p0.h, vl4 +; CHECK-NEXT: mov z5.d, z0.d +; CHECK-NEXT: ext z5.b, z5.b, z5.b, #8 +; CHECK-NEXT: ldp q2, q3, [x1] +; CHECK-NEXT: mov z4.d, z1.d +; CHECK-NEXT: ext z4.b, z4.b, z4.b, #8 +; CHECK-NEXT: mov z6.d, z2.d +; CHECK-NEXT: smulh z0.h, p0/m, z0.h, z2.h +; CHECK-NEXT: ext z6.b, z6.b, z6.b, #8 +; CHECK-NEXT: mov z2.d, z3.d +; CHECK-NEXT: smulh z1.h, p0/m, z1.h, z3.h +; CHECK-NEXT: ext z2.b, z2.b, z3.b, #8 +; CHECK-NEXT: movprfx z3, z5 +; CHECK-NEXT: smulh z3.h, p0/m, z3.h, z6.h +; CHECK-NEXT: smulh z2.h, p0/m, z2.h, z4.h +; CHECK-NEXT: splice z0.h, p0, z0.h, z3.h +; CHECK-NEXT: splice z1.h, p0, z1.h, z2.h +; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: ret + %op1 = load <16 x i16>, <16 x i16>* %a + %op2 = load <16 x i16>, <16 x i16>* %b + %1 = sext <16 x i16> %op1 to <16 x i32> + %2 = sext <16 x i16> %op2 to <16 x i32> + %mul = mul <16 x i32> %1, %2 + %shr = lshr <16 x i32> %mul, + %res = trunc <16 x i32> %shr to <16 x i16> + store <16 x i16> %res, <16 x i16>* %a + ret void +} + +define <2 x i32> @smulh_v2i32(<2 x i32> %op1, <2 x i32> %op2) #0 { +; CHECK-LABEL: smulh_v2i32: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: ptrue p0.s, vl2 +; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1 +; CHECK-NEXT: smulh z0.s, p0/m, z0.s, z1.s +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 +; CHECK-NEXT: ret + %1 = sext <2 x i32> %op1 to <2 x i64> + %2 = sext <2 x i32> %op2 to <2 x i64> + %mul = mul <2 x i64> %1, %2 + %shr = lshr <2 x i64> %mul, + %res = trunc <2 x i64> %shr to <2 x i32> + ret <2 x i32> %res +} + +define <4 x i32> @smulh_v4i32(<4 x i32> %op1, <4 x i32> %op2) #0 { +; CHECK-LABEL: smulh_v4i32: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: ptrue p0.s, vl4 +; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1 +; CHECK-NEXT: smulh z0.s, p0/m, z0.s, z1.s +; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 +; CHECK-NEXT: ret + %1 = sext <4 x i32> %op1 to <4 x i64> + %2 = sext <4 x i32> %op2 to <4 x i64> + %mul = mul <4 x i64> %1, %2 + %shr = lshr <4 x i64> %mul, + %res = trunc <4 x i64> %shr to <4 x i32> + ret <4 x i32> %res +} + +define void @smulh_v8i32(<8 x i32>* %a, <8 x i32>* %b) #0 { +; CHECK-LABEL: smulh_v8i32: +; CHECK: // %bb.0: +; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ptrue p0.s, vl2 +; CHECK-NEXT: mov z5.d, z0.d +; CHECK-NEXT: ext z5.b, z5.b, z5.b, #8 +; CHECK-NEXT: ldp q2, q3, [x1] +; CHECK-NEXT: mov z4.d, z1.d +; CHECK-NEXT: ext z4.b, z4.b, z4.b, #8 +; CHECK-NEXT: mov z6.d, z2.d +; CHECK-NEXT: smulh z0.s, p0/m, z0.s, z2.s +; CHECK-NEXT: ext z6.b, z6.b, z6.b, #8 +; CHECK-NEXT: mov z2.d, z3.d +; CHECK-NEXT: smulh z1.s, p0/m, z1.s, z3.s +; CHECK-NEXT: ext z2.b, z2.b, z3.b, #8 +; CHECK-NEXT: movprfx z3, z5 +; CHECK-NEXT: smulh z3.s, p0/m, z3.s, z6.s +; CHECK-NEXT: smulh z2.s, p0/m, z2.s, z4.s +; CHECK-NEXT: splice z0.s, p0, z0.s, z3.s +; CHECK-NEXT: splice z1.s, p0, z1.s, z2.s +; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: ret + %op1 = load <8 x i32>, <8 x i32>* %a + %op2 = load <8 x i32>, <8 x i32>* %b + %1 = sext <8 x i32> %op1 to <8 x i64> + %2 = sext <8 x i32> %op2 to <8 x i64> + %mul = mul <8 x i64> %1, %2 + %shr = lshr <8 x i64> %mul, + %res = trunc <8 x i64> %shr to <8 x i32> + store <8 x i32> %res, <8 x i32>* %a + ret void +} + +define <1 x i64> @smulh_v1i64(<1 x i64> %op1, <1 x i64> %op2) #0 { +; CHECK-LABEL: smulh_v1i64: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: ptrue p0.d, vl1 +; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1 +; CHECK-NEXT: smulh z0.d, p0/m, z0.d, z1.d +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 +; CHECK-NEXT: ret + %insert = insertelement <1 x i128> undef, i128 64, i128 0 + %splat = shufflevector <1 x i128> %insert, <1 x i128> undef, <1 x i32> zeroinitializer + %1 = sext <1 x i64> %op1 to <1 x i128> + %2 = sext <1 x i64> %op2 to <1 x i128> + %mul = mul <1 x i128> %1, %2 + %shr = lshr <1 x i128> %mul, %splat + %res = trunc <1 x i128> %shr to <1 x i64> + ret <1 x i64> %res +} + +define <2 x i64> @smulh_v2i64(<2 x i64> %op1, <2 x i64> %op2) #0 { +; CHECK-LABEL: smulh_v2i64: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: ptrue p0.d, vl2 +; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1 +; CHECK-NEXT: smulh z0.d, p0/m, z0.d, z1.d +; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 +; CHECK-NEXT: ret + %1 = sext <2 x i64> %op1 to <2 x i128> + %2 = sext <2 x i64> %op2 to <2 x i128> + %mul = mul <2 x i128> %1, %2 + %shr = lshr <2 x i128> %mul, + %res = trunc <2 x i128> %shr to <2 x i64> + ret <2 x i64> %res +} + +define void @smulh_v4i64(<4 x i64>* %a, <4 x i64>* %b) #0 { +; CHECK-LABEL: smulh_v4i64: +; CHECK: // %bb.0: +; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ptrue p0.d, vl1 +; CHECK-NEXT: fmov x9, d0 +; CHECK-NEXT: ldp q2, q3, [x1] +; CHECK-NEXT: mov z4.d, z1.d[1] +; CHECK-NEXT: fmov x8, d1 +; CHECK-NEXT: mov z1.d, z0.d[1] +; CHECK-NEXT: fmov x13, d4 +; CHECK-NEXT: fmov x10, d1 +; CHECK-NEXT: mov z0.d, z2.d[1] +; CHECK-NEXT: fmov x12, d2 +; CHECK-NEXT: fmov x11, d0 +; CHECK-NEXT: mov z0.d, z3.d[1] +; CHECK-NEXT: fmov x14, d0 +; CHECK-NEXT: smulh x9, x9, x12 +; CHECK-NEXT: smulh x10, x10, x11 +; CHECK-NEXT: fmov x11, d3 +; CHECK-NEXT: smulh x12, x13, x14 +; CHECK-NEXT: smulh x8, x8, x11 +; CHECK-NEXT: fmov d0, x9 +; CHECK-NEXT: fmov d1, x10 +; CHECK-NEXT: fmov d3, x12 +; CHECK-NEXT: fmov d2, x8 +; CHECK-NEXT: splice z0.d, p0, z0.d, z1.d +; CHECK-NEXT: splice z2.d, p0, z2.d, z3.d +; CHECK-NEXT: stp q0, q2, [x0] +; CHECK-NEXT: ret + %op1 = load <4 x i64>, <4 x i64>* %a + %op2 = load <4 x i64>, <4 x i64>* %b + %1 = sext <4 x i64> %op1 to <4 x i128> + %2 = sext <4 x i64> %op2 to <4 x i128> + %mul = mul <4 x i128> %1, %2 + %shr = lshr <4 x i128> %mul, + %res = trunc <4 x i128> %shr to <4 x i64> + store <4 x i64> %res, <4 x i64>* %a + ret void +} + +; +; UMULH +; + +define <4 x i8> @umulh_v4i8(<4 x i8> %op1, <4 x i8> %op2) #0 { +; CHECK-LABEL: umulh_v4i8: +; CHECK: // %bb.0: +; CHECK-NEXT: adrp x8, .LCPI14_0 +; CHECK-NEXT: adrp x9, .LCPI14_1 +; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1 +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: ptrue p0.h, vl4 +; CHECK-NEXT: ldr d2, [x8, :lo12:.LCPI14_0] +; CHECK-NEXT: ldr d3, [x9, :lo12:.LCPI14_1] +; CHECK-NEXT: and z0.d, z0.d, z2.d +; CHECK-NEXT: and z1.d, z1.d, z2.d +; CHECK-NEXT: mul z0.h, p0/m, z0.h, z1.h +; CHECK-NEXT: lsr z0.h, p0/m, z0.h, z3.h +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 +; CHECK-NEXT: ret + %1 = zext <4 x i8> %op1 to <4 x i16> + %2 = zext <4 x i8> %op2 to <4 x i16> + %mul = mul <4 x i16> %1, %2 + %shr = lshr <4 x i16> %mul, + %res = trunc <4 x i16> %shr to <4 x i8> + ret <4 x i8> %res +} + +define <8 x i8> @umulh_v8i8(<8 x i8> %op1, <8 x i8> %op2) #0 { +; CHECK-LABEL: umulh_v8i8: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: ptrue p0.b, vl8 +; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1 +; CHECK-NEXT: umulh z0.b, p0/m, z0.b, z1.b +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 +; CHECK-NEXT: ret + %1 = zext <8 x i8> %op1 to <8 x i16> + %2 = zext <8 x i8> %op2 to <8 x i16> + %mul = mul <8 x i16> %1, %2 + %shr = lshr <8 x i16> %mul, + %res = trunc <8 x i16> %shr to <8 x i8> + ret <8 x i8> %res +} + +define <16 x i8> @umulh_v16i8(<16 x i8> %op1, <16 x i8> %op2) #0 { +; CHECK-LABEL: umulh_v16i8: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: ptrue p0.b, vl16 +; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1 +; CHECK-NEXT: umulh z0.b, p0/m, z0.b, z1.b +; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 +; CHECK-NEXT: ret + %1 = zext <16 x i8> %op1 to <16 x i16> + %2 = zext <16 x i8> %op2 to <16 x i16> + %mul = mul <16 x i16> %1, %2 + %shr = lshr <16 x i16> %mul, + %res = trunc <16 x i16> %shr to <16 x i8> + ret <16 x i8> %res +} + +define void @umulh_v32i8(<32 x i8>* %a, <32 x i8>* %b) #0 { +; CHECK-LABEL: umulh_v32i8: +; CHECK: // %bb.0: +; CHECK-NEXT: sub sp, sp, #32 +; CHECK-NEXT: .cfi_def_cfa_offset 32 +; CHECK-NEXT: ldp q2, q3, [x0] +; CHECK-NEXT: adrp x8, .LCPI17_0 +; CHECK-NEXT: ptrue p0.h, vl8 +; CHECK-NEXT: uunpklo z0.h, z2.b +; CHECK-NEXT: ext z2.b, z2.b, z2.b, #8 +; CHECK-NEXT: uunpklo z2.h, z2.b +; CHECK-NEXT: ldp q4, q5, [x1] +; CHECK-NEXT: uunpklo z6.h, z3.b +; CHECK-NEXT: ext z3.b, z3.b, z3.b, #8 +; CHECK-NEXT: uunpklo z3.h, z3.b +; CHECK-NEXT: uunpklo z1.h, z4.b +; CHECK-NEXT: ext z4.b, z4.b, z4.b, #8 +; CHECK-NEXT: uunpklo z4.h, z4.b +; CHECK-NEXT: mul z0.h, p0/m, z0.h, z1.h +; CHECK-NEXT: uunpklo z7.h, z5.b +; CHECK-NEXT: ext z5.b, z5.b, z5.b, #8 +; CHECK-NEXT: ldr q16, [x8, :lo12:.LCPI17_0] +; CHECK-NEXT: uunpklo z5.h, z5.b +; CHECK-NEXT: mul z3.h, p0/m, z3.h, z5.h +; CHECK-NEXT: movprfx z5, z6 +; CHECK-NEXT: mul z5.h, p0/m, z5.h, z7.h +; CHECK-NEXT: mul z2.h, p0/m, z2.h, z4.h +; CHECK-NEXT: movprfx z4, z5 +; CHECK-NEXT: lsr z4.h, p0/m, z4.h, z16.h +; CHECK-NEXT: lsr z3.h, p0/m, z3.h, z16.h +; CHECK-NEXT: fmov w9, s4 +; CHECK-NEXT: fmov w8, s3 +; CHECK-NEXT: mov z5.h, z3.h[7] +; CHECK-NEXT: mov z6.h, z3.h[6] +; CHECK-NEXT: mov z7.h, z3.h[5] +; CHECK-NEXT: fmov w10, s5 +; CHECK-NEXT: strb w9, [sp, #16] +; CHECK-NEXT: strb w8, [sp, #24] +; CHECK-NEXT: fmov w8, s6 +; CHECK-NEXT: fmov w9, s7 +; CHECK-NEXT: mov z17.h, z3.h[4] +; CHECK-NEXT: mov z18.h, z3.h[3] +; CHECK-NEXT: mov z19.h, z3.h[2] +; CHECK-NEXT: strb w10, [sp, #31] +; CHECK-NEXT: fmov w10, s17 +; CHECK-NEXT: strb w8, [sp, #30] +; CHECK-NEXT: fmov w8, s18 +; CHECK-NEXT: strb w9, [sp, #29] +; CHECK-NEXT: fmov w9, s19 +; CHECK-NEXT: mov z20.h, z3.h[1] +; CHECK-NEXT: mov z3.h, z4.h[7] +; CHECK-NEXT: mov z21.h, z4.h[6] +; CHECK-NEXT: strb w10, [sp, #28] +; CHECK-NEXT: fmov w10, s20 +; CHECK-NEXT: strb w8, [sp, #27] +; CHECK-NEXT: fmov w8, s3 +; CHECK-NEXT: strb w9, [sp, #26] +; CHECK-NEXT: fmov w9, s21 +; CHECK-NEXT: mov z22.h, z4.h[5] +; CHECK-NEXT: mov z23.h, z4.h[4] +; CHECK-NEXT: mov z24.h, z4.h[3] +; CHECK-NEXT: strb w10, [sp, #25] +; CHECK-NEXT: fmov w10, s22 +; CHECK-NEXT: strb w8, [sp, #23] +; CHECK-NEXT: fmov w8, s23 +; CHECK-NEXT: strb w9, [sp, #22] +; CHECK-NEXT: fmov w9, s24 +; CHECK-NEXT: mov z25.h, z4.h[2] +; CHECK-NEXT: mov z26.h, z4.h[1] +; CHECK-NEXT: strb w10, [sp, #21] +; CHECK-NEXT: fmov w10, s25 +; CHECK-NEXT: strb w8, [sp, #20] +; CHECK-NEXT: movprfx z1, z2 +; CHECK-NEXT: lsr z1.h, p0/m, z1.h, z16.h +; CHECK-NEXT: strb w9, [sp, #19] +; CHECK-NEXT: fmov w8, s26 +; CHECK-NEXT: fmov w9, s1 +; CHECK-NEXT: lsr z0.h, p0/m, z0.h, z16.h +; CHECK-NEXT: mov z2.h, z1.h[7] +; CHECK-NEXT: mov z3.h, z1.h[6] +; CHECK-NEXT: strb w10, [sp, #18] +; CHECK-NEXT: fmov w10, s0 +; CHECK-NEXT: strb w8, [sp, #17] +; CHECK-NEXT: fmov w8, s2 +; CHECK-NEXT: strb w9, [sp, #8] +; CHECK-NEXT: fmov w9, s3 +; CHECK-NEXT: mov z4.h, z1.h[5] +; CHECK-NEXT: mov z5.h, z1.h[4] +; CHECK-NEXT: mov z6.h, z1.h[3] +; CHECK-NEXT: strb w10, [sp] +; CHECK-NEXT: fmov w10, s4 +; CHECK-NEXT: strb w8, [sp, #15] +; CHECK-NEXT: fmov w8, s5 +; CHECK-NEXT: strb w9, [sp, #14] +; CHECK-NEXT: fmov w9, s6 +; CHECK-NEXT: mov z7.h, z1.h[2] +; CHECK-NEXT: mov z16.h, z1.h[1] +; CHECK-NEXT: mov z1.h, z0.h[7] +; CHECK-NEXT: strb w10, [sp, #13] +; CHECK-NEXT: fmov w10, s7 +; CHECK-NEXT: strb w8, [sp, #12] +; CHECK-NEXT: fmov w8, s16 +; CHECK-NEXT: strb w9, [sp, #11] +; CHECK-NEXT: fmov w9, s1 +; CHECK-NEXT: mov z17.h, z0.h[6] +; CHECK-NEXT: mov z18.h, z0.h[5] +; CHECK-NEXT: mov z19.h, z0.h[4] +; CHECK-NEXT: strb w10, [sp, #10] +; CHECK-NEXT: fmov w10, s17 +; CHECK-NEXT: strb w8, [sp, #9] +; CHECK-NEXT: fmov w8, s18 +; CHECK-NEXT: strb w9, [sp, #7] +; CHECK-NEXT: fmov w9, s19 +; CHECK-NEXT: mov z20.h, z0.h[3] +; CHECK-NEXT: mov z21.h, z0.h[2] +; CHECK-NEXT: mov z22.h, z0.h[1] +; CHECK-NEXT: strb w10, [sp, #6] +; CHECK-NEXT: fmov w10, s20 +; CHECK-NEXT: strb w8, [sp, #5] +; CHECK-NEXT: fmov w8, s21 +; CHECK-NEXT: strb w9, [sp, #4] +; CHECK-NEXT: fmov w9, s22 +; CHECK-NEXT: strb w10, [sp, #3] +; CHECK-NEXT: strb w8, [sp, #2] +; CHECK-NEXT: strb w9, [sp, #1] +; CHECK-NEXT: ldp q0, q1, [sp] +; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: add sp, sp, #32 +; CHECK-NEXT: ret + %op1 = load <32 x i8>, <32 x i8>* %a + %op2 = load <32 x i8>, <32 x i8>* %b + %1 = zext <32 x i8> %op1 to <32 x i16> + %2 = zext <32 x i8> %op2 to <32 x i16> + %mul = mul <32 x i16> %1, %2 + %shr = lshr <32 x i16> %mul, + %res = trunc <32 x i16> %shr to <32 x i8> + store <32 x i8> %res, <32 x i8>* %a + ret void +} + +define <2 x i16> @umulh_v2i16(<2 x i16> %op1, <2 x i16> %op2) #0 { +; CHECK-LABEL: umulh_v2i16: +; CHECK: // %bb.0: +; CHECK-NEXT: adrp x8, .LCPI18_0 +; CHECK-NEXT: adrp x9, .LCPI18_1 +; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1 +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: ptrue p0.s, vl2 +; CHECK-NEXT: ldr d2, [x8, :lo12:.LCPI18_0] +; CHECK-NEXT: ldr d3, [x9, :lo12:.LCPI18_1] +; CHECK-NEXT: and z0.d, z0.d, z2.d +; CHECK-NEXT: and z1.d, z1.d, z2.d +; CHECK-NEXT: mul z0.s, p0/m, z0.s, z1.s +; CHECK-NEXT: lsr z0.s, p0/m, z0.s, z3.s +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 +; CHECK-NEXT: ret + %1 = zext <2 x i16> %op1 to <2 x i32> + %2 = zext <2 x i16> %op2 to <2 x i32> + %mul = mul <2 x i32> %1, %2 + %shr = lshr <2 x i32> %mul, + %res = trunc <2 x i32> %shr to <2 x i16> + ret <2 x i16> %res +} + +define <4 x i16> @umulh_v4i16(<4 x i16> %op1, <4 x i16> %op2) #0 { +; CHECK-LABEL: umulh_v4i16: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: ptrue p0.h, vl4 +; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1 +; CHECK-NEXT: umulh z0.h, p0/m, z0.h, z1.h +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 +; CHECK-NEXT: ret + %1 = zext <4 x i16> %op1 to <4 x i32> + %2 = zext <4 x i16> %op2 to <4 x i32> + %mul = mul <4 x i32> %1, %2 + %shr = lshr <4 x i32> %mul, + %res = trunc <4 x i32> %shr to <4 x i16> + ret <4 x i16> %res +} + +define <8 x i16> @umulh_v8i16(<8 x i16> %op1, <8 x i16> %op2) #0 { +; CHECK-LABEL: umulh_v8i16: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: ptrue p0.h, vl8 +; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1 +; CHECK-NEXT: umulh z0.h, p0/m, z0.h, z1.h +; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 +; CHECK-NEXT: ret + %1 = zext <8 x i16> %op1 to <8 x i32> + %2 = zext <8 x i16> %op2 to <8 x i32> + %mul = mul <8 x i32> %1, %2 + %shr = lshr <8 x i32> %mul, + %res = trunc <8 x i32> %shr to <8 x i16> + ret <8 x i16> %res +} + +define void @umulh_v16i16(<16 x i16>* %a, <16 x i16>* %b) #0 { +; CHECK-LABEL: umulh_v16i16: +; CHECK: // %bb.0: +; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ptrue p0.h, vl4 +; CHECK-NEXT: mov z5.d, z0.d +; CHECK-NEXT: ext z5.b, z5.b, z5.b, #8 +; CHECK-NEXT: ldp q2, q3, [x1] +; CHECK-NEXT: mov z4.d, z1.d +; CHECK-NEXT: ext z4.b, z4.b, z4.b, #8 +; CHECK-NEXT: mov z6.d, z2.d +; CHECK-NEXT: umulh z0.h, p0/m, z0.h, z2.h +; CHECK-NEXT: ext z6.b, z6.b, z6.b, #8 +; CHECK-NEXT: mov z2.d, z3.d +; CHECK-NEXT: umulh z1.h, p0/m, z1.h, z3.h +; CHECK-NEXT: ext z2.b, z2.b, z3.b, #8 +; CHECK-NEXT: movprfx z3, z5 +; CHECK-NEXT: umulh z3.h, p0/m, z3.h, z6.h +; CHECK-NEXT: umulh z2.h, p0/m, z2.h, z4.h +; CHECK-NEXT: splice z0.h, p0, z0.h, z3.h +; CHECK-NEXT: splice z1.h, p0, z1.h, z2.h +; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: ret + %op1 = load <16 x i16>, <16 x i16>* %a + %op2 = load <16 x i16>, <16 x i16>* %b + %1 = zext <16 x i16> %op1 to <16 x i32> + %2 = zext <16 x i16> %op2 to <16 x i32> + %mul = mul <16 x i32> %1, %2 + %shr = lshr <16 x i32> %mul, + %res = trunc <16 x i32> %shr to <16 x i16> + store <16 x i16> %res, <16 x i16>* %a + ret void +} + +define <2 x i32> @umulh_v2i32(<2 x i32> %op1, <2 x i32> %op2) #0 { +; CHECK-LABEL: umulh_v2i32: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: ptrue p0.s, vl2 +; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1 +; CHECK-NEXT: umulh z0.s, p0/m, z0.s, z1.s +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 +; CHECK-NEXT: ret + %1 = zext <2 x i32> %op1 to <2 x i64> + %2 = zext <2 x i32> %op2 to <2 x i64> + %mul = mul <2 x i64> %1, %2 + %shr = lshr <2 x i64> %mul, + %res = trunc <2 x i64> %shr to <2 x i32> + ret <2 x i32> %res +} + +define <4 x i32> @umulh_v4i32(<4 x i32> %op1, <4 x i32> %op2) #0 { +; CHECK-LABEL: umulh_v4i32: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: ptrue p0.s, vl4 +; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1 +; CHECK-NEXT: umulh z0.s, p0/m, z0.s, z1.s +; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 +; CHECK-NEXT: ret + %1 = zext <4 x i32> %op1 to <4 x i64> + %2 = zext <4 x i32> %op2 to <4 x i64> + %mul = mul <4 x i64> %1, %2 + %shr = lshr <4 x i64> %mul, + %res = trunc <4 x i64> %shr to <4 x i32> + ret <4 x i32> %res +} + +define void @umulh_v8i32(<8 x i32>* %a, <8 x i32>* %b) #0 { +; CHECK-LABEL: umulh_v8i32: +; CHECK: // %bb.0: +; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ptrue p0.s, vl2 +; CHECK-NEXT: mov z5.d, z0.d +; CHECK-NEXT: ext z5.b, z5.b, z5.b, #8 +; CHECK-NEXT: ldp q2, q3, [x1] +; CHECK-NEXT: mov z4.d, z1.d +; CHECK-NEXT: ext z4.b, z4.b, z4.b, #8 +; CHECK-NEXT: mov z6.d, z2.d +; CHECK-NEXT: umulh z0.s, p0/m, z0.s, z2.s +; CHECK-NEXT: ext z6.b, z6.b, z6.b, #8 +; CHECK-NEXT: mov z2.d, z3.d +; CHECK-NEXT: umulh z1.s, p0/m, z1.s, z3.s +; CHECK-NEXT: ext z2.b, z2.b, z3.b, #8 +; CHECK-NEXT: movprfx z3, z5 +; CHECK-NEXT: umulh z3.s, p0/m, z3.s, z6.s +; CHECK-NEXT: umulh z2.s, p0/m, z2.s, z4.s +; CHECK-NEXT: splice z0.s, p0, z0.s, z3.s +; CHECK-NEXT: splice z1.s, p0, z1.s, z2.s +; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: ret + %op1 = load <8 x i32>, <8 x i32>* %a + %op2 = load <8 x i32>, <8 x i32>* %b + %insert = insertelement <8 x i64> undef, i64 32, i64 0 + %splat = shufflevector <8 x i64> %insert, <8 x i64> undef, <8 x i32> zeroinitializer + %1 = zext <8 x i32> %op1 to <8 x i64> + %2 = zext <8 x i32> %op2 to <8 x i64> + %mul = mul <8 x i64> %1, %2 + %shr = lshr <8 x i64> %mul, + %res = trunc <8 x i64> %shr to <8 x i32> + store <8 x i32> %res, <8 x i32>* %a + ret void +} + +define <1 x i64> @umulh_v1i64(<1 x i64> %op1, <1 x i64> %op2) #0 { +; CHECK-LABEL: umulh_v1i64: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: ptrue p0.d, vl1 +; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1 +; CHECK-NEXT: umulh z0.d, p0/m, z0.d, z1.d +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 +; CHECK-NEXT: ret + %1 = zext <1 x i64> %op1 to <1 x i128> + %2 = zext <1 x i64> %op2 to <1 x i128> + %mul = mul <1 x i128> %1, %2 + %shr = lshr <1 x i128> %mul, + %res = trunc <1 x i128> %shr to <1 x i64> + ret <1 x i64> %res +} + +define <2 x i64> @umulh_v2i64(<2 x i64> %op1, <2 x i64> %op2) #0 { +; CHECK-LABEL: umulh_v2i64: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: ptrue p0.d, vl2 +; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1 +; CHECK-NEXT: umulh z0.d, p0/m, z0.d, z1.d +; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 +; CHECK-NEXT: ret + %1 = zext <2 x i64> %op1 to <2 x i128> + %2 = zext <2 x i64> %op2 to <2 x i128> + %mul = mul <2 x i128> %1, %2 + %shr = lshr <2 x i128> %mul, + %res = trunc <2 x i128> %shr to <2 x i64> + ret <2 x i64> %res +} + +define void @umulh_v4i64(<4 x i64>* %a, <4 x i64>* %b) #0 { +; CHECK-LABEL: umulh_v4i64: +; CHECK: // %bb.0: +; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ptrue p0.d, vl1 +; CHECK-NEXT: fmov x9, d0 +; CHECK-NEXT: ldp q2, q3, [x1] +; CHECK-NEXT: mov z4.d, z1.d[1] +; CHECK-NEXT: fmov x8, d1 +; CHECK-NEXT: mov z1.d, z0.d[1] +; CHECK-NEXT: fmov x13, d4 +; CHECK-NEXT: fmov x10, d1 +; CHECK-NEXT: mov z0.d, z2.d[1] +; CHECK-NEXT: fmov x12, d2 +; CHECK-NEXT: fmov x11, d0 +; CHECK-NEXT: mov z0.d, z3.d[1] +; CHECK-NEXT: fmov x14, d0 +; CHECK-NEXT: umulh x9, x9, x12 +; CHECK-NEXT: umulh x10, x10, x11 +; CHECK-NEXT: fmov x11, d3 +; CHECK-NEXT: umulh x12, x13, x14 +; CHECK-NEXT: umulh x8, x8, x11 +; CHECK-NEXT: fmov d0, x9 +; CHECK-NEXT: fmov d1, x10 +; CHECK-NEXT: fmov d3, x12 +; CHECK-NEXT: fmov d2, x8 +; CHECK-NEXT: splice z0.d, p0, z0.d, z1.d +; CHECK-NEXT: splice z2.d, p0, z2.d, z3.d +; CHECK-NEXT: stp q0, q2, [x0] +; CHECK-NEXT: ret + %op1 = load <4 x i64>, <4 x i64>* %a + %op2 = load <4 x i64>, <4 x i64>* %b + %1 = zext <4 x i64> %op1 to <4 x i128> + %2 = zext <4 x i64> %op2 to <4 x i128> + %mul = mul <4 x i128> %1, %2 + %shr = lshr <4 x i128> %mul, + %res = trunc <4 x i128> %shr to <4 x i64> + store <4 x i64> %res, <4 x i64>* %a + ret void +} + +attributes #0 = { "target-features"="+sve" } diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-rem.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-rem.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-rem.ll @@ -0,0 +1,742 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -force-streaming-compatible-sve < %s | FileCheck %s + +target triple = "aarch64-unknown-linux-gnu" + +; +; SREM +; + +define <4 x i8> @srem_v4i8(<4 x i8> %op1, <4 x i8> %op2) #0 { +; CHECK-LABEL: srem_v4i8: +; CHECK: // %bb.0: +; CHECK-NEXT: sub sp, sp, #16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: adrp x8, .LCPI0_0 +; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1 +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: ptrue p0.h, vl4 +; CHECK-NEXT: ptrue p1.s, vl4 +; CHECK-NEXT: ldr d2, [x8, :lo12:.LCPI0_0] +; CHECK-NEXT: lsl z0.h, p0/m, z0.h, z2.h +; CHECK-NEXT: lsl z1.h, p0/m, z1.h, z2.h +; CHECK-NEXT: asr z0.h, p0/m, z0.h, z2.h +; CHECK-NEXT: asr z1.h, p0/m, z1.h, z2.h +; CHECK-NEXT: sunpklo z2.s, z1.h +; CHECK-NEXT: sunpklo z3.s, z0.h +; CHECK-NEXT: sdivr z2.s, p1/m, z2.s, z3.s +; CHECK-NEXT: fmov w8, s2 +; CHECK-NEXT: mov z3.s, z2.s[3] +; CHECK-NEXT: mov z4.s, z2.s[2] +; CHECK-NEXT: mov z2.s, z2.s[1] +; CHECK-NEXT: fmov w9, s3 +; CHECK-NEXT: fmov w10, s4 +; CHECK-NEXT: strh w8, [sp, #8] +; CHECK-NEXT: fmov w8, s2 +; CHECK-NEXT: strh w9, [sp, #14] +; CHECK-NEXT: strh w10, [sp, #12] +; CHECK-NEXT: strh w8, [sp, #10] +; CHECK-NEXT: ldr d2, [sp, #8] +; CHECK-NEXT: mls z0.h, p0/m, z2.h, z1.h +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 +; CHECK-NEXT: add sp, sp, #16 +; CHECK-NEXT: ret + %res = srem <4 x i8> %op1, %op2 + ret <4 x i8> %res +} + +define <8 x i8> @srem_v8i8(<8 x i8> %op1, <8 x i8> %op2) #0 { +; CHECK-LABEL: srem_v8i8: +; CHECK: // %bb.0: +; CHECK-NEXT: sub sp, sp, #16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1 +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: sunpklo z2.h, z1.b +; CHECK-NEXT: sunpklo z3.h, z0.b +; CHECK-NEXT: ptrue p0.s, vl4 +; CHECK-NEXT: sunpkhi z4.s, z2.h +; CHECK-NEXT: sunpkhi z5.s, z3.h +; CHECK-NEXT: sunpklo z2.s, z2.h +; CHECK-NEXT: sunpklo z3.s, z3.h +; CHECK-NEXT: sdivr z4.s, p0/m, z4.s, z5.s +; CHECK-NEXT: sdivr z2.s, p0/m, z2.s, z3.s +; CHECK-NEXT: ptrue p0.b, vl8 +; CHECK-NEXT: uzp1 z2.h, z2.h, z4.h +; CHECK-NEXT: mov z3.h, z2.h[7] +; CHECK-NEXT: fmov w8, s2 +; CHECK-NEXT: fmov w9, s3 +; CHECK-NEXT: mov z4.h, z2.h[6] +; CHECK-NEXT: mov z5.h, z2.h[5] +; CHECK-NEXT: mov z6.h, z2.h[4] +; CHECK-NEXT: fmov w10, s4 +; CHECK-NEXT: strb w8, [sp, #8] +; CHECK-NEXT: fmov w8, s5 +; CHECK-NEXT: strb w9, [sp, #15] +; CHECK-NEXT: fmov w9, s6 +; CHECK-NEXT: mov z7.h, z2.h[3] +; CHECK-NEXT: mov z16.h, z2.h[2] +; CHECK-NEXT: mov z2.h, z2.h[1] +; CHECK-NEXT: strb w10, [sp, #14] +; CHECK-NEXT: fmov w10, s7 +; CHECK-NEXT: strb w8, [sp, #13] +; CHECK-NEXT: fmov w8, s16 +; CHECK-NEXT: strb w9, [sp, #12] +; CHECK-NEXT: fmov w9, s2 +; CHECK-NEXT: strb w10, [sp, #11] +; CHECK-NEXT: strb w8, [sp, #10] +; CHECK-NEXT: strb w9, [sp, #9] +; CHECK-NEXT: ldr d2, [sp, #8] +; CHECK-NEXT: mls z0.b, p0/m, z2.b, z1.b +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 +; CHECK-NEXT: add sp, sp, #16 +; CHECK-NEXT: ret + %res = srem <8 x i8> %op1, %op2 + ret <8 x i8> %res +} + +define <16 x i8> @srem_v16i8(<16 x i8> %op1, <16 x i8> %op2) #0 { +; CHECK-LABEL: srem_v16i8: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1 +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: sunpkhi z2.h, z1.b +; CHECK-NEXT: sunpkhi z3.h, z0.b +; CHECK-NEXT: ptrue p0.s, vl4 +; CHECK-NEXT: sunpkhi z5.s, z2.h +; CHECK-NEXT: sunpkhi z6.s, z3.h +; CHECK-NEXT: sunpklo z2.s, z2.h +; CHECK-NEXT: sunpklo z3.s, z3.h +; CHECK-NEXT: sunpklo z4.h, z1.b +; CHECK-NEXT: sdivr z2.s, p0/m, z2.s, z3.s +; CHECK-NEXT: sunpklo z3.h, z0.b +; CHECK-NEXT: sdivr z5.s, p0/m, z5.s, z6.s +; CHECK-NEXT: sunpkhi z6.s, z4.h +; CHECK-NEXT: sunpkhi z7.s, z3.h +; CHECK-NEXT: sunpklo z4.s, z4.h +; CHECK-NEXT: sunpklo z3.s, z3.h +; CHECK-NEXT: sdivr z6.s, p0/m, z6.s, z7.s +; CHECK-NEXT: sdiv z3.s, p0/m, z3.s, z4.s +; CHECK-NEXT: uzp1 z2.h, z2.h, z5.h +; CHECK-NEXT: uzp1 z3.h, z3.h, z6.h +; CHECK-NEXT: ptrue p0.b, vl16 +; CHECK-NEXT: uzp1 z2.b, z3.b, z2.b +; CHECK-NEXT: mls z0.b, p0/m, z2.b, z1.b +; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 +; CHECK-NEXT: ret + %res = srem <16 x i8> %op1, %op2 + ret <16 x i8> %res +} + +define void @srem_v32i8(<32 x i8>* %a, <32 x i8>* %b) #0 { +; CHECK-LABEL: srem_v32i8: +; CHECK: // %bb.0: +; CHECK-NEXT: ldp q2, q0, [x0] +; CHECK-NEXT: ptrue p0.s, vl4 +; CHECK-NEXT: ldp q3, q1, [x1] +; CHECK-NEXT: sunpkhi z5.h, z0.b +; CHECK-NEXT: sunpklo z7.h, z0.b +; CHECK-NEXT: sunpkhi z17.s, z5.h +; CHECK-NEXT: sunpklo z5.s, z5.h +; CHECK-NEXT: sunpkhi z4.h, z1.b +; CHECK-NEXT: sunpklo z6.h, z1.b +; CHECK-NEXT: sunpkhi z16.s, z4.h +; CHECK-NEXT: sunpklo z4.s, z4.h +; CHECK-NEXT: sunpkhi z18.s, z6.h +; CHECK-NEXT: sdivr z4.s, p0/m, z4.s, z5.s +; CHECK-NEXT: sunpkhi z5.s, z7.h +; CHECK-NEXT: sunpklo z6.s, z6.h +; CHECK-NEXT: sunpklo z7.s, z7.h +; CHECK-NEXT: sdiv z5.s, p0/m, z5.s, z18.s +; CHECK-NEXT: sdivr z6.s, p0/m, z6.s, z7.s +; CHECK-NEXT: sdivr z16.s, p0/m, z16.s, z17.s +; CHECK-NEXT: uzp1 z5.h, z6.h, z5.h +; CHECK-NEXT: sunpkhi z6.h, z3.b +; CHECK-NEXT: sunpkhi z7.h, z2.b +; CHECK-NEXT: uzp1 z4.h, z4.h, z16.h +; CHECK-NEXT: sunpkhi z16.s, z6.h +; CHECK-NEXT: sunpkhi z17.s, z7.h +; CHECK-NEXT: sunpklo z6.s, z6.h +; CHECK-NEXT: sunpklo z7.s, z7.h +; CHECK-NEXT: sdivr z16.s, p0/m, z16.s, z17.s +; CHECK-NEXT: sdivr z6.s, p0/m, z6.s, z7.s +; CHECK-NEXT: sunpklo z7.h, z3.b +; CHECK-NEXT: sunpklo z17.h, z2.b +; CHECK-NEXT: sunpkhi z18.s, z7.h +; CHECK-NEXT: sunpkhi z19.s, z17.h +; CHECK-NEXT: sunpklo z7.s, z7.h +; CHECK-NEXT: sunpklo z17.s, z17.h +; CHECK-NEXT: sdivr z18.s, p0/m, z18.s, z19.s +; CHECK-NEXT: sdivr z7.s, p0/m, z7.s, z17.s +; CHECK-NEXT: uzp1 z6.h, z6.h, z16.h +; CHECK-NEXT: uzp1 z7.h, z7.h, z18.h +; CHECK-NEXT: ptrue p0.b, vl16 +; CHECK-NEXT: uzp1 z6.b, z7.b, z6.b +; CHECK-NEXT: uzp1 z4.b, z5.b, z4.b +; CHECK-NEXT: mls z2.b, p0/m, z6.b, z3.b +; CHECK-NEXT: mls z0.b, p0/m, z4.b, z1.b +; CHECK-NEXT: stp q2, q0, [x0] +; CHECK-NEXT: ret + %op1 = load <32 x i8>, <32 x i8>* %a + %op2 = load <32 x i8>, <32 x i8>* %b + %res = srem <32 x i8> %op1, %op2 + store <32 x i8> %res, <32 x i8>* %a + ret void +} + +define <4 x i16> @srem_v4i16(<4 x i16> %op1, <4 x i16> %op2) #0 { +; CHECK-LABEL: srem_v4i16: +; CHECK: // %bb.0: +; CHECK-NEXT: sub sp, sp, #16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1 +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: ptrue p0.s, vl4 +; CHECK-NEXT: sunpklo z2.s, z1.h +; CHECK-NEXT: sunpklo z3.s, z0.h +; CHECK-NEXT: sdivr z2.s, p0/m, z2.s, z3.s +; CHECK-NEXT: ptrue p0.h, vl4 +; CHECK-NEXT: fmov w8, s2 +; CHECK-NEXT: mov z3.s, z2.s[3] +; CHECK-NEXT: mov z4.s, z2.s[2] +; CHECK-NEXT: mov z2.s, z2.s[1] +; CHECK-NEXT: fmov w9, s3 +; CHECK-NEXT: fmov w10, s4 +; CHECK-NEXT: strh w8, [sp, #8] +; CHECK-NEXT: fmov w8, s2 +; CHECK-NEXT: strh w9, [sp, #14] +; CHECK-NEXT: strh w10, [sp, #12] +; CHECK-NEXT: strh w8, [sp, #10] +; CHECK-NEXT: ldr d2, [sp, #8] +; CHECK-NEXT: mls z0.h, p0/m, z2.h, z1.h +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 +; CHECK-NEXT: add sp, sp, #16 +; CHECK-NEXT: ret + %res = srem <4 x i16> %op1, %op2 + ret <4 x i16> %res +} + +define <8 x i16> @srem_v8i16(<8 x i16> %op1, <8 x i16> %op2) #0 { +; CHECK-LABEL: srem_v8i16: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1 +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: ptrue p0.s, vl4 +; CHECK-NEXT: sunpkhi z2.s, z1.h +; CHECK-NEXT: sunpkhi z3.s, z0.h +; CHECK-NEXT: sunpklo z4.s, z1.h +; CHECK-NEXT: sdivr z2.s, p0/m, z2.s, z3.s +; CHECK-NEXT: sunpklo z5.s, z0.h +; CHECK-NEXT: movprfx z3, z5 +; CHECK-NEXT: sdiv z3.s, p0/m, z3.s, z4.s +; CHECK-NEXT: uzp1 z2.h, z3.h, z2.h +; CHECK-NEXT: ptrue p0.h, vl8 +; CHECK-NEXT: mls z0.h, p0/m, z2.h, z1.h +; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 +; CHECK-NEXT: ret + %res = srem <8 x i16> %op1, %op2 + ret <8 x i16> %res +} + +define void @srem_v16i16(<16 x i16>* %a, <16 x i16>* %b) #0 { +; CHECK-LABEL: srem_v16i16: +; CHECK: // %bb.0: +; CHECK-NEXT: ldp q2, q0, [x0] +; CHECK-NEXT: ptrue p0.s, vl4 +; CHECK-NEXT: sunpkhi z17.s, z2.h +; CHECK-NEXT: ldp q3, q1, [x1] +; CHECK-NEXT: sunpkhi z5.s, z0.h +; CHECK-NEXT: sunpklo z7.s, z0.h +; CHECK-NEXT: sunpkhi z16.s, z3.h +; CHECK-NEXT: sdivr z16.s, p0/m, z16.s, z17.s +; CHECK-NEXT: sunpkhi z4.s, z1.h +; CHECK-NEXT: sunpklo z6.s, z1.h +; CHECK-NEXT: sdivr z4.s, p0/m, z4.s, z5.s +; CHECK-NEXT: sunpklo z5.s, z3.h +; CHECK-NEXT: sdivr z6.s, p0/m, z6.s, z7.s +; CHECK-NEXT: sunpklo z7.s, z2.h +; CHECK-NEXT: sdivr z5.s, p0/m, z5.s, z7.s +; CHECK-NEXT: ptrue p0.h, vl8 +; CHECK-NEXT: uzp1 z5.h, z5.h, z16.h +; CHECK-NEXT: uzp1 z4.h, z6.h, z4.h +; CHECK-NEXT: mls z2.h, p0/m, z5.h, z3.h +; CHECK-NEXT: mls z0.h, p0/m, z4.h, z1.h +; CHECK-NEXT: stp q2, q0, [x0] +; CHECK-NEXT: ret + %op1 = load <16 x i16>, <16 x i16>* %a + %op2 = load <16 x i16>, <16 x i16>* %b + %res = srem <16 x i16> %op1, %op2 + store <16 x i16> %res, <16 x i16>* %a + ret void +} + +define <2 x i32> @srem_v2i32(<2 x i32> %op1, <2 x i32> %op2) #0 { +; CHECK-LABEL: srem_v2i32: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: ptrue p0.s, vl2 +; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1 +; CHECK-NEXT: movprfx z2, z0 +; CHECK-NEXT: sdiv z2.s, p0/m, z2.s, z1.s +; CHECK-NEXT: mls z0.s, p0/m, z2.s, z1.s +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 +; CHECK-NEXT: ret + %res = srem <2 x i32> %op1, %op2 + ret <2 x i32> %res +} + +define <4 x i32> @srem_v4i32(<4 x i32> %op1, <4 x i32> %op2) #0 { +; CHECK-LABEL: srem_v4i32: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: ptrue p0.s, vl4 +; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1 +; CHECK-NEXT: movprfx z2, z0 +; CHECK-NEXT: sdiv z2.s, p0/m, z2.s, z1.s +; CHECK-NEXT: mls z0.s, p0/m, z2.s, z1.s +; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 +; CHECK-NEXT: ret + %res = srem <4 x i32> %op1, %op2 + ret <4 x i32> %res +} + +define void @srem_v8i32(<8 x i32>* %a, <8 x i32>* %b) #0 { +; CHECK-LABEL: srem_v8i32: +; CHECK: // %bb.0: +; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ptrue p0.s, vl4 +; CHECK-NEXT: ldp q2, q3, [x1] +; CHECK-NEXT: movprfx z4, z0 +; CHECK-NEXT: sdiv z4.s, p0/m, z4.s, z2.s +; CHECK-NEXT: movprfx z5, z1 +; CHECK-NEXT: sdiv z5.s, p0/m, z5.s, z3.s +; CHECK-NEXT: mls z0.s, p0/m, z4.s, z2.s +; CHECK-NEXT: mls z1.s, p0/m, z5.s, z3.s +; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: ret + %op1 = load <8 x i32>, <8 x i32>* %a + %op2 = load <8 x i32>, <8 x i32>* %b + %res = srem <8 x i32> %op1, %op2 + store <8 x i32> %res, <8 x i32>* %a + ret void +} + +define <1 x i64> @srem_v1i64(<1 x i64> %op1, <1 x i64> %op2) #0 { +; CHECK-LABEL: srem_v1i64: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: ptrue p0.d, vl1 +; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1 +; CHECK-NEXT: movprfx z2, z0 +; CHECK-NEXT: sdiv z2.d, p0/m, z2.d, z1.d +; CHECK-NEXT: mls z0.d, p0/m, z2.d, z1.d +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 +; CHECK-NEXT: ret + %res = srem <1 x i64> %op1, %op2 + ret <1 x i64> %res +} + +define <2 x i64> @srem_v2i64(<2 x i64> %op1, <2 x i64> %op2) #0 { +; CHECK-LABEL: srem_v2i64: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: ptrue p0.d, vl2 +; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1 +; CHECK-NEXT: movprfx z2, z0 +; CHECK-NEXT: sdiv z2.d, p0/m, z2.d, z1.d +; CHECK-NEXT: mls z0.d, p0/m, z2.d, z1.d +; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 +; CHECK-NEXT: ret + %res = srem <2 x i64> %op1, %op2 + ret <2 x i64> %res +} + +define void @srem_v4i64(<4 x i64>* %a, <4 x i64>* %b) #0 { +; CHECK-LABEL: srem_v4i64: +; CHECK: // %bb.0: +; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ptrue p0.d, vl2 +; CHECK-NEXT: ldp q2, q3, [x1] +; CHECK-NEXT: movprfx z4, z0 +; CHECK-NEXT: sdiv z4.d, p0/m, z4.d, z2.d +; CHECK-NEXT: movprfx z5, z1 +; CHECK-NEXT: sdiv z5.d, p0/m, z5.d, z3.d +; CHECK-NEXT: mls z0.d, p0/m, z4.d, z2.d +; CHECK-NEXT: mls z1.d, p0/m, z5.d, z3.d +; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: ret + %op1 = load <4 x i64>, <4 x i64>* %a + %op2 = load <4 x i64>, <4 x i64>* %b + %res = srem <4 x i64> %op1, %op2 + store <4 x i64> %res, <4 x i64>* %a + ret void +} + +; +; UREM +; + +define <4 x i8> @urem_v4i8(<4 x i8> %op1, <4 x i8> %op2) #0 { +; CHECK-LABEL: urem_v4i8: +; CHECK: // %bb.0: +; CHECK-NEXT: sub sp, sp, #16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: adrp x8, .LCPI13_0 +; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1 +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: ptrue p0.s, vl4 +; CHECK-NEXT: ldr d2, [x8, :lo12:.LCPI13_0] +; CHECK-NEXT: and z0.d, z0.d, z2.d +; CHECK-NEXT: and z1.d, z1.d, z2.d +; CHECK-NEXT: uunpklo z2.s, z1.h +; CHECK-NEXT: uunpklo z3.s, z0.h +; CHECK-NEXT: udivr z2.s, p0/m, z2.s, z3.s +; CHECK-NEXT: ptrue p0.h, vl4 +; CHECK-NEXT: fmov w8, s2 +; CHECK-NEXT: mov z3.s, z2.s[3] +; CHECK-NEXT: mov z4.s, z2.s[2] +; CHECK-NEXT: mov z2.s, z2.s[1] +; CHECK-NEXT: fmov w9, s3 +; CHECK-NEXT: fmov w10, s4 +; CHECK-NEXT: strh w8, [sp, #8] +; CHECK-NEXT: fmov w8, s2 +; CHECK-NEXT: strh w9, [sp, #14] +; CHECK-NEXT: strh w10, [sp, #12] +; CHECK-NEXT: strh w8, [sp, #10] +; CHECK-NEXT: ldr d2, [sp, #8] +; CHECK-NEXT: mls z0.h, p0/m, z2.h, z1.h +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 +; CHECK-NEXT: add sp, sp, #16 +; CHECK-NEXT: ret + %res = urem <4 x i8> %op1, %op2 + ret <4 x i8> %res +} + +define <8 x i8> @urem_v8i8(<8 x i8> %op1, <8 x i8> %op2) #0 { +; CHECK-LABEL: urem_v8i8: +; CHECK: // %bb.0: +; CHECK-NEXT: sub sp, sp, #16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1 +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: uunpklo z2.h, z1.b +; CHECK-NEXT: uunpklo z3.h, z0.b +; CHECK-NEXT: ptrue p0.s, vl4 +; CHECK-NEXT: uunpkhi z4.s, z2.h +; CHECK-NEXT: uunpkhi z5.s, z3.h +; CHECK-NEXT: uunpklo z2.s, z2.h +; CHECK-NEXT: uunpklo z3.s, z3.h +; CHECK-NEXT: udivr z4.s, p0/m, z4.s, z5.s +; CHECK-NEXT: udivr z2.s, p0/m, z2.s, z3.s +; CHECK-NEXT: ptrue p0.b, vl8 +; CHECK-NEXT: uzp1 z2.h, z2.h, z4.h +; CHECK-NEXT: mov z3.h, z2.h[7] +; CHECK-NEXT: fmov w8, s2 +; CHECK-NEXT: fmov w9, s3 +; CHECK-NEXT: mov z4.h, z2.h[6] +; CHECK-NEXT: mov z5.h, z2.h[5] +; CHECK-NEXT: mov z6.h, z2.h[4] +; CHECK-NEXT: fmov w10, s4 +; CHECK-NEXT: strb w8, [sp, #8] +; CHECK-NEXT: fmov w8, s5 +; CHECK-NEXT: strb w9, [sp, #15] +; CHECK-NEXT: fmov w9, s6 +; CHECK-NEXT: mov z7.h, z2.h[3] +; CHECK-NEXT: mov z16.h, z2.h[2] +; CHECK-NEXT: mov z2.h, z2.h[1] +; CHECK-NEXT: strb w10, [sp, #14] +; CHECK-NEXT: fmov w10, s7 +; CHECK-NEXT: strb w8, [sp, #13] +; CHECK-NEXT: fmov w8, s16 +; CHECK-NEXT: strb w9, [sp, #12] +; CHECK-NEXT: fmov w9, s2 +; CHECK-NEXT: strb w10, [sp, #11] +; CHECK-NEXT: strb w8, [sp, #10] +; CHECK-NEXT: strb w9, [sp, #9] +; CHECK-NEXT: ldr d2, [sp, #8] +; CHECK-NEXT: mls z0.b, p0/m, z2.b, z1.b +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 +; CHECK-NEXT: add sp, sp, #16 +; CHECK-NEXT: ret + %res = urem <8 x i8> %op1, %op2 + ret <8 x i8> %res +} + +define <16 x i8> @urem_v16i8(<16 x i8> %op1, <16 x i8> %op2) #0 { +; CHECK-LABEL: urem_v16i8: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1 +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: uunpkhi z2.h, z1.b +; CHECK-NEXT: uunpkhi z3.h, z0.b +; CHECK-NEXT: ptrue p0.s, vl4 +; CHECK-NEXT: uunpkhi z5.s, z2.h +; CHECK-NEXT: uunpkhi z6.s, z3.h +; CHECK-NEXT: uunpklo z2.s, z2.h +; CHECK-NEXT: uunpklo z3.s, z3.h +; CHECK-NEXT: uunpklo z4.h, z1.b +; CHECK-NEXT: udivr z2.s, p0/m, z2.s, z3.s +; CHECK-NEXT: uunpklo z3.h, z0.b +; CHECK-NEXT: udivr z5.s, p0/m, z5.s, z6.s +; CHECK-NEXT: uunpkhi z6.s, z4.h +; CHECK-NEXT: uunpkhi z7.s, z3.h +; CHECK-NEXT: uunpklo z4.s, z4.h +; CHECK-NEXT: uunpklo z3.s, z3.h +; CHECK-NEXT: udivr z6.s, p0/m, z6.s, z7.s +; CHECK-NEXT: udiv z3.s, p0/m, z3.s, z4.s +; CHECK-NEXT: uzp1 z2.h, z2.h, z5.h +; CHECK-NEXT: uzp1 z3.h, z3.h, z6.h +; CHECK-NEXT: ptrue p0.b, vl16 +; CHECK-NEXT: uzp1 z2.b, z3.b, z2.b +; CHECK-NEXT: mls z0.b, p0/m, z2.b, z1.b +; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 +; CHECK-NEXT: ret + %res = urem <16 x i8> %op1, %op2 + ret <16 x i8> %res +} + +define void @urem_v32i8(<32 x i8>* %a, <32 x i8>* %b) #0 { +; CHECK-LABEL: urem_v32i8: +; CHECK: // %bb.0: +; CHECK-NEXT: ldp q2, q0, [x0] +; CHECK-NEXT: ptrue p0.s, vl4 +; CHECK-NEXT: ldp q3, q1, [x1] +; CHECK-NEXT: uunpkhi z5.h, z0.b +; CHECK-NEXT: uunpklo z7.h, z0.b +; CHECK-NEXT: uunpkhi z17.s, z5.h +; CHECK-NEXT: uunpklo z5.s, z5.h +; CHECK-NEXT: uunpkhi z4.h, z1.b +; CHECK-NEXT: uunpklo z6.h, z1.b +; CHECK-NEXT: uunpkhi z16.s, z4.h +; CHECK-NEXT: uunpklo z4.s, z4.h +; CHECK-NEXT: uunpkhi z18.s, z6.h +; CHECK-NEXT: udivr z4.s, p0/m, z4.s, z5.s +; CHECK-NEXT: uunpkhi z5.s, z7.h +; CHECK-NEXT: uunpklo z6.s, z6.h +; CHECK-NEXT: uunpklo z7.s, z7.h +; CHECK-NEXT: udiv z5.s, p0/m, z5.s, z18.s +; CHECK-NEXT: udivr z6.s, p0/m, z6.s, z7.s +; CHECK-NEXT: udivr z16.s, p0/m, z16.s, z17.s +; CHECK-NEXT: uzp1 z5.h, z6.h, z5.h +; CHECK-NEXT: uunpkhi z6.h, z3.b +; CHECK-NEXT: uunpkhi z7.h, z2.b +; CHECK-NEXT: uzp1 z4.h, z4.h, z16.h +; CHECK-NEXT: uunpkhi z16.s, z6.h +; CHECK-NEXT: uunpkhi z17.s, z7.h +; CHECK-NEXT: uunpklo z6.s, z6.h +; CHECK-NEXT: uunpklo z7.s, z7.h +; CHECK-NEXT: udivr z16.s, p0/m, z16.s, z17.s +; CHECK-NEXT: udivr z6.s, p0/m, z6.s, z7.s +; CHECK-NEXT: uunpklo z7.h, z3.b +; CHECK-NEXT: uunpklo z17.h, z2.b +; CHECK-NEXT: uunpkhi z18.s, z7.h +; CHECK-NEXT: uunpkhi z19.s, z17.h +; CHECK-NEXT: uunpklo z7.s, z7.h +; CHECK-NEXT: uunpklo z17.s, z17.h +; CHECK-NEXT: udivr z18.s, p0/m, z18.s, z19.s +; CHECK-NEXT: udivr z7.s, p0/m, z7.s, z17.s +; CHECK-NEXT: uzp1 z6.h, z6.h, z16.h +; CHECK-NEXT: uzp1 z7.h, z7.h, z18.h +; CHECK-NEXT: ptrue p0.b, vl16 +; CHECK-NEXT: uzp1 z6.b, z7.b, z6.b +; CHECK-NEXT: uzp1 z4.b, z5.b, z4.b +; CHECK-NEXT: mls z2.b, p0/m, z6.b, z3.b +; CHECK-NEXT: mls z0.b, p0/m, z4.b, z1.b +; CHECK-NEXT: stp q2, q0, [x0] +; CHECK-NEXT: ret + %op1 = load <32 x i8>, <32 x i8>* %a + %op2 = load <32 x i8>, <32 x i8>* %b + %res = urem <32 x i8> %op1, %op2 + store <32 x i8> %res, <32 x i8>* %a + ret void +} + +define <4 x i16> @urem_v4i16(<4 x i16> %op1, <4 x i16> %op2) #0 { +; CHECK-LABEL: urem_v4i16: +; CHECK: // %bb.0: +; CHECK-NEXT: sub sp, sp, #16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1 +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: ptrue p0.s, vl4 +; CHECK-NEXT: uunpklo z2.s, z1.h +; CHECK-NEXT: uunpklo z3.s, z0.h +; CHECK-NEXT: udivr z2.s, p0/m, z2.s, z3.s +; CHECK-NEXT: ptrue p0.h, vl4 +; CHECK-NEXT: fmov w8, s2 +; CHECK-NEXT: mov z3.s, z2.s[3] +; CHECK-NEXT: mov z4.s, z2.s[2] +; CHECK-NEXT: mov z2.s, z2.s[1] +; CHECK-NEXT: fmov w9, s3 +; CHECK-NEXT: fmov w10, s4 +; CHECK-NEXT: strh w8, [sp, #8] +; CHECK-NEXT: fmov w8, s2 +; CHECK-NEXT: strh w9, [sp, #14] +; CHECK-NEXT: strh w10, [sp, #12] +; CHECK-NEXT: strh w8, [sp, #10] +; CHECK-NEXT: ldr d2, [sp, #8] +; CHECK-NEXT: mls z0.h, p0/m, z2.h, z1.h +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 +; CHECK-NEXT: add sp, sp, #16 +; CHECK-NEXT: ret + %res = urem <4 x i16> %op1, %op2 + ret <4 x i16> %res +} + +define <8 x i16> @urem_v8i16(<8 x i16> %op1, <8 x i16> %op2) #0 { +; CHECK-LABEL: urem_v8i16: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1 +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: ptrue p0.s, vl4 +; CHECK-NEXT: uunpkhi z2.s, z1.h +; CHECK-NEXT: uunpkhi z3.s, z0.h +; CHECK-NEXT: uunpklo z4.s, z1.h +; CHECK-NEXT: udivr z2.s, p0/m, z2.s, z3.s +; CHECK-NEXT: uunpklo z5.s, z0.h +; CHECK-NEXT: movprfx z3, z5 +; CHECK-NEXT: udiv z3.s, p0/m, z3.s, z4.s +; CHECK-NEXT: uzp1 z2.h, z3.h, z2.h +; CHECK-NEXT: ptrue p0.h, vl8 +; CHECK-NEXT: mls z0.h, p0/m, z2.h, z1.h +; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 +; CHECK-NEXT: ret + %res = urem <8 x i16> %op1, %op2 + ret <8 x i16> %res +} + +define void @urem_v16i16(<16 x i16>* %a, <16 x i16>* %b) #0 { +; CHECK-LABEL: urem_v16i16: +; CHECK: // %bb.0: +; CHECK-NEXT: ldp q2, q0, [x0] +; CHECK-NEXT: ptrue p0.s, vl4 +; CHECK-NEXT: uunpkhi z17.s, z2.h +; CHECK-NEXT: ldp q3, q1, [x1] +; CHECK-NEXT: uunpkhi z5.s, z0.h +; CHECK-NEXT: uunpklo z7.s, z0.h +; CHECK-NEXT: uunpkhi z16.s, z3.h +; CHECK-NEXT: udivr z16.s, p0/m, z16.s, z17.s +; CHECK-NEXT: uunpkhi z4.s, z1.h +; CHECK-NEXT: uunpklo z6.s, z1.h +; CHECK-NEXT: udivr z4.s, p0/m, z4.s, z5.s +; CHECK-NEXT: uunpklo z5.s, z3.h +; CHECK-NEXT: udivr z6.s, p0/m, z6.s, z7.s +; CHECK-NEXT: uunpklo z7.s, z2.h +; CHECK-NEXT: udivr z5.s, p0/m, z5.s, z7.s +; CHECK-NEXT: ptrue p0.h, vl8 +; CHECK-NEXT: uzp1 z5.h, z5.h, z16.h +; CHECK-NEXT: uzp1 z4.h, z6.h, z4.h +; CHECK-NEXT: mls z2.h, p0/m, z5.h, z3.h +; CHECK-NEXT: mls z0.h, p0/m, z4.h, z1.h +; CHECK-NEXT: stp q2, q0, [x0] +; CHECK-NEXT: ret + %op1 = load <16 x i16>, <16 x i16>* %a + %op2 = load <16 x i16>, <16 x i16>* %b + %res = urem <16 x i16> %op1, %op2 + store <16 x i16> %res, <16 x i16>* %a + ret void +} + +define <2 x i32> @urem_v2i32(<2 x i32> %op1, <2 x i32> %op2) #0 { +; CHECK-LABEL: urem_v2i32: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: ptrue p0.s, vl2 +; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1 +; CHECK-NEXT: movprfx z2, z0 +; CHECK-NEXT: udiv z2.s, p0/m, z2.s, z1.s +; CHECK-NEXT: mls z0.s, p0/m, z2.s, z1.s +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 +; CHECK-NEXT: ret + %res = urem <2 x i32> %op1, %op2 + ret <2 x i32> %res +} + +define <4 x i32> @urem_v4i32(<4 x i32> %op1, <4 x i32> %op2) #0 { +; CHECK-LABEL: urem_v4i32: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: ptrue p0.s, vl4 +; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1 +; CHECK-NEXT: movprfx z2, z0 +; CHECK-NEXT: udiv z2.s, p0/m, z2.s, z1.s +; CHECK-NEXT: mls z0.s, p0/m, z2.s, z1.s +; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 +; CHECK-NEXT: ret + %res = urem <4 x i32> %op1, %op2 + ret <4 x i32> %res +} + +define void @urem_v8i32(<8 x i32>* %a, <8 x i32>* %b) #0 { +; CHECK-LABEL: urem_v8i32: +; CHECK: // %bb.0: +; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ptrue p0.s, vl4 +; CHECK-NEXT: ldp q2, q3, [x1] +; CHECK-NEXT: movprfx z4, z0 +; CHECK-NEXT: udiv z4.s, p0/m, z4.s, z2.s +; CHECK-NEXT: movprfx z5, z1 +; CHECK-NEXT: udiv z5.s, p0/m, z5.s, z3.s +; CHECK-NEXT: mls z0.s, p0/m, z4.s, z2.s +; CHECK-NEXT: mls z1.s, p0/m, z5.s, z3.s +; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: ret + %op1 = load <8 x i32>, <8 x i32>* %a + %op2 = load <8 x i32>, <8 x i32>* %b + %res = urem <8 x i32> %op1, %op2 + store <8 x i32> %res, <8 x i32>* %a + ret void +} + +define <1 x i64> @urem_v1i64(<1 x i64> %op1, <1 x i64> %op2) #0 { +; CHECK-LABEL: urem_v1i64: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: ptrue p0.d, vl1 +; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1 +; CHECK-NEXT: movprfx z2, z0 +; CHECK-NEXT: udiv z2.d, p0/m, z2.d, z1.d +; CHECK-NEXT: mls z0.d, p0/m, z2.d, z1.d +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 +; CHECK-NEXT: ret + %res = urem <1 x i64> %op1, %op2 + ret <1 x i64> %res +} + +define <2 x i64> @urem_v2i64(<2 x i64> %op1, <2 x i64> %op2) #0 { +; CHECK-LABEL: urem_v2i64: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: ptrue p0.d, vl2 +; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1 +; CHECK-NEXT: movprfx z2, z0 +; CHECK-NEXT: udiv z2.d, p0/m, z2.d, z1.d +; CHECK-NEXT: mls z0.d, p0/m, z2.d, z1.d +; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 +; CHECK-NEXT: ret + %res = urem <2 x i64> %op1, %op2 + ret <2 x i64> %res +} + +define void @urem_v4i64(<4 x i64>* %a, <4 x i64>* %b) #0 { +; CHECK-LABEL: urem_v4i64: +; CHECK: // %bb.0: +; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ptrue p0.d, vl2 +; CHECK-NEXT: ldp q2, q3, [x1] +; CHECK-NEXT: movprfx z4, z0 +; CHECK-NEXT: udiv z4.d, p0/m, z4.d, z2.d +; CHECK-NEXT: movprfx z5, z1 +; CHECK-NEXT: udiv z5.d, p0/m, z5.d, z3.d +; CHECK-NEXT: mls z0.d, p0/m, z4.d, z2.d +; CHECK-NEXT: mls z1.d, p0/m, z5.d, z3.d +; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: ret + %op1 = load <4 x i64>, <4 x i64>* %a + %op2 = load <4 x i64>, <4 x i64>* %b + %res = urem <4 x i64> %op1, %op2 + store <4 x i64> %res, <4 x i64>* %a + ret void +} + +attributes #0 = { "target-features"="+sve" }