diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -22614,50 +22614,39 @@ return LowerToPredicatedOp(Op, DAG, PredOpcode); // Scalable vector i8/i16 DIV is not supported. Promote it to i32. - EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT); EVT HalfVT = VT.getHalfNumVectorElementsVT(*DAG.getContext()); - EVT FixedWidenedVT = HalfVT.widenIntegerVectorElementType(*DAG.getContext()); - EVT ScalableWidenedVT = getContainerForFixedLengthVector(DAG, FixedWidenedVT); - - // If this is not a full vector, extend, div, and truncate it. - EVT WidenedVT = VT.widenIntegerVectorElementType(*DAG.getContext()); - if (DAG.getTargetLoweringInfo().isTypeLegal(WidenedVT)) { - unsigned ExtendOpcode = Signed ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND; - SDValue Op0 = DAG.getNode(ExtendOpcode, dl, WidenedVT, Op.getOperand(0)); - SDValue Op1 = DAG.getNode(ExtendOpcode, dl, WidenedVT, Op.getOperand(1)); - SDValue Div = DAG.getNode(Op.getOpcode(), dl, WidenedVT, Op0, Op1); + EVT PromVT = HalfVT.widenIntegerVectorElementType(*DAG.getContext()); + unsigned ExtendOpcode = Signed ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND; + + // If the wider type is legal: extend, op, and truncate. + EVT WideVT = VT.widenIntegerVectorElementType(*DAG.getContext()); + if (DAG.getTargetLoweringInfo().isTypeLegal(WideVT)) { + SDValue Op0 = DAG.getNode(ExtendOpcode, dl, WideVT, Op.getOperand(0)); + SDValue Op1 = DAG.getNode(ExtendOpcode, dl, WideVT, Op.getOperand(1)); + SDValue Div = DAG.getNode(Op.getOpcode(), dl, WideVT, Op0, Op1); return DAG.getNode(ISD::TRUNCATE, dl, VT, Div); } - // Convert the operands to scalable vectors. - SDValue Op0 = convertToScalableVector(DAG, ContainerVT, Op.getOperand(0)); - SDValue Op1 = convertToScalableVector(DAG, ContainerVT, Op.getOperand(1)); + auto HalveAndExtendVector = [&DAG, &dl, &HalfVT, &PromVT, + &ExtendOpcode](SDValue Op) { + SDValue IdxZero = DAG.getConstant(0, dl, MVT::i64); + SDValue IdxHalf = + DAG.getConstant(HalfVT.getVectorNumElements(), dl, MVT::i64); + SDValue Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, HalfVT, Op, IdxZero); + SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, HalfVT, Op, IdxHalf); + return std::pair( + {DAG.getNode(ExtendOpcode, dl, PromVT, Lo), + DAG.getNode(ExtendOpcode, dl, PromVT, Hi)}); + }; - // Extend the scalable operands. - unsigned UnpkLo = Signed ? AArch64ISD::SUNPKLO : AArch64ISD::UUNPKLO; - unsigned UnpkHi = Signed ? AArch64ISD::SUNPKHI : AArch64ISD::UUNPKHI; - SDValue Op0Lo = DAG.getNode(UnpkLo, dl, ScalableWidenedVT, Op0); - SDValue Op1Lo = DAG.getNode(UnpkLo, dl, ScalableWidenedVT, Op1); - SDValue Op0Hi = DAG.getNode(UnpkHi, dl, ScalableWidenedVT, Op0); - SDValue Op1Hi = DAG.getNode(UnpkHi, dl, ScalableWidenedVT, Op1); - - // Convert back to fixed vectors so the DIV can be further lowered. - Op0Lo = convertFromScalableVector(DAG, FixedWidenedVT, Op0Lo); - Op1Lo = convertFromScalableVector(DAG, FixedWidenedVT, Op1Lo); - Op0Hi = convertFromScalableVector(DAG, FixedWidenedVT, Op0Hi); - Op1Hi = convertFromScalableVector(DAG, FixedWidenedVT, Op1Hi); - SDValue ResultLo = DAG.getNode(Op.getOpcode(), dl, FixedWidenedVT, - Op0Lo, Op1Lo); - SDValue ResultHi = DAG.getNode(Op.getOpcode(), dl, FixedWidenedVT, - Op0Hi, Op1Hi); - - // Convert again to scalable vectors to truncate. - ResultLo = convertToScalableVector(DAG, ScalableWidenedVT, ResultLo); - ResultHi = convertToScalableVector(DAG, ScalableWidenedVT, ResultHi); - SDValue ScalableResult = DAG.getNode(AArch64ISD::UZP1, dl, ContainerVT, - ResultLo, ResultHi); - - return convertFromScalableVector(DAG, VT, ScalableResult); + // If wider type is not legal: split, extend, op, trunc and concat. + auto [Op0LoExt, Op0HiExt] = HalveAndExtendVector(Op.getOperand(0)); + auto [Op1LoExt, Op1HiExt] = HalveAndExtendVector(Op.getOperand(1)); + SDValue Lo = DAG.getNode(Op.getOpcode(), dl, PromVT, Op0LoExt, Op1LoExt); + SDValue Hi = DAG.getNode(Op.getOpcode(), dl, PromVT, Op0HiExt, Op1HiExt); + SDValue LoTrunc = DAG.getNode(ISD::TRUNCATE, dl, HalfVT, Lo); + SDValue HiTrunc = DAG.getNode(ISD::TRUNCATE, dl, HalfVT, Hi); + return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, {LoTrunc, HiTrunc}); } SDValue AArch64TargetLowering::LowerFixedLengthVectorIntExtendToSVE( diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-int-div.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-int-div.ll --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-int-div.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-int-div.ll @@ -18,13 +18,13 @@ ; VBITS_GE_128-NEXT: sshll v1.8h, v1.8b, #0 ; VBITS_GE_128-NEXT: ptrue p0.s, vl4 ; VBITS_GE_128-NEXT: sshll v0.8h, v0.8b, #0 -; VBITS_GE_128-NEXT: sunpkhi z2.s, z1.h -; VBITS_GE_128-NEXT: sunpklo z1.s, z1.h -; VBITS_GE_128-NEXT: sunpkhi z3.s, z0.h -; VBITS_GE_128-NEXT: sunpklo z0.s, z0.h +; VBITS_GE_128-NEXT: sshll2 v2.4s, v1.8h, #0 +; VBITS_GE_128-NEXT: sshll2 v3.4s, v0.8h, #0 +; VBITS_GE_128-NEXT: sshll v1.4s, v1.4h, #0 +; VBITS_GE_128-NEXT: sshll v0.4s, v0.4h, #0 ; VBITS_GE_128-NEXT: sdivr z2.s, p0/m, z2.s, z3.s ; VBITS_GE_128-NEXT: sdiv z0.s, p0/m, z0.s, z1.s -; VBITS_GE_128-NEXT: uzp1 z0.h, z0.h, z2.h +; VBITS_GE_128-NEXT: uzp1 v0.8h, v0.8h, v2.8h ; VBITS_GE_128-NEXT: xtn v0.8b, v0.8h ; VBITS_GE_128-NEXT: ret ; @@ -94,29 +94,26 @@ define <16 x i8> @sdiv_v16i8(<16 x i8> %op1, <16 x i8> %op2) #0 { ; VBITS_GE_128-LABEL: sdiv_v16i8: ; VBITS_GE_128: // %bb.0: -; VBITS_GE_128-NEXT: // kill: def $q1 killed $q1 def $z1 -; VBITS_GE_128-NEXT: // kill: def $q0 killed $q0 def $z0 -; VBITS_GE_128-NEXT: sunpkhi z2.h, z1.b -; VBITS_GE_128-NEXT: sunpkhi z3.h, z0.b +; VBITS_GE_128-NEXT: sshll2 v2.8h, v1.16b, #0 ; VBITS_GE_128-NEXT: ptrue p0.s, vl4 -; VBITS_GE_128-NEXT: sunpklo z1.h, z1.b -; VBITS_GE_128-NEXT: sunpkhi z4.s, z2.h -; VBITS_GE_128-NEXT: sunpkhi z5.s, z3.h -; VBITS_GE_128-NEXT: sunpklo z2.s, z2.h -; VBITS_GE_128-NEXT: sunpklo z3.s, z3.h -; VBITS_GE_128-NEXT: sunpklo z0.h, z0.b +; VBITS_GE_128-NEXT: sshll2 v3.8h, v0.16b, #0 +; VBITS_GE_128-NEXT: sshll2 v4.4s, v2.8h, #0 +; VBITS_GE_128-NEXT: sshll2 v5.4s, v3.8h, #0 +; VBITS_GE_128-NEXT: sshll v2.4s, v2.4h, #0 +; VBITS_GE_128-NEXT: sshll v3.4s, v3.4h, #0 ; VBITS_GE_128-NEXT: sdivr z4.s, p0/m, z4.s, z5.s +; VBITS_GE_128-NEXT: sshll v1.8h, v1.8b, #0 +; VBITS_GE_128-NEXT: sshll v0.8h, v0.8b, #0 ; VBITS_GE_128-NEXT: sdivr z2.s, p0/m, z2.s, z3.s -; VBITS_GE_128-NEXT: sunpkhi z3.s, z1.h -; VBITS_GE_128-NEXT: sunpkhi z5.s, z0.h -; VBITS_GE_128-NEXT: sunpklo z1.s, z1.h -; VBITS_GE_128-NEXT: sunpklo z0.s, z0.h +; VBITS_GE_128-NEXT: sshll2 v3.4s, v1.8h, #0 +; VBITS_GE_128-NEXT: sshll2 v5.4s, v0.8h, #0 +; VBITS_GE_128-NEXT: sshll v1.4s, v1.4h, #0 +; VBITS_GE_128-NEXT: sshll v0.4s, v0.4h, #0 ; VBITS_GE_128-NEXT: sdivr z3.s, p0/m, z3.s, z5.s +; VBITS_GE_128-NEXT: uzp1 v2.8h, v2.8h, v4.8h ; VBITS_GE_128-NEXT: sdiv z0.s, p0/m, z0.s, z1.s -; VBITS_GE_128-NEXT: uzp1 z1.h, z2.h, z4.h -; VBITS_GE_128-NEXT: uzp1 z0.h, z0.h, z3.h -; VBITS_GE_128-NEXT: uzp1 z0.b, z0.b, z1.b -; VBITS_GE_128-NEXT: // kill: def $q0 killed $q0 killed $z0 +; VBITS_GE_128-NEXT: uzp1 v0.8h, v0.8h, v3.8h +; VBITS_GE_128-NEXT: uzp1 v0.16b, v0.16b, v2.16b ; VBITS_GE_128-NEXT: ret ; ; VBITS_GE_256-LABEL: sdiv_v16i8: @@ -126,14 +123,19 @@ ; VBITS_GE_256-NEXT: ptrue p0.s, vl8 ; VBITS_GE_256-NEXT: sunpklo z1.h, z1.b ; VBITS_GE_256-NEXT: sunpklo z0.h, z0.b -; VBITS_GE_256-NEXT: sunpkhi z2.s, z1.h -; VBITS_GE_256-NEXT: sunpkhi z3.s, z0.h +; VBITS_GE_256-NEXT: sunpklo z2.s, z1.h +; VBITS_GE_256-NEXT: sunpklo z3.s, z0.h +; VBITS_GE_256-NEXT: ext z1.b, z1.b, z1.b, #16 +; VBITS_GE_256-NEXT: ext z0.b, z0.b, z0.b, #16 ; VBITS_GE_256-NEXT: sunpklo z1.s, z1.h ; VBITS_GE_256-NEXT: sunpklo z0.s, z0.h ; VBITS_GE_256-NEXT: sdivr z2.s, p0/m, z2.s, z3.s ; VBITS_GE_256-NEXT: sdiv z0.s, p0/m, z0.s, z1.s -; VBITS_GE_256-NEXT: uzp1 z0.h, z0.h, z2.h -; VBITS_GE_256-NEXT: uzp1 z0.b, z0.b, z0.b +; VBITS_GE_256-NEXT: uzp1 z2.h, z2.h, z2.h +; VBITS_GE_256-NEXT: uzp1 z0.h, z0.h, z0.h +; VBITS_GE_256-NEXT: ptrue p0.h, vl8 +; VBITS_GE_256-NEXT: splice z2.h, p0, z2.h, z0.h +; VBITS_GE_256-NEXT: uzp1 z0.b, z2.b, z2.b ; VBITS_GE_256-NEXT: // kill: def $q0 killed $q0 killed $z0 ; VBITS_GE_256-NEXT: ret ; @@ -206,15 +208,20 @@ ; CHECK-NEXT: ptrue p0.s, vl64 ; CHECK-NEXT: sunpklo z1.h, z1.b ; CHECK-NEXT: sunpklo z0.h, z0.b -; CHECK-NEXT: sunpkhi z2.s, z1.h -; CHECK-NEXT: sunpkhi z3.s, z0.h +; CHECK-NEXT: sunpklo z2.s, z1.h +; CHECK-NEXT: sunpklo z3.s, z0.h +; CHECK-NEXT: ext z1.b, z1.b, z1.b, #128 +; CHECK-NEXT: ext z0.b, z0.b, z0.b, #128 ; CHECK-NEXT: sunpklo z1.s, z1.h ; CHECK-NEXT: sunpklo z0.s, z0.h ; CHECK-NEXT: sdivr z2.s, p0/m, z2.s, z3.s ; CHECK-NEXT: sdiv z0.s, p0/m, z0.s, z1.s -; CHECK-NEXT: uzp1 z0.h, z0.h, z2.h +; CHECK-NEXT: uzp1 z2.h, z2.h, z2.h +; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h +; CHECK-NEXT: ptrue p0.h, vl64 +; CHECK-NEXT: splice z2.h, p0, z2.h, z0.h ; CHECK-NEXT: ptrue p0.h, vl128 -; CHECK-NEXT: st1b { z0.h }, p0, [x0] +; CHECK-NEXT: st1b { z2.h }, p0, [x0] ; CHECK-NEXT: ret %op1 = load <128 x i8>, ptr %a %op2 = load <128 x i8>, ptr %b @@ -230,26 +237,41 @@ ; CHECK-NEXT: ptrue p1.s, vl64 ; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0] ; CHECK-NEXT: ld1b { z1.b }, p0/z, [x1] -; CHECK-NEXT: sunpkhi z2.h, z1.b -; CHECK-NEXT: sunpkhi z3.h, z0.b +; CHECK-NEXT: ptrue p2.h, vl64 +; CHECK-NEXT: sunpklo z2.h, z1.b +; CHECK-NEXT: sunpklo z3.h, z0.b +; CHECK-NEXT: ext z1.b, z1.b, z1.b, #128 +; CHECK-NEXT: ext z0.b, z0.b, z0.b, #128 +; CHECK-NEXT: sunpklo z4.s, z2.h +; CHECK-NEXT: sunpklo z5.s, z3.h +; CHECK-NEXT: ext z2.b, z2.b, z2.b, #128 +; CHECK-NEXT: ext z3.b, z3.b, z3.b, #128 ; CHECK-NEXT: sunpklo z1.h, z1.b ; CHECK-NEXT: sunpklo z0.h, z0.b -; CHECK-NEXT: sunpkhi z4.s, z2.h -; CHECK-NEXT: sunpkhi z5.s, z3.h +; CHECK-NEXT: sdivr z4.s, p1/m, z4.s, z5.s ; CHECK-NEXT: sunpklo z2.s, z2.h ; CHECK-NEXT: sunpklo z3.s, z3.h -; CHECK-NEXT: sdivr z4.s, p1/m, z4.s, z5.s -; CHECK-NEXT: sunpkhi z5.s, z1.h +; CHECK-NEXT: sunpklo z5.s, z1.h ; CHECK-NEXT: sdivr z2.s, p1/m, z2.s, z3.s -; CHECK-NEXT: sunpkhi z3.s, z0.h +; CHECK-NEXT: uzp1 z3.h, z4.h, z4.h +; CHECK-NEXT: sunpklo z4.s, z0.h +; CHECK-NEXT: ext z1.b, z1.b, z1.b, #128 +; CHECK-NEXT: ext z0.b, z0.b, z0.b, #128 ; CHECK-NEXT: sunpklo z1.s, z1.h ; CHECK-NEXT: sunpklo z0.s, z0.h -; CHECK-NEXT: sdiv z3.s, p1/m, z3.s, z5.s +; CHECK-NEXT: uzp1 z2.h, z2.h, z2.h ; CHECK-NEXT: sdiv z0.s, p1/m, z0.s, z1.s -; CHECK-NEXT: uzp1 z1.h, z2.h, z4.h -; CHECK-NEXT: uzp1 z0.h, z0.h, z3.h -; CHECK-NEXT: uzp1 z0.b, z0.b, z1.b -; CHECK-NEXT: st1b { z0.b }, p0, [x0] +; CHECK-NEXT: splice z3.h, p2, z3.h, z2.h +; CHECK-NEXT: movprfx z2, z4 +; CHECK-NEXT: sdiv z2.s, p1/m, z2.s, z5.s +; CHECK-NEXT: uzp1 z1.h, z2.h, z2.h +; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h +; CHECK-NEXT: uzp1 z2.b, z3.b, z3.b +; CHECK-NEXT: splice z1.h, p2, z1.h, z0.h +; CHECK-NEXT: ptrue p1.b, vl128 +; CHECK-NEXT: uzp1 z0.b, z1.b, z1.b +; CHECK-NEXT: splice z2.b, p1, z2.b, z0.b +; CHECK-NEXT: st1b { z2.b }, p0, [x0] ; CHECK-NEXT: ret %op1 = load <256 x i8>, ptr %a %op2 = load <256 x i8>, ptr %b @@ -308,17 +330,14 @@ define <8 x i16> @sdiv_v8i16(<8 x i16> %op1, <8 x i16> %op2) #0 { ; VBITS_GE_128-LABEL: sdiv_v8i16: ; VBITS_GE_128: // %bb.0: -; VBITS_GE_128-NEXT: // kill: def $q1 killed $q1 def $z1 -; VBITS_GE_128-NEXT: // kill: def $q0 killed $q0 def $z0 +; VBITS_GE_128-NEXT: sshll2 v2.4s, v1.8h, #0 ; VBITS_GE_128-NEXT: ptrue p0.s, vl4 -; VBITS_GE_128-NEXT: sunpkhi z2.s, z1.h -; VBITS_GE_128-NEXT: sunpkhi z3.s, z0.h -; VBITS_GE_128-NEXT: sunpklo z1.s, z1.h -; VBITS_GE_128-NEXT: sunpklo z0.s, z0.h +; VBITS_GE_128-NEXT: sshll2 v3.4s, v0.8h, #0 +; VBITS_GE_128-NEXT: sshll v1.4s, v1.4h, #0 +; VBITS_GE_128-NEXT: sshll v0.4s, v0.4h, #0 ; VBITS_GE_128-NEXT: sdivr z2.s, p0/m, z2.s, z3.s ; VBITS_GE_128-NEXT: sdiv z0.s, p0/m, z0.s, z1.s -; VBITS_GE_128-NEXT: uzp1 z0.h, z0.h, z2.h -; VBITS_GE_128-NEXT: // kill: def $q0 killed $q0 killed $z0 +; VBITS_GE_128-NEXT: uzp1 v0.8h, v0.8h, v2.8h ; VBITS_GE_128-NEXT: ret ; ; VBITS_GE_256-LABEL: sdiv_v8i16: @@ -351,24 +370,25 @@ define void @sdiv_v16i16(ptr %a, ptr %b) #0 { ; VBITS_GE_128-LABEL: sdiv_v16i16: ; VBITS_GE_128: // %bb.0: -; VBITS_GE_128-NEXT: ldp q0, q1, [x1] +; VBITS_GE_128-NEXT: ldp q3, q0, [x1] ; VBITS_GE_128-NEXT: ptrue p0.s, vl4 -; VBITS_GE_128-NEXT: sunpkhi z6.s, z0.h -; VBITS_GE_128-NEXT: sunpklo z0.s, z0.h -; VBITS_GE_128-NEXT: ldp q3, q2, [x0] -; VBITS_GE_128-NEXT: sunpkhi z4.s, z1.h -; VBITS_GE_128-NEXT: sunpklo z1.s, z1.h -; VBITS_GE_128-NEXT: sunpkhi z5.s, z2.h -; VBITS_GE_128-NEXT: sunpklo z2.s, z2.h +; VBITS_GE_128-NEXT: sshll2 v6.4s, v3.8h, #0 +; VBITS_GE_128-NEXT: sshll v3.4s, v3.4h, #0 +; VBITS_GE_128-NEXT: ldp q1, q2, [x0] +; VBITS_GE_128-NEXT: sshll2 v4.4s, v0.8h, #0 +; VBITS_GE_128-NEXT: sshll v0.4s, v0.4h, #0 +; VBITS_GE_128-NEXT: sshll2 v7.4s, v1.8h, #0 +; VBITS_GE_128-NEXT: sshll v1.4s, v1.4h, #0 +; VBITS_GE_128-NEXT: sshll2 v5.4s, v2.8h, #0 +; VBITS_GE_128-NEXT: sshll v2.4s, v2.4h, #0 +; VBITS_GE_128-NEXT: sdiv z1.s, p0/m, z1.s, z3.s ; VBITS_GE_128-NEXT: sdivr z4.s, p0/m, z4.s, z5.s -; VBITS_GE_128-NEXT: sunpkhi z5.s, z3.h -; VBITS_GE_128-NEXT: sunpklo z3.s, z3.h -; VBITS_GE_128-NEXT: sdiv z5.s, p0/m, z5.s, z6.s -; VBITS_GE_128-NEXT: sdivr z0.s, p0/m, z0.s, z3.s -; VBITS_GE_128-NEXT: sdivr z1.s, p0/m, z1.s, z2.s -; VBITS_GE_128-NEXT: uzp1 z0.h, z0.h, z5.h -; VBITS_GE_128-NEXT: uzp1 z1.h, z1.h, z4.h -; VBITS_GE_128-NEXT: stp q0, q1, [x0] +; VBITS_GE_128-NEXT: sdivr z0.s, p0/m, z0.s, z2.s +; VBITS_GE_128-NEXT: movprfx z2, z7 +; VBITS_GE_128-NEXT: sdiv z2.s, p0/m, z2.s, z6.s +; VBITS_GE_128-NEXT: uzp1 v1.8h, v1.8h, v2.8h +; VBITS_GE_128-NEXT: uzp1 v0.8h, v0.8h, v4.8h +; VBITS_GE_128-NEXT: stp q1, q0, [x0] ; VBITS_GE_128-NEXT: ret ; ; VBITS_GE_256-LABEL: sdiv_v16i16: @@ -377,14 +397,19 @@ ; VBITS_GE_256-NEXT: ptrue p1.s, vl8 ; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0] ; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x1] -; VBITS_GE_256-NEXT: sunpkhi z2.s, z1.h -; VBITS_GE_256-NEXT: sunpkhi z3.s, z0.h +; VBITS_GE_256-NEXT: sunpklo z2.s, z1.h +; VBITS_GE_256-NEXT: sunpklo z3.s, z0.h +; VBITS_GE_256-NEXT: ext z1.b, z1.b, z1.b, #16 +; VBITS_GE_256-NEXT: ext z0.b, z0.b, z0.b, #16 ; VBITS_GE_256-NEXT: sunpklo z1.s, z1.h ; VBITS_GE_256-NEXT: sunpklo z0.s, z0.h ; VBITS_GE_256-NEXT: sdivr z2.s, p1/m, z2.s, z3.s ; VBITS_GE_256-NEXT: sdiv z0.s, p1/m, z0.s, z1.s -; VBITS_GE_256-NEXT: uzp1 z0.h, z0.h, z2.h -; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x0] +; VBITS_GE_256-NEXT: uzp1 z2.h, z2.h, z2.h +; VBITS_GE_256-NEXT: uzp1 z0.h, z0.h, z0.h +; VBITS_GE_256-NEXT: ptrue p1.h, vl8 +; VBITS_GE_256-NEXT: splice z2.h, p1, z2.h, z0.h +; VBITS_GE_256-NEXT: st1h { z2.h }, p0, [x0] ; VBITS_GE_256-NEXT: ret ; ; VBITS_GE_512-LABEL: sdiv_v16i16: @@ -450,14 +475,19 @@ ; CHECK-NEXT: ptrue p1.s, vl64 ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] ; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1] -; CHECK-NEXT: sunpkhi z2.s, z1.h -; CHECK-NEXT: sunpkhi z3.s, z0.h +; CHECK-NEXT: sunpklo z2.s, z1.h +; CHECK-NEXT: sunpklo z3.s, z0.h +; CHECK-NEXT: ext z1.b, z1.b, z1.b, #128 +; CHECK-NEXT: ext z0.b, z0.b, z0.b, #128 ; CHECK-NEXT: sunpklo z1.s, z1.h ; CHECK-NEXT: sunpklo z0.s, z0.h ; CHECK-NEXT: sdivr z2.s, p1/m, z2.s, z3.s ; CHECK-NEXT: sdiv z0.s, p1/m, z0.s, z1.s -; CHECK-NEXT: uzp1 z0.h, z0.h, z2.h -; CHECK-NEXT: st1h { z0.h }, p0, [x0] +; CHECK-NEXT: uzp1 z2.h, z2.h, z2.h +; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h +; CHECK-NEXT: ptrue p1.h, vl64 +; CHECK-NEXT: splice z2.h, p1, z2.h, z0.h +; CHECK-NEXT: st1h { z2.h }, p0, [x0] ; CHECK-NEXT: ret %op1 = load <128 x i16>, ptr %a %op2 = load <128 x i16>, ptr %b @@ -724,13 +754,13 @@ ; VBITS_GE_128-NEXT: ushll v1.8h, v1.8b, #0 ; VBITS_GE_128-NEXT: ptrue p0.s, vl4 ; VBITS_GE_128-NEXT: ushll v0.8h, v0.8b, #0 -; VBITS_GE_128-NEXT: uunpkhi z2.s, z1.h -; VBITS_GE_128-NEXT: uunpklo z1.s, z1.h -; VBITS_GE_128-NEXT: uunpkhi z3.s, z0.h -; VBITS_GE_128-NEXT: uunpklo z0.s, z0.h +; VBITS_GE_128-NEXT: ushll2 v2.4s, v1.8h, #0 +; VBITS_GE_128-NEXT: ushll2 v3.4s, v0.8h, #0 +; VBITS_GE_128-NEXT: ushll v1.4s, v1.4h, #0 +; VBITS_GE_128-NEXT: ushll v0.4s, v0.4h, #0 ; VBITS_GE_128-NEXT: udivr z2.s, p0/m, z2.s, z3.s ; VBITS_GE_128-NEXT: udiv z0.s, p0/m, z0.s, z1.s -; VBITS_GE_128-NEXT: uzp1 z0.h, z0.h, z2.h +; VBITS_GE_128-NEXT: uzp1 v0.8h, v0.8h, v2.8h ; VBITS_GE_128-NEXT: xtn v0.8b, v0.8h ; VBITS_GE_128-NEXT: ret ; @@ -800,29 +830,26 @@ define <16 x i8> @udiv_v16i8(<16 x i8> %op1, <16 x i8> %op2) #0 { ; VBITS_GE_128-LABEL: udiv_v16i8: ; VBITS_GE_128: // %bb.0: -; VBITS_GE_128-NEXT: // kill: def $q1 killed $q1 def $z1 -; VBITS_GE_128-NEXT: // kill: def $q0 killed $q0 def $z0 -; VBITS_GE_128-NEXT: uunpkhi z2.h, z1.b -; VBITS_GE_128-NEXT: uunpkhi z3.h, z0.b +; VBITS_GE_128-NEXT: ushll2 v2.8h, v1.16b, #0 ; VBITS_GE_128-NEXT: ptrue p0.s, vl4 -; VBITS_GE_128-NEXT: uunpklo z1.h, z1.b -; VBITS_GE_128-NEXT: uunpkhi z4.s, z2.h -; VBITS_GE_128-NEXT: uunpkhi z5.s, z3.h -; VBITS_GE_128-NEXT: uunpklo z2.s, z2.h -; VBITS_GE_128-NEXT: uunpklo z3.s, z3.h -; VBITS_GE_128-NEXT: uunpklo z0.h, z0.b +; VBITS_GE_128-NEXT: ushll2 v3.8h, v0.16b, #0 +; VBITS_GE_128-NEXT: ushll2 v4.4s, v2.8h, #0 +; VBITS_GE_128-NEXT: ushll2 v5.4s, v3.8h, #0 +; VBITS_GE_128-NEXT: ushll v2.4s, v2.4h, #0 +; VBITS_GE_128-NEXT: ushll v3.4s, v3.4h, #0 ; VBITS_GE_128-NEXT: udivr z4.s, p0/m, z4.s, z5.s +; VBITS_GE_128-NEXT: ushll v1.8h, v1.8b, #0 +; VBITS_GE_128-NEXT: ushll v0.8h, v0.8b, #0 ; VBITS_GE_128-NEXT: udivr z2.s, p0/m, z2.s, z3.s -; VBITS_GE_128-NEXT: uunpkhi z3.s, z1.h -; VBITS_GE_128-NEXT: uunpkhi z5.s, z0.h -; VBITS_GE_128-NEXT: uunpklo z1.s, z1.h -; VBITS_GE_128-NEXT: uunpklo z0.s, z0.h +; VBITS_GE_128-NEXT: ushll2 v3.4s, v1.8h, #0 +; VBITS_GE_128-NEXT: ushll2 v5.4s, v0.8h, #0 +; VBITS_GE_128-NEXT: ushll v1.4s, v1.4h, #0 +; VBITS_GE_128-NEXT: ushll v0.4s, v0.4h, #0 ; VBITS_GE_128-NEXT: udivr z3.s, p0/m, z3.s, z5.s +; VBITS_GE_128-NEXT: uzp1 v2.8h, v2.8h, v4.8h ; VBITS_GE_128-NEXT: udiv z0.s, p0/m, z0.s, z1.s -; VBITS_GE_128-NEXT: uzp1 z1.h, z2.h, z4.h -; VBITS_GE_128-NEXT: uzp1 z0.h, z0.h, z3.h -; VBITS_GE_128-NEXT: uzp1 z0.b, z0.b, z1.b -; VBITS_GE_128-NEXT: // kill: def $q0 killed $q0 killed $z0 +; VBITS_GE_128-NEXT: uzp1 v0.8h, v0.8h, v3.8h +; VBITS_GE_128-NEXT: uzp1 v0.16b, v0.16b, v2.16b ; VBITS_GE_128-NEXT: ret ; ; VBITS_GE_256-LABEL: udiv_v16i8: @@ -832,14 +859,19 @@ ; VBITS_GE_256-NEXT: ptrue p0.s, vl8 ; VBITS_GE_256-NEXT: uunpklo z1.h, z1.b ; VBITS_GE_256-NEXT: uunpklo z0.h, z0.b -; VBITS_GE_256-NEXT: uunpkhi z2.s, z1.h -; VBITS_GE_256-NEXT: uunpkhi z3.s, z0.h +; VBITS_GE_256-NEXT: uunpklo z2.s, z1.h +; VBITS_GE_256-NEXT: uunpklo z3.s, z0.h +; VBITS_GE_256-NEXT: ext z1.b, z1.b, z1.b, #16 +; VBITS_GE_256-NEXT: ext z0.b, z0.b, z0.b, #16 ; VBITS_GE_256-NEXT: uunpklo z1.s, z1.h ; VBITS_GE_256-NEXT: uunpklo z0.s, z0.h ; VBITS_GE_256-NEXT: udivr z2.s, p0/m, z2.s, z3.s ; VBITS_GE_256-NEXT: udiv z0.s, p0/m, z0.s, z1.s -; VBITS_GE_256-NEXT: uzp1 z0.h, z0.h, z2.h -; VBITS_GE_256-NEXT: uzp1 z0.b, z0.b, z0.b +; VBITS_GE_256-NEXT: uzp1 z2.h, z2.h, z2.h +; VBITS_GE_256-NEXT: uzp1 z0.h, z0.h, z0.h +; VBITS_GE_256-NEXT: ptrue p0.h, vl8 +; VBITS_GE_256-NEXT: splice z2.h, p0, z2.h, z0.h +; VBITS_GE_256-NEXT: uzp1 z0.b, z2.b, z2.b ; VBITS_GE_256-NEXT: // kill: def $q0 killed $q0 killed $z0 ; VBITS_GE_256-NEXT: ret ; @@ -900,14 +932,19 @@ ; CHECK-NEXT: ptrue p1.s, vl64 ; CHECK-NEXT: ld1b { z0.h }, p0/z, [x1] ; CHECK-NEXT: ld1b { z1.h }, p0/z, [x0] -; CHECK-NEXT: uunpkhi z2.s, z0.h -; CHECK-NEXT: uunpkhi z3.s, z1.h +; CHECK-NEXT: uunpklo z2.s, z0.h +; CHECK-NEXT: uunpklo z3.s, z1.h +; CHECK-NEXT: ext z0.b, z0.b, z0.b, #128 +; CHECK-NEXT: ext z1.b, z1.b, z1.b, #128 ; CHECK-NEXT: uunpklo z0.s, z0.h ; CHECK-NEXT: uunpklo z1.s, z1.h ; CHECK-NEXT: udivr z2.s, p1/m, z2.s, z3.s ; CHECK-NEXT: udivr z0.s, p1/m, z0.s, z1.s -; CHECK-NEXT: uzp1 z0.h, z0.h, z2.h -; CHECK-NEXT: st1b { z0.h }, p0, [x0] +; CHECK-NEXT: uzp1 z2.h, z2.h, z2.h +; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h +; CHECK-NEXT: ptrue p1.h, vl64 +; CHECK-NEXT: splice z2.h, p1, z2.h, z0.h +; CHECK-NEXT: st1b { z2.h }, p0, [x0] ; CHECK-NEXT: ret %op1 = load <128 x i8>, ptr %a %op2 = load <128 x i8>, ptr %b @@ -923,26 +960,41 @@ ; CHECK-NEXT: ptrue p1.s, vl64 ; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0] ; CHECK-NEXT: ld1b { z1.b }, p0/z, [x1] -; CHECK-NEXT: uunpkhi z2.h, z1.b -; CHECK-NEXT: uunpkhi z3.h, z0.b +; CHECK-NEXT: ptrue p2.h, vl64 +; CHECK-NEXT: uunpklo z2.h, z1.b +; CHECK-NEXT: uunpklo z3.h, z0.b +; CHECK-NEXT: ext z1.b, z1.b, z1.b, #128 +; CHECK-NEXT: ext z0.b, z0.b, z0.b, #128 +; CHECK-NEXT: uunpklo z4.s, z2.h +; CHECK-NEXT: uunpklo z5.s, z3.h +; CHECK-NEXT: ext z2.b, z2.b, z2.b, #128 +; CHECK-NEXT: ext z3.b, z3.b, z3.b, #128 ; CHECK-NEXT: uunpklo z1.h, z1.b ; CHECK-NEXT: uunpklo z0.h, z0.b -; CHECK-NEXT: uunpkhi z4.s, z2.h -; CHECK-NEXT: uunpkhi z5.s, z3.h +; CHECK-NEXT: udivr z4.s, p1/m, z4.s, z5.s ; CHECK-NEXT: uunpklo z2.s, z2.h ; CHECK-NEXT: uunpklo z3.s, z3.h -; CHECK-NEXT: udivr z4.s, p1/m, z4.s, z5.s -; CHECK-NEXT: uunpkhi z5.s, z1.h +; CHECK-NEXT: uunpklo z5.s, z1.h ; CHECK-NEXT: udivr z2.s, p1/m, z2.s, z3.s -; CHECK-NEXT: uunpkhi z3.s, z0.h +; CHECK-NEXT: uzp1 z3.h, z4.h, z4.h +; CHECK-NEXT: uunpklo z4.s, z0.h +; CHECK-NEXT: ext z1.b, z1.b, z1.b, #128 +; CHECK-NEXT: ext z0.b, z0.b, z0.b, #128 ; CHECK-NEXT: uunpklo z1.s, z1.h ; CHECK-NEXT: uunpklo z0.s, z0.h -; CHECK-NEXT: udiv z3.s, p1/m, z3.s, z5.s +; CHECK-NEXT: uzp1 z2.h, z2.h, z2.h ; CHECK-NEXT: udiv z0.s, p1/m, z0.s, z1.s -; CHECK-NEXT: uzp1 z1.h, z2.h, z4.h -; CHECK-NEXT: uzp1 z0.h, z0.h, z3.h -; CHECK-NEXT: uzp1 z0.b, z0.b, z1.b -; CHECK-NEXT: st1b { z0.b }, p0, [x0] +; CHECK-NEXT: splice z3.h, p2, z3.h, z2.h +; CHECK-NEXT: movprfx z2, z4 +; CHECK-NEXT: udiv z2.s, p1/m, z2.s, z5.s +; CHECK-NEXT: uzp1 z1.h, z2.h, z2.h +; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h +; CHECK-NEXT: uzp1 z2.b, z3.b, z3.b +; CHECK-NEXT: splice z1.h, p2, z1.h, z0.h +; CHECK-NEXT: ptrue p1.b, vl128 +; CHECK-NEXT: uzp1 z0.b, z1.b, z1.b +; CHECK-NEXT: splice z2.b, p1, z2.b, z0.b +; CHECK-NEXT: st1b { z2.b }, p0, [x0] ; CHECK-NEXT: ret %op1 = load <256 x i8>, ptr %a %op2 = load <256 x i8>, ptr %b @@ -1001,17 +1053,14 @@ define <8 x i16> @udiv_v8i16(<8 x i16> %op1, <8 x i16> %op2) #0 { ; VBITS_GE_128-LABEL: udiv_v8i16: ; VBITS_GE_128: // %bb.0: -; VBITS_GE_128-NEXT: // kill: def $q1 killed $q1 def $z1 -; VBITS_GE_128-NEXT: // kill: def $q0 killed $q0 def $z0 +; VBITS_GE_128-NEXT: ushll2 v2.4s, v1.8h, #0 ; VBITS_GE_128-NEXT: ptrue p0.s, vl4 -; VBITS_GE_128-NEXT: uunpkhi z2.s, z1.h -; VBITS_GE_128-NEXT: uunpkhi z3.s, z0.h -; VBITS_GE_128-NEXT: uunpklo z1.s, z1.h -; VBITS_GE_128-NEXT: uunpklo z0.s, z0.h +; VBITS_GE_128-NEXT: ushll2 v3.4s, v0.8h, #0 +; VBITS_GE_128-NEXT: ushll v1.4s, v1.4h, #0 +; VBITS_GE_128-NEXT: ushll v0.4s, v0.4h, #0 ; VBITS_GE_128-NEXT: udivr z2.s, p0/m, z2.s, z3.s ; VBITS_GE_128-NEXT: udiv z0.s, p0/m, z0.s, z1.s -; VBITS_GE_128-NEXT: uzp1 z0.h, z0.h, z2.h -; VBITS_GE_128-NEXT: // kill: def $q0 killed $q0 killed $z0 +; VBITS_GE_128-NEXT: uzp1 v0.8h, v0.8h, v2.8h ; VBITS_GE_128-NEXT: ret ; ; VBITS_GE_256-LABEL: udiv_v8i16: @@ -1044,24 +1093,25 @@ define void @udiv_v16i16(ptr %a, ptr %b) #0 { ; VBITS_GE_128-LABEL: udiv_v16i16: ; VBITS_GE_128: // %bb.0: -; VBITS_GE_128-NEXT: ldp q0, q1, [x1] +; VBITS_GE_128-NEXT: ldp q3, q0, [x1] ; VBITS_GE_128-NEXT: ptrue p0.s, vl4 -; VBITS_GE_128-NEXT: uunpkhi z6.s, z0.h -; VBITS_GE_128-NEXT: uunpklo z0.s, z0.h -; VBITS_GE_128-NEXT: ldp q3, q2, [x0] -; VBITS_GE_128-NEXT: uunpkhi z4.s, z1.h -; VBITS_GE_128-NEXT: uunpklo z1.s, z1.h -; VBITS_GE_128-NEXT: uunpkhi z5.s, z2.h -; VBITS_GE_128-NEXT: uunpklo z2.s, z2.h +; VBITS_GE_128-NEXT: ushll2 v6.4s, v3.8h, #0 +; VBITS_GE_128-NEXT: ushll v3.4s, v3.4h, #0 +; VBITS_GE_128-NEXT: ldp q1, q2, [x0] +; VBITS_GE_128-NEXT: ushll2 v4.4s, v0.8h, #0 +; VBITS_GE_128-NEXT: ushll v0.4s, v0.4h, #0 +; VBITS_GE_128-NEXT: ushll2 v7.4s, v1.8h, #0 +; VBITS_GE_128-NEXT: ushll v1.4s, v1.4h, #0 +; VBITS_GE_128-NEXT: ushll2 v5.4s, v2.8h, #0 +; VBITS_GE_128-NEXT: ushll v2.4s, v2.4h, #0 +; VBITS_GE_128-NEXT: udiv z1.s, p0/m, z1.s, z3.s ; VBITS_GE_128-NEXT: udivr z4.s, p0/m, z4.s, z5.s -; VBITS_GE_128-NEXT: uunpkhi z5.s, z3.h -; VBITS_GE_128-NEXT: uunpklo z3.s, z3.h -; VBITS_GE_128-NEXT: udiv z5.s, p0/m, z5.s, z6.s -; VBITS_GE_128-NEXT: udivr z0.s, p0/m, z0.s, z3.s -; VBITS_GE_128-NEXT: udivr z1.s, p0/m, z1.s, z2.s -; VBITS_GE_128-NEXT: uzp1 z0.h, z0.h, z5.h -; VBITS_GE_128-NEXT: uzp1 z1.h, z1.h, z4.h -; VBITS_GE_128-NEXT: stp q0, q1, [x0] +; VBITS_GE_128-NEXT: udivr z0.s, p0/m, z0.s, z2.s +; VBITS_GE_128-NEXT: movprfx z2, z7 +; VBITS_GE_128-NEXT: udiv z2.s, p0/m, z2.s, z6.s +; VBITS_GE_128-NEXT: uzp1 v1.8h, v1.8h, v2.8h +; VBITS_GE_128-NEXT: uzp1 v0.8h, v0.8h, v4.8h +; VBITS_GE_128-NEXT: stp q1, q0, [x0] ; VBITS_GE_128-NEXT: ret ; ; VBITS_GE_256-LABEL: udiv_v16i16: @@ -1070,14 +1120,19 @@ ; VBITS_GE_256-NEXT: ptrue p1.s, vl8 ; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0] ; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x1] -; VBITS_GE_256-NEXT: uunpkhi z2.s, z1.h -; VBITS_GE_256-NEXT: uunpkhi z3.s, z0.h +; VBITS_GE_256-NEXT: uunpklo z2.s, z1.h +; VBITS_GE_256-NEXT: uunpklo z3.s, z0.h +; VBITS_GE_256-NEXT: ext z1.b, z1.b, z1.b, #16 +; VBITS_GE_256-NEXT: ext z0.b, z0.b, z0.b, #16 ; VBITS_GE_256-NEXT: uunpklo z1.s, z1.h ; VBITS_GE_256-NEXT: uunpklo z0.s, z0.h ; VBITS_GE_256-NEXT: udivr z2.s, p1/m, z2.s, z3.s ; VBITS_GE_256-NEXT: udiv z0.s, p1/m, z0.s, z1.s -; VBITS_GE_256-NEXT: uzp1 z0.h, z0.h, z2.h -; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x0] +; VBITS_GE_256-NEXT: uzp1 z2.h, z2.h, z2.h +; VBITS_GE_256-NEXT: uzp1 z0.h, z0.h, z0.h +; VBITS_GE_256-NEXT: ptrue p1.h, vl8 +; VBITS_GE_256-NEXT: splice z2.h, p1, z2.h, z0.h +; VBITS_GE_256-NEXT: st1h { z2.h }, p0, [x0] ; VBITS_GE_256-NEXT: ret ; ; VBITS_GE_512-LABEL: udiv_v16i16: @@ -1134,14 +1189,19 @@ ; CHECK-NEXT: ptrue p1.s, vl64 ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] ; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1] -; CHECK-NEXT: uunpkhi z2.s, z1.h -; CHECK-NEXT: uunpkhi z3.s, z0.h +; CHECK-NEXT: uunpklo z2.s, z1.h +; CHECK-NEXT: uunpklo z3.s, z0.h +; CHECK-NEXT: ext z1.b, z1.b, z1.b, #128 +; CHECK-NEXT: ext z0.b, z0.b, z0.b, #128 ; CHECK-NEXT: uunpklo z1.s, z1.h ; CHECK-NEXT: uunpklo z0.s, z0.h ; CHECK-NEXT: udivr z2.s, p1/m, z2.s, z3.s ; CHECK-NEXT: udiv z0.s, p1/m, z0.s, z1.s -; CHECK-NEXT: uzp1 z0.h, z0.h, z2.h -; CHECK-NEXT: st1h { z0.h }, p0, [x0] +; CHECK-NEXT: uzp1 z2.h, z2.h, z2.h +; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h +; CHECK-NEXT: ptrue p1.h, vl64 +; CHECK-NEXT: splice z2.h, p1, z2.h, z0.h +; CHECK-NEXT: st1h { z2.h }, p0, [x0] ; CHECK-NEXT: ret %op1 = load <128 x i16>, ptr %a %op2 = load <128 x i16>, ptr %b diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-int-rem.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-int-rem.ll --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-int-rem.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-int-rem.ll @@ -18,13 +18,13 @@ ; VBITS_GE_128-NEXT: sshll v2.8h, v1.8b, #0 ; VBITS_GE_128-NEXT: ptrue p0.s, vl4 ; VBITS_GE_128-NEXT: sshll v3.8h, v0.8b, #0 -; VBITS_GE_128-NEXT: sunpkhi z4.s, z2.h -; VBITS_GE_128-NEXT: sunpklo z2.s, z2.h -; VBITS_GE_128-NEXT: sunpkhi z5.s, z3.h -; VBITS_GE_128-NEXT: sunpklo z3.s, z3.h +; VBITS_GE_128-NEXT: sshll2 v4.4s, v2.8h, #0 +; VBITS_GE_128-NEXT: sshll2 v5.4s, v3.8h, #0 +; VBITS_GE_128-NEXT: sshll v2.4s, v2.4h, #0 +; VBITS_GE_128-NEXT: sshll v3.4s, v3.4h, #0 ; VBITS_GE_128-NEXT: sdivr z4.s, p0/m, z4.s, z5.s ; VBITS_GE_128-NEXT: sdivr z2.s, p0/m, z2.s, z3.s -; VBITS_GE_128-NEXT: uzp1 z2.h, z2.h, z4.h +; VBITS_GE_128-NEXT: uzp1 v2.8h, v2.8h, v4.8h ; VBITS_GE_128-NEXT: xtn v2.8b, v2.8h ; VBITS_GE_128-NEXT: mls v0.8b, v2.8b, v1.8b ; VBITS_GE_128-NEXT: ret @@ -97,30 +97,28 @@ define <16 x i8> @srem_v16i8(<16 x i8> %op1, <16 x i8> %op2) #0 { ; VBITS_GE_128-LABEL: srem_v16i8: ; VBITS_GE_128: // %bb.0: -; VBITS_GE_128-NEXT: // kill: def $q1 killed $q1 def $z1 -; VBITS_GE_128-NEXT: // kill: def $q0 killed $q0 def $z0 -; VBITS_GE_128-NEXT: sunpkhi z2.h, z1.b -; VBITS_GE_128-NEXT: sunpkhi z3.h, z0.b +; VBITS_GE_128-NEXT: sshll2 v2.8h, v1.16b, #0 ; VBITS_GE_128-NEXT: ptrue p0.s, vl4 -; VBITS_GE_128-NEXT: sunpkhi z5.s, z2.h -; VBITS_GE_128-NEXT: sunpkhi z6.s, z3.h -; VBITS_GE_128-NEXT: sunpklo z2.s, z2.h -; VBITS_GE_128-NEXT: sunpklo z3.s, z3.h -; VBITS_GE_128-NEXT: sunpklo z4.h, z1.b +; VBITS_GE_128-NEXT: sshll2 v3.8h, v0.16b, #0 +; VBITS_GE_128-NEXT: sshll2 v4.4s, v2.8h, #0 +; VBITS_GE_128-NEXT: sshll2 v5.4s, v3.8h, #0 +; VBITS_GE_128-NEXT: sshll v2.4s, v2.4h, #0 +; VBITS_GE_128-NEXT: sshll v3.4s, v3.4h, #0 +; VBITS_GE_128-NEXT: sdivr z4.s, p0/m, z4.s, z5.s +; VBITS_GE_128-NEXT: sshll v6.8h, v1.8b, #0 +; VBITS_GE_128-NEXT: sshll v7.8h, v0.8b, #0 ; VBITS_GE_128-NEXT: sdivr z2.s, p0/m, z2.s, z3.s -; VBITS_GE_128-NEXT: sunpklo z3.h, z0.b -; VBITS_GE_128-NEXT: sdivr z5.s, p0/m, z5.s, z6.s -; VBITS_GE_128-NEXT: sunpkhi z6.s, z4.h -; VBITS_GE_128-NEXT: sunpkhi z7.s, z3.h -; VBITS_GE_128-NEXT: sunpklo z4.s, z4.h -; VBITS_GE_128-NEXT: sunpklo z3.s, z3.h -; VBITS_GE_128-NEXT: sdivr z6.s, p0/m, z6.s, z7.s -; VBITS_GE_128-NEXT: sdiv z3.s, p0/m, z3.s, z4.s -; VBITS_GE_128-NEXT: uzp1 z2.h, z2.h, z5.h -; VBITS_GE_128-NEXT: uzp1 z3.h, z3.h, z6.h -; VBITS_GE_128-NEXT: uzp1 z2.b, z3.b, z2.b +; VBITS_GE_128-NEXT: sshll2 v3.4s, v6.8h, #0 +; VBITS_GE_128-NEXT: sshll2 v5.4s, v7.8h, #0 +; VBITS_GE_128-NEXT: sshll v6.4s, v6.4h, #0 +; VBITS_GE_128-NEXT: sdivr z3.s, p0/m, z3.s, z5.s +; VBITS_GE_128-NEXT: sshll v7.4s, v7.4h, #0 +; VBITS_GE_128-NEXT: uzp1 v2.8h, v2.8h, v4.8h +; VBITS_GE_128-NEXT: movprfx z4, z7 +; VBITS_GE_128-NEXT: sdiv z4.s, p0/m, z4.s, z6.s +; VBITS_GE_128-NEXT: uzp1 v3.8h, v4.8h, v3.8h +; VBITS_GE_128-NEXT: uzp1 v2.16b, v3.16b, v2.16b ; VBITS_GE_128-NEXT: mls v0.16b, v2.16b, v1.16b -; VBITS_GE_128-NEXT: // kill: def $q0 killed $q0 killed $z0 ; VBITS_GE_128-NEXT: ret ; ; VBITS_GE_256-LABEL: srem_v16i8: @@ -129,15 +127,20 @@ ; VBITS_GE_256-NEXT: // kill: def $q0 killed $q0 def $z0 ; VBITS_GE_256-NEXT: sunpklo z2.h, z1.b ; VBITS_GE_256-NEXT: sunpklo z3.h, z0.b +; VBITS_GE_256-NEXT: sunpklo z4.s, z2.h +; VBITS_GE_256-NEXT: sunpklo z5.s, z3.h +; VBITS_GE_256-NEXT: ext z2.b, z2.b, z2.b, #16 +; VBITS_GE_256-NEXT: ext z3.b, z3.b, z3.b, #16 ; VBITS_GE_256-NEXT: ptrue p0.s, vl8 -; VBITS_GE_256-NEXT: sunpkhi z4.s, z2.h -; VBITS_GE_256-NEXT: sunpkhi z5.s, z3.h ; VBITS_GE_256-NEXT: sunpklo z2.s, z2.h ; VBITS_GE_256-NEXT: sunpklo z3.s, z3.h ; VBITS_GE_256-NEXT: sdivr z4.s, p0/m, z4.s, z5.s ; VBITS_GE_256-NEXT: sdivr z2.s, p0/m, z2.s, z3.s -; VBITS_GE_256-NEXT: uzp1 z2.h, z2.h, z4.h -; VBITS_GE_256-NEXT: uzp1 z2.b, z2.b, z2.b +; VBITS_GE_256-NEXT: uzp1 z4.h, z4.h, z4.h +; VBITS_GE_256-NEXT: uzp1 z2.h, z2.h, z2.h +; VBITS_GE_256-NEXT: ptrue p0.h, vl8 +; VBITS_GE_256-NEXT: splice z4.h, p0, z4.h, z2.h +; VBITS_GE_256-NEXT: uzp1 z2.b, z4.b, z4.b ; VBITS_GE_256-NEXT: mls v0.16b, v2.16b, v1.16b ; VBITS_GE_256-NEXT: // kill: def $q0 killed $q0 killed $z0 ; VBITS_GE_256-NEXT: ret @@ -218,14 +221,19 @@ ; CHECK-NEXT: ld1b { z1.b }, p0/z, [x1] ; CHECK-NEXT: sunpklo z2.h, z1.b ; CHECK-NEXT: sunpklo z3.h, z0.b -; CHECK-NEXT: sunpkhi z4.s, z2.h -; CHECK-NEXT: sunpkhi z5.s, z3.h +; CHECK-NEXT: sunpklo z4.s, z2.h +; CHECK-NEXT: sunpklo z5.s, z3.h +; CHECK-NEXT: ext z2.b, z2.b, z2.b, #128 +; CHECK-NEXT: ext z3.b, z3.b, z3.b, #128 ; CHECK-NEXT: sunpklo z2.s, z2.h ; CHECK-NEXT: sunpklo z3.s, z3.h ; CHECK-NEXT: sdivr z4.s, p1/m, z4.s, z5.s ; CHECK-NEXT: sdivr z2.s, p1/m, z2.s, z3.s -; CHECK-NEXT: uzp1 z2.h, z2.h, z4.h -; CHECK-NEXT: uzp1 z2.b, z2.b, z2.b +; CHECK-NEXT: uzp1 z4.h, z4.h, z4.h +; CHECK-NEXT: uzp1 z2.h, z2.h, z2.h +; CHECK-NEXT: ptrue p1.h, vl64 +; CHECK-NEXT: splice z4.h, p1, z4.h, z2.h +; CHECK-NEXT: uzp1 z2.b, z4.b, z4.b ; CHECK-NEXT: mls z0.b, p0/m, z2.b, z1.b ; CHECK-NEXT: st1b { z0.b }, p0, [x0] ; CHECK-NEXT: ret @@ -243,26 +251,42 @@ ; CHECK-NEXT: ptrue p1.s, vl64 ; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0] ; CHECK-NEXT: ld1b { z1.b }, p0/z, [x1] -; CHECK-NEXT: sunpkhi z2.h, z1.b -; CHECK-NEXT: sunpkhi z3.h, z0.b -; CHECK-NEXT: sunpklo z4.h, z1.b -; CHECK-NEXT: sunpklo z5.h, z0.b -; CHECK-NEXT: sunpkhi z6.s, z2.h -; CHECK-NEXT: sunpkhi z7.s, z3.h +; CHECK-NEXT: ptrue p2.h, vl64 +; CHECK-NEXT: sunpklo z2.h, z1.b +; CHECK-NEXT: sunpklo z3.h, z0.b +; CHECK-NEXT: sunpklo z5.s, z2.h +; CHECK-NEXT: sunpklo z6.s, z3.h +; CHECK-NEXT: ext z2.b, z2.b, z2.b, #128 +; CHECK-NEXT: ext z3.b, z3.b, z3.b, #128 +; CHECK-NEXT: mov z4.d, z1.d +; CHECK-NEXT: sdivr z5.s, p1/m, z5.s, z6.s +; CHECK-NEXT: mov z6.d, z0.d ; CHECK-NEXT: sunpklo z2.s, z2.h ; CHECK-NEXT: sunpklo z3.s, z3.h -; CHECK-NEXT: sdivr z6.s, p1/m, z6.s, z7.s -; CHECK-NEXT: sunpkhi z7.s, z4.h +; CHECK-NEXT: ext z4.b, z4.b, z1.b, #128 ; CHECK-NEXT: sdivr z2.s, p1/m, z2.s, z3.s -; CHECK-NEXT: sunpkhi z3.s, z5.h +; CHECK-NEXT: ext z6.b, z6.b, z0.b, #128 +; CHECK-NEXT: uzp1 z5.h, z5.h, z5.h +; CHECK-NEXT: uzp1 z2.h, z2.h, z2.h +; CHECK-NEXT: sunpklo z3.h, z4.b +; CHECK-NEXT: sunpklo z4.h, z6.b +; CHECK-NEXT: splice z5.h, p2, z5.h, z2.h +; CHECK-NEXT: sunpklo z2.s, z3.h +; CHECK-NEXT: sunpklo z6.s, z4.h +; CHECK-NEXT: ext z3.b, z3.b, z3.b, #128 +; CHECK-NEXT: ext z4.b, z4.b, z4.b, #128 +; CHECK-NEXT: sunpklo z3.s, z3.h ; CHECK-NEXT: sunpklo z4.s, z4.h -; CHECK-NEXT: sunpklo z5.s, z5.h -; CHECK-NEXT: sdiv z3.s, p1/m, z3.s, z7.s -; CHECK-NEXT: sdivr z4.s, p1/m, z4.s, z5.s -; CHECK-NEXT: uzp1 z2.h, z2.h, z6.h -; CHECK-NEXT: uzp1 z3.h, z4.h, z3.h -; CHECK-NEXT: uzp1 z2.b, z3.b, z2.b -; CHECK-NEXT: mls z0.b, p0/m, z2.b, z1.b +; CHECK-NEXT: sdivr z2.s, p1/m, z2.s, z6.s +; CHECK-NEXT: sdivr z3.s, p1/m, z3.s, z4.s +; CHECK-NEXT: uzp1 z2.h, z2.h, z2.h +; CHECK-NEXT: uzp1 z3.h, z3.h, z3.h +; CHECK-NEXT: uzp1 z4.b, z5.b, z5.b +; CHECK-NEXT: splice z2.h, p2, z2.h, z3.h +; CHECK-NEXT: ptrue p1.b, vl128 +; CHECK-NEXT: uzp1 z2.b, z2.b, z2.b +; CHECK-NEXT: splice z4.b, p1, z4.b, z2.b +; CHECK-NEXT: mls z0.b, p0/m, z4.b, z1.b ; CHECK-NEXT: st1b { z0.b }, p0, [x0] ; CHECK-NEXT: ret %op1 = load <256 x i8>, ptr %a @@ -323,19 +347,16 @@ define <8 x i16> @srem_v8i16(<8 x i16> %op1, <8 x i16> %op2) #0 { ; VBITS_GE_128-LABEL: srem_v8i16: ; VBITS_GE_128: // %bb.0: -; VBITS_GE_128-NEXT: // kill: def $q1 killed $q1 def $z1 -; VBITS_GE_128-NEXT: // kill: def $q0 killed $q0 def $z0 +; VBITS_GE_128-NEXT: sshll2 v2.4s, v1.8h, #0 ; VBITS_GE_128-NEXT: ptrue p0.s, vl4 -; VBITS_GE_128-NEXT: sunpkhi z2.s, z1.h -; VBITS_GE_128-NEXT: sunpkhi z3.s, z0.h -; VBITS_GE_128-NEXT: sunpklo z4.s, z1.h +; VBITS_GE_128-NEXT: sshll2 v3.4s, v0.8h, #0 +; VBITS_GE_128-NEXT: sshll v4.4s, v1.4h, #0 ; VBITS_GE_128-NEXT: sdivr z2.s, p0/m, z2.s, z3.s -; VBITS_GE_128-NEXT: sunpklo z5.s, z0.h +; VBITS_GE_128-NEXT: sshll v5.4s, v0.4h, #0 ; VBITS_GE_128-NEXT: movprfx z3, z5 ; VBITS_GE_128-NEXT: sdiv z3.s, p0/m, z3.s, z4.s -; VBITS_GE_128-NEXT: uzp1 z2.h, z3.h, z2.h +; VBITS_GE_128-NEXT: uzp1 v2.8h, v3.8h, v2.8h ; VBITS_GE_128-NEXT: mls v0.8h, v2.8h, v1.8h -; VBITS_GE_128-NEXT: // kill: def $q0 killed $q0 killed $z0 ; VBITS_GE_128-NEXT: ret ; ; VBITS_GE_256-LABEL: srem_v8i16: @@ -370,26 +391,26 @@ define void @srem_v16i16(ptr %a, ptr %b) #0 { ; VBITS_GE_128-LABEL: srem_v16i16: ; VBITS_GE_128: // %bb.0: -; VBITS_GE_128-NEXT: ldp q2, q0, [x0] +; VBITS_GE_128-NEXT: ldp q0, q1, [x1] ; VBITS_GE_128-NEXT: ptrue p0.s, vl4 -; VBITS_GE_128-NEXT: sunpkhi z17.s, z2.h -; VBITS_GE_128-NEXT: ldp q3, q1, [x1] -; VBITS_GE_128-NEXT: sunpkhi z5.s, z0.h -; VBITS_GE_128-NEXT: sunpklo z7.s, z0.h -; VBITS_GE_128-NEXT: sunpkhi z16.s, z3.h -; VBITS_GE_128-NEXT: sdivr z16.s, p0/m, z16.s, z17.s -; VBITS_GE_128-NEXT: sunpkhi z4.s, z1.h -; VBITS_GE_128-NEXT: sunpklo z6.s, z1.h -; VBITS_GE_128-NEXT: sdivr z4.s, p0/m, z4.s, z5.s -; VBITS_GE_128-NEXT: sunpklo z5.s, z3.h -; VBITS_GE_128-NEXT: sdivr z6.s, p0/m, z6.s, z7.s -; VBITS_GE_128-NEXT: sunpklo z7.s, z2.h -; VBITS_GE_128-NEXT: sdivr z5.s, p0/m, z5.s, z7.s -; VBITS_GE_128-NEXT: uzp1 z4.h, z6.h, z4.h -; VBITS_GE_128-NEXT: uzp1 z5.h, z5.h, z16.h -; VBITS_GE_128-NEXT: mls v2.8h, v5.8h, v3.8h -; VBITS_GE_128-NEXT: mls v0.8h, v4.8h, v1.8h -; VBITS_GE_128-NEXT: stp q2, q0, [x0] +; VBITS_GE_128-NEXT: sshll2 v5.4s, v0.8h, #0 +; VBITS_GE_128-NEXT: sshll v7.4s, v0.4h, #0 +; VBITS_GE_128-NEXT: ldp q2, q3, [x0] +; VBITS_GE_128-NEXT: sshll2 v4.4s, v1.8h, #0 +; VBITS_GE_128-NEXT: sshll2 v6.4s, v2.8h, #0 +; VBITS_GE_128-NEXT: sshll v16.4s, v2.4h, #0 +; VBITS_GE_128-NEXT: sshll2 v17.4s, v3.8h, #0 +; VBITS_GE_128-NEXT: sdivr z5.s, p0/m, z5.s, z6.s +; VBITS_GE_128-NEXT: sshll v6.4s, v1.4h, #0 +; VBITS_GE_128-NEXT: sdivr z7.s, p0/m, z7.s, z16.s +; VBITS_GE_128-NEXT: sshll v16.4s, v3.4h, #0 +; VBITS_GE_128-NEXT: sdivr z4.s, p0/m, z4.s, z17.s +; VBITS_GE_128-NEXT: uzp1 v5.8h, v7.8h, v5.8h +; VBITS_GE_128-NEXT: sdivr z6.s, p0/m, z6.s, z16.s +; VBITS_GE_128-NEXT: uzp1 v4.8h, v6.8h, v4.8h +; VBITS_GE_128-NEXT: mls v2.8h, v5.8h, v0.8h +; VBITS_GE_128-NEXT: mls v3.8h, v4.8h, v1.8h +; VBITS_GE_128-NEXT: stp q2, q3, [x0] ; VBITS_GE_128-NEXT: ret ; ; VBITS_GE_256-LABEL: srem_v16i16: @@ -398,14 +419,20 @@ ; VBITS_GE_256-NEXT: ptrue p1.s, vl8 ; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0] ; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x1] -; VBITS_GE_256-NEXT: sunpkhi z2.s, z1.h -; VBITS_GE_256-NEXT: sunpkhi z3.s, z0.h -; VBITS_GE_256-NEXT: sdivr z2.s, p1/m, z2.s, z3.s -; VBITS_GE_256-NEXT: sunpklo z4.s, z1.h +; VBITS_GE_256-NEXT: mov z3.d, z1.d +; VBITS_GE_256-NEXT: mov z4.d, z0.d +; VBITS_GE_256-NEXT: ext z3.b, z3.b, z1.b, #16 +; VBITS_GE_256-NEXT: ext z4.b, z4.b, z0.b, #16 +; VBITS_GE_256-NEXT: sunpklo z2.s, z1.h ; VBITS_GE_256-NEXT: sunpklo z5.s, z0.h -; VBITS_GE_256-NEXT: movprfx z3, z5 -; VBITS_GE_256-NEXT: sdiv z3.s, p1/m, z3.s, z4.s -; VBITS_GE_256-NEXT: uzp1 z2.h, z3.h, z2.h +; VBITS_GE_256-NEXT: sunpklo z3.s, z3.h +; VBITS_GE_256-NEXT: sunpklo z4.s, z4.h +; VBITS_GE_256-NEXT: sdivr z2.s, p1/m, z2.s, z5.s +; VBITS_GE_256-NEXT: sdivr z3.s, p1/m, z3.s, z4.s +; VBITS_GE_256-NEXT: uzp1 z2.h, z2.h, z2.h +; VBITS_GE_256-NEXT: uzp1 z3.h, z3.h, z3.h +; VBITS_GE_256-NEXT: ptrue p1.h, vl8 +; VBITS_GE_256-NEXT: splice z2.h, p1, z2.h, z3.h ; VBITS_GE_256-NEXT: mls z0.h, p0/m, z2.h, z1.h ; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x0] ; VBITS_GE_256-NEXT: ret @@ -479,14 +506,20 @@ ; CHECK-NEXT: ptrue p1.s, vl64 ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] ; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1] -; CHECK-NEXT: sunpkhi z2.s, z1.h -; CHECK-NEXT: sunpkhi z3.s, z0.h -; CHECK-NEXT: sdivr z2.s, p1/m, z2.s, z3.s -; CHECK-NEXT: sunpklo z4.s, z1.h +; CHECK-NEXT: mov z3.d, z1.d +; CHECK-NEXT: mov z4.d, z0.d +; CHECK-NEXT: ext z3.b, z3.b, z1.b, #128 +; CHECK-NEXT: ext z4.b, z4.b, z0.b, #128 +; CHECK-NEXT: sunpklo z2.s, z1.h ; CHECK-NEXT: sunpklo z5.s, z0.h -; CHECK-NEXT: movprfx z3, z5 -; CHECK-NEXT: sdiv z3.s, p1/m, z3.s, z4.s -; CHECK-NEXT: uzp1 z2.h, z3.h, z2.h +; CHECK-NEXT: sunpklo z3.s, z3.h +; CHECK-NEXT: sunpklo z4.s, z4.h +; CHECK-NEXT: sdivr z2.s, p1/m, z2.s, z5.s +; CHECK-NEXT: sdivr z3.s, p1/m, z3.s, z4.s +; CHECK-NEXT: uzp1 z2.h, z2.h, z2.h +; CHECK-NEXT: uzp1 z3.h, z3.h, z3.h +; CHECK-NEXT: ptrue p1.h, vl64 +; CHECK-NEXT: splice z2.h, p1, z2.h, z3.h ; CHECK-NEXT: mls z0.h, p0/m, z2.h, z1.h ; CHECK-NEXT: st1h { z0.h }, p0, [x0] ; CHECK-NEXT: ret @@ -805,13 +838,13 @@ ; VBITS_GE_128-NEXT: ushll v2.8h, v1.8b, #0 ; VBITS_GE_128-NEXT: ptrue p0.s, vl4 ; VBITS_GE_128-NEXT: ushll v3.8h, v0.8b, #0 -; VBITS_GE_128-NEXT: uunpkhi z4.s, z2.h -; VBITS_GE_128-NEXT: uunpklo z2.s, z2.h -; VBITS_GE_128-NEXT: uunpkhi z5.s, z3.h -; VBITS_GE_128-NEXT: uunpklo z3.s, z3.h +; VBITS_GE_128-NEXT: ushll2 v4.4s, v2.8h, #0 +; VBITS_GE_128-NEXT: ushll2 v5.4s, v3.8h, #0 +; VBITS_GE_128-NEXT: ushll v2.4s, v2.4h, #0 +; VBITS_GE_128-NEXT: ushll v3.4s, v3.4h, #0 ; VBITS_GE_128-NEXT: udivr z4.s, p0/m, z4.s, z5.s ; VBITS_GE_128-NEXT: udivr z2.s, p0/m, z2.s, z3.s -; VBITS_GE_128-NEXT: uzp1 z2.h, z2.h, z4.h +; VBITS_GE_128-NEXT: uzp1 v2.8h, v2.8h, v4.8h ; VBITS_GE_128-NEXT: xtn v2.8b, v2.8h ; VBITS_GE_128-NEXT: mls v0.8b, v2.8b, v1.8b ; VBITS_GE_128-NEXT: ret @@ -884,30 +917,28 @@ define <16 x i8> @urem_v16i8(<16 x i8> %op1, <16 x i8> %op2) #0 { ; VBITS_GE_128-LABEL: urem_v16i8: ; VBITS_GE_128: // %bb.0: -; VBITS_GE_128-NEXT: // kill: def $q1 killed $q1 def $z1 -; VBITS_GE_128-NEXT: // kill: def $q0 killed $q0 def $z0 -; VBITS_GE_128-NEXT: uunpkhi z2.h, z1.b -; VBITS_GE_128-NEXT: uunpkhi z3.h, z0.b +; VBITS_GE_128-NEXT: ushll2 v2.8h, v1.16b, #0 ; VBITS_GE_128-NEXT: ptrue p0.s, vl4 -; VBITS_GE_128-NEXT: uunpkhi z5.s, z2.h -; VBITS_GE_128-NEXT: uunpkhi z6.s, z3.h -; VBITS_GE_128-NEXT: uunpklo z2.s, z2.h -; VBITS_GE_128-NEXT: uunpklo z3.s, z3.h -; VBITS_GE_128-NEXT: uunpklo z4.h, z1.b +; VBITS_GE_128-NEXT: ushll2 v3.8h, v0.16b, #0 +; VBITS_GE_128-NEXT: ushll2 v4.4s, v2.8h, #0 +; VBITS_GE_128-NEXT: ushll2 v5.4s, v3.8h, #0 +; VBITS_GE_128-NEXT: ushll v2.4s, v2.4h, #0 +; VBITS_GE_128-NEXT: ushll v3.4s, v3.4h, #0 +; VBITS_GE_128-NEXT: udivr z4.s, p0/m, z4.s, z5.s +; VBITS_GE_128-NEXT: ushll v6.8h, v1.8b, #0 +; VBITS_GE_128-NEXT: ushll v7.8h, v0.8b, #0 ; VBITS_GE_128-NEXT: udivr z2.s, p0/m, z2.s, z3.s -; VBITS_GE_128-NEXT: uunpklo z3.h, z0.b -; VBITS_GE_128-NEXT: udivr z5.s, p0/m, z5.s, z6.s -; VBITS_GE_128-NEXT: uunpkhi z6.s, z4.h -; VBITS_GE_128-NEXT: uunpkhi z7.s, z3.h -; VBITS_GE_128-NEXT: uunpklo z4.s, z4.h -; VBITS_GE_128-NEXT: uunpklo z3.s, z3.h -; VBITS_GE_128-NEXT: udivr z6.s, p0/m, z6.s, z7.s -; VBITS_GE_128-NEXT: udiv z3.s, p0/m, z3.s, z4.s -; VBITS_GE_128-NEXT: uzp1 z2.h, z2.h, z5.h -; VBITS_GE_128-NEXT: uzp1 z3.h, z3.h, z6.h -; VBITS_GE_128-NEXT: uzp1 z2.b, z3.b, z2.b +; VBITS_GE_128-NEXT: ushll2 v3.4s, v6.8h, #0 +; VBITS_GE_128-NEXT: ushll2 v5.4s, v7.8h, #0 +; VBITS_GE_128-NEXT: ushll v6.4s, v6.4h, #0 +; VBITS_GE_128-NEXT: udivr z3.s, p0/m, z3.s, z5.s +; VBITS_GE_128-NEXT: ushll v7.4s, v7.4h, #0 +; VBITS_GE_128-NEXT: uzp1 v2.8h, v2.8h, v4.8h +; VBITS_GE_128-NEXT: movprfx z4, z7 +; VBITS_GE_128-NEXT: udiv z4.s, p0/m, z4.s, z6.s +; VBITS_GE_128-NEXT: uzp1 v3.8h, v4.8h, v3.8h +; VBITS_GE_128-NEXT: uzp1 v2.16b, v3.16b, v2.16b ; VBITS_GE_128-NEXT: mls v0.16b, v2.16b, v1.16b -; VBITS_GE_128-NEXT: // kill: def $q0 killed $q0 killed $z0 ; VBITS_GE_128-NEXT: ret ; ; VBITS_GE_256-LABEL: urem_v16i8: @@ -916,15 +947,20 @@ ; VBITS_GE_256-NEXT: // kill: def $q0 killed $q0 def $z0 ; VBITS_GE_256-NEXT: uunpklo z2.h, z1.b ; VBITS_GE_256-NEXT: uunpklo z3.h, z0.b +; VBITS_GE_256-NEXT: uunpklo z4.s, z2.h +; VBITS_GE_256-NEXT: uunpklo z5.s, z3.h +; VBITS_GE_256-NEXT: ext z2.b, z2.b, z2.b, #16 +; VBITS_GE_256-NEXT: ext z3.b, z3.b, z3.b, #16 ; VBITS_GE_256-NEXT: ptrue p0.s, vl8 -; VBITS_GE_256-NEXT: uunpkhi z4.s, z2.h -; VBITS_GE_256-NEXT: uunpkhi z5.s, z3.h ; VBITS_GE_256-NEXT: uunpklo z2.s, z2.h ; VBITS_GE_256-NEXT: uunpklo z3.s, z3.h ; VBITS_GE_256-NEXT: udivr z4.s, p0/m, z4.s, z5.s ; VBITS_GE_256-NEXT: udivr z2.s, p0/m, z2.s, z3.s -; VBITS_GE_256-NEXT: uzp1 z2.h, z2.h, z4.h -; VBITS_GE_256-NEXT: uzp1 z2.b, z2.b, z2.b +; VBITS_GE_256-NEXT: uzp1 z4.h, z4.h, z4.h +; VBITS_GE_256-NEXT: uzp1 z2.h, z2.h, z2.h +; VBITS_GE_256-NEXT: ptrue p0.h, vl8 +; VBITS_GE_256-NEXT: splice z4.h, p0, z4.h, z2.h +; VBITS_GE_256-NEXT: uzp1 z2.b, z4.b, z4.b ; VBITS_GE_256-NEXT: mls v0.16b, v2.16b, v1.16b ; VBITS_GE_256-NEXT: // kill: def $q0 killed $q0 killed $z0 ; VBITS_GE_256-NEXT: ret @@ -1005,14 +1041,19 @@ ; CHECK-NEXT: ld1b { z1.b }, p0/z, [x1] ; CHECK-NEXT: uunpklo z2.h, z1.b ; CHECK-NEXT: uunpklo z3.h, z0.b -; CHECK-NEXT: uunpkhi z4.s, z2.h -; CHECK-NEXT: uunpkhi z5.s, z3.h +; CHECK-NEXT: uunpklo z4.s, z2.h +; CHECK-NEXT: uunpklo z5.s, z3.h +; CHECK-NEXT: ext z2.b, z2.b, z2.b, #128 +; CHECK-NEXT: ext z3.b, z3.b, z3.b, #128 ; CHECK-NEXT: uunpklo z2.s, z2.h ; CHECK-NEXT: uunpklo z3.s, z3.h ; CHECK-NEXT: udivr z4.s, p1/m, z4.s, z5.s ; CHECK-NEXT: udivr z2.s, p1/m, z2.s, z3.s -; CHECK-NEXT: uzp1 z2.h, z2.h, z4.h -; CHECK-NEXT: uzp1 z2.b, z2.b, z2.b +; CHECK-NEXT: uzp1 z4.h, z4.h, z4.h +; CHECK-NEXT: uzp1 z2.h, z2.h, z2.h +; CHECK-NEXT: ptrue p1.h, vl64 +; CHECK-NEXT: splice z4.h, p1, z4.h, z2.h +; CHECK-NEXT: uzp1 z2.b, z4.b, z4.b ; CHECK-NEXT: mls z0.b, p0/m, z2.b, z1.b ; CHECK-NEXT: st1b { z0.b }, p0, [x0] ; CHECK-NEXT: ret @@ -1030,26 +1071,42 @@ ; CHECK-NEXT: ptrue p1.s, vl64 ; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0] ; CHECK-NEXT: ld1b { z1.b }, p0/z, [x1] -; CHECK-NEXT: uunpkhi z2.h, z1.b -; CHECK-NEXT: uunpkhi z3.h, z0.b -; CHECK-NEXT: uunpklo z4.h, z1.b -; CHECK-NEXT: uunpklo z5.h, z0.b -; CHECK-NEXT: uunpkhi z6.s, z2.h -; CHECK-NEXT: uunpkhi z7.s, z3.h +; CHECK-NEXT: ptrue p2.h, vl64 +; CHECK-NEXT: uunpklo z2.h, z1.b +; CHECK-NEXT: uunpklo z3.h, z0.b +; CHECK-NEXT: uunpklo z5.s, z2.h +; CHECK-NEXT: uunpklo z6.s, z3.h +; CHECK-NEXT: ext z2.b, z2.b, z2.b, #128 +; CHECK-NEXT: ext z3.b, z3.b, z3.b, #128 +; CHECK-NEXT: mov z4.d, z1.d +; CHECK-NEXT: udivr z5.s, p1/m, z5.s, z6.s +; CHECK-NEXT: mov z6.d, z0.d ; CHECK-NEXT: uunpklo z2.s, z2.h ; CHECK-NEXT: uunpklo z3.s, z3.h -; CHECK-NEXT: udivr z6.s, p1/m, z6.s, z7.s -; CHECK-NEXT: uunpkhi z7.s, z4.h +; CHECK-NEXT: ext z4.b, z4.b, z1.b, #128 ; CHECK-NEXT: udivr z2.s, p1/m, z2.s, z3.s -; CHECK-NEXT: uunpkhi z3.s, z5.h +; CHECK-NEXT: ext z6.b, z6.b, z0.b, #128 +; CHECK-NEXT: uzp1 z5.h, z5.h, z5.h +; CHECK-NEXT: uzp1 z2.h, z2.h, z2.h +; CHECK-NEXT: uunpklo z3.h, z4.b +; CHECK-NEXT: uunpklo z4.h, z6.b +; CHECK-NEXT: splice z5.h, p2, z5.h, z2.h +; CHECK-NEXT: uunpklo z2.s, z3.h +; CHECK-NEXT: uunpklo z6.s, z4.h +; CHECK-NEXT: ext z3.b, z3.b, z3.b, #128 +; CHECK-NEXT: ext z4.b, z4.b, z4.b, #128 +; CHECK-NEXT: uunpklo z3.s, z3.h ; CHECK-NEXT: uunpklo z4.s, z4.h -; CHECK-NEXT: uunpklo z5.s, z5.h -; CHECK-NEXT: udiv z3.s, p1/m, z3.s, z7.s -; CHECK-NEXT: udivr z4.s, p1/m, z4.s, z5.s -; CHECK-NEXT: uzp1 z2.h, z2.h, z6.h -; CHECK-NEXT: uzp1 z3.h, z4.h, z3.h -; CHECK-NEXT: uzp1 z2.b, z3.b, z2.b -; CHECK-NEXT: mls z0.b, p0/m, z2.b, z1.b +; CHECK-NEXT: udivr z2.s, p1/m, z2.s, z6.s +; CHECK-NEXT: udivr z3.s, p1/m, z3.s, z4.s +; CHECK-NEXT: uzp1 z2.h, z2.h, z2.h +; CHECK-NEXT: uzp1 z3.h, z3.h, z3.h +; CHECK-NEXT: uzp1 z4.b, z5.b, z5.b +; CHECK-NEXT: splice z2.h, p2, z2.h, z3.h +; CHECK-NEXT: ptrue p1.b, vl128 +; CHECK-NEXT: uzp1 z2.b, z2.b, z2.b +; CHECK-NEXT: splice z4.b, p1, z4.b, z2.b +; CHECK-NEXT: mls z0.b, p0/m, z4.b, z1.b ; CHECK-NEXT: st1b { z0.b }, p0, [x0] ; CHECK-NEXT: ret %op1 = load <256 x i8>, ptr %a @@ -1110,19 +1167,16 @@ define <8 x i16> @urem_v8i16(<8 x i16> %op1, <8 x i16> %op2) #0 { ; VBITS_GE_128-LABEL: urem_v8i16: ; VBITS_GE_128: // %bb.0: -; VBITS_GE_128-NEXT: // kill: def $q1 killed $q1 def $z1 -; VBITS_GE_128-NEXT: // kill: def $q0 killed $q0 def $z0 +; VBITS_GE_128-NEXT: ushll2 v2.4s, v1.8h, #0 ; VBITS_GE_128-NEXT: ptrue p0.s, vl4 -; VBITS_GE_128-NEXT: uunpkhi z2.s, z1.h -; VBITS_GE_128-NEXT: uunpkhi z3.s, z0.h -; VBITS_GE_128-NEXT: uunpklo z4.s, z1.h +; VBITS_GE_128-NEXT: ushll2 v3.4s, v0.8h, #0 +; VBITS_GE_128-NEXT: ushll v4.4s, v1.4h, #0 ; VBITS_GE_128-NEXT: udivr z2.s, p0/m, z2.s, z3.s -; VBITS_GE_128-NEXT: uunpklo z5.s, z0.h +; VBITS_GE_128-NEXT: ushll v5.4s, v0.4h, #0 ; VBITS_GE_128-NEXT: movprfx z3, z5 ; VBITS_GE_128-NEXT: udiv z3.s, p0/m, z3.s, z4.s -; VBITS_GE_128-NEXT: uzp1 z2.h, z3.h, z2.h +; VBITS_GE_128-NEXT: uzp1 v2.8h, v3.8h, v2.8h ; VBITS_GE_128-NEXT: mls v0.8h, v2.8h, v1.8h -; VBITS_GE_128-NEXT: // kill: def $q0 killed $q0 killed $z0 ; VBITS_GE_128-NEXT: ret ; ; VBITS_GE_256-LABEL: urem_v8i16: @@ -1157,26 +1211,26 @@ define void @urem_v16i16(ptr %a, ptr %b) #0 { ; VBITS_GE_128-LABEL: urem_v16i16: ; VBITS_GE_128: // %bb.0: -; VBITS_GE_128-NEXT: ldp q2, q0, [x0] +; VBITS_GE_128-NEXT: ldp q0, q1, [x1] ; VBITS_GE_128-NEXT: ptrue p0.s, vl4 -; VBITS_GE_128-NEXT: uunpkhi z17.s, z2.h -; VBITS_GE_128-NEXT: ldp q3, q1, [x1] -; VBITS_GE_128-NEXT: uunpkhi z5.s, z0.h -; VBITS_GE_128-NEXT: uunpklo z7.s, z0.h -; VBITS_GE_128-NEXT: uunpkhi z16.s, z3.h -; VBITS_GE_128-NEXT: udivr z16.s, p0/m, z16.s, z17.s -; VBITS_GE_128-NEXT: uunpkhi z4.s, z1.h -; VBITS_GE_128-NEXT: uunpklo z6.s, z1.h -; VBITS_GE_128-NEXT: udivr z4.s, p0/m, z4.s, z5.s -; VBITS_GE_128-NEXT: uunpklo z5.s, z3.h -; VBITS_GE_128-NEXT: udivr z6.s, p0/m, z6.s, z7.s -; VBITS_GE_128-NEXT: uunpklo z7.s, z2.h -; VBITS_GE_128-NEXT: udivr z5.s, p0/m, z5.s, z7.s -; VBITS_GE_128-NEXT: uzp1 z4.h, z6.h, z4.h -; VBITS_GE_128-NEXT: uzp1 z5.h, z5.h, z16.h -; VBITS_GE_128-NEXT: mls v2.8h, v5.8h, v3.8h -; VBITS_GE_128-NEXT: mls v0.8h, v4.8h, v1.8h -; VBITS_GE_128-NEXT: stp q2, q0, [x0] +; VBITS_GE_128-NEXT: ushll2 v5.4s, v0.8h, #0 +; VBITS_GE_128-NEXT: ushll v7.4s, v0.4h, #0 +; VBITS_GE_128-NEXT: ldp q2, q3, [x0] +; VBITS_GE_128-NEXT: ushll2 v4.4s, v1.8h, #0 +; VBITS_GE_128-NEXT: ushll2 v6.4s, v2.8h, #0 +; VBITS_GE_128-NEXT: ushll v16.4s, v2.4h, #0 +; VBITS_GE_128-NEXT: ushll2 v17.4s, v3.8h, #0 +; VBITS_GE_128-NEXT: udivr z5.s, p0/m, z5.s, z6.s +; VBITS_GE_128-NEXT: ushll v6.4s, v1.4h, #0 +; VBITS_GE_128-NEXT: udivr z7.s, p0/m, z7.s, z16.s +; VBITS_GE_128-NEXT: ushll v16.4s, v3.4h, #0 +; VBITS_GE_128-NEXT: udivr z4.s, p0/m, z4.s, z17.s +; VBITS_GE_128-NEXT: uzp1 v5.8h, v7.8h, v5.8h +; VBITS_GE_128-NEXT: udivr z6.s, p0/m, z6.s, z16.s +; VBITS_GE_128-NEXT: uzp1 v4.8h, v6.8h, v4.8h +; VBITS_GE_128-NEXT: mls v2.8h, v5.8h, v0.8h +; VBITS_GE_128-NEXT: mls v3.8h, v4.8h, v1.8h +; VBITS_GE_128-NEXT: stp q2, q3, [x0] ; VBITS_GE_128-NEXT: ret ; ; VBITS_GE_256-LABEL: urem_v16i16: @@ -1185,14 +1239,20 @@ ; VBITS_GE_256-NEXT: ptrue p1.s, vl8 ; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0] ; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x1] -; VBITS_GE_256-NEXT: uunpkhi z2.s, z1.h -; VBITS_GE_256-NEXT: uunpkhi z3.s, z0.h -; VBITS_GE_256-NEXT: udivr z2.s, p1/m, z2.s, z3.s -; VBITS_GE_256-NEXT: uunpklo z4.s, z1.h +; VBITS_GE_256-NEXT: mov z3.d, z1.d +; VBITS_GE_256-NEXT: mov z4.d, z0.d +; VBITS_GE_256-NEXT: ext z3.b, z3.b, z1.b, #16 +; VBITS_GE_256-NEXT: ext z4.b, z4.b, z0.b, #16 +; VBITS_GE_256-NEXT: uunpklo z2.s, z1.h ; VBITS_GE_256-NEXT: uunpklo z5.s, z0.h -; VBITS_GE_256-NEXT: movprfx z3, z5 -; VBITS_GE_256-NEXT: udiv z3.s, p1/m, z3.s, z4.s -; VBITS_GE_256-NEXT: uzp1 z2.h, z3.h, z2.h +; VBITS_GE_256-NEXT: uunpklo z3.s, z3.h +; VBITS_GE_256-NEXT: uunpklo z4.s, z4.h +; VBITS_GE_256-NEXT: udivr z2.s, p1/m, z2.s, z5.s +; VBITS_GE_256-NEXT: udivr z3.s, p1/m, z3.s, z4.s +; VBITS_GE_256-NEXT: uzp1 z2.h, z2.h, z2.h +; VBITS_GE_256-NEXT: uzp1 z3.h, z3.h, z3.h +; VBITS_GE_256-NEXT: ptrue p1.h, vl8 +; VBITS_GE_256-NEXT: splice z2.h, p1, z2.h, z3.h ; VBITS_GE_256-NEXT: mls z0.h, p0/m, z2.h, z1.h ; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x0] ; VBITS_GE_256-NEXT: ret @@ -1266,14 +1326,20 @@ ; CHECK-NEXT: ptrue p1.s, vl64 ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] ; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1] -; CHECK-NEXT: uunpkhi z2.s, z1.h -; CHECK-NEXT: uunpkhi z3.s, z0.h -; CHECK-NEXT: udivr z2.s, p1/m, z2.s, z3.s -; CHECK-NEXT: uunpklo z4.s, z1.h +; CHECK-NEXT: mov z3.d, z1.d +; CHECK-NEXT: mov z4.d, z0.d +; CHECK-NEXT: ext z3.b, z3.b, z1.b, #128 +; CHECK-NEXT: ext z4.b, z4.b, z0.b, #128 +; CHECK-NEXT: uunpklo z2.s, z1.h ; CHECK-NEXT: uunpklo z5.s, z0.h -; CHECK-NEXT: movprfx z3, z5 -; CHECK-NEXT: udiv z3.s, p1/m, z3.s, z4.s -; CHECK-NEXT: uzp1 z2.h, z3.h, z2.h +; CHECK-NEXT: uunpklo z3.s, z3.h +; CHECK-NEXT: uunpklo z4.s, z4.h +; CHECK-NEXT: udivr z2.s, p1/m, z2.s, z5.s +; CHECK-NEXT: udivr z3.s, p1/m, z3.s, z4.s +; CHECK-NEXT: uzp1 z2.h, z2.h, z2.h +; CHECK-NEXT: uzp1 z3.h, z3.h, z3.h +; CHECK-NEXT: ptrue p1.h, vl64 +; CHECK-NEXT: splice z2.h, p1, z2.h, z3.h ; CHECK-NEXT: mls z0.h, p0/m, z2.h, z1.h ; CHECK-NEXT: st1h { z0.h }, p0, [x0] ; CHECK-NEXT: ret diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-div.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-div.ll --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-div.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-div.ll @@ -36,14 +36,19 @@ ; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: sunpklo z1.h, z1.b ; CHECK-NEXT: sunpklo z0.h, z0.b -; CHECK-NEXT: sunpkhi z2.s, z1.h -; CHECK-NEXT: sunpkhi z3.s, z0.h +; CHECK-NEXT: sunpklo z2.s, z1.h +; CHECK-NEXT: sunpklo z3.s, z0.h +; CHECK-NEXT: ext z1.b, z1.b, z1.b, #8 +; CHECK-NEXT: ext z0.b, z0.b, z0.b, #8 ; CHECK-NEXT: sunpklo z1.s, z1.h ; CHECK-NEXT: sunpklo z0.s, z0.h ; CHECK-NEXT: sdivr z2.s, p0/m, z2.s, z3.s ; CHECK-NEXT: sdiv z0.s, p0/m, z0.s, z1.s -; CHECK-NEXT: uzp1 z0.h, z0.h, z2.h -; CHECK-NEXT: uzp1 z0.b, z0.b, z0.b +; CHECK-NEXT: uzp1 z2.h, z2.h, z2.h +; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h +; CHECK-NEXT: ptrue p0.h, vl4 +; CHECK-NEXT: splice z2.h, p0, z2.h, z0.h +; CHECK-NEXT: uzp1 z0.b, z2.b, z2.b ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret %res = sdiv <8 x i8> %op1, %op2 @@ -55,26 +60,43 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1 ; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 -; CHECK-NEXT: sunpkhi z2.h, z1.b -; CHECK-NEXT: sunpkhi z3.h, z0.b +; CHECK-NEXT: mov z2.d, z1.d +; CHECK-NEXT: mov z3.d, z0.d +; CHECK-NEXT: ext z2.b, z2.b, z2.b, #8 +; CHECK-NEXT: ext z3.b, z3.b, z3.b, #8 +; CHECK-NEXT: sunpklo z2.h, z2.b +; CHECK-NEXT: sunpklo z3.h, z3.b +; CHECK-NEXT: sunpklo z4.s, z2.h +; CHECK-NEXT: sunpklo z5.s, z3.h +; CHECK-NEXT: ext z2.b, z2.b, z2.b, #8 +; CHECK-NEXT: ext z3.b, z3.b, z3.b, #8 ; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: sunpklo z1.h, z1.b -; CHECK-NEXT: sunpkhi z4.s, z2.h -; CHECK-NEXT: sunpkhi z5.s, z3.h +; CHECK-NEXT: sunpklo z0.h, z0.b ; CHECK-NEXT: sunpklo z2.s, z2.h ; CHECK-NEXT: sunpklo z3.s, z3.h -; CHECK-NEXT: sunpklo z0.h, z0.b ; CHECK-NEXT: sdivr z4.s, p0/m, z4.s, z5.s +; CHECK-NEXT: sunpklo z5.s, z1.h ; CHECK-NEXT: sdivr z2.s, p0/m, z2.s, z3.s -; CHECK-NEXT: sunpkhi z3.s, z1.h -; CHECK-NEXT: sunpkhi z5.s, z0.h +; CHECK-NEXT: sunpklo z3.s, z0.h +; CHECK-NEXT: ext z1.b, z1.b, z1.b, #8 +; CHECK-NEXT: ext z0.b, z0.b, z0.b, #8 ; CHECK-NEXT: sunpklo z1.s, z1.h ; CHECK-NEXT: sunpklo z0.s, z0.h -; CHECK-NEXT: sdivr z3.s, p0/m, z3.s, z5.s +; CHECK-NEXT: ptrue p1.h, vl4 +; CHECK-NEXT: uzp1 z4.h, z4.h, z4.h +; CHECK-NEXT: uzp1 z2.h, z2.h, z2.h ; CHECK-NEXT: sdiv z0.s, p0/m, z0.s, z1.s -; CHECK-NEXT: uzp1 z1.h, z2.h, z4.h -; CHECK-NEXT: uzp1 z0.h, z0.h, z3.h -; CHECK-NEXT: uzp1 z0.b, z0.b, z1.b +; CHECK-NEXT: splice z4.h, p1, z4.h, z2.h +; CHECK-NEXT: movprfx z2, z3 +; CHECK-NEXT: sdiv z2.s, p0/m, z2.s, z5.s +; CHECK-NEXT: uzp1 z1.h, z2.h, z2.h +; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h +; CHECK-NEXT: uzp1 z2.b, z4.b, z4.b +; CHECK-NEXT: splice z1.h, p1, z1.h, z0.h +; CHECK-NEXT: ptrue p0.b, vl8 +; CHECK-NEXT: uzp1 z0.b, z1.b, z1.b +; CHECK-NEXT: splice z0.b, p0, z0.b, z2.b ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: ret %res = sdiv <16 x i8> %op1, %op2 @@ -84,48 +106,76 @@ define void @sdiv_v32i8(<32 x i8>* %a, <32 x i8>* %b) #0 { ; CHECK-LABEL: sdiv_v32i8: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q3, q0, [x1] +; CHECK-NEXT: ldp q0, q2, [x0] ; CHECK-NEXT: ptrue p0.s, vl4 -; CHECK-NEXT: ldp q2, q1, [x0] -; CHECK-NEXT: sunpkhi z4.h, z0.b -; CHECK-NEXT: sunpklo z0.h, z0.b -; CHECK-NEXT: sunpkhi z6.s, z4.h -; CHECK-NEXT: sunpklo z4.s, z4.h -; CHECK-NEXT: sunpkhi z16.s, z0.h -; CHECK-NEXT: sunpklo z0.s, z0.h -; CHECK-NEXT: sunpkhi z5.h, z1.b -; CHECK-NEXT: sunpklo z1.h, z1.b -; CHECK-NEXT: sunpkhi z7.s, z5.h +; CHECK-NEXT: ptrue p1.h, vl4 +; CHECK-NEXT: ptrue p2.b, vl8 +; CHECK-NEXT: ldp q1, q3, [x1] +; CHECK-NEXT: mov z5.d, z2.d +; CHECK-NEXT: sunpklo z2.h, z2.b +; CHECK-NEXT: ext z5.b, z5.b, z5.b, #8 +; CHECK-NEXT: sunpklo z5.h, z5.b +; CHECK-NEXT: sunpklo z7.s, z5.h +; CHECK-NEXT: ext z5.b, z5.b, z5.b, #8 ; CHECK-NEXT: sunpklo z5.s, z5.h +; CHECK-NEXT: mov z4.d, z3.d +; CHECK-NEXT: sunpklo z3.h, z3.b +; CHECK-NEXT: ext z4.b, z4.b, z4.b, #8 +; CHECK-NEXT: sunpklo z4.h, z4.b +; CHECK-NEXT: sunpklo z6.s, z4.h +; CHECK-NEXT: ext z4.b, z4.b, z4.b, #8 +; CHECK-NEXT: sunpklo z4.s, z4.h ; CHECK-NEXT: sdivr z6.s, p0/m, z6.s, z7.s ; CHECK-NEXT: sdivr z4.s, p0/m, z4.s, z5.s -; CHECK-NEXT: sunpkhi z5.s, z1.h -; CHECK-NEXT: sunpklo z1.s, z1.h -; CHECK-NEXT: uzp1 z4.h, z4.h, z6.h -; CHECK-NEXT: sdivr z0.s, p0/m, z0.s, z1.s -; CHECK-NEXT: sunpkhi z1.h, z3.b -; CHECK-NEXT: sunpkhi z6.h, z2.b -; CHECK-NEXT: sdiv z5.s, p0/m, z5.s, z16.s -; CHECK-NEXT: sunpkhi z7.s, z1.h -; CHECK-NEXT: sunpkhi z16.s, z6.h -; CHECK-NEXT: sunpklo z1.s, z1.h -; CHECK-NEXT: sunpklo z6.s, z6.h -; CHECK-NEXT: sunpklo z3.h, z3.b -; CHECK-NEXT: sunpklo z2.h, z2.b -; CHECK-NEXT: sdivr z7.s, p0/m, z7.s, z16.s -; CHECK-NEXT: sdivr z1.s, p0/m, z1.s, z6.s -; CHECK-NEXT: sunpkhi z6.s, z3.h -; CHECK-NEXT: sunpkhi z16.s, z2.h +; CHECK-NEXT: sunpklo z7.s, z3.h +; CHECK-NEXT: sunpklo z5.s, z2.h +; CHECK-NEXT: ext z3.b, z3.b, z3.b, #8 +; CHECK-NEXT: ext z2.b, z2.b, z2.b, #8 +; CHECK-NEXT: uzp1 z6.h, z6.h, z6.h +; CHECK-NEXT: uzp1 z4.h, z4.h, z4.h ; CHECK-NEXT: sunpklo z3.s, z3.h +; CHECK-NEXT: splice z6.h, p1, z6.h, z4.h ; CHECK-NEXT: sunpklo z2.s, z2.h -; CHECK-NEXT: sdivr z6.s, p0/m, z6.s, z16.s +; CHECK-NEXT: uzp1 z4.b, z6.b, z6.b +; CHECK-NEXT: sdiv z5.s, p0/m, z5.s, z7.s ; CHECK-NEXT: sdiv z2.s, p0/m, z2.s, z3.s -; CHECK-NEXT: uzp1 z1.h, z1.h, z7.h -; CHECK-NEXT: uzp1 z2.h, z2.h, z6.h -; CHECK-NEXT: uzp1 z0.h, z0.h, z5.h -; CHECK-NEXT: uzp1 z1.b, z2.b, z1.b -; CHECK-NEXT: uzp1 z0.b, z0.b, z4.b -; CHECK-NEXT: stp q1, q0, [x0] +; CHECK-NEXT: sunpklo z3.h, z1.b +; CHECK-NEXT: sunpklo z6.h, z0.b +; CHECK-NEXT: ext z1.b, z1.b, z1.b, #8 +; CHECK-NEXT: ext z0.b, z0.b, z0.b, #8 +; CHECK-NEXT: uzp1 z5.h, z5.h, z5.h +; CHECK-NEXT: uzp1 z2.h, z2.h, z2.h +; CHECK-NEXT: sunpklo z1.h, z1.b +; CHECK-NEXT: sunpklo z0.h, z0.b +; CHECK-NEXT: splice z5.h, p1, z5.h, z2.h +; CHECK-NEXT: sunpklo z2.s, z1.h +; CHECK-NEXT: sunpklo z7.s, z0.h +; CHECK-NEXT: ext z1.b, z1.b, z1.b, #8 +; CHECK-NEXT: ext z0.b, z0.b, z0.b, #8 +; CHECK-NEXT: sunpklo z1.s, z1.h +; CHECK-NEXT: sunpklo z0.s, z0.h +; CHECK-NEXT: sdivr z2.s, p0/m, z2.s, z7.s +; CHECK-NEXT: sdiv z0.s, p0/m, z0.s, z1.s +; CHECK-NEXT: uzp1 z1.h, z2.h, z2.h +; CHECK-NEXT: sunpklo z2.s, z3.h +; CHECK-NEXT: sunpklo z7.s, z6.h +; CHECK-NEXT: ext z3.b, z3.b, z3.b, #8 +; CHECK-NEXT: ext z6.b, z6.b, z6.b, #8 +; CHECK-NEXT: sunpklo z3.s, z3.h +; CHECK-NEXT: sunpklo z6.s, z6.h +; CHECK-NEXT: sdivr z2.s, p0/m, z2.s, z7.s +; CHECK-NEXT: sdivr z3.s, p0/m, z3.s, z6.s +; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h +; CHECK-NEXT: uzp1 z2.h, z2.h, z2.h +; CHECK-NEXT: uzp1 z3.h, z3.h, z3.h +; CHECK-NEXT: splice z1.h, p1, z1.h, z0.h +; CHECK-NEXT: splice z2.h, p1, z2.h, z3.h +; CHECK-NEXT: uzp1 z0.b, z1.b, z1.b +; CHECK-NEXT: uzp1 z1.b, z2.b, z2.b +; CHECK-NEXT: uzp1 z2.b, z5.b, z5.b +; CHECK-NEXT: splice z1.b, p2, z1.b, z0.b +; CHECK-NEXT: splice z2.b, p2, z2.b, z4.b +; CHECK-NEXT: stp q1, q2, [x0] ; CHECK-NEXT: ret %op1 = load <32 x i8>, <32 x i8>* %a %op2 = load <32 x i8>, <32 x i8>* %b @@ -172,14 +222,21 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1 ; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: mov z2.d, z1.d +; CHECK-NEXT: mov z3.d, z0.d +; CHECK-NEXT: ext z2.b, z2.b, z2.b, #8 +; CHECK-NEXT: ext z3.b, z3.b, z0.b, #8 ; CHECK-NEXT: ptrue p0.s, vl4 -; CHECK-NEXT: sunpkhi z2.s, z1.h -; CHECK-NEXT: sunpkhi z3.s, z0.h ; CHECK-NEXT: sunpklo z1.s, z1.h +; CHECK-NEXT: sunpklo z2.s, z2.h +; CHECK-NEXT: sunpklo z3.s, z3.h ; CHECK-NEXT: sunpklo z0.s, z0.h ; CHECK-NEXT: sdivr z2.s, p0/m, z2.s, z3.s ; CHECK-NEXT: sdiv z0.s, p0/m, z0.s, z1.s -; CHECK-NEXT: uzp1 z0.h, z0.h, z2.h +; CHECK-NEXT: uzp1 z1.h, z2.h, z2.h +; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h +; CHECK-NEXT: ptrue p0.h, vl4 +; CHECK-NEXT: splice z0.h, p0, z0.h, z1.h ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: ret %res = sdiv <8 x i16> %op1, %op2 @@ -189,24 +246,34 @@ define void @sdiv_v16i16(<16 x i16>* %a, <16 x i16>* %b) #0 { ; CHECK-LABEL: sdiv_v16i16: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q1, [x1] +; CHECK-NEXT: ldp q3, q0, [x1] ; CHECK-NEXT: ptrue p0.s, vl4 -; CHECK-NEXT: sunpkhi z6.s, z0.h +; CHECK-NEXT: ptrue p1.h, vl4 +; CHECK-NEXT: sunpklo z6.s, z3.h +; CHECK-NEXT: ext z3.b, z3.b, z3.b, #8 +; CHECK-NEXT: ldp q1, q2, [x0] +; CHECK-NEXT: sunpklo z4.s, z0.h +; CHECK-NEXT: ext z0.b, z0.b, z0.b, #8 ; CHECK-NEXT: sunpklo z0.s, z0.h -; CHECK-NEXT: ldp q3, q2, [x0] -; CHECK-NEXT: sunpkhi z4.s, z1.h +; CHECK-NEXT: sunpklo z7.s, z1.h +; CHECK-NEXT: ext z1.b, z1.b, z1.b, #8 ; CHECK-NEXT: sunpklo z1.s, z1.h -; CHECK-NEXT: sunpkhi z5.s, z2.h +; CHECK-NEXT: sunpklo z5.s, z2.h +; CHECK-NEXT: ext z2.b, z2.b, z2.b, #8 ; CHECK-NEXT: sunpklo z2.s, z2.h ; CHECK-NEXT: sdivr z4.s, p0/m, z4.s, z5.s -; CHECK-NEXT: sunpkhi z5.s, z3.h -; CHECK-NEXT: sunpklo z3.s, z3.h -; CHECK-NEXT: sdiv z5.s, p0/m, z5.s, z6.s -; CHECK-NEXT: sdivr z0.s, p0/m, z0.s, z3.s -; CHECK-NEXT: sdivr z1.s, p0/m, z1.s, z2.s -; CHECK-NEXT: uzp1 z0.h, z0.h, z5.h -; CHECK-NEXT: uzp1 z1.h, z1.h, z4.h -; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: sdivr z0.s, p0/m, z0.s, z2.s +; CHECK-NEXT: sunpklo z2.s, z3.h +; CHECK-NEXT: sdiv z1.s, p0/m, z1.s, z2.s +; CHECK-NEXT: movprfx z2, z7 +; CHECK-NEXT: sdiv z2.s, p0/m, z2.s, z6.s +; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h +; CHECK-NEXT: uzp1 z1.h, z1.h, z1.h +; CHECK-NEXT: uzp1 z2.h, z2.h, z2.h +; CHECK-NEXT: uzp1 z3.h, z4.h, z4.h +; CHECK-NEXT: splice z2.h, p1, z2.h, z1.h +; CHECK-NEXT: splice z3.h, p1, z3.h, z0.h +; CHECK-NEXT: stp q2, q3, [x0] ; CHECK-NEXT: ret %op1 = load <16 x i16>, <16 x i16>* %a %op2 = load <16 x i16>, <16 x i16>* %b @@ -331,14 +398,19 @@ ; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: uunpklo z1.h, z1.b ; CHECK-NEXT: uunpklo z0.h, z0.b -; CHECK-NEXT: uunpkhi z2.s, z1.h -; CHECK-NEXT: uunpkhi z3.s, z0.h +; CHECK-NEXT: uunpklo z2.s, z1.h +; CHECK-NEXT: uunpklo z3.s, z0.h +; CHECK-NEXT: ext z1.b, z1.b, z1.b, #8 +; CHECK-NEXT: ext z0.b, z0.b, z0.b, #8 ; CHECK-NEXT: uunpklo z1.s, z1.h ; CHECK-NEXT: uunpklo z0.s, z0.h ; CHECK-NEXT: udivr z2.s, p0/m, z2.s, z3.s ; CHECK-NEXT: udiv z0.s, p0/m, z0.s, z1.s -; CHECK-NEXT: uzp1 z0.h, z0.h, z2.h -; CHECK-NEXT: uzp1 z0.b, z0.b, z0.b +; CHECK-NEXT: uzp1 z2.h, z2.h, z2.h +; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h +; CHECK-NEXT: ptrue p0.h, vl4 +; CHECK-NEXT: splice z2.h, p0, z2.h, z0.h +; CHECK-NEXT: uzp1 z0.b, z2.b, z2.b ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret %res = udiv <8 x i8> %op1, %op2 @@ -350,26 +422,43 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1 ; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 -; CHECK-NEXT: uunpkhi z2.h, z1.b -; CHECK-NEXT: uunpkhi z3.h, z0.b +; CHECK-NEXT: mov z2.d, z1.d +; CHECK-NEXT: mov z3.d, z0.d +; CHECK-NEXT: ext z2.b, z2.b, z2.b, #8 +; CHECK-NEXT: ext z3.b, z3.b, z3.b, #8 +; CHECK-NEXT: uunpklo z2.h, z2.b +; CHECK-NEXT: uunpklo z3.h, z3.b +; CHECK-NEXT: uunpklo z4.s, z2.h +; CHECK-NEXT: uunpklo z5.s, z3.h +; CHECK-NEXT: ext z2.b, z2.b, z2.b, #8 +; CHECK-NEXT: ext z3.b, z3.b, z3.b, #8 ; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: uunpklo z1.h, z1.b -; CHECK-NEXT: uunpkhi z4.s, z2.h -; CHECK-NEXT: uunpkhi z5.s, z3.h +; CHECK-NEXT: uunpklo z0.h, z0.b ; CHECK-NEXT: uunpklo z2.s, z2.h ; CHECK-NEXT: uunpklo z3.s, z3.h -; CHECK-NEXT: uunpklo z0.h, z0.b ; CHECK-NEXT: udivr z4.s, p0/m, z4.s, z5.s +; CHECK-NEXT: uunpklo z5.s, z1.h ; CHECK-NEXT: udivr z2.s, p0/m, z2.s, z3.s -; CHECK-NEXT: uunpkhi z3.s, z1.h -; CHECK-NEXT: uunpkhi z5.s, z0.h +; CHECK-NEXT: uunpklo z3.s, z0.h +; CHECK-NEXT: ext z1.b, z1.b, z1.b, #8 +; CHECK-NEXT: ext z0.b, z0.b, z0.b, #8 ; CHECK-NEXT: uunpklo z1.s, z1.h ; CHECK-NEXT: uunpklo z0.s, z0.h -; CHECK-NEXT: udivr z3.s, p0/m, z3.s, z5.s +; CHECK-NEXT: ptrue p1.h, vl4 +; CHECK-NEXT: uzp1 z4.h, z4.h, z4.h +; CHECK-NEXT: uzp1 z2.h, z2.h, z2.h ; CHECK-NEXT: udiv z0.s, p0/m, z0.s, z1.s -; CHECK-NEXT: uzp1 z1.h, z2.h, z4.h -; CHECK-NEXT: uzp1 z0.h, z0.h, z3.h -; CHECK-NEXT: uzp1 z0.b, z0.b, z1.b +; CHECK-NEXT: splice z4.h, p1, z4.h, z2.h +; CHECK-NEXT: movprfx z2, z3 +; CHECK-NEXT: udiv z2.s, p0/m, z2.s, z5.s +; CHECK-NEXT: uzp1 z1.h, z2.h, z2.h +; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h +; CHECK-NEXT: uzp1 z2.b, z4.b, z4.b +; CHECK-NEXT: splice z1.h, p1, z1.h, z0.h +; CHECK-NEXT: ptrue p0.b, vl8 +; CHECK-NEXT: uzp1 z0.b, z1.b, z1.b +; CHECK-NEXT: splice z0.b, p0, z0.b, z2.b ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: ret %res = udiv <16 x i8> %op1, %op2 @@ -379,48 +468,76 @@ define void @udiv_v32i8(<32 x i8>* %a, <32 x i8>* %b) #0 { ; CHECK-LABEL: udiv_v32i8: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q3, q0, [x1] +; CHECK-NEXT: ldp q0, q2, [x0] ; CHECK-NEXT: ptrue p0.s, vl4 -; CHECK-NEXT: ldp q2, q1, [x0] -; CHECK-NEXT: uunpkhi z4.h, z0.b -; CHECK-NEXT: uunpklo z0.h, z0.b -; CHECK-NEXT: uunpkhi z6.s, z4.h -; CHECK-NEXT: uunpklo z4.s, z4.h -; CHECK-NEXT: uunpkhi z16.s, z0.h -; CHECK-NEXT: uunpklo z0.s, z0.h -; CHECK-NEXT: uunpkhi z5.h, z1.b -; CHECK-NEXT: uunpklo z1.h, z1.b -; CHECK-NEXT: uunpkhi z7.s, z5.h +; CHECK-NEXT: ptrue p1.h, vl4 +; CHECK-NEXT: ptrue p2.b, vl8 +; CHECK-NEXT: ldp q1, q3, [x1] +; CHECK-NEXT: mov z5.d, z2.d +; CHECK-NEXT: uunpklo z2.h, z2.b +; CHECK-NEXT: ext z5.b, z5.b, z5.b, #8 +; CHECK-NEXT: uunpklo z5.h, z5.b +; CHECK-NEXT: uunpklo z7.s, z5.h +; CHECK-NEXT: ext z5.b, z5.b, z5.b, #8 ; CHECK-NEXT: uunpklo z5.s, z5.h +; CHECK-NEXT: mov z4.d, z3.d +; CHECK-NEXT: uunpklo z3.h, z3.b +; CHECK-NEXT: ext z4.b, z4.b, z4.b, #8 +; CHECK-NEXT: uunpklo z4.h, z4.b +; CHECK-NEXT: uunpklo z6.s, z4.h +; CHECK-NEXT: ext z4.b, z4.b, z4.b, #8 +; CHECK-NEXT: uunpklo z4.s, z4.h ; CHECK-NEXT: udivr z6.s, p0/m, z6.s, z7.s ; CHECK-NEXT: udivr z4.s, p0/m, z4.s, z5.s -; CHECK-NEXT: uunpkhi z5.s, z1.h -; CHECK-NEXT: uunpklo z1.s, z1.h -; CHECK-NEXT: uzp1 z4.h, z4.h, z6.h -; CHECK-NEXT: udivr z0.s, p0/m, z0.s, z1.s -; CHECK-NEXT: uunpkhi z1.h, z3.b -; CHECK-NEXT: uunpkhi z6.h, z2.b -; CHECK-NEXT: udiv z5.s, p0/m, z5.s, z16.s -; CHECK-NEXT: uunpkhi z7.s, z1.h -; CHECK-NEXT: uunpkhi z16.s, z6.h -; CHECK-NEXT: uunpklo z1.s, z1.h -; CHECK-NEXT: uunpklo z6.s, z6.h -; CHECK-NEXT: uunpklo z3.h, z3.b -; CHECK-NEXT: uunpklo z2.h, z2.b -; CHECK-NEXT: udivr z7.s, p0/m, z7.s, z16.s -; CHECK-NEXT: udivr z1.s, p0/m, z1.s, z6.s -; CHECK-NEXT: uunpkhi z6.s, z3.h -; CHECK-NEXT: uunpkhi z16.s, z2.h +; CHECK-NEXT: uunpklo z7.s, z3.h +; CHECK-NEXT: uunpklo z5.s, z2.h +; CHECK-NEXT: ext z3.b, z3.b, z3.b, #8 +; CHECK-NEXT: ext z2.b, z2.b, z2.b, #8 +; CHECK-NEXT: uzp1 z6.h, z6.h, z6.h +; CHECK-NEXT: uzp1 z4.h, z4.h, z4.h ; CHECK-NEXT: uunpklo z3.s, z3.h +; CHECK-NEXT: splice z6.h, p1, z6.h, z4.h ; CHECK-NEXT: uunpklo z2.s, z2.h -; CHECK-NEXT: udivr z6.s, p0/m, z6.s, z16.s +; CHECK-NEXT: uzp1 z4.b, z6.b, z6.b +; CHECK-NEXT: udiv z5.s, p0/m, z5.s, z7.s ; CHECK-NEXT: udiv z2.s, p0/m, z2.s, z3.s -; CHECK-NEXT: uzp1 z1.h, z1.h, z7.h -; CHECK-NEXT: uzp1 z2.h, z2.h, z6.h -; CHECK-NEXT: uzp1 z0.h, z0.h, z5.h -; CHECK-NEXT: uzp1 z1.b, z2.b, z1.b -; CHECK-NEXT: uzp1 z0.b, z0.b, z4.b -; CHECK-NEXT: stp q1, q0, [x0] +; CHECK-NEXT: uunpklo z3.h, z1.b +; CHECK-NEXT: uunpklo z6.h, z0.b +; CHECK-NEXT: ext z1.b, z1.b, z1.b, #8 +; CHECK-NEXT: ext z0.b, z0.b, z0.b, #8 +; CHECK-NEXT: uzp1 z5.h, z5.h, z5.h +; CHECK-NEXT: uzp1 z2.h, z2.h, z2.h +; CHECK-NEXT: uunpklo z1.h, z1.b +; CHECK-NEXT: uunpklo z0.h, z0.b +; CHECK-NEXT: splice z5.h, p1, z5.h, z2.h +; CHECK-NEXT: uunpklo z2.s, z1.h +; CHECK-NEXT: uunpklo z7.s, z0.h +; CHECK-NEXT: ext z1.b, z1.b, z1.b, #8 +; CHECK-NEXT: ext z0.b, z0.b, z0.b, #8 +; CHECK-NEXT: uunpklo z1.s, z1.h +; CHECK-NEXT: uunpklo z0.s, z0.h +; CHECK-NEXT: udivr z2.s, p0/m, z2.s, z7.s +; CHECK-NEXT: udiv z0.s, p0/m, z0.s, z1.s +; CHECK-NEXT: uzp1 z1.h, z2.h, z2.h +; CHECK-NEXT: uunpklo z2.s, z3.h +; CHECK-NEXT: uunpklo z7.s, z6.h +; CHECK-NEXT: ext z3.b, z3.b, z3.b, #8 +; CHECK-NEXT: ext z6.b, z6.b, z6.b, #8 +; CHECK-NEXT: uunpklo z3.s, z3.h +; CHECK-NEXT: uunpklo z6.s, z6.h +; CHECK-NEXT: udivr z2.s, p0/m, z2.s, z7.s +; CHECK-NEXT: udivr z3.s, p0/m, z3.s, z6.s +; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h +; CHECK-NEXT: uzp1 z2.h, z2.h, z2.h +; CHECK-NEXT: uzp1 z3.h, z3.h, z3.h +; CHECK-NEXT: splice z1.h, p1, z1.h, z0.h +; CHECK-NEXT: splice z2.h, p1, z2.h, z3.h +; CHECK-NEXT: uzp1 z0.b, z1.b, z1.b +; CHECK-NEXT: uzp1 z1.b, z2.b, z2.b +; CHECK-NEXT: uzp1 z2.b, z5.b, z5.b +; CHECK-NEXT: splice z1.b, p2, z1.b, z0.b +; CHECK-NEXT: splice z2.b, p2, z2.b, z4.b +; CHECK-NEXT: stp q1, q2, [x0] ; CHECK-NEXT: ret %op1 = load <32 x i8>, <32 x i8>* %a %op2 = load <32 x i8>, <32 x i8>* %b @@ -465,14 +582,21 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1 ; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: mov z2.d, z1.d +; CHECK-NEXT: mov z3.d, z0.d +; CHECK-NEXT: ext z2.b, z2.b, z2.b, #8 +; CHECK-NEXT: ext z3.b, z3.b, z0.b, #8 ; CHECK-NEXT: ptrue p0.s, vl4 -; CHECK-NEXT: uunpkhi z2.s, z1.h -; CHECK-NEXT: uunpkhi z3.s, z0.h ; CHECK-NEXT: uunpklo z1.s, z1.h +; CHECK-NEXT: uunpklo z2.s, z2.h +; CHECK-NEXT: uunpklo z3.s, z3.h ; CHECK-NEXT: uunpklo z0.s, z0.h ; CHECK-NEXT: udivr z2.s, p0/m, z2.s, z3.s ; CHECK-NEXT: udiv z0.s, p0/m, z0.s, z1.s -; CHECK-NEXT: uzp1 z0.h, z0.h, z2.h +; CHECK-NEXT: uzp1 z1.h, z2.h, z2.h +; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h +; CHECK-NEXT: ptrue p0.h, vl4 +; CHECK-NEXT: splice z0.h, p0, z0.h, z1.h ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: ret %res = udiv <8 x i16> %op1, %op2 @@ -482,24 +606,34 @@ define void @udiv_v16i16(<16 x i16>* %a, <16 x i16>* %b) #0 { ; CHECK-LABEL: udiv_v16i16: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q1, [x1] +; CHECK-NEXT: ldp q3, q0, [x1] ; CHECK-NEXT: ptrue p0.s, vl4 -; CHECK-NEXT: uunpkhi z6.s, z0.h +; CHECK-NEXT: ptrue p1.h, vl4 +; CHECK-NEXT: uunpklo z6.s, z3.h +; CHECK-NEXT: ext z3.b, z3.b, z3.b, #8 +; CHECK-NEXT: ldp q1, q2, [x0] +; CHECK-NEXT: uunpklo z4.s, z0.h +; CHECK-NEXT: ext z0.b, z0.b, z0.b, #8 ; CHECK-NEXT: uunpklo z0.s, z0.h -; CHECK-NEXT: ldp q3, q2, [x0] -; CHECK-NEXT: uunpkhi z4.s, z1.h +; CHECK-NEXT: uunpklo z7.s, z1.h +; CHECK-NEXT: ext z1.b, z1.b, z1.b, #8 ; CHECK-NEXT: uunpklo z1.s, z1.h -; CHECK-NEXT: uunpkhi z5.s, z2.h +; CHECK-NEXT: uunpklo z5.s, z2.h +; CHECK-NEXT: ext z2.b, z2.b, z2.b, #8 ; CHECK-NEXT: uunpklo z2.s, z2.h ; CHECK-NEXT: udivr z4.s, p0/m, z4.s, z5.s -; CHECK-NEXT: uunpkhi z5.s, z3.h -; CHECK-NEXT: uunpklo z3.s, z3.h -; CHECK-NEXT: udiv z5.s, p0/m, z5.s, z6.s -; CHECK-NEXT: udivr z0.s, p0/m, z0.s, z3.s -; CHECK-NEXT: udivr z1.s, p0/m, z1.s, z2.s -; CHECK-NEXT: uzp1 z0.h, z0.h, z5.h -; CHECK-NEXT: uzp1 z1.h, z1.h, z4.h -; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: udivr z0.s, p0/m, z0.s, z2.s +; CHECK-NEXT: uunpklo z2.s, z3.h +; CHECK-NEXT: udiv z1.s, p0/m, z1.s, z2.s +; CHECK-NEXT: movprfx z2, z7 +; CHECK-NEXT: udiv z2.s, p0/m, z2.s, z6.s +; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h +; CHECK-NEXT: uzp1 z1.h, z1.h, z1.h +; CHECK-NEXT: uzp1 z2.h, z2.h, z2.h +; CHECK-NEXT: uzp1 z3.h, z4.h, z4.h +; CHECK-NEXT: splice z2.h, p1, z2.h, z1.h +; CHECK-NEXT: splice z3.h, p1, z3.h, z0.h +; CHECK-NEXT: stp q2, q3, [x0] ; CHECK-NEXT: ret %op1 = load <16 x i16>, <16 x i16>* %a %op2 = load <16 x i16>, <16 x i16>* %b diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-rem.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-rem.ll --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-rem.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-rem.ll @@ -36,16 +36,21 @@ ; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 ; CHECK-NEXT: sunpklo z2.h, z1.b ; CHECK-NEXT: sunpklo z3.h, z0.b +; CHECK-NEXT: sunpklo z4.s, z2.h +; CHECK-NEXT: sunpklo z5.s, z3.h +; CHECK-NEXT: ext z2.b, z2.b, z2.b, #8 +; CHECK-NEXT: ext z3.b, z3.b, z3.b, #8 ; CHECK-NEXT: ptrue p0.s, vl4 -; CHECK-NEXT: sunpkhi z4.s, z2.h -; CHECK-NEXT: sunpkhi z5.s, z3.h ; CHECK-NEXT: sunpklo z2.s, z2.h ; CHECK-NEXT: sunpklo z3.s, z3.h ; CHECK-NEXT: sdivr z4.s, p0/m, z4.s, z5.s ; CHECK-NEXT: sdivr z2.s, p0/m, z2.s, z3.s +; CHECK-NEXT: uzp1 z4.h, z4.h, z4.h +; CHECK-NEXT: uzp1 z2.h, z2.h, z2.h +; CHECK-NEXT: ptrue p0.h, vl4 +; CHECK-NEXT: splice z4.h, p0, z4.h, z2.h ; CHECK-NEXT: ptrue p0.b, vl8 -; CHECK-NEXT: uzp1 z2.h, z2.h, z4.h -; CHECK-NEXT: uzp1 z2.b, z2.b, z2.b +; CHECK-NEXT: uzp1 z2.b, z4.b, z4.b ; CHECK-NEXT: mls z0.b, p0/m, z2.b, z1.b ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret @@ -58,27 +63,44 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1 ; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 -; CHECK-NEXT: sunpkhi z2.h, z1.b -; CHECK-NEXT: sunpkhi z3.h, z0.b +; CHECK-NEXT: mov z2.d, z1.d +; CHECK-NEXT: mov z3.d, z0.d +; CHECK-NEXT: ext z2.b, z2.b, z1.b, #8 +; CHECK-NEXT: ext z3.b, z3.b, z0.b, #8 +; CHECK-NEXT: sunpklo z2.h, z2.b +; CHECK-NEXT: sunpklo z3.h, z3.b +; CHECK-NEXT: sunpklo z5.s, z2.h +; CHECK-NEXT: sunpklo z6.s, z3.h +; CHECK-NEXT: ext z2.b, z2.b, z2.b, #8 +; CHECK-NEXT: ext z3.b, z3.b, z3.b, #8 ; CHECK-NEXT: ptrue p0.s, vl4 -; CHECK-NEXT: sunpkhi z5.s, z2.h -; CHECK-NEXT: sunpkhi z6.s, z3.h ; CHECK-NEXT: sunpklo z2.s, z2.h ; CHECK-NEXT: sunpklo z3.s, z3.h -; CHECK-NEXT: sunpklo z4.h, z1.b -; CHECK-NEXT: sdivr z2.s, p0/m, z2.s, z3.s -; CHECK-NEXT: sunpklo z3.h, z0.b ; CHECK-NEXT: sdivr z5.s, p0/m, z5.s, z6.s -; CHECK-NEXT: sunpkhi z6.s, z4.h -; CHECK-NEXT: sunpkhi z7.s, z3.h +; CHECK-NEXT: sdivr z2.s, p0/m, z2.s, z3.s +; CHECK-NEXT: ptrue p1.h, vl4 +; CHECK-NEXT: sunpklo z4.h, z1.b +; CHECK-NEXT: sunpklo z6.h, z0.b +; CHECK-NEXT: uzp1 z5.h, z5.h, z5.h +; CHECK-NEXT: uzp1 z2.h, z2.h, z2.h +; CHECK-NEXT: sunpklo z3.s, z4.h +; CHECK-NEXT: splice z5.h, p1, z5.h, z2.h +; CHECK-NEXT: sunpklo z2.s, z6.h +; CHECK-NEXT: ext z4.b, z4.b, z4.b, #8 +; CHECK-NEXT: ext z6.b, z6.b, z6.b, #8 ; CHECK-NEXT: sunpklo z4.s, z4.h -; CHECK-NEXT: sunpklo z3.s, z3.h -; CHECK-NEXT: sdivr z6.s, p0/m, z6.s, z7.s +; CHECK-NEXT: sunpklo z6.s, z6.h +; CHECK-NEXT: sdiv z2.s, p0/m, z2.s, z3.s +; CHECK-NEXT: movprfx z3, z6 ; CHECK-NEXT: sdiv z3.s, p0/m, z3.s, z4.s -; CHECK-NEXT: uzp1 z2.h, z2.h, z5.h -; CHECK-NEXT: uzp1 z3.h, z3.h, z6.h +; CHECK-NEXT: uzp1 z2.h, z2.h, z2.h +; CHECK-NEXT: uzp1 z3.h, z3.h, z3.h +; CHECK-NEXT: uzp1 z4.b, z5.b, z5.b +; CHECK-NEXT: splice z2.h, p1, z2.h, z3.h +; CHECK-NEXT: ptrue p0.b, vl8 +; CHECK-NEXT: uzp1 z2.b, z2.b, z2.b +; CHECK-NEXT: splice z2.b, p0, z2.b, z4.b ; CHECK-NEXT: ptrue p0.b, vl16 -; CHECK-NEXT: uzp1 z2.b, z3.b, z2.b ; CHECK-NEXT: mls z0.b, p0/m, z2.b, z1.b ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: ret @@ -89,51 +111,81 @@ define void @srem_v32i8(<32 x i8>* %a, <32 x i8>* %b) #0 { ; CHECK-LABEL: srem_v32i8: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q2, q0, [x0] +; CHECK-NEXT: ldp q1, q0, [x0] ; CHECK-NEXT: ptrue p0.s, vl4 -; CHECK-NEXT: ldp q3, q1, [x1] -; CHECK-NEXT: sunpkhi z5.h, z0.b +; CHECK-NEXT: ptrue p1.h, vl4 +; CHECK-NEXT: ldp q3, q2, [x1] +; CHECK-NEXT: mov z5.d, z0.d ; CHECK-NEXT: sunpklo z7.h, z0.b -; CHECK-NEXT: sunpkhi z17.s, z5.h +; CHECK-NEXT: ext z5.b, z5.b, z0.b, #8 +; CHECK-NEXT: sunpklo z5.h, z5.b +; CHECK-NEXT: sunpklo z18.s, z5.h +; CHECK-NEXT: ext z5.b, z5.b, z5.b, #8 ; CHECK-NEXT: sunpklo z5.s, z5.h -; CHECK-NEXT: sunpkhi z4.h, z1.b -; CHECK-NEXT: sunpklo z6.h, z1.b -; CHECK-NEXT: sunpkhi z16.s, z4.h +; CHECK-NEXT: mov z4.d, z2.d +; CHECK-NEXT: sunpklo z6.h, z2.b +; CHECK-NEXT: ext z4.b, z4.b, z2.b, #8 +; CHECK-NEXT: sunpklo z16.s, z6.h +; CHECK-NEXT: sunpklo z4.h, z4.b +; CHECK-NEXT: ext z6.b, z6.b, z6.b, #8 +; CHECK-NEXT: sunpklo z17.s, z4.h +; CHECK-NEXT: ext z4.b, z4.b, z4.b, #8 ; CHECK-NEXT: sunpklo z4.s, z4.h -; CHECK-NEXT: sunpkhi z18.s, z6.h +; CHECK-NEXT: sdivr z17.s, p0/m, z17.s, z18.s ; CHECK-NEXT: sdivr z4.s, p0/m, z4.s, z5.s -; CHECK-NEXT: sunpkhi z5.s, z7.h +; CHECK-NEXT: sunpklo z18.s, z7.h +; CHECK-NEXT: uzp1 z17.h, z17.h, z17.h +; CHECK-NEXT: uzp1 z4.h, z4.h, z4.h +; CHECK-NEXT: ext z7.b, z7.b, z7.b, #8 +; CHECK-NEXT: sunpklo z5.s, z6.h +; CHECK-NEXT: splice z17.h, p1, z17.h, z4.h +; CHECK-NEXT: sunpklo z4.s, z7.h +; CHECK-NEXT: mov z6.d, z3.d +; CHECK-NEXT: mov z7.d, z1.d +; CHECK-NEXT: ext z6.b, z6.b, z3.b, #8 +; CHECK-NEXT: ext z7.b, z7.b, z1.b, #8 +; CHECK-NEXT: sdivr z16.s, p0/m, z16.s, z18.s +; CHECK-NEXT: sunpklo z6.h, z6.b +; CHECK-NEXT: sunpklo z7.h, z7.b +; CHECK-NEXT: sdiv z4.s, p0/m, z4.s, z5.s +; CHECK-NEXT: uzp1 z5.h, z16.h, z16.h +; CHECK-NEXT: sunpklo z16.s, z6.h +; CHECK-NEXT: sunpklo z18.s, z7.h +; CHECK-NEXT: ext z6.b, z6.b, z6.b, #8 +; CHECK-NEXT: ext z7.b, z7.b, z7.b, #8 ; CHECK-NEXT: sunpklo z6.s, z6.h ; CHECK-NEXT: sunpklo z7.s, z7.h -; CHECK-NEXT: sdiv z5.s, p0/m, z5.s, z18.s +; CHECK-NEXT: sdivr z16.s, p0/m, z16.s, z18.s ; CHECK-NEXT: sdivr z6.s, p0/m, z6.s, z7.s -; CHECK-NEXT: sdivr z16.s, p0/m, z16.s, z17.s -; CHECK-NEXT: uzp1 z5.h, z6.h, z5.h -; CHECK-NEXT: sunpkhi z6.h, z3.b -; CHECK-NEXT: sunpkhi z7.h, z2.b -; CHECK-NEXT: uzp1 z4.h, z4.h, z16.h -; CHECK-NEXT: sunpkhi z16.s, z6.h -; CHECK-NEXT: sunpkhi z17.s, z7.h +; CHECK-NEXT: uzp1 z4.h, z4.h, z4.h +; CHECK-NEXT: uzp1 z7.h, z16.h, z16.h +; CHECK-NEXT: uzp1 z6.h, z6.h, z6.h +; CHECK-NEXT: splice z5.h, p1, z5.h, z4.h +; CHECK-NEXT: splice z7.h, p1, z7.h, z6.h +; CHECK-NEXT: sunpklo z4.h, z3.b +; CHECK-NEXT: sunpklo z6.h, z1.b +; CHECK-NEXT: sunpklo z16.s, z4.h +; CHECK-NEXT: sunpklo z18.s, z6.h +; CHECK-NEXT: ext z4.b, z4.b, z4.b, #8 +; CHECK-NEXT: ext z6.b, z6.b, z6.b, #8 +; CHECK-NEXT: sunpklo z4.s, z4.h ; CHECK-NEXT: sunpklo z6.s, z6.h -; CHECK-NEXT: sunpklo z7.s, z7.h -; CHECK-NEXT: sdivr z16.s, p0/m, z16.s, z17.s -; CHECK-NEXT: sdivr z6.s, p0/m, z6.s, z7.s -; CHECK-NEXT: sunpklo z7.h, z3.b -; CHECK-NEXT: sunpklo z17.h, z2.b -; CHECK-NEXT: sunpkhi z18.s, z7.h -; CHECK-NEXT: sunpkhi z19.s, z17.h -; CHECK-NEXT: sunpklo z7.s, z7.h -; CHECK-NEXT: sunpklo z17.s, z17.h -; CHECK-NEXT: sdivr z18.s, p0/m, z18.s, z19.s -; CHECK-NEXT: sdivr z7.s, p0/m, z7.s, z17.s -; CHECK-NEXT: uzp1 z6.h, z6.h, z16.h -; CHECK-NEXT: uzp1 z7.h, z7.h, z18.h -; CHECK-NEXT: ptrue p0.b, vl16 -; CHECK-NEXT: uzp1 z6.b, z7.b, z6.b -; CHECK-NEXT: uzp1 z4.b, z5.b, z4.b -; CHECK-NEXT: mls z2.b, p0/m, z6.b, z3.b -; CHECK-NEXT: mls z0.b, p0/m, z4.b, z1.b -; CHECK-NEXT: stp q2, q0, [x0] +; CHECK-NEXT: sdivr z16.s, p0/m, z16.s, z18.s +; CHECK-NEXT: sdivr z4.s, p0/m, z4.s, z6.s +; CHECK-NEXT: uzp1 z16.h, z16.h, z16.h +; CHECK-NEXT: uzp1 z4.h, z4.h, z4.h +; CHECK-NEXT: splice z16.h, p1, z16.h, z4.h +; CHECK-NEXT: uzp1 z6.b, z17.b, z17.b +; CHECK-NEXT: uzp1 z5.b, z5.b, z5.b +; CHECK-NEXT: ptrue p0.b, vl8 +; CHECK-NEXT: uzp1 z4.b, z7.b, z7.b +; CHECK-NEXT: uzp1 z7.b, z16.b, z16.b +; CHECK-NEXT: ptrue p1.b, vl16 +; CHECK-NEXT: splice z7.b, p0, z7.b, z4.b +; CHECK-NEXT: splice z5.b, p0, z5.b, z6.b +; CHECK-NEXT: mls z1.b, p1/m, z7.b, z3.b +; CHECK-NEXT: mls z0.b, p1/m, z5.b, z2.b +; CHECK-NEXT: stp q1, q0, [x0] ; CHECK-NEXT: ret %op1 = load <32 x i8>, <32 x i8>* %a %op2 = load <32 x i8>, <32 x i8>* %b @@ -165,17 +217,23 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1 ; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: mov z2.d, z1.d +; CHECK-NEXT: mov z3.d, z0.d +; CHECK-NEXT: ext z2.b, z2.b, z1.b, #8 +; CHECK-NEXT: ext z3.b, z3.b, z0.b, #8 ; CHECK-NEXT: ptrue p0.s, vl4 -; CHECK-NEXT: sunpkhi z2.s, z1.h -; CHECK-NEXT: sunpkhi z3.s, z0.h +; CHECK-NEXT: sunpklo z2.s, z2.h +; CHECK-NEXT: sunpklo z3.s, z3.h ; CHECK-NEXT: sunpklo z4.s, z1.h ; CHECK-NEXT: sdivr z2.s, p0/m, z2.s, z3.s -; CHECK-NEXT: sunpklo z5.s, z0.h -; CHECK-NEXT: movprfx z3, z5 +; CHECK-NEXT: sunpklo z3.s, z0.h ; CHECK-NEXT: sdiv z3.s, p0/m, z3.s, z4.s -; CHECK-NEXT: uzp1 z2.h, z3.h, z2.h +; CHECK-NEXT: uzp1 z2.h, z2.h, z2.h +; CHECK-NEXT: uzp1 z3.h, z3.h, z3.h +; CHECK-NEXT: ptrue p0.h, vl4 +; CHECK-NEXT: splice z3.h, p0, z3.h, z2.h ; CHECK-NEXT: ptrue p0.h, vl8 -; CHECK-NEXT: mls z0.h, p0/m, z2.h, z1.h +; CHECK-NEXT: mls z0.h, p0/m, z3.h, z1.h ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: ret %res = srem <8 x i16> %op1, %op2 @@ -187,24 +245,38 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: ldp q2, q0, [x0] ; CHECK-NEXT: ptrue p0.s, vl4 -; CHECK-NEXT: sunpkhi z17.s, z2.h +; CHECK-NEXT: ptrue p1.h, vl8 +; CHECK-NEXT: mov z17.d, z2.d +; CHECK-NEXT: ext z17.b, z17.b, z2.b, #8 ; CHECK-NEXT: ldp q3, q1, [x1] -; CHECK-NEXT: sunpkhi z5.s, z0.h +; CHECK-NEXT: mov z5.d, z0.d ; CHECK-NEXT: sunpklo z7.s, z0.h -; CHECK-NEXT: sunpkhi z16.s, z3.h -; CHECK-NEXT: sdivr z16.s, p0/m, z16.s, z17.s -; CHECK-NEXT: sunpkhi z4.s, z1.h +; CHECK-NEXT: ext z5.b, z5.b, z0.b, #8 +; CHECK-NEXT: sunpklo z5.s, z5.h +; CHECK-NEXT: mov z16.d, z3.d +; CHECK-NEXT: ext z16.b, z16.b, z3.b, #8 +; CHECK-NEXT: mov z4.d, z1.d ; CHECK-NEXT: sunpklo z6.s, z1.h -; CHECK-NEXT: sdivr z4.s, p0/m, z4.s, z5.s -; CHECK-NEXT: sunpklo z5.s, z3.h +; CHECK-NEXT: ext z4.b, z4.b, z1.b, #8 ; CHECK-NEXT: sdivr z6.s, p0/m, z6.s, z7.s -; CHECK-NEXT: sunpklo z7.s, z2.h -; CHECK-NEXT: sdivr z5.s, p0/m, z5.s, z7.s -; CHECK-NEXT: ptrue p0.h, vl8 -; CHECK-NEXT: uzp1 z5.h, z5.h, z16.h -; CHECK-NEXT: uzp1 z4.h, z6.h, z4.h -; CHECK-NEXT: mls z2.h, p0/m, z5.h, z3.h -; CHECK-NEXT: mls z0.h, p0/m, z4.h, z1.h +; CHECK-NEXT: sunpklo z4.s, z4.h +; CHECK-NEXT: sunpklo z7.s, z16.h +; CHECK-NEXT: sunpklo z16.s, z17.h +; CHECK-NEXT: sdivr z4.s, p0/m, z4.s, z5.s +; CHECK-NEXT: movprfx z5, z16 +; CHECK-NEXT: sdiv z5.s, p0/m, z5.s, z7.s +; CHECK-NEXT: sunpklo z7.s, z3.h +; CHECK-NEXT: sunpklo z16.s, z2.h +; CHECK-NEXT: uzp1 z5.h, z5.h, z5.h +; CHECK-NEXT: sdivr z7.s, p0/m, z7.s, z16.s +; CHECK-NEXT: ptrue p0.h, vl4 +; CHECK-NEXT: uzp1 z7.h, z7.h, z7.h +; CHECK-NEXT: uzp1 z4.h, z4.h, z4.h +; CHECK-NEXT: splice z7.h, p0, z7.h, z5.h +; CHECK-NEXT: uzp1 z5.h, z6.h, z6.h +; CHECK-NEXT: splice z5.h, p0, z5.h, z4.h +; CHECK-NEXT: mls z2.h, p1/m, z7.h, z3.h +; CHECK-NEXT: mls z0.h, p1/m, z5.h, z1.h ; CHECK-NEXT: stp q2, q0, [x0] ; CHECK-NEXT: ret %op1 = load <16 x i16>, <16 x i16>* %a @@ -347,16 +419,21 @@ ; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 ; CHECK-NEXT: uunpklo z2.h, z1.b ; CHECK-NEXT: uunpklo z3.h, z0.b +; CHECK-NEXT: uunpklo z4.s, z2.h +; CHECK-NEXT: uunpklo z5.s, z3.h +; CHECK-NEXT: ext z2.b, z2.b, z2.b, #8 +; CHECK-NEXT: ext z3.b, z3.b, z3.b, #8 ; CHECK-NEXT: ptrue p0.s, vl4 -; CHECK-NEXT: uunpkhi z4.s, z2.h -; CHECK-NEXT: uunpkhi z5.s, z3.h ; CHECK-NEXT: uunpklo z2.s, z2.h ; CHECK-NEXT: uunpklo z3.s, z3.h ; CHECK-NEXT: udivr z4.s, p0/m, z4.s, z5.s ; CHECK-NEXT: udivr z2.s, p0/m, z2.s, z3.s +; CHECK-NEXT: uzp1 z4.h, z4.h, z4.h +; CHECK-NEXT: uzp1 z2.h, z2.h, z2.h +; CHECK-NEXT: ptrue p0.h, vl4 +; CHECK-NEXT: splice z4.h, p0, z4.h, z2.h ; CHECK-NEXT: ptrue p0.b, vl8 -; CHECK-NEXT: uzp1 z2.h, z2.h, z4.h -; CHECK-NEXT: uzp1 z2.b, z2.b, z2.b +; CHECK-NEXT: uzp1 z2.b, z4.b, z4.b ; CHECK-NEXT: mls z0.b, p0/m, z2.b, z1.b ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret @@ -369,27 +446,44 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1 ; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 -; CHECK-NEXT: uunpkhi z2.h, z1.b -; CHECK-NEXT: uunpkhi z3.h, z0.b +; CHECK-NEXT: mov z2.d, z1.d +; CHECK-NEXT: mov z3.d, z0.d +; CHECK-NEXT: ext z2.b, z2.b, z1.b, #8 +; CHECK-NEXT: ext z3.b, z3.b, z0.b, #8 +; CHECK-NEXT: uunpklo z2.h, z2.b +; CHECK-NEXT: uunpklo z3.h, z3.b +; CHECK-NEXT: uunpklo z5.s, z2.h +; CHECK-NEXT: uunpklo z6.s, z3.h +; CHECK-NEXT: ext z2.b, z2.b, z2.b, #8 +; CHECK-NEXT: ext z3.b, z3.b, z3.b, #8 ; CHECK-NEXT: ptrue p0.s, vl4 -; CHECK-NEXT: uunpkhi z5.s, z2.h -; CHECK-NEXT: uunpkhi z6.s, z3.h ; CHECK-NEXT: uunpklo z2.s, z2.h ; CHECK-NEXT: uunpklo z3.s, z3.h -; CHECK-NEXT: uunpklo z4.h, z1.b -; CHECK-NEXT: udivr z2.s, p0/m, z2.s, z3.s -; CHECK-NEXT: uunpklo z3.h, z0.b ; CHECK-NEXT: udivr z5.s, p0/m, z5.s, z6.s -; CHECK-NEXT: uunpkhi z6.s, z4.h -; CHECK-NEXT: uunpkhi z7.s, z3.h +; CHECK-NEXT: udivr z2.s, p0/m, z2.s, z3.s +; CHECK-NEXT: ptrue p1.h, vl4 +; CHECK-NEXT: uunpklo z4.h, z1.b +; CHECK-NEXT: uunpklo z6.h, z0.b +; CHECK-NEXT: uzp1 z5.h, z5.h, z5.h +; CHECK-NEXT: uzp1 z2.h, z2.h, z2.h +; CHECK-NEXT: uunpklo z3.s, z4.h +; CHECK-NEXT: splice z5.h, p1, z5.h, z2.h +; CHECK-NEXT: uunpklo z2.s, z6.h +; CHECK-NEXT: ext z4.b, z4.b, z4.b, #8 +; CHECK-NEXT: ext z6.b, z6.b, z6.b, #8 ; CHECK-NEXT: uunpklo z4.s, z4.h -; CHECK-NEXT: uunpklo z3.s, z3.h -; CHECK-NEXT: udivr z6.s, p0/m, z6.s, z7.s +; CHECK-NEXT: uunpklo z6.s, z6.h +; CHECK-NEXT: udiv z2.s, p0/m, z2.s, z3.s +; CHECK-NEXT: movprfx z3, z6 ; CHECK-NEXT: udiv z3.s, p0/m, z3.s, z4.s -; CHECK-NEXT: uzp1 z2.h, z2.h, z5.h -; CHECK-NEXT: uzp1 z3.h, z3.h, z6.h +; CHECK-NEXT: uzp1 z2.h, z2.h, z2.h +; CHECK-NEXT: uzp1 z3.h, z3.h, z3.h +; CHECK-NEXT: uzp1 z4.b, z5.b, z5.b +; CHECK-NEXT: splice z2.h, p1, z2.h, z3.h +; CHECK-NEXT: ptrue p0.b, vl8 +; CHECK-NEXT: uzp1 z2.b, z2.b, z2.b +; CHECK-NEXT: splice z2.b, p0, z2.b, z4.b ; CHECK-NEXT: ptrue p0.b, vl16 -; CHECK-NEXT: uzp1 z2.b, z3.b, z2.b ; CHECK-NEXT: mls z0.b, p0/m, z2.b, z1.b ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: ret @@ -400,51 +494,81 @@ define void @urem_v32i8(<32 x i8>* %a, <32 x i8>* %b) #0 { ; CHECK-LABEL: urem_v32i8: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q2, q0, [x0] +; CHECK-NEXT: ldp q1, q0, [x0] ; CHECK-NEXT: ptrue p0.s, vl4 -; CHECK-NEXT: ldp q3, q1, [x1] -; CHECK-NEXT: uunpkhi z5.h, z0.b +; CHECK-NEXT: ptrue p1.h, vl4 +; CHECK-NEXT: ldp q3, q2, [x1] +; CHECK-NEXT: mov z5.d, z0.d ; CHECK-NEXT: uunpklo z7.h, z0.b -; CHECK-NEXT: uunpkhi z17.s, z5.h +; CHECK-NEXT: ext z5.b, z5.b, z0.b, #8 +; CHECK-NEXT: uunpklo z5.h, z5.b +; CHECK-NEXT: uunpklo z18.s, z5.h +; CHECK-NEXT: ext z5.b, z5.b, z5.b, #8 ; CHECK-NEXT: uunpklo z5.s, z5.h -; CHECK-NEXT: uunpkhi z4.h, z1.b -; CHECK-NEXT: uunpklo z6.h, z1.b -; CHECK-NEXT: uunpkhi z16.s, z4.h +; CHECK-NEXT: mov z4.d, z2.d +; CHECK-NEXT: uunpklo z6.h, z2.b +; CHECK-NEXT: ext z4.b, z4.b, z2.b, #8 +; CHECK-NEXT: uunpklo z16.s, z6.h +; CHECK-NEXT: uunpklo z4.h, z4.b +; CHECK-NEXT: ext z6.b, z6.b, z6.b, #8 +; CHECK-NEXT: uunpklo z17.s, z4.h +; CHECK-NEXT: ext z4.b, z4.b, z4.b, #8 ; CHECK-NEXT: uunpklo z4.s, z4.h -; CHECK-NEXT: uunpkhi z18.s, z6.h +; CHECK-NEXT: udivr z17.s, p0/m, z17.s, z18.s ; CHECK-NEXT: udivr z4.s, p0/m, z4.s, z5.s -; CHECK-NEXT: uunpkhi z5.s, z7.h +; CHECK-NEXT: uunpklo z18.s, z7.h +; CHECK-NEXT: uzp1 z17.h, z17.h, z17.h +; CHECK-NEXT: uzp1 z4.h, z4.h, z4.h +; CHECK-NEXT: ext z7.b, z7.b, z7.b, #8 +; CHECK-NEXT: uunpklo z5.s, z6.h +; CHECK-NEXT: splice z17.h, p1, z17.h, z4.h +; CHECK-NEXT: uunpklo z4.s, z7.h +; CHECK-NEXT: mov z6.d, z3.d +; CHECK-NEXT: mov z7.d, z1.d +; CHECK-NEXT: ext z6.b, z6.b, z3.b, #8 +; CHECK-NEXT: ext z7.b, z7.b, z1.b, #8 +; CHECK-NEXT: udivr z16.s, p0/m, z16.s, z18.s +; CHECK-NEXT: uunpklo z6.h, z6.b +; CHECK-NEXT: uunpklo z7.h, z7.b +; CHECK-NEXT: udiv z4.s, p0/m, z4.s, z5.s +; CHECK-NEXT: uzp1 z5.h, z16.h, z16.h +; CHECK-NEXT: uunpklo z16.s, z6.h +; CHECK-NEXT: uunpklo z18.s, z7.h +; CHECK-NEXT: ext z6.b, z6.b, z6.b, #8 +; CHECK-NEXT: ext z7.b, z7.b, z7.b, #8 ; CHECK-NEXT: uunpklo z6.s, z6.h ; CHECK-NEXT: uunpklo z7.s, z7.h -; CHECK-NEXT: udiv z5.s, p0/m, z5.s, z18.s +; CHECK-NEXT: udivr z16.s, p0/m, z16.s, z18.s ; CHECK-NEXT: udivr z6.s, p0/m, z6.s, z7.s -; CHECK-NEXT: udivr z16.s, p0/m, z16.s, z17.s -; CHECK-NEXT: uzp1 z5.h, z6.h, z5.h -; CHECK-NEXT: uunpkhi z6.h, z3.b -; CHECK-NEXT: uunpkhi z7.h, z2.b -; CHECK-NEXT: uzp1 z4.h, z4.h, z16.h -; CHECK-NEXT: uunpkhi z16.s, z6.h -; CHECK-NEXT: uunpkhi z17.s, z7.h +; CHECK-NEXT: uzp1 z4.h, z4.h, z4.h +; CHECK-NEXT: uzp1 z7.h, z16.h, z16.h +; CHECK-NEXT: uzp1 z6.h, z6.h, z6.h +; CHECK-NEXT: splice z5.h, p1, z5.h, z4.h +; CHECK-NEXT: splice z7.h, p1, z7.h, z6.h +; CHECK-NEXT: uunpklo z4.h, z3.b +; CHECK-NEXT: uunpklo z6.h, z1.b +; CHECK-NEXT: uunpklo z16.s, z4.h +; CHECK-NEXT: uunpklo z18.s, z6.h +; CHECK-NEXT: ext z4.b, z4.b, z4.b, #8 +; CHECK-NEXT: ext z6.b, z6.b, z6.b, #8 +; CHECK-NEXT: uunpklo z4.s, z4.h ; CHECK-NEXT: uunpklo z6.s, z6.h -; CHECK-NEXT: uunpklo z7.s, z7.h -; CHECK-NEXT: udivr z16.s, p0/m, z16.s, z17.s -; CHECK-NEXT: udivr z6.s, p0/m, z6.s, z7.s -; CHECK-NEXT: uunpklo z7.h, z3.b -; CHECK-NEXT: uunpklo z17.h, z2.b -; CHECK-NEXT: uunpkhi z18.s, z7.h -; CHECK-NEXT: uunpkhi z19.s, z17.h -; CHECK-NEXT: uunpklo z7.s, z7.h -; CHECK-NEXT: uunpklo z17.s, z17.h -; CHECK-NEXT: udivr z18.s, p0/m, z18.s, z19.s -; CHECK-NEXT: udivr z7.s, p0/m, z7.s, z17.s -; CHECK-NEXT: uzp1 z6.h, z6.h, z16.h -; CHECK-NEXT: uzp1 z7.h, z7.h, z18.h -; CHECK-NEXT: ptrue p0.b, vl16 -; CHECK-NEXT: uzp1 z6.b, z7.b, z6.b -; CHECK-NEXT: uzp1 z4.b, z5.b, z4.b -; CHECK-NEXT: mls z2.b, p0/m, z6.b, z3.b -; CHECK-NEXT: mls z0.b, p0/m, z4.b, z1.b -; CHECK-NEXT: stp q2, q0, [x0] +; CHECK-NEXT: udivr z16.s, p0/m, z16.s, z18.s +; CHECK-NEXT: udivr z4.s, p0/m, z4.s, z6.s +; CHECK-NEXT: uzp1 z16.h, z16.h, z16.h +; CHECK-NEXT: uzp1 z4.h, z4.h, z4.h +; CHECK-NEXT: splice z16.h, p1, z16.h, z4.h +; CHECK-NEXT: uzp1 z6.b, z17.b, z17.b +; CHECK-NEXT: uzp1 z5.b, z5.b, z5.b +; CHECK-NEXT: ptrue p0.b, vl8 +; CHECK-NEXT: uzp1 z4.b, z7.b, z7.b +; CHECK-NEXT: uzp1 z7.b, z16.b, z16.b +; CHECK-NEXT: ptrue p1.b, vl16 +; CHECK-NEXT: splice z7.b, p0, z7.b, z4.b +; CHECK-NEXT: splice z5.b, p0, z5.b, z6.b +; CHECK-NEXT: mls z1.b, p1/m, z7.b, z3.b +; CHECK-NEXT: mls z0.b, p1/m, z5.b, z2.b +; CHECK-NEXT: stp q1, q0, [x0] ; CHECK-NEXT: ret %op1 = load <32 x i8>, <32 x i8>* %a %op2 = load <32 x i8>, <32 x i8>* %b @@ -476,17 +600,23 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1 ; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: mov z2.d, z1.d +; CHECK-NEXT: mov z3.d, z0.d +; CHECK-NEXT: ext z2.b, z2.b, z1.b, #8 +; CHECK-NEXT: ext z3.b, z3.b, z0.b, #8 ; CHECK-NEXT: ptrue p0.s, vl4 -; CHECK-NEXT: uunpkhi z2.s, z1.h -; CHECK-NEXT: uunpkhi z3.s, z0.h +; CHECK-NEXT: uunpklo z2.s, z2.h +; CHECK-NEXT: uunpklo z3.s, z3.h ; CHECK-NEXT: uunpklo z4.s, z1.h ; CHECK-NEXT: udivr z2.s, p0/m, z2.s, z3.s -; CHECK-NEXT: uunpklo z5.s, z0.h -; CHECK-NEXT: movprfx z3, z5 +; CHECK-NEXT: uunpklo z3.s, z0.h ; CHECK-NEXT: udiv z3.s, p0/m, z3.s, z4.s -; CHECK-NEXT: uzp1 z2.h, z3.h, z2.h +; CHECK-NEXT: uzp1 z2.h, z2.h, z2.h +; CHECK-NEXT: uzp1 z3.h, z3.h, z3.h +; CHECK-NEXT: ptrue p0.h, vl4 +; CHECK-NEXT: splice z3.h, p0, z3.h, z2.h ; CHECK-NEXT: ptrue p0.h, vl8 -; CHECK-NEXT: mls z0.h, p0/m, z2.h, z1.h +; CHECK-NEXT: mls z0.h, p0/m, z3.h, z1.h ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: ret %res = urem <8 x i16> %op1, %op2 @@ -498,24 +628,38 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: ldp q2, q0, [x0] ; CHECK-NEXT: ptrue p0.s, vl4 -; CHECK-NEXT: uunpkhi z17.s, z2.h +; CHECK-NEXT: ptrue p1.h, vl8 +; CHECK-NEXT: mov z17.d, z2.d +; CHECK-NEXT: ext z17.b, z17.b, z2.b, #8 ; CHECK-NEXT: ldp q3, q1, [x1] -; CHECK-NEXT: uunpkhi z5.s, z0.h +; CHECK-NEXT: mov z5.d, z0.d ; CHECK-NEXT: uunpklo z7.s, z0.h -; CHECK-NEXT: uunpkhi z16.s, z3.h -; CHECK-NEXT: udivr z16.s, p0/m, z16.s, z17.s -; CHECK-NEXT: uunpkhi z4.s, z1.h +; CHECK-NEXT: ext z5.b, z5.b, z0.b, #8 +; CHECK-NEXT: uunpklo z5.s, z5.h +; CHECK-NEXT: mov z16.d, z3.d +; CHECK-NEXT: ext z16.b, z16.b, z3.b, #8 +; CHECK-NEXT: mov z4.d, z1.d ; CHECK-NEXT: uunpklo z6.s, z1.h -; CHECK-NEXT: udivr z4.s, p0/m, z4.s, z5.s -; CHECK-NEXT: uunpklo z5.s, z3.h +; CHECK-NEXT: ext z4.b, z4.b, z1.b, #8 ; CHECK-NEXT: udivr z6.s, p0/m, z6.s, z7.s -; CHECK-NEXT: uunpklo z7.s, z2.h -; CHECK-NEXT: udivr z5.s, p0/m, z5.s, z7.s -; CHECK-NEXT: ptrue p0.h, vl8 -; CHECK-NEXT: uzp1 z5.h, z5.h, z16.h -; CHECK-NEXT: uzp1 z4.h, z6.h, z4.h -; CHECK-NEXT: mls z2.h, p0/m, z5.h, z3.h -; CHECK-NEXT: mls z0.h, p0/m, z4.h, z1.h +; CHECK-NEXT: uunpklo z4.s, z4.h +; CHECK-NEXT: uunpklo z7.s, z16.h +; CHECK-NEXT: uunpklo z16.s, z17.h +; CHECK-NEXT: udivr z4.s, p0/m, z4.s, z5.s +; CHECK-NEXT: movprfx z5, z16 +; CHECK-NEXT: udiv z5.s, p0/m, z5.s, z7.s +; CHECK-NEXT: uunpklo z7.s, z3.h +; CHECK-NEXT: uunpklo z16.s, z2.h +; CHECK-NEXT: uzp1 z5.h, z5.h, z5.h +; CHECK-NEXT: udivr z7.s, p0/m, z7.s, z16.s +; CHECK-NEXT: ptrue p0.h, vl4 +; CHECK-NEXT: uzp1 z7.h, z7.h, z7.h +; CHECK-NEXT: uzp1 z4.h, z4.h, z4.h +; CHECK-NEXT: splice z7.h, p0, z7.h, z5.h +; CHECK-NEXT: uzp1 z5.h, z6.h, z6.h +; CHECK-NEXT: splice z5.h, p0, z5.h, z4.h +; CHECK-NEXT: mls z2.h, p1/m, z7.h, z3.h +; CHECK-NEXT: mls z0.h, p1/m, z5.h, z1.h ; CHECK-NEXT: stp q2, q0, [x0] ; CHECK-NEXT: ret %op1 = load <16 x i16>, <16 x i16>* %a