diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -1626,6 +1626,8 @@ setOperationAction(ISD::MULHU, VT, Custom); setOperationAction(ISD::ABS, VT, Custom); setOperationAction(ISD::XOR, VT, Custom); + setOperationAction(ISD::TRUNCATE, VT, Custom); + setOperationAction(ISD::FMUL, VT, Custom); } void AArch64TargetLowering::addTypeForFixedLengthSVE(MVT VT) { @@ -3784,7 +3786,8 @@ SDValue SrcVal = Op.getOperand(IsStrict ? 1 : 0); EVT SrcVT = SrcVal.getValueType(); - if (useSVEForFixedLengthVectorVT(SrcVT)) + if (useSVEForFixedLengthVectorVT(SrcVT, + Subtarget->forceStreamingCompatibleSVE())) return LowerFixedLengthFPRoundToSVE(Op, DAG); if (SrcVT != MVT::f128) { @@ -3815,7 +3818,10 @@ return LowerToPredicatedOp(Op, DAG, Opcode); } - if (useSVEForFixedLengthVectorVT(VT) || useSVEForFixedLengthVectorVT(InVT)) + if (useSVEForFixedLengthVectorVT(VT, + Subtarget->forceStreamingCompatibleSVE()) || + useSVEForFixedLengthVectorVT(InVT, + Subtarget->forceStreamingCompatibleSVE())) return LowerFixedLengthFPToIntToSVE(Op, DAG); unsigned NumElts = InVT.getVectorNumElements(); @@ -4069,7 +4075,10 @@ return LowerToPredicatedOp(Op, DAG, Opcode); } - if (useSVEForFixedLengthVectorVT(VT) || useSVEForFixedLengthVectorVT(InVT)) + if (useSVEForFixedLengthVectorVT(VT, + Subtarget->forceStreamingCompatibleSVE()) || + useSVEForFixedLengthVectorVT(InVT, + Subtarget->forceStreamingCompatibleSVE())) return LowerFixedLengthIntToFPToSVE(Op, DAG); uint64_t VTSize = VT.getFixedSizeInBits(); @@ -15422,7 +15431,7 @@ static SDValue performFpToIntCombine(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const AArch64Subtarget *Subtarget) { - if (!Subtarget->hasNEON()) + if (!Subtarget->hasNEON() || Subtarget->forceStreamingCompatibleSVE()) return SDValue(); if (!N->getValueType(0).isSimple()) diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-convert.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-convert.ll --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-convert.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-convert.ll @@ -7,9 +7,14 @@ define void @fp_convert_combine_crash(<8 x float> *%a, <8 x i32> *%b) #0 { ; CHECK-LABEL: fp_convert_combine_crash: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q1, [x0] -; CHECK-NEXT: fcvtzs v0.4s, v0.4s, #3 -; CHECK-NEXT: fcvtzs v1.4s, v1.4s, #3 +; CHECK-NEXT: adrp x8, .LCPI0_0 +; CHECK-NEXT: ptrue p0.s, vl4 +; CHECK-NEXT: ldp q0, q2, [x0] +; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI0_0] +; CHECK-NEXT: fmul z0.s, p0/m, z0.s, z1.s +; CHECK-NEXT: fmul z1.s, p0/m, z1.s, z2.s +; CHECK-NEXT: fcvtzs z0.s, p0/m, z0.s +; CHECK-NEXT: fcvtzs z1.s, p0/m, z1.s ; CHECK-NEXT: stp q0, q1, [x1] ; CHECK-NEXT: ret %f = load <8 x float>, <8 x float>* %a diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-to-int.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-to-int.ll --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-to-int.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-to-int.ll @@ -10,7 +10,10 @@ define <4 x i16> @fcvtzu_v4f16_v4i16(<4 x half> %op1) #0 { ; CHECK-LABEL: fcvtzu_v4f16_v4i16: ; CHECK: // %bb.0: -; CHECK-NEXT: fcvtzu v0.4h, v0.4h +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: ptrue p0.h, vl4 +; CHECK-NEXT: fcvtzu z0.h, p0/m, z0.h +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret %res = fptoui <4 x half> %op1 to <4 x i16> ret <4 x i16> %res @@ -20,7 +23,8 @@ ; CHECK-LABEL: fcvtzu_v8f16_v8i16: ; CHECK: // %bb.0: ; CHECK-NEXT: ldr q0, [x0] -; CHECK-NEXT: fcvtzu v0.8h, v0.8h +; CHECK-NEXT: ptrue p0.h, vl8 +; CHECK-NEXT: fcvtzu z0.h, p0/m, z0.h ; CHECK-NEXT: str q0, [x1] ; CHECK-NEXT: ret %op1 = load <8 x half>, <8 x half>* %a @@ -33,8 +37,9 @@ ; CHECK-LABEL: fcvtzu_v16f16_v16i16: ; CHECK: // %bb.0: ; CHECK-NEXT: ldp q0, q1, [x0] -; CHECK-NEXT: fcvtzu v0.8h, v0.8h -; CHECK-NEXT: fcvtzu v1.8h, v1.8h +; CHECK-NEXT: ptrue p0.h, vl8 +; CHECK-NEXT: fcvtzu z0.h, p0/m, z0.h +; CHECK-NEXT: fcvtzu z1.h, p0/m, z1.h ; CHECK-NEXT: stp q0, q1, [x1] ; CHECK-NEXT: ret %op1 = load <16 x half>, <16 x half>* %a @@ -50,9 +55,11 @@ define <2 x i32> @fcvtzu_v2f16_v2i32(<2 x half> %op1) #0 { ; CHECK-LABEL: fcvtzu_v2f16_v2i32: ; CHECK: // %bb.0: -; CHECK-NEXT: fcvtl v0.4s, v0.4h -; CHECK-NEXT: fcvtzu v0.4s, v0.4s -; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: ptrue p0.s, vl4 +; CHECK-NEXT: uunpklo z0.s, z0.h +; CHECK-NEXT: fcvtzu z0.s, p0/m, z0.h +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret %res = fptoui <2 x half> %op1 to <2 x i32> ret <2 x i32> %res @@ -61,8 +68,11 @@ define <4 x i32> @fcvtzu_v4f16_v4i32(<4 x half> %op1) #0 { ; CHECK-LABEL: fcvtzu_v4f16_v4i32: ; CHECK: // %bb.0: -; CHECK-NEXT: fcvtl v0.4s, v0.4h -; CHECK-NEXT: fcvtzu v0.4s, v0.4s +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: ptrue p0.s, vl4 +; CHECK-NEXT: uunpklo z0.s, z0.h +; CHECK-NEXT: fcvtzu z0.s, p0/m, z0.h +; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: ret %res = fptoui <4 x half> %op1 to <4 x i32> ret <4 x i32> %res @@ -72,13 +82,13 @@ ; CHECK-LABEL: fcvtzu_v8f16_v8i32: ; CHECK: // %bb.0: ; CHECK-NEXT: ldr q0, [x0] -; CHECK-NEXT: mov z1.d, z0.d -; CHECK-NEXT: fcvtl v0.4s, v0.4h -; CHECK-NEXT: ext z1.b, z1.b, z1.b, #8 -; CHECK-NEXT: fcvtl v1.4s, v1.4h -; CHECK-NEXT: fcvtzu v0.4s, v0.4s -; CHECK-NEXT: fcvtzu v1.4s, v1.4s -; CHECK-NEXT: stp q0, q1, [x1] +; CHECK-NEXT: ptrue p0.s, vl4 +; CHECK-NEXT: uunpklo z1.s, z0.h +; CHECK-NEXT: ext z0.b, z0.b, z0.b, #8 +; CHECK-NEXT: uunpklo z0.s, z0.h +; CHECK-NEXT: fcvtzu z1.s, p0/m, z1.h +; CHECK-NEXT: fcvtzu z0.s, p0/m, z0.h +; CHECK-NEXT: stp q1, q0, [x1] ; CHECK-NEXT: ret %op1 = load <8 x half>, <8 x half>* %a %res = fptoui <8 x half> %op1 to <8 x i32> @@ -89,21 +99,21 @@ define void @fcvtzu_v16f16_v16i32(<16 x half>* %a, <16 x i32>* %b) #0 { ; CHECK-LABEL: fcvtzu_v16f16_v16i32: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q1, q0, [x0] -; CHECK-NEXT: mov z3.d, z1.d -; CHECK-NEXT: fcvtl v1.4s, v1.4h -; CHECK-NEXT: ext z3.b, z3.b, z3.b, #8 -; CHECK-NEXT: mov z2.d, z0.d -; CHECK-NEXT: fcvtl v0.4s, v0.4h -; CHECK-NEXT: ext z2.b, z2.b, z2.b, #8 -; CHECK-NEXT: fcvtl v3.4s, v3.4h -; CHECK-NEXT: fcvtl v2.4s, v2.4h -; CHECK-NEXT: fcvtzu v1.4s, v1.4s -; CHECK-NEXT: fcvtzu v0.4s, v0.4s -; CHECK-NEXT: fcvtzu v3.4s, v3.4s -; CHECK-NEXT: fcvtzu v2.4s, v2.4s -; CHECK-NEXT: stp q1, q3, [x1] -; CHECK-NEXT: stp q0, q2, [x1, #32] +; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ptrue p0.s, vl4 +; CHECK-NEXT: uunpklo z2.s, z0.h +; CHECK-NEXT: ext z0.b, z0.b, z0.b, #8 +; CHECK-NEXT: uunpklo z0.s, z0.h +; CHECK-NEXT: uunpklo z3.s, z1.h +; CHECK-NEXT: ext z1.b, z1.b, z1.b, #8 +; CHECK-NEXT: uunpklo z1.s, z1.h +; CHECK-NEXT: fcvtzu z3.s, p0/m, z3.h +; CHECK-NEXT: fcvtzu z1.s, p0/m, z1.h +; CHECK-NEXT: fcvtzu z0.s, p0/m, z0.h +; CHECK-NEXT: stp q3, q1, [x1, #32] +; CHECK-NEXT: movprfx z1, z2 +; CHECK-NEXT: fcvtzu z1.s, p0/m, z2.h +; CHECK-NEXT: stp q1, q0, [x1] ; CHECK-NEXT: ret %op1 = load <16 x half>, <16 x half>* %a %res = fptoui <16 x half> %op1 to <16 x i32> @@ -268,7 +278,10 @@ define <2 x i16> @fcvtzu_v2f32_v2i16(<2 x float> %op1) #0 { ; CHECK-LABEL: fcvtzu_v2f32_v2i16: ; CHECK: // %bb.0: -; CHECK-NEXT: fcvtzs v0.2s, v0.2s +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: ptrue p0.s, vl2 +; CHECK-NEXT: fcvtzs z0.s, p0/m, z0.s +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret %res = fptoui <2 x float> %op1 to <2 x i16> ret <2 x i16> %res @@ -279,18 +292,20 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: sub sp, sp, #16 ; CHECK-NEXT: .cfi_def_cfa_offset 16 -; CHECK-NEXT: fcvtzu v0.4s, v0.4s -; CHECK-NEXT: mov z1.s, z0.s[3] +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: fcvtzu z0.s, p0/m, z0.s ; CHECK-NEXT: fmov w8, s0 +; CHECK-NEXT: mov z1.s, z0.s[3] ; CHECK-NEXT: mov z2.s, z0.s[2] ; CHECK-NEXT: mov z0.s, z0.s[1] ; CHECK-NEXT: fmov w9, s1 ; CHECK-NEXT: fmov w10, s2 -; CHECK-NEXT: fmov w11, s0 ; CHECK-NEXT: strh w8, [sp, #8] +; CHECK-NEXT: fmov w8, s0 ; CHECK-NEXT: strh w9, [sp, #14] ; CHECK-NEXT: strh w10, [sp, #12] -; CHECK-NEXT: strh w11, [sp, #10] +; CHECK-NEXT: strh w8, [sp, #10] ; CHECK-NEXT: ldr d0, [sp, #8] ; CHECK-NEXT: add sp, sp, #16 ; CHECK-NEXT: ret @@ -304,10 +319,11 @@ ; CHECK-NEXT: sub sp, sp, #16 ; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: ldp q1, q0, [x0] -; CHECK-NEXT: fcvtzu v1.4s, v1.4s -; CHECK-NEXT: fcvtzu v0.4s, v0.4s +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: fcvtzu z1.s, p0/m, z1.s ; CHECK-NEXT: fmov w9, s1 ; CHECK-NEXT: mov z5.s, z1.s[2] +; CHECK-NEXT: fcvtzu z0.s, p0/m, z0.s ; CHECK-NEXT: fmov w8, s0 ; CHECK-NEXT: mov z2.s, z0.s[3] ; CHECK-NEXT: mov z3.s, z0.s[2] @@ -340,49 +356,52 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: sub sp, sp, #32 ; CHECK-NEXT: .cfi_def_cfa_offset 32 -; CHECK-NEXT: ldp q1, q0, [x0] -; CHECK-NEXT: fcvtzu v1.4s, v1.4s -; CHECK-NEXT: fcvtzu v0.4s, v0.4s -; CHECK-NEXT: ldp q2, q3, [x0, #32] -; CHECK-NEXT: fmov w9, s1 -; CHECK-NEXT: mov z7.s, z1.s[2] -; CHECK-NEXT: fmov w8, s0 -; CHECK-NEXT: mov z4.s, z0.s[3] +; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: fcvtzu z0.s, p0/m, z0.s +; CHECK-NEXT: fmov w9, s0 ; CHECK-NEXT: mov z5.s, z0.s[2] -; CHECK-NEXT: fmov w10, s4 -; CHECK-NEXT: mov z6.s, z0.s[1] -; CHECK-NEXT: mov z0.s, z1.s[3] +; CHECK-NEXT: fcvtzu z1.s, p0/m, z1.s +; CHECK-NEXT: fmov w8, s1 +; CHECK-NEXT: mov z2.s, z1.s[3] +; CHECK-NEXT: mov z3.s, z1.s[2] +; CHECK-NEXT: fmov w10, s2 +; CHECK-NEXT: ldp q6, q7, [x0, #32] ; CHECK-NEXT: strh w8, [sp, #8] -; CHECK-NEXT: fmov w8, s5 -; CHECK-NEXT: fcvtzu v3.4s, v3.4s -; CHECK-NEXT: strh w10, [sp, #14] -; CHECK-NEXT: fmov w10, s0 -; CHECK-NEXT: mov z0.s, z1.s[1] -; CHECK-NEXT: fcvtzu v1.4s, v2.4s +; CHECK-NEXT: fmov w8, s3 +; CHECK-NEXT: mov z4.s, z1.s[1] +; CHECK-NEXT: mov z1.s, z0.s[3] ; CHECK-NEXT: strh w9, [sp] -; CHECK-NEXT: fmov w9, s6 +; CHECK-NEXT: fmov w9, s4 +; CHECK-NEXT: strh w10, [sp, #14] +; CHECK-NEXT: fmov w10, s1 ; CHECK-NEXT: strh w8, [sp, #12] -; CHECK-NEXT: fmov w8, s7 -; CHECK-NEXT: strh w10, [sp, #6] -; CHECK-NEXT: mov z2.s, z3.s[2] -; CHECK-NEXT: mov z4.s, z3.s[1] +; CHECK-NEXT: fmov w8, s5 ; CHECK-NEXT: strh w9, [sp, #10] -; CHECK-NEXT: fmov w9, s3 +; CHECK-NEXT: mov z0.s, z0.s[1] +; CHECK-NEXT: strh w10, [sp, #6] ; CHECK-NEXT: strh w8, [sp, #4] +; CHECK-NEXT: movprfx z1, z7 +; CHECK-NEXT: fcvtzu z1.s, p0/m, z7.s ; CHECK-NEXT: fmov w8, s0 +; CHECK-NEXT: mov z0.s, z1.s[3] +; CHECK-NEXT: mov z2.s, z1.s[2] +; CHECK-NEXT: mov z3.s, z1.s[1] +; CHECK-NEXT: fmov w9, s1 +; CHECK-NEXT: movprfx z1, z6 +; CHECK-NEXT: fcvtzu z1.s, p0/m, z6.s ; CHECK-NEXT: fmov w10, s1 -; CHECK-NEXT: mov z0.s, z3.s[3] -; CHECK-NEXT: strh w9, [sp, #24] -; CHECK-NEXT: fmov w9, s2 ; CHECK-NEXT: strh w8, [sp, #2] ; CHECK-NEXT: fmov w8, s0 +; CHECK-NEXT: mov z4.s, z1.s[3] +; CHECK-NEXT: strh w9, [sp, #24] +; CHECK-NEXT: fmov w9, s2 ; CHECK-NEXT: strh w10, [sp, #16] -; CHECK-NEXT: fmov w10, s4 -; CHECK-NEXT: mov z3.s, z1.s[3] +; CHECK-NEXT: fmov w10, s3 ; CHECK-NEXT: mov z5.s, z1.s[2] ; CHECK-NEXT: mov z6.s, z1.s[1] ; CHECK-NEXT: strh w8, [sp, #30] -; CHECK-NEXT: fmov w8, s3 +; CHECK-NEXT: fmov w8, s4 ; CHECK-NEXT: strh w9, [sp, #28] ; CHECK-NEXT: fmov w9, s5 ; CHECK-NEXT: strh w10, [sp, #26] @@ -407,7 +426,10 @@ define <2 x i32> @fcvtzu_v2f32_v2i32(<2 x float> %op1) #0 { ; CHECK-LABEL: fcvtzu_v2f32_v2i32: ; CHECK: // %bb.0: -; CHECK-NEXT: fcvtzu v0.2s, v0.2s +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: ptrue p0.s, vl2 +; CHECK-NEXT: fcvtzu z0.s, p0/m, z0.s +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret %res = fptoui <2 x float> %op1 to <2 x i32> ret <2 x i32> %res @@ -416,7 +438,10 @@ define <4 x i32> @fcvtzu_v4f32_v4i32(<4 x float> %op1) #0 { ; CHECK-LABEL: fcvtzu_v4f32_v4i32: ; CHECK: // %bb.0: -; CHECK-NEXT: fcvtzu v0.4s, v0.4s +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: ptrue p0.s, vl4 +; CHECK-NEXT: fcvtzu z0.s, p0/m, z0.s +; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: ret %res = fptoui <4 x float> %op1 to <4 x i32> ret <4 x i32> %res @@ -426,8 +451,9 @@ ; CHECK-LABEL: fcvtzu_v8f32_v8i32: ; CHECK: // %bb.0: ; CHECK-NEXT: ldp q0, q1, [x0] -; CHECK-NEXT: fcvtzu v0.4s, v0.4s -; CHECK-NEXT: fcvtzu v1.4s, v1.4s +; CHECK-NEXT: ptrue p0.s, vl4 +; CHECK-NEXT: fcvtzu z0.s, p0/m, z0.s +; CHECK-NEXT: fcvtzu z1.s, p0/m, z1.s ; CHECK-NEXT: stp q0, q1, [x1] ; CHECK-NEXT: ret %op1 = load <8 x float>, <8 x float>* %a @@ -443,9 +469,11 @@ define <1 x i64> @fcvtzu_v1f32_v1i64(<1 x float> %op1) #0 { ; CHECK-LABEL: fcvtzu_v1f32_v1i64: ; CHECK: // %bb.0: -; CHECK-NEXT: fcvtl v0.2d, v0.2s -; CHECK-NEXT: fcvtzu v0.2d, v0.2d -; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: ptrue p0.d, vl2 +; CHECK-NEXT: uunpklo z0.d, z0.s +; CHECK-NEXT: fcvtzu z0.d, p0/m, z0.s +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret %res = fptoui <1 x float> %op1 to <1 x i64> ret <1 x i64> %res @@ -454,8 +482,11 @@ define <2 x i64> @fcvtzu_v2f32_v2i64(<2 x float> %op1) #0 { ; CHECK-LABEL: fcvtzu_v2f32_v2i64: ; CHECK: // %bb.0: -; CHECK-NEXT: fcvtl v0.2d, v0.2s -; CHECK-NEXT: fcvtzu v0.2d, v0.2d +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: ptrue p0.d, vl2 +; CHECK-NEXT: uunpklo z0.d, z0.s +; CHECK-NEXT: fcvtzu z0.d, p0/m, z0.s +; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: ret %res = fptoui <2 x float> %op1 to <2 x i64> ret <2 x i64> %res @@ -465,13 +496,13 @@ ; CHECK-LABEL: fcvtzu_v4f32_v4i64: ; CHECK: // %bb.0: ; CHECK-NEXT: ldr q0, [x0] -; CHECK-NEXT: mov z1.d, z0.d -; CHECK-NEXT: fcvtl v0.2d, v0.2s -; CHECK-NEXT: ext z1.b, z1.b, z1.b, #8 -; CHECK-NEXT: fcvtl v1.2d, v1.2s -; CHECK-NEXT: fcvtzu v0.2d, v0.2d -; CHECK-NEXT: fcvtzu v1.2d, v1.2d -; CHECK-NEXT: stp q0, q1, [x1] +; CHECK-NEXT: ptrue p0.d, vl2 +; CHECK-NEXT: uunpklo z1.d, z0.s +; CHECK-NEXT: ext z0.b, z0.b, z0.b, #8 +; CHECK-NEXT: uunpklo z0.d, z0.s +; CHECK-NEXT: fcvtzu z1.d, p0/m, z1.s +; CHECK-NEXT: fcvtzu z0.d, p0/m, z0.s +; CHECK-NEXT: stp q1, q0, [x1] ; CHECK-NEXT: ret %op1 = load <4 x float>, <4 x float>* %a %res = fptoui <4 x float> %op1 to <4 x i64> @@ -482,21 +513,21 @@ define void @fcvtzu_v8f32_v8i64(<8 x float>* %a, <8 x i64>* %b) #0 { ; CHECK-LABEL: fcvtzu_v8f32_v8i64: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q1, q0, [x0] -; CHECK-NEXT: mov z3.d, z1.d -; CHECK-NEXT: fcvtl v1.2d, v1.2s -; CHECK-NEXT: ext z3.b, z3.b, z3.b, #8 -; CHECK-NEXT: mov z2.d, z0.d -; CHECK-NEXT: fcvtl v0.2d, v0.2s -; CHECK-NEXT: ext z2.b, z2.b, z2.b, #8 -; CHECK-NEXT: fcvtl v3.2d, v3.2s -; CHECK-NEXT: fcvtl v2.2d, v2.2s -; CHECK-NEXT: fcvtzu v1.2d, v1.2d -; CHECK-NEXT: fcvtzu v0.2d, v0.2d -; CHECK-NEXT: fcvtzu v3.2d, v3.2d -; CHECK-NEXT: fcvtzu v2.2d, v2.2d -; CHECK-NEXT: stp q1, q3, [x1] -; CHECK-NEXT: stp q0, q2, [x1, #32] +; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ptrue p0.d, vl2 +; CHECK-NEXT: uunpklo z2.d, z0.s +; CHECK-NEXT: ext z0.b, z0.b, z0.b, #8 +; CHECK-NEXT: uunpklo z0.d, z0.s +; CHECK-NEXT: uunpklo z3.d, z1.s +; CHECK-NEXT: ext z1.b, z1.b, z1.b, #8 +; CHECK-NEXT: uunpklo z1.d, z1.s +; CHECK-NEXT: fcvtzu z3.d, p0/m, z3.s +; CHECK-NEXT: fcvtzu z1.d, p0/m, z1.s +; CHECK-NEXT: fcvtzu z0.d, p0/m, z0.s +; CHECK-NEXT: stp q3, q1, [x1, #32] +; CHECK-NEXT: movprfx z1, z2 +; CHECK-NEXT: fcvtzu z1.d, p0/m, z2.s +; CHECK-NEXT: stp q1, q0, [x1] ; CHECK-NEXT: ret %op1 = load <8 x float>, <8 x float>* %a %res = fptoui <8 x float> %op1 to <8 x i64> @@ -522,8 +553,17 @@ define <2 x i16> @fcvtzu_v2f64_v2i16(<2 x double> %op1) #0 { ; CHECK-LABEL: fcvtzu_v2f64_v2i16: ; CHECK: // %bb.0: -; CHECK-NEXT: fcvtzs v0.2d, v0.2d -; CHECK-NEXT: xtn v0.2s, v0.2d +; CHECK-NEXT: sub sp, sp, #16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: fcvtzs z0.d, p0/m, z0.d +; CHECK-NEXT: mov z1.d, z0.d[1] +; CHECK-NEXT: fmov x8, d0 +; CHECK-NEXT: fmov x9, d1 +; CHECK-NEXT: stp w8, w9, [sp, #8] +; CHECK-NEXT: ldr d0, [sp, #8] +; CHECK-NEXT: add sp, sp, #16 ; CHECK-NEXT: ret %res = fptoui <2 x double> %op1 to <2 x i16> ret <2 x i16> %res @@ -532,25 +572,34 @@ define <4 x i16> @fcvtzu_v4f64_v4i16(<4 x double>* %a) #0 { ; CHECK-LABEL: fcvtzu_v4f64_v4i16: ; CHECK: // %bb.0: -; CHECK-NEXT: sub sp, sp, #16 -; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: sub sp, sp, #32 +; CHECK-NEXT: .cfi_def_cfa_offset 32 ; CHECK-NEXT: ldp q1, q0, [x0] -; CHECK-NEXT: fcvtzs v1.2d, v1.2d -; CHECK-NEXT: fcvtzs v0.2d, v0.2d -; CHECK-NEXT: xtn v1.2s, v1.2d -; CHECK-NEXT: xtn v0.2s, v0.2d -; CHECK-NEXT: fmov w9, s1 -; CHECK-NEXT: mov z1.s, z1.s[1] +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: fcvtzs z0.d, p0/m, z0.d +; CHECK-NEXT: mov z2.d, z0.d[1] +; CHECK-NEXT: fmov x8, d0 +; CHECK-NEXT: movprfx z0, z1 +; CHECK-NEXT: fcvtzs z0.d, p0/m, z1.d +; CHECK-NEXT: mov z1.d, z0.d[1] +; CHECK-NEXT: fmov x9, d2 +; CHECK-NEXT: fmov x10, d0 +; CHECK-NEXT: fmov x11, d1 +; CHECK-NEXT: stp w8, w9, [sp, #16] +; CHECK-NEXT: stp w10, w11, [sp, #8] +; CHECK-NEXT: ldp d1, d0, [sp, #8] ; CHECK-NEXT: fmov w8, s0 ; CHECK-NEXT: mov z0.s, z0.s[1] -; CHECK-NEXT: fmov w10, s0 -; CHECK-NEXT: strh w9, [sp, #8] -; CHECK-NEXT: strh w8, [sp, #12] +; CHECK-NEXT: fmov w9, s0 +; CHECK-NEXT: strh w8, [sp, #28] ; CHECK-NEXT: fmov w8, s1 -; CHECK-NEXT: strh w10, [sp, #14] -; CHECK-NEXT: strh w8, [sp, #10] -; CHECK-NEXT: ldr d0, [sp, #8] -; CHECK-NEXT: add sp, sp, #16 +; CHECK-NEXT: mov z1.s, z1.s[1] +; CHECK-NEXT: strh w9, [sp, #30] +; CHECK-NEXT: fmov w10, s1 +; CHECK-NEXT: strh w8, [sp, #24] +; CHECK-NEXT: strh w10, [sp, #26] +; CHECK-NEXT: ldr d0, [sp, #24] +; CHECK-NEXT: add sp, sp, #32 ; CHECK-NEXT: ret %op1 = load <4 x double>, <4 x double>* %a %res = fptoui <4 x double> %op1 to <4 x i16> @@ -560,39 +609,57 @@ define <8 x i16> @fcvtzu_v8f64_v8i16(<8 x double>* %a) #0 { ; CHECK-LABEL: fcvtzu_v8f64_v8i16: ; CHECK: // %bb.0: -; CHECK-NEXT: sub sp, sp, #16 -; CHECK-NEXT: .cfi_def_cfa_offset 16 -; CHECK-NEXT: ldp q0, q1, [x0, #32] -; CHECK-NEXT: fcvtzs v0.2d, v0.2d -; CHECK-NEXT: ldp q3, q2, [x0] -; CHECK-NEXT: fcvtzs v1.2d, v1.2d -; CHECK-NEXT: xtn v0.2s, v0.2d -; CHECK-NEXT: fcvtzs v3.2d, v3.2d -; CHECK-NEXT: fmov w9, s0 -; CHECK-NEXT: xtn v1.2s, v1.2d -; CHECK-NEXT: fcvtzs v2.2d, v2.2d +; CHECK-NEXT: sub sp, sp, #48 +; CHECK-NEXT: .cfi_def_cfa_offset 48 +; CHECK-NEXT: ldp q1, q0, [x0, #32] +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: fcvtzs z1.d, p0/m, z1.d +; CHECK-NEXT: fmov x9, d1 +; CHECK-NEXT: fcvtzs z0.d, p0/m, z0.d +; CHECK-NEXT: mov z4.d, z0.d[1] +; CHECK-NEXT: fmov x8, d0 +; CHECK-NEXT: fmov x10, d4 +; CHECK-NEXT: mov z0.d, z1.d[1] +; CHECK-NEXT: ldp q2, q3, [x0] +; CHECK-NEXT: movprfx z1, z3 +; CHECK-NEXT: fcvtzs z1.d, p0/m, z3.d +; CHECK-NEXT: mov z3.d, z1.d[1] +; CHECK-NEXT: fmov x11, d1 +; CHECK-NEXT: stp w8, w10, [sp, #8] +; CHECK-NEXT: fmov x8, d3 +; CHECK-NEXT: fmov x12, d0 +; CHECK-NEXT: fcvtzs z2.d, p0/m, z2.d +; CHECK-NEXT: mov z0.d, z2.d[1] +; CHECK-NEXT: stp w11, w8, [sp, #16] +; CHECK-NEXT: fmov x10, d0 +; CHECK-NEXT: stp w9, w12, [sp, #24] +; CHECK-NEXT: fmov x9, d2 +; CHECK-NEXT: ldp d0, d2, [sp, #8] +; CHECK-NEXT: ldr d1, [sp, #24] +; CHECK-NEXT: stp w9, w10, [sp] +; CHECK-NEXT: ldr d3, [sp] +; CHECK-NEXT: fmov w8, s0 +; CHECK-NEXT: mov z0.s, z0.s[1] +; CHECK-NEXT: fmov w10, s3 +; CHECK-NEXT: strh w8, [sp, #44] ; CHECK-NEXT: fmov w8, s1 -; CHECK-NEXT: mov z4.s, z1.s[1] -; CHECK-NEXT: xtn v3.2s, v3.2d -; CHECK-NEXT: strh w9, [sp, #8] -; CHECK-NEXT: fmov w9, s4 -; CHECK-NEXT: mov z1.s, z0.s[1] -; CHECK-NEXT: xtn v2.2s, v2.2d -; CHECK-NEXT: strh w8, [sp, #12] -; CHECK-NEXT: fmov w8, s3 -; CHECK-NEXT: fmov w10, s2 +; CHECK-NEXT: mov z1.s, z1.s[1] +; CHECK-NEXT: fmov w9, s1 +; CHECK-NEXT: mov z1.s, z3.s[1] +; CHECK-NEXT: strh w8, [sp, #40] +; CHECK-NEXT: fmov w8, s2 +; CHECK-NEXT: strh w10, [sp, #32] +; CHECK-NEXT: strh w9, [sp, #42] +; CHECK-NEXT: fmov w9, s1 +; CHECK-NEXT: strh w8, [sp, #36] +; CHECK-NEXT: fmov w8, s0 ; CHECK-NEXT: mov z0.s, z2.s[1] -; CHECK-NEXT: mov z2.s, z3.s[1] -; CHECK-NEXT: strh w9, [sp, #14] -; CHECK-NEXT: strh w8, [sp] +; CHECK-NEXT: strh w9, [sp, #34] +; CHECK-NEXT: strh w8, [sp, #46] ; CHECK-NEXT: fmov w8, s0 -; CHECK-NEXT: strh w10, [sp, #4] -; CHECK-NEXT: fmov w10, s1 -; CHECK-NEXT: fmov w9, s2 -; CHECK-NEXT: strh w8, [sp, #6] -; CHECK-NEXT: strh w10, [sp, #10] -; CHECK-NEXT: strh w9, [sp, #2] -; CHECK-NEXT: ldr q0, [sp], #16 +; CHECK-NEXT: strh w8, [sp, #38] +; CHECK-NEXT: ldr q0, [sp, #32] +; CHECK-NEXT: add sp, sp, #48 ; CHECK-NEXT: ret %op1 = load <8 x double>, <8 x double>* %a %res = fptoui <8 x double> %op1 to <8 x i16> @@ -602,71 +669,108 @@ define void @fcvtzu_v16f64_v16i16(<16 x double>* %a, <16 x i16>* %b) #0 { ; CHECK-LABEL: fcvtzu_v16f64_v16i16: ; CHECK: // %bb.0: -; CHECK-NEXT: sub sp, sp, #32 -; CHECK-NEXT: .cfi_def_cfa_offset 32 +; CHECK-NEXT: sub sp, sp, #96 +; CHECK-NEXT: .cfi_def_cfa_offset 96 ; CHECK-NEXT: ldp q0, q1, [x0, #32] -; CHECK-NEXT: fcvtzs v0.2d, v0.2d -; CHECK-NEXT: ldp q3, q2, [x0] -; CHECK-NEXT: fcvtzs v1.2d, v1.2d -; CHECK-NEXT: xtn v0.2s, v0.2d -; CHECK-NEXT: fcvtzs v3.2d, v3.2d -; CHECK-NEXT: fmov w9, s0 -; CHECK-NEXT: xtn v1.2s, v1.2d -; CHECK-NEXT: fcvtzs v2.2d, v2.2d -; CHECK-NEXT: ldp q6, q7, [x0, #96] -; CHECK-NEXT: fmov w8, s1 -; CHECK-NEXT: xtn v3.2s, v3.2d -; CHECK-NEXT: mov z16.s, z1.s[1] -; CHECK-NEXT: mov z1.s, z0.s[1] -; CHECK-NEXT: xtn v2.2s, v2.2d -; CHECK-NEXT: mov z0.s, z2.s[1] -; CHECK-NEXT: fmov w10, s2 -; CHECK-NEXT: ldp q4, q5, [x0, #64] -; CHECK-NEXT: strh w8, [sp, #12] -; CHECK-NEXT: fmov w8, s3 -; CHECK-NEXT: fcvtzs v2.2d, v7.2d -; CHECK-NEXT: strh w9, [sp, #8] -; CHECK-NEXT: fmov w9, s16 -; CHECK-NEXT: mov z3.s, z3.s[1] -; CHECK-NEXT: strh w8, [sp] -; CHECK-NEXT: fmov w8, s1 -; CHECK-NEXT: fcvtzs v1.2d, v6.2d -; CHECK-NEXT: strh w10, [sp, #4] -; CHECK-NEXT: strh w9, [sp, #14] -; CHECK-NEXT: fmov w9, s0 -; CHECK-NEXT: xtn v0.2s, v2.2d -; CHECK-NEXT: strh w8, [sp, #10] -; CHECK-NEXT: fmov w8, s3 -; CHECK-NEXT: fcvtzs v2.2d, v5.2d -; CHECK-NEXT: fcvtzs v3.2d, v4.2d -; CHECK-NEXT: xtn v1.2s, v1.2d -; CHECK-NEXT: fmov w10, s0 +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: fcvtzs z0.d, p0/m, z0.d +; CHECK-NEXT: fmov x10, d0 +; CHECK-NEXT: fcvtzs z1.d, p0/m, z1.d +; CHECK-NEXT: mov z6.d, z1.d[1] +; CHECK-NEXT: fmov x8, d1 +; CHECK-NEXT: fmov x9, d6 +; CHECK-NEXT: mov z6.d, z0.d[1] +; CHECK-NEXT: fmov x11, d6 +; CHECK-NEXT: ldp q7, q1, [x0] +; CHECK-NEXT: ldp q2, q3, [x0, #64] +; CHECK-NEXT: ldp q4, q5, [x0, #96] +; CHECK-NEXT: stp w8, w9, [sp, #32] +; CHECK-NEXT: stp w10, w11, [sp, #48] +; CHECK-NEXT: movprfx z0, z1 +; CHECK-NEXT: fcvtzs z0.d, p0/m, z1.d +; CHECK-NEXT: mov z1.d, z0.d[1] +; CHECK-NEXT: fmov x8, d0 +; CHECK-NEXT: fmov x9, d1 +; CHECK-NEXT: movprfx z0, z7 +; CHECK-NEXT: fcvtzs z0.d, p0/m, z7.d +; CHECK-NEXT: mov z1.d, z0.d[1] +; CHECK-NEXT: fmov x10, d0 +; CHECK-NEXT: fmov x11, d1 +; CHECK-NEXT: stp w8, w9, [sp, #40] +; CHECK-NEXT: stp w10, w11, [sp] +; CHECK-NEXT: movprfx z0, z5 +; CHECK-NEXT: fcvtzs z0.d, p0/m, z5.d +; CHECK-NEXT: mov z1.d, z0.d[1] +; CHECK-NEXT: fmov x8, d0 +; CHECK-NEXT: fmov x9, d1 +; CHECK-NEXT: movprfx z0, z4 +; CHECK-NEXT: fcvtzs z0.d, p0/m, z4.d +; CHECK-NEXT: mov z1.d, z0.d[1] +; CHECK-NEXT: fmov x10, d0 +; CHECK-NEXT: fmov x11, d1 +; CHECK-NEXT: stp w8, w9, [sp, #16] +; CHECK-NEXT: stp w10, w11, [sp, #8] +; CHECK-NEXT: movprfx z0, z3 +; CHECK-NEXT: fcvtzs z0.d, p0/m, z3.d +; CHECK-NEXT: mov z1.d, z0.d[1] +; CHECK-NEXT: fmov x8, d0 +; CHECK-NEXT: movprfx z0, z2 +; CHECK-NEXT: fcvtzs z0.d, p0/m, z2.d +; CHECK-NEXT: fmov x9, d1 +; CHECK-NEXT: mov z1.d, z0.d[1] +; CHECK-NEXT: fmov x10, d0 +; CHECK-NEXT: ldp d0, d2, [sp, #32] +; CHECK-NEXT: fmov x11, d1 +; CHECK-NEXT: stp w8, w9, [sp, #24] +; CHECK-NEXT: ldr d1, [sp, #48] +; CHECK-NEXT: ldr d3, [sp] +; CHECK-NEXT: fmov w8, s0 ; CHECK-NEXT: mov z0.s, z0.s[1] -; CHECK-NEXT: strh w8, [sp, #2] +; CHECK-NEXT: stp w10, w11, [sp, #56] +; CHECK-NEXT: fmov w9, s3 +; CHECK-NEXT: strh w8, [sp, #76] ; CHECK-NEXT: fmov w8, s1 -; CHECK-NEXT: xtn v2.2s, v2.2d -; CHECK-NEXT: mov z4.s, z1.s[1] -; CHECK-NEXT: xtn v1.2s, v3.2d -; CHECK-NEXT: strh w9, [sp, #6] -; CHECK-NEXT: strh w10, [sp, #28] -; CHECK-NEXT: fmov w9, s2 +; CHECK-NEXT: mov z1.s, z1.s[1] ; CHECK-NEXT: fmov w10, s1 -; CHECK-NEXT: strh w8, [sp, #24] +; CHECK-NEXT: strh w9, [sp, #64] +; CHECK-NEXT: strh w8, [sp, #72] +; CHECK-NEXT: fmov w8, s2 +; CHECK-NEXT: strh w10, [sp, #74] +; CHECK-NEXT: strh w8, [sp, #68] ; CHECK-NEXT: fmov w8, s0 -; CHECK-NEXT: mov z3.s, z2.s[1] -; CHECK-NEXT: mov z2.s, z1.s[1] -; CHECK-NEXT: strh w9, [sp, #20] -; CHECK-NEXT: fmov w9, s4 -; CHECK-NEXT: strh w10, [sp, #16] -; CHECK-NEXT: fmov w10, s3 -; CHECK-NEXT: strh w8, [sp, #30] +; CHECK-NEXT: mov z0.s, z2.s[1] +; CHECK-NEXT: mov z2.s, z3.s[1] +; CHECK-NEXT: fmov w9, s2 +; CHECK-NEXT: ldr d2, [sp, #24] +; CHECK-NEXT: strh w8, [sp, #78] +; CHECK-NEXT: fmov w8, s0 +; CHECK-NEXT: ldp d1, d0, [sp, #8] +; CHECK-NEXT: strh w9, [sp, #66] +; CHECK-NEXT: strh w8, [sp, #70] +; CHECK-NEXT: fmov w8, s0 +; CHECK-NEXT: mov z0.s, z0.s[1] +; CHECK-NEXT: fmov w9, s0 +; CHECK-NEXT: mov z0.s, z1.s[1] +; CHECK-NEXT: strh w8, [sp, #92] +; CHECK-NEXT: fmov w8, s1 +; CHECK-NEXT: ldr d1, [sp, #56] +; CHECK-NEXT: strh w9, [sp, #94] +; CHECK-NEXT: strh w8, [sp, #88] ; CHECK-NEXT: fmov w8, s2 -; CHECK-NEXT: strh w9, [sp, #26] -; CHECK-NEXT: strh w10, [sp, #22] -; CHECK-NEXT: strh w8, [sp, #18] -; CHECK-NEXT: ldp q1, q0, [sp] +; CHECK-NEXT: mov z2.s, z2.s[1] +; CHECK-NEXT: fmov w9, s1 +; CHECK-NEXT: fmov w10, s2 +; CHECK-NEXT: strh w8, [sp, #84] +; CHECK-NEXT: fmov w8, s0 +; CHECK-NEXT: mov z0.s, z1.s[1] +; CHECK-NEXT: strh w9, [sp, #80] +; CHECK-NEXT: strh w10, [sp, #86] +; CHECK-NEXT: strh w8, [sp, #90] +; CHECK-NEXT: fmov w8, s0 +; CHECK-NEXT: strh w8, [sp, #82] +; CHECK-NEXT: ldp q1, q0, [sp, #64] ; CHECK-NEXT: stp q1, q0, [x1] -; CHECK-NEXT: add sp, sp, #32 +; CHECK-NEXT: add sp, sp, #96 ; CHECK-NEXT: ret %op1 = load <16 x double>, <16 x double>* %a %res = fptoui <16 x double> %op1 to <16 x i16> @@ -681,9 +785,17 @@ define <1 x i32> @fcvtzu_v1f64_v1i32(<1 x double> %op1) #0 { ; CHECK-LABEL: fcvtzu_v1f64_v1i32: ; CHECK: // %bb.0: +; CHECK-NEXT: sub sp, sp, #16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 -; CHECK-NEXT: fcvtzu v0.2d, v0.2d -; CHECK-NEXT: xtn v0.2s, v0.2d +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: fcvtzu z0.d, p0/m, z0.d +; CHECK-NEXT: mov z1.d, z0.d[1] +; CHECK-NEXT: fmov x8, d0 +; CHECK-NEXT: fmov x9, d1 +; CHECK-NEXT: stp w8, w9, [sp, #8] +; CHECK-NEXT: ldr d0, [sp, #8] +; CHECK-NEXT: add sp, sp, #16 ; CHECK-NEXT: ret %res = fptoui <1 x double> %op1 to <1 x i32> ret <1 x i32> %res @@ -692,8 +804,17 @@ define <2 x i32> @fcvtzu_v2f64_v2i32(<2 x double> %op1) #0 { ; CHECK-LABEL: fcvtzu_v2f64_v2i32: ; CHECK: // %bb.0: -; CHECK-NEXT: fcvtzu v0.2d, v0.2d -; CHECK-NEXT: xtn v0.2s, v0.2d +; CHECK-NEXT: sub sp, sp, #16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: fcvtzu z0.d, p0/m, z0.d +; CHECK-NEXT: mov z1.d, z0.d[1] +; CHECK-NEXT: fmov x8, d0 +; CHECK-NEXT: fmov x9, d1 +; CHECK-NEXT: stp w8, w9, [sp, #8] +; CHECK-NEXT: ldr d0, [sp, #8] +; CHECK-NEXT: add sp, sp, #16 ; CHECK-NEXT: ret %res = fptoui <2 x double> %op1 to <2 x i32> ret <2 x i32> %res @@ -702,14 +823,22 @@ define <4 x i32> @fcvtzu_v4f64_v4i32(<4 x double>* %a) #0 { ; CHECK-LABEL: fcvtzu_v4f64_v4i32: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q1, [x0] -; CHECK-NEXT: ptrue p0.s, vl2 -; CHECK-NEXT: fcvtzu v0.2d, v0.2d -; CHECK-NEXT: fcvtzu v1.2d, v1.2d -; CHECK-NEXT: xtn v0.2s, v0.2d -; CHECK-NEXT: xtn v1.2s, v1.2d -; CHECK-NEXT: splice z0.s, p0, z0.s, z1.s -; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 +; CHECK-NEXT: sub sp, sp, #16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: ldp q1, q0, [x0] +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: fcvtzu z0.d, p0/m, z0.d +; CHECK-NEXT: mov z2.d, z0.d[1] +; CHECK-NEXT: fmov x8, d0 +; CHECK-NEXT: movprfx z0, z1 +; CHECK-NEXT: fcvtzu z0.d, p0/m, z1.d +; CHECK-NEXT: mov z1.d, z0.d[1] +; CHECK-NEXT: fmov x9, d2 +; CHECK-NEXT: fmov x10, d0 +; CHECK-NEXT: fmov x11, d1 +; CHECK-NEXT: stp w8, w9, [sp, #8] +; CHECK-NEXT: stp w10, w11, [sp] +; CHECK-NEXT: ldr q0, [sp], #16 ; CHECK-NEXT: ret %op1 = load <4 x double>, <4 x double>* %a %res = fptoui <4 x double> %op1 to <4 x i32> @@ -719,20 +848,37 @@ define void @fcvtzu_v8f64_v8i32(<8 x double>* %a, <8 x i32>* %b) #0 { ; CHECK-LABEL: fcvtzu_v8f64_v8i32: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q1, [x0, #32] -; CHECK-NEXT: ptrue p0.s, vl2 -; CHECK-NEXT: fcvtzu v0.2d, v0.2d -; CHECK-NEXT: ldp q2, q3, [x0] -; CHECK-NEXT: fcvtzu v1.2d, v1.2d -; CHECK-NEXT: xtn v0.2s, v0.2d -; CHECK-NEXT: fcvtzu v2.2d, v2.2d -; CHECK-NEXT: xtn v1.2s, v1.2d -; CHECK-NEXT: fcvtzu v3.2d, v3.2d -; CHECK-NEXT: splice z0.s, p0, z0.s, z1.s -; CHECK-NEXT: xtn v2.2s, v2.2d -; CHECK-NEXT: xtn v3.2s, v3.2d -; CHECK-NEXT: splice z2.s, p0, z2.s, z3.s -; CHECK-NEXT: stp q2, q0, [x1] +; CHECK-NEXT: sub sp, sp, #32 +; CHECK-NEXT: .cfi_def_cfa_offset 32 +; CHECK-NEXT: ldp q1, q0, [x0] +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: fcvtzu z0.d, p0/m, z0.d +; CHECK-NEXT: ldp q3, q2, [x0, #32] +; CHECK-NEXT: mov z4.d, z0.d[1] +; CHECK-NEXT: fmov x8, d0 +; CHECK-NEXT: movprfx z0, z1 +; CHECK-NEXT: fcvtzu z0.d, p0/m, z1.d +; CHECK-NEXT: movprfx z1, z2 +; CHECK-NEXT: fcvtzu z1.d, p0/m, z2.d +; CHECK-NEXT: movprfx z2, z3 +; CHECK-NEXT: fcvtzu z2.d, p0/m, z3.d +; CHECK-NEXT: fmov x9, d4 +; CHECK-NEXT: mov z3.d, z0.d[1] +; CHECK-NEXT: fmov x10, d0 +; CHECK-NEXT: fmov x11, d3 +; CHECK-NEXT: mov z0.d, z1.d[1] +; CHECK-NEXT: stp w8, w9, [sp, #8] +; CHECK-NEXT: fmov x9, d0 +; CHECK-NEXT: mov z0.d, z2.d[1] +; CHECK-NEXT: fmov x8, d1 +; CHECK-NEXT: stp w10, w11, [sp] +; CHECK-NEXT: fmov x10, d2 +; CHECK-NEXT: fmov x11, d0 +; CHECK-NEXT: stp w8, w9, [sp, #24] +; CHECK-NEXT: stp w10, w11, [sp, #16] +; CHECK-NEXT: ldp q1, q0, [sp] +; CHECK-NEXT: stp q1, q0, [x1] +; CHECK-NEXT: add sp, sp, #32 ; CHECK-NEXT: ret %op1 = load <8 x double>, <8 x double>* %a %res = fptoui <8 x double> %op1 to <8 x i32> @@ -758,7 +904,10 @@ define <2 x i64> @fcvtzu_v2f64_v2i64(<2 x double> %op1) #0 { ; CHECK-LABEL: fcvtzu_v2f64_v2i64: ; CHECK: // %bb.0: -; CHECK-NEXT: fcvtzu v0.2d, v0.2d +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: ptrue p0.d, vl2 +; CHECK-NEXT: fcvtzu z0.d, p0/m, z0.d +; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: ret %res = fptoui <2 x double> %op1 to <2 x i64> ret <2 x i64> %res @@ -768,8 +917,9 @@ ; CHECK-LABEL: fcvtzu_v4f64_v4i64: ; CHECK: // %bb.0: ; CHECK-NEXT: ldp q0, q1, [x0] -; CHECK-NEXT: fcvtzu v0.2d, v0.2d -; CHECK-NEXT: fcvtzu v1.2d, v1.2d +; CHECK-NEXT: ptrue p0.d, vl2 +; CHECK-NEXT: fcvtzu z0.d, p0/m, z0.d +; CHECK-NEXT: fcvtzu z1.d, p0/m, z1.d ; CHECK-NEXT: stp q0, q1, [x1] ; CHECK-NEXT: ret %op1 = load <4 x double>, <4 x double>* %a @@ -785,7 +935,10 @@ define <4 x i16> @fcvtzs_v4f16_v4i16(<4 x half> %op1) #0 { ; CHECK-LABEL: fcvtzs_v4f16_v4i16: ; CHECK: // %bb.0: -; CHECK-NEXT: fcvtzs v0.4h, v0.4h +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: ptrue p0.h, vl4 +; CHECK-NEXT: fcvtzs z0.h, p0/m, z0.h +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret %res = fptosi <4 x half> %op1 to <4 x i16> ret <4 x i16> %res @@ -795,7 +948,8 @@ ; CHECK-LABEL: fcvtzs_v8f16_v8i16: ; CHECK: // %bb.0: ; CHECK-NEXT: ldr q0, [x0] -; CHECK-NEXT: fcvtzs v0.8h, v0.8h +; CHECK-NEXT: ptrue p0.h, vl8 +; CHECK-NEXT: fcvtzs z0.h, p0/m, z0.h ; CHECK-NEXT: str q0, [x1] ; CHECK-NEXT: ret %op1 = load <8 x half>, <8 x half>* %a @@ -808,8 +962,9 @@ ; CHECK-LABEL: fcvtzs_v16f16_v16i16: ; CHECK: // %bb.0: ; CHECK-NEXT: ldp q0, q1, [x0] -; CHECK-NEXT: fcvtzs v0.8h, v0.8h -; CHECK-NEXT: fcvtzs v1.8h, v1.8h +; CHECK-NEXT: ptrue p0.h, vl8 +; CHECK-NEXT: fcvtzs z0.h, p0/m, z0.h +; CHECK-NEXT: fcvtzs z1.h, p0/m, z1.h ; CHECK-NEXT: stp q0, q1, [x1] ; CHECK-NEXT: ret %op1 = load <16 x half>, <16 x half>* %a @@ -825,9 +980,11 @@ define <2 x i32> @fcvtzs_v2f16_v2i32(<2 x half> %op1) #0 { ; CHECK-LABEL: fcvtzs_v2f16_v2i32: ; CHECK: // %bb.0: -; CHECK-NEXT: fcvtl v0.4s, v0.4h -; CHECK-NEXT: fcvtzs v0.4s, v0.4s -; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: ptrue p0.s, vl4 +; CHECK-NEXT: uunpklo z0.s, z0.h +; CHECK-NEXT: fcvtzs z0.s, p0/m, z0.h +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret %res = fptosi <2 x half> %op1 to <2 x i32> ret <2 x i32> %res @@ -836,8 +993,11 @@ define <4 x i32> @fcvtzs_v4f16_v4i32(<4 x half> %op1) #0 { ; CHECK-LABEL: fcvtzs_v4f16_v4i32: ; CHECK: // %bb.0: -; CHECK-NEXT: fcvtl v0.4s, v0.4h -; CHECK-NEXT: fcvtzs v0.4s, v0.4s +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: ptrue p0.s, vl4 +; CHECK-NEXT: uunpklo z0.s, z0.h +; CHECK-NEXT: fcvtzs z0.s, p0/m, z0.h +; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: ret %res = fptosi <4 x half> %op1 to <4 x i32> ret <4 x i32> %res @@ -847,13 +1007,13 @@ ; CHECK-LABEL: fcvtzs_v8f16_v8i32: ; CHECK: // %bb.0: ; CHECK-NEXT: ldr q0, [x0] -; CHECK-NEXT: mov z1.d, z0.d -; CHECK-NEXT: fcvtl v0.4s, v0.4h -; CHECK-NEXT: ext z1.b, z1.b, z1.b, #8 -; CHECK-NEXT: fcvtl v1.4s, v1.4h -; CHECK-NEXT: fcvtzs v0.4s, v0.4s -; CHECK-NEXT: fcvtzs v1.4s, v1.4s -; CHECK-NEXT: stp q0, q1, [x1] +; CHECK-NEXT: ptrue p0.s, vl4 +; CHECK-NEXT: uunpklo z1.s, z0.h +; CHECK-NEXT: ext z0.b, z0.b, z0.b, #8 +; CHECK-NEXT: uunpklo z0.s, z0.h +; CHECK-NEXT: fcvtzs z1.s, p0/m, z1.h +; CHECK-NEXT: fcvtzs z0.s, p0/m, z0.h +; CHECK-NEXT: stp q1, q0, [x1] ; CHECK-NEXT: ret %op1 = load <8 x half>, <8 x half>* %a %res = fptosi <8 x half> %op1 to <8 x i32> @@ -864,21 +1024,21 @@ define void @fcvtzs_v16f16_v16i32(<16 x half>* %a, <16 x i32>* %b) #0 { ; CHECK-LABEL: fcvtzs_v16f16_v16i32: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q1, q0, [x0] -; CHECK-NEXT: mov z3.d, z1.d -; CHECK-NEXT: fcvtl v1.4s, v1.4h -; CHECK-NEXT: ext z3.b, z3.b, z3.b, #8 -; CHECK-NEXT: mov z2.d, z0.d -; CHECK-NEXT: fcvtl v0.4s, v0.4h -; CHECK-NEXT: ext z2.b, z2.b, z2.b, #8 -; CHECK-NEXT: fcvtl v3.4s, v3.4h -; CHECK-NEXT: fcvtl v2.4s, v2.4h -; CHECK-NEXT: fcvtzs v1.4s, v1.4s -; CHECK-NEXT: fcvtzs v0.4s, v0.4s -; CHECK-NEXT: fcvtzs v3.4s, v3.4s -; CHECK-NEXT: fcvtzs v2.4s, v2.4s -; CHECK-NEXT: stp q1, q3, [x1] -; CHECK-NEXT: stp q0, q2, [x1, #32] +; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ptrue p0.s, vl4 +; CHECK-NEXT: uunpklo z2.s, z0.h +; CHECK-NEXT: ext z0.b, z0.b, z0.b, #8 +; CHECK-NEXT: uunpklo z0.s, z0.h +; CHECK-NEXT: uunpklo z3.s, z1.h +; CHECK-NEXT: ext z1.b, z1.b, z1.b, #8 +; CHECK-NEXT: uunpklo z1.s, z1.h +; CHECK-NEXT: fcvtzs z3.s, p0/m, z3.h +; CHECK-NEXT: fcvtzs z1.s, p0/m, z1.h +; CHECK-NEXT: fcvtzs z0.s, p0/m, z0.h +; CHECK-NEXT: stp q3, q1, [x1, #32] +; CHECK-NEXT: movprfx z1, z2 +; CHECK-NEXT: fcvtzs z1.s, p0/m, z2.h +; CHECK-NEXT: stp q1, q0, [x1] ; CHECK-NEXT: ret %op1 = load <16 x half>, <16 x half>* %a %res = fptosi <16 x half> %op1 to <16 x i32> @@ -1044,7 +1204,10 @@ define <2 x i16> @fcvtzs_v2f32_v2i16(<2 x float> %op1) #0 { ; CHECK-LABEL: fcvtzs_v2f32_v2i16: ; CHECK: // %bb.0: -; CHECK-NEXT: fcvtzs v0.2s, v0.2s +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: ptrue p0.s, vl2 +; CHECK-NEXT: fcvtzs z0.s, p0/m, z0.s +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret %res = fptosi <2 x float> %op1 to <2 x i16> ret <2 x i16> %res @@ -1055,18 +1218,20 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: sub sp, sp, #16 ; CHECK-NEXT: .cfi_def_cfa_offset 16 -; CHECK-NEXT: fcvtzs v0.4s, v0.4s -; CHECK-NEXT: mov z1.s, z0.s[3] +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: fcvtzs z0.s, p0/m, z0.s ; CHECK-NEXT: fmov w8, s0 +; CHECK-NEXT: mov z1.s, z0.s[3] ; CHECK-NEXT: mov z2.s, z0.s[2] ; CHECK-NEXT: mov z0.s, z0.s[1] ; CHECK-NEXT: fmov w9, s1 ; CHECK-NEXT: fmov w10, s2 -; CHECK-NEXT: fmov w11, s0 ; CHECK-NEXT: strh w8, [sp, #8] +; CHECK-NEXT: fmov w8, s0 ; CHECK-NEXT: strh w9, [sp, #14] ; CHECK-NEXT: strh w10, [sp, #12] -; CHECK-NEXT: strh w11, [sp, #10] +; CHECK-NEXT: strh w8, [sp, #10] ; CHECK-NEXT: ldr d0, [sp, #8] ; CHECK-NEXT: add sp, sp, #16 ; CHECK-NEXT: ret @@ -1080,10 +1245,11 @@ ; CHECK-NEXT: sub sp, sp, #16 ; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: ldp q1, q0, [x0] -; CHECK-NEXT: fcvtzs v1.4s, v1.4s -; CHECK-NEXT: fcvtzs v0.4s, v0.4s +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: fcvtzs z1.s, p0/m, z1.s ; CHECK-NEXT: fmov w9, s1 ; CHECK-NEXT: mov z5.s, z1.s[2] +; CHECK-NEXT: fcvtzs z0.s, p0/m, z0.s ; CHECK-NEXT: fmov w8, s0 ; CHECK-NEXT: mov z2.s, z0.s[3] ; CHECK-NEXT: mov z3.s, z0.s[2] @@ -1116,49 +1282,52 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: sub sp, sp, #32 ; CHECK-NEXT: .cfi_def_cfa_offset 32 -; CHECK-NEXT: ldp q1, q0, [x0] -; CHECK-NEXT: fcvtzs v1.4s, v1.4s -; CHECK-NEXT: fcvtzs v0.4s, v0.4s -; CHECK-NEXT: ldp q2, q3, [x0, #32] -; CHECK-NEXT: fmov w9, s1 -; CHECK-NEXT: mov z7.s, z1.s[2] -; CHECK-NEXT: fmov w8, s0 -; CHECK-NEXT: mov z4.s, z0.s[3] +; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: fcvtzs z0.s, p0/m, z0.s +; CHECK-NEXT: fmov w9, s0 ; CHECK-NEXT: mov z5.s, z0.s[2] -; CHECK-NEXT: fmov w10, s4 -; CHECK-NEXT: mov z6.s, z0.s[1] -; CHECK-NEXT: mov z0.s, z1.s[3] +; CHECK-NEXT: fcvtzs z1.s, p0/m, z1.s +; CHECK-NEXT: fmov w8, s1 +; CHECK-NEXT: mov z2.s, z1.s[3] +; CHECK-NEXT: mov z3.s, z1.s[2] +; CHECK-NEXT: fmov w10, s2 +; CHECK-NEXT: ldp q6, q7, [x0, #32] ; CHECK-NEXT: strh w8, [sp, #8] -; CHECK-NEXT: fmov w8, s5 -; CHECK-NEXT: fcvtzs v3.4s, v3.4s -; CHECK-NEXT: strh w10, [sp, #14] -; CHECK-NEXT: fmov w10, s0 -; CHECK-NEXT: mov z0.s, z1.s[1] -; CHECK-NEXT: fcvtzs v1.4s, v2.4s +; CHECK-NEXT: fmov w8, s3 +; CHECK-NEXT: mov z4.s, z1.s[1] +; CHECK-NEXT: mov z1.s, z0.s[3] ; CHECK-NEXT: strh w9, [sp] -; CHECK-NEXT: fmov w9, s6 +; CHECK-NEXT: fmov w9, s4 +; CHECK-NEXT: strh w10, [sp, #14] +; CHECK-NEXT: fmov w10, s1 ; CHECK-NEXT: strh w8, [sp, #12] -; CHECK-NEXT: fmov w8, s7 -; CHECK-NEXT: strh w10, [sp, #6] -; CHECK-NEXT: mov z2.s, z3.s[2] -; CHECK-NEXT: mov z4.s, z3.s[1] +; CHECK-NEXT: fmov w8, s5 ; CHECK-NEXT: strh w9, [sp, #10] -; CHECK-NEXT: fmov w9, s3 +; CHECK-NEXT: mov z0.s, z0.s[1] +; CHECK-NEXT: strh w10, [sp, #6] ; CHECK-NEXT: strh w8, [sp, #4] +; CHECK-NEXT: movprfx z1, z7 +; CHECK-NEXT: fcvtzs z1.s, p0/m, z7.s ; CHECK-NEXT: fmov w8, s0 +; CHECK-NEXT: mov z0.s, z1.s[3] +; CHECK-NEXT: mov z2.s, z1.s[2] +; CHECK-NEXT: mov z3.s, z1.s[1] +; CHECK-NEXT: fmov w9, s1 +; CHECK-NEXT: movprfx z1, z6 +; CHECK-NEXT: fcvtzs z1.s, p0/m, z6.s ; CHECK-NEXT: fmov w10, s1 -; CHECK-NEXT: mov z0.s, z3.s[3] -; CHECK-NEXT: strh w9, [sp, #24] -; CHECK-NEXT: fmov w9, s2 ; CHECK-NEXT: strh w8, [sp, #2] ; CHECK-NEXT: fmov w8, s0 +; CHECK-NEXT: mov z4.s, z1.s[3] +; CHECK-NEXT: strh w9, [sp, #24] +; CHECK-NEXT: fmov w9, s2 ; CHECK-NEXT: strh w10, [sp, #16] -; CHECK-NEXT: fmov w10, s4 -; CHECK-NEXT: mov z3.s, z1.s[3] +; CHECK-NEXT: fmov w10, s3 ; CHECK-NEXT: mov z5.s, z1.s[2] ; CHECK-NEXT: mov z6.s, z1.s[1] ; CHECK-NEXT: strh w8, [sp, #30] -; CHECK-NEXT: fmov w8, s3 +; CHECK-NEXT: fmov w8, s4 ; CHECK-NEXT: strh w9, [sp, #28] ; CHECK-NEXT: fmov w9, s5 ; CHECK-NEXT: strh w10, [sp, #26] @@ -1183,7 +1352,10 @@ define <2 x i32> @fcvtzs_v2f32_v2i32(<2 x float> %op1) #0 { ; CHECK-LABEL: fcvtzs_v2f32_v2i32: ; CHECK: // %bb.0: -; CHECK-NEXT: fcvtzs v0.2s, v0.2s +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: ptrue p0.s, vl2 +; CHECK-NEXT: fcvtzs z0.s, p0/m, z0.s +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret %res = fptosi <2 x float> %op1 to <2 x i32> ret <2 x i32> %res @@ -1192,7 +1364,10 @@ define <4 x i32> @fcvtzs_v4f32_v4i32(<4 x float> %op1) #0 { ; CHECK-LABEL: fcvtzs_v4f32_v4i32: ; CHECK: // %bb.0: -; CHECK-NEXT: fcvtzs v0.4s, v0.4s +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: ptrue p0.s, vl4 +; CHECK-NEXT: fcvtzs z0.s, p0/m, z0.s +; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: ret %res = fptosi <4 x float> %op1 to <4 x i32> ret <4 x i32> %res @@ -1202,8 +1377,9 @@ ; CHECK-LABEL: fcvtzs_v8f32_v8i32: ; CHECK: // %bb.0: ; CHECK-NEXT: ldp q0, q1, [x0] -; CHECK-NEXT: fcvtzs v0.4s, v0.4s -; CHECK-NEXT: fcvtzs v1.4s, v1.4s +; CHECK-NEXT: ptrue p0.s, vl4 +; CHECK-NEXT: fcvtzs z0.s, p0/m, z0.s +; CHECK-NEXT: fcvtzs z1.s, p0/m, z1.s ; CHECK-NEXT: stp q0, q1, [x1] ; CHECK-NEXT: ret %op1 = load <8 x float>, <8 x float>* %a @@ -1219,9 +1395,11 @@ define <1 x i64> @fcvtzs_v1f32_v1i64(<1 x float> %op1) #0 { ; CHECK-LABEL: fcvtzs_v1f32_v1i64: ; CHECK: // %bb.0: -; CHECK-NEXT: fcvtl v0.2d, v0.2s -; CHECK-NEXT: fcvtzs v0.2d, v0.2d -; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: ptrue p0.d, vl2 +; CHECK-NEXT: uunpklo z0.d, z0.s +; CHECK-NEXT: fcvtzs z0.d, p0/m, z0.s +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret %res = fptosi <1 x float> %op1 to <1 x i64> ret <1 x i64> %res @@ -1230,8 +1408,11 @@ define <2 x i64> @fcvtzs_v2f32_v2i64(<2 x float> %op1) #0 { ; CHECK-LABEL: fcvtzs_v2f32_v2i64: ; CHECK: // %bb.0: -; CHECK-NEXT: fcvtl v0.2d, v0.2s -; CHECK-NEXT: fcvtzs v0.2d, v0.2d +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: ptrue p0.d, vl2 +; CHECK-NEXT: uunpklo z0.d, z0.s +; CHECK-NEXT: fcvtzs z0.d, p0/m, z0.s +; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: ret %res = fptosi <2 x float> %op1 to <2 x i64> ret <2 x i64> %res @@ -1241,13 +1422,13 @@ ; CHECK-LABEL: fcvtzs_v4f32_v4i64: ; CHECK: // %bb.0: ; CHECK-NEXT: ldr q0, [x0] -; CHECK-NEXT: mov z1.d, z0.d -; CHECK-NEXT: fcvtl v0.2d, v0.2s -; CHECK-NEXT: ext z1.b, z1.b, z1.b, #8 -; CHECK-NEXT: fcvtl v1.2d, v1.2s -; CHECK-NEXT: fcvtzs v0.2d, v0.2d -; CHECK-NEXT: fcvtzs v1.2d, v1.2d -; CHECK-NEXT: stp q0, q1, [x1] +; CHECK-NEXT: ptrue p0.d, vl2 +; CHECK-NEXT: uunpklo z1.d, z0.s +; CHECK-NEXT: ext z0.b, z0.b, z0.b, #8 +; CHECK-NEXT: uunpklo z0.d, z0.s +; CHECK-NEXT: fcvtzs z1.d, p0/m, z1.s +; CHECK-NEXT: fcvtzs z0.d, p0/m, z0.s +; CHECK-NEXT: stp q1, q0, [x1] ; CHECK-NEXT: ret %op1 = load <4 x float>, <4 x float>* %a %res = fptosi <4 x float> %op1 to <4 x i64> @@ -1258,21 +1439,21 @@ define void @fcvtzs_v8f32_v8i64(<8 x float>* %a, <8 x i64>* %b) #0 { ; CHECK-LABEL: fcvtzs_v8f32_v8i64: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q1, q0, [x0] -; CHECK-NEXT: mov z3.d, z1.d -; CHECK-NEXT: fcvtl v1.2d, v1.2s -; CHECK-NEXT: ext z3.b, z3.b, z3.b, #8 -; CHECK-NEXT: mov z2.d, z0.d -; CHECK-NEXT: fcvtl v0.2d, v0.2s -; CHECK-NEXT: ext z2.b, z2.b, z2.b, #8 -; CHECK-NEXT: fcvtl v3.2d, v3.2s -; CHECK-NEXT: fcvtl v2.2d, v2.2s -; CHECK-NEXT: fcvtzs v1.2d, v1.2d -; CHECK-NEXT: fcvtzs v0.2d, v0.2d -; CHECK-NEXT: fcvtzs v3.2d, v3.2d -; CHECK-NEXT: fcvtzs v2.2d, v2.2d -; CHECK-NEXT: stp q1, q3, [x1] -; CHECK-NEXT: stp q0, q2, [x1, #32] +; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ptrue p0.d, vl2 +; CHECK-NEXT: uunpklo z2.d, z0.s +; CHECK-NEXT: ext z0.b, z0.b, z0.b, #8 +; CHECK-NEXT: uunpklo z0.d, z0.s +; CHECK-NEXT: uunpklo z3.d, z1.s +; CHECK-NEXT: ext z1.b, z1.b, z1.b, #8 +; CHECK-NEXT: uunpklo z1.d, z1.s +; CHECK-NEXT: fcvtzs z3.d, p0/m, z3.s +; CHECK-NEXT: fcvtzs z1.d, p0/m, z1.s +; CHECK-NEXT: fcvtzs z0.d, p0/m, z0.s +; CHECK-NEXT: stp q3, q1, [x1, #32] +; CHECK-NEXT: movprfx z1, z2 +; CHECK-NEXT: fcvtzs z1.d, p0/m, z2.s +; CHECK-NEXT: stp q1, q0, [x1] ; CHECK-NEXT: ret %op1 = load <8 x float>, <8 x float>* %a %res = fptosi <8 x float> %op1 to <8 x i64> @@ -1300,8 +1481,17 @@ define <2 x i16> @fcvtzs_v2f64_v2i16(<2 x double> %op1) #0 { ; CHECK-LABEL: fcvtzs_v2f64_v2i16: ; CHECK: // %bb.0: -; CHECK-NEXT: fcvtzs v0.2d, v0.2d -; CHECK-NEXT: xtn v0.2s, v0.2d +; CHECK-NEXT: sub sp, sp, #16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: fcvtzs z0.d, p0/m, z0.d +; CHECK-NEXT: mov z1.d, z0.d[1] +; CHECK-NEXT: fmov x8, d0 +; CHECK-NEXT: fmov x9, d1 +; CHECK-NEXT: stp w8, w9, [sp, #8] +; CHECK-NEXT: ldr d0, [sp, #8] +; CHECK-NEXT: add sp, sp, #16 ; CHECK-NEXT: ret %res = fptosi <2 x double> %op1 to <2 x i16> ret <2 x i16> %res @@ -1310,25 +1500,34 @@ define <4 x i16> @fcvtzs_v4f64_v4i16(<4 x double>* %a) #0 { ; CHECK-LABEL: fcvtzs_v4f64_v4i16: ; CHECK: // %bb.0: -; CHECK-NEXT: sub sp, sp, #16 -; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: sub sp, sp, #32 +; CHECK-NEXT: .cfi_def_cfa_offset 32 ; CHECK-NEXT: ldp q1, q0, [x0] -; CHECK-NEXT: fcvtzs v1.2d, v1.2d -; CHECK-NEXT: fcvtzs v0.2d, v0.2d -; CHECK-NEXT: xtn v1.2s, v1.2d -; CHECK-NEXT: xtn v0.2s, v0.2d -; CHECK-NEXT: fmov w9, s1 -; CHECK-NEXT: mov z1.s, z1.s[1] +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: fcvtzs z0.d, p0/m, z0.d +; CHECK-NEXT: mov z2.d, z0.d[1] +; CHECK-NEXT: fmov x8, d0 +; CHECK-NEXT: movprfx z0, z1 +; CHECK-NEXT: fcvtzs z0.d, p0/m, z1.d +; CHECK-NEXT: mov z1.d, z0.d[1] +; CHECK-NEXT: fmov x9, d2 +; CHECK-NEXT: fmov x10, d0 +; CHECK-NEXT: fmov x11, d1 +; CHECK-NEXT: stp w8, w9, [sp, #16] +; CHECK-NEXT: stp w10, w11, [sp, #8] +; CHECK-NEXT: ldp d1, d0, [sp, #8] ; CHECK-NEXT: fmov w8, s0 ; CHECK-NEXT: mov z0.s, z0.s[1] -; CHECK-NEXT: fmov w10, s0 -; CHECK-NEXT: strh w9, [sp, #8] -; CHECK-NEXT: strh w8, [sp, #12] +; CHECK-NEXT: fmov w9, s0 +; CHECK-NEXT: strh w8, [sp, #28] ; CHECK-NEXT: fmov w8, s1 -; CHECK-NEXT: strh w10, [sp, #14] -; CHECK-NEXT: strh w8, [sp, #10] -; CHECK-NEXT: ldr d0, [sp, #8] -; CHECK-NEXT: add sp, sp, #16 +; CHECK-NEXT: mov z1.s, z1.s[1] +; CHECK-NEXT: strh w9, [sp, #30] +; CHECK-NEXT: fmov w10, s1 +; CHECK-NEXT: strh w8, [sp, #24] +; CHECK-NEXT: strh w10, [sp, #26] +; CHECK-NEXT: ldr d0, [sp, #24] +; CHECK-NEXT: add sp, sp, #32 ; CHECK-NEXT: ret %op1 = load <4 x double>, <4 x double>* %a %res = fptosi <4 x double> %op1 to <4 x i16> @@ -1338,39 +1537,57 @@ define <8 x i16> @fcvtzs_v8f64_v8i16(<8 x double>* %a) #0 { ; CHECK-LABEL: fcvtzs_v8f64_v8i16: ; CHECK: // %bb.0: -; CHECK-NEXT: sub sp, sp, #16 -; CHECK-NEXT: .cfi_def_cfa_offset 16 -; CHECK-NEXT: ldp q0, q1, [x0, #32] -; CHECK-NEXT: fcvtzs v0.2d, v0.2d -; CHECK-NEXT: ldp q3, q2, [x0] -; CHECK-NEXT: fcvtzs v1.2d, v1.2d -; CHECK-NEXT: xtn v0.2s, v0.2d -; CHECK-NEXT: fcvtzs v3.2d, v3.2d -; CHECK-NEXT: fmov w9, s0 -; CHECK-NEXT: xtn v1.2s, v1.2d -; CHECK-NEXT: fcvtzs v2.2d, v2.2d +; CHECK-NEXT: sub sp, sp, #48 +; CHECK-NEXT: .cfi_def_cfa_offset 48 +; CHECK-NEXT: ldp q1, q0, [x0, #32] +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: fcvtzs z1.d, p0/m, z1.d +; CHECK-NEXT: fmov x9, d1 +; CHECK-NEXT: fcvtzs z0.d, p0/m, z0.d +; CHECK-NEXT: mov z4.d, z0.d[1] +; CHECK-NEXT: fmov x8, d0 +; CHECK-NEXT: fmov x10, d4 +; CHECK-NEXT: mov z0.d, z1.d[1] +; CHECK-NEXT: ldp q2, q3, [x0] +; CHECK-NEXT: movprfx z1, z3 +; CHECK-NEXT: fcvtzs z1.d, p0/m, z3.d +; CHECK-NEXT: mov z3.d, z1.d[1] +; CHECK-NEXT: fmov x11, d1 +; CHECK-NEXT: stp w8, w10, [sp, #8] +; CHECK-NEXT: fmov x8, d3 +; CHECK-NEXT: fmov x12, d0 +; CHECK-NEXT: fcvtzs z2.d, p0/m, z2.d +; CHECK-NEXT: mov z0.d, z2.d[1] +; CHECK-NEXT: stp w11, w8, [sp, #16] +; CHECK-NEXT: fmov x10, d0 +; CHECK-NEXT: stp w9, w12, [sp, #24] +; CHECK-NEXT: fmov x9, d2 +; CHECK-NEXT: ldp d0, d2, [sp, #8] +; CHECK-NEXT: ldr d1, [sp, #24] +; CHECK-NEXT: stp w9, w10, [sp] +; CHECK-NEXT: ldr d3, [sp] +; CHECK-NEXT: fmov w8, s0 +; CHECK-NEXT: mov z0.s, z0.s[1] +; CHECK-NEXT: fmov w10, s3 +; CHECK-NEXT: strh w8, [sp, #44] ; CHECK-NEXT: fmov w8, s1 -; CHECK-NEXT: mov z4.s, z1.s[1] -; CHECK-NEXT: xtn v3.2s, v3.2d -; CHECK-NEXT: strh w9, [sp, #8] -; CHECK-NEXT: fmov w9, s4 -; CHECK-NEXT: mov z1.s, z0.s[1] -; CHECK-NEXT: xtn v2.2s, v2.2d -; CHECK-NEXT: strh w8, [sp, #12] -; CHECK-NEXT: fmov w8, s3 -; CHECK-NEXT: fmov w10, s2 +; CHECK-NEXT: mov z1.s, z1.s[1] +; CHECK-NEXT: fmov w9, s1 +; CHECK-NEXT: mov z1.s, z3.s[1] +; CHECK-NEXT: strh w8, [sp, #40] +; CHECK-NEXT: fmov w8, s2 +; CHECK-NEXT: strh w10, [sp, #32] +; CHECK-NEXT: strh w9, [sp, #42] +; CHECK-NEXT: fmov w9, s1 +; CHECK-NEXT: strh w8, [sp, #36] +; CHECK-NEXT: fmov w8, s0 ; CHECK-NEXT: mov z0.s, z2.s[1] -; CHECK-NEXT: mov z2.s, z3.s[1] -; CHECK-NEXT: strh w9, [sp, #14] -; CHECK-NEXT: strh w8, [sp] +; CHECK-NEXT: strh w9, [sp, #34] +; CHECK-NEXT: strh w8, [sp, #46] ; CHECK-NEXT: fmov w8, s0 -; CHECK-NEXT: strh w10, [sp, #4] -; CHECK-NEXT: fmov w10, s1 -; CHECK-NEXT: fmov w9, s2 -; CHECK-NEXT: strh w8, [sp, #6] -; CHECK-NEXT: strh w10, [sp, #10] -; CHECK-NEXT: strh w9, [sp, #2] -; CHECK-NEXT: ldr q0, [sp], #16 +; CHECK-NEXT: strh w8, [sp, #38] +; CHECK-NEXT: ldr q0, [sp, #32] +; CHECK-NEXT: add sp, sp, #48 ; CHECK-NEXT: ret %op1 = load <8 x double>, <8 x double>* %a %res = fptosi <8 x double> %op1 to <8 x i16> @@ -1380,71 +1597,108 @@ define void @fcvtzs_v16f64_v16i16(<16 x double>* %a, <16 x i16>* %b) #0 { ; CHECK-LABEL: fcvtzs_v16f64_v16i16: ; CHECK: // %bb.0: -; CHECK-NEXT: sub sp, sp, #32 -; CHECK-NEXT: .cfi_def_cfa_offset 32 +; CHECK-NEXT: sub sp, sp, #96 +; CHECK-NEXT: .cfi_def_cfa_offset 96 ; CHECK-NEXT: ldp q0, q1, [x0, #32] -; CHECK-NEXT: fcvtzs v0.2d, v0.2d -; CHECK-NEXT: ldp q3, q2, [x0] -; CHECK-NEXT: fcvtzs v1.2d, v1.2d -; CHECK-NEXT: xtn v0.2s, v0.2d -; CHECK-NEXT: fcvtzs v3.2d, v3.2d -; CHECK-NEXT: fmov w9, s0 -; CHECK-NEXT: xtn v1.2s, v1.2d -; CHECK-NEXT: fcvtzs v2.2d, v2.2d -; CHECK-NEXT: ldp q6, q7, [x0, #96] -; CHECK-NEXT: fmov w8, s1 -; CHECK-NEXT: xtn v3.2s, v3.2d -; CHECK-NEXT: mov z16.s, z1.s[1] -; CHECK-NEXT: mov z1.s, z0.s[1] -; CHECK-NEXT: xtn v2.2s, v2.2d -; CHECK-NEXT: mov z0.s, z2.s[1] -; CHECK-NEXT: fmov w10, s2 -; CHECK-NEXT: ldp q4, q5, [x0, #64] -; CHECK-NEXT: strh w8, [sp, #12] -; CHECK-NEXT: fmov w8, s3 -; CHECK-NEXT: fcvtzs v2.2d, v7.2d -; CHECK-NEXT: strh w9, [sp, #8] -; CHECK-NEXT: fmov w9, s16 -; CHECK-NEXT: mov z3.s, z3.s[1] -; CHECK-NEXT: strh w8, [sp] -; CHECK-NEXT: fmov w8, s1 -; CHECK-NEXT: fcvtzs v1.2d, v6.2d -; CHECK-NEXT: strh w10, [sp, #4] -; CHECK-NEXT: strh w9, [sp, #14] -; CHECK-NEXT: fmov w9, s0 -; CHECK-NEXT: xtn v0.2s, v2.2d -; CHECK-NEXT: strh w8, [sp, #10] -; CHECK-NEXT: fmov w8, s3 -; CHECK-NEXT: fcvtzs v2.2d, v5.2d -; CHECK-NEXT: fcvtzs v3.2d, v4.2d -; CHECK-NEXT: xtn v1.2s, v1.2d -; CHECK-NEXT: fmov w10, s0 +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: fcvtzs z0.d, p0/m, z0.d +; CHECK-NEXT: fmov x10, d0 +; CHECK-NEXT: fcvtzs z1.d, p0/m, z1.d +; CHECK-NEXT: mov z6.d, z1.d[1] +; CHECK-NEXT: fmov x8, d1 +; CHECK-NEXT: fmov x9, d6 +; CHECK-NEXT: mov z6.d, z0.d[1] +; CHECK-NEXT: fmov x11, d6 +; CHECK-NEXT: ldp q7, q1, [x0] +; CHECK-NEXT: ldp q2, q3, [x0, #64] +; CHECK-NEXT: ldp q4, q5, [x0, #96] +; CHECK-NEXT: stp w8, w9, [sp, #32] +; CHECK-NEXT: stp w10, w11, [sp, #48] +; CHECK-NEXT: movprfx z0, z1 +; CHECK-NEXT: fcvtzs z0.d, p0/m, z1.d +; CHECK-NEXT: mov z1.d, z0.d[1] +; CHECK-NEXT: fmov x8, d0 +; CHECK-NEXT: fmov x9, d1 +; CHECK-NEXT: movprfx z0, z7 +; CHECK-NEXT: fcvtzs z0.d, p0/m, z7.d +; CHECK-NEXT: mov z1.d, z0.d[1] +; CHECK-NEXT: fmov x10, d0 +; CHECK-NEXT: fmov x11, d1 +; CHECK-NEXT: stp w8, w9, [sp, #40] +; CHECK-NEXT: stp w10, w11, [sp] +; CHECK-NEXT: movprfx z0, z5 +; CHECK-NEXT: fcvtzs z0.d, p0/m, z5.d +; CHECK-NEXT: mov z1.d, z0.d[1] +; CHECK-NEXT: fmov x8, d0 +; CHECK-NEXT: fmov x9, d1 +; CHECK-NEXT: movprfx z0, z4 +; CHECK-NEXT: fcvtzs z0.d, p0/m, z4.d +; CHECK-NEXT: mov z1.d, z0.d[1] +; CHECK-NEXT: fmov x10, d0 +; CHECK-NEXT: fmov x11, d1 +; CHECK-NEXT: stp w8, w9, [sp, #16] +; CHECK-NEXT: stp w10, w11, [sp, #8] +; CHECK-NEXT: movprfx z0, z3 +; CHECK-NEXT: fcvtzs z0.d, p0/m, z3.d +; CHECK-NEXT: mov z1.d, z0.d[1] +; CHECK-NEXT: fmov x8, d0 +; CHECK-NEXT: movprfx z0, z2 +; CHECK-NEXT: fcvtzs z0.d, p0/m, z2.d +; CHECK-NEXT: fmov x9, d1 +; CHECK-NEXT: mov z1.d, z0.d[1] +; CHECK-NEXT: fmov x10, d0 +; CHECK-NEXT: ldp d0, d2, [sp, #32] +; CHECK-NEXT: fmov x11, d1 +; CHECK-NEXT: stp w8, w9, [sp, #24] +; CHECK-NEXT: ldr d1, [sp, #48] +; CHECK-NEXT: ldr d3, [sp] +; CHECK-NEXT: fmov w8, s0 ; CHECK-NEXT: mov z0.s, z0.s[1] -; CHECK-NEXT: strh w8, [sp, #2] +; CHECK-NEXT: stp w10, w11, [sp, #56] +; CHECK-NEXT: fmov w9, s3 +; CHECK-NEXT: strh w8, [sp, #76] ; CHECK-NEXT: fmov w8, s1 -; CHECK-NEXT: xtn v2.2s, v2.2d -; CHECK-NEXT: mov z4.s, z1.s[1] -; CHECK-NEXT: xtn v1.2s, v3.2d -; CHECK-NEXT: strh w9, [sp, #6] -; CHECK-NEXT: strh w10, [sp, #28] -; CHECK-NEXT: fmov w9, s2 +; CHECK-NEXT: mov z1.s, z1.s[1] ; CHECK-NEXT: fmov w10, s1 -; CHECK-NEXT: strh w8, [sp, #24] +; CHECK-NEXT: strh w9, [sp, #64] +; CHECK-NEXT: strh w8, [sp, #72] +; CHECK-NEXT: fmov w8, s2 +; CHECK-NEXT: strh w10, [sp, #74] +; CHECK-NEXT: strh w8, [sp, #68] ; CHECK-NEXT: fmov w8, s0 -; CHECK-NEXT: mov z3.s, z2.s[1] -; CHECK-NEXT: mov z2.s, z1.s[1] -; CHECK-NEXT: strh w9, [sp, #20] -; CHECK-NEXT: fmov w9, s4 -; CHECK-NEXT: strh w10, [sp, #16] -; CHECK-NEXT: fmov w10, s3 -; CHECK-NEXT: strh w8, [sp, #30] +; CHECK-NEXT: mov z0.s, z2.s[1] +; CHECK-NEXT: mov z2.s, z3.s[1] +; CHECK-NEXT: fmov w9, s2 +; CHECK-NEXT: ldr d2, [sp, #24] +; CHECK-NEXT: strh w8, [sp, #78] +; CHECK-NEXT: fmov w8, s0 +; CHECK-NEXT: ldp d1, d0, [sp, #8] +; CHECK-NEXT: strh w9, [sp, #66] +; CHECK-NEXT: strh w8, [sp, #70] +; CHECK-NEXT: fmov w8, s0 +; CHECK-NEXT: mov z0.s, z0.s[1] +; CHECK-NEXT: fmov w9, s0 +; CHECK-NEXT: mov z0.s, z1.s[1] +; CHECK-NEXT: strh w8, [sp, #92] +; CHECK-NEXT: fmov w8, s1 +; CHECK-NEXT: ldr d1, [sp, #56] +; CHECK-NEXT: strh w9, [sp, #94] +; CHECK-NEXT: strh w8, [sp, #88] ; CHECK-NEXT: fmov w8, s2 -; CHECK-NEXT: strh w9, [sp, #26] -; CHECK-NEXT: strh w10, [sp, #22] -; CHECK-NEXT: strh w8, [sp, #18] -; CHECK-NEXT: ldp q1, q0, [sp] +; CHECK-NEXT: mov z2.s, z2.s[1] +; CHECK-NEXT: fmov w9, s1 +; CHECK-NEXT: fmov w10, s2 +; CHECK-NEXT: strh w8, [sp, #84] +; CHECK-NEXT: fmov w8, s0 +; CHECK-NEXT: mov z0.s, z1.s[1] +; CHECK-NEXT: strh w9, [sp, #80] +; CHECK-NEXT: strh w10, [sp, #86] +; CHECK-NEXT: strh w8, [sp, #90] +; CHECK-NEXT: fmov w8, s0 +; CHECK-NEXT: strh w8, [sp, #82] +; CHECK-NEXT: ldp q1, q0, [sp, #64] ; CHECK-NEXT: stp q1, q0, [x1] -; CHECK-NEXT: add sp, sp, #32 +; CHECK-NEXT: add sp, sp, #96 ; CHECK-NEXT: ret %op1 = load <16 x double>, <16 x double>* %a %res = fptosi <16 x double> %op1 to <16 x i16> @@ -1459,9 +1713,17 @@ define <1 x i32> @fcvtzs_v1f64_v1i32(<1 x double> %op1) #0 { ; CHECK-LABEL: fcvtzs_v1f64_v1i32: ; CHECK: // %bb.0: +; CHECK-NEXT: sub sp, sp, #16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 -; CHECK-NEXT: fcvtzs v0.2d, v0.2d -; CHECK-NEXT: xtn v0.2s, v0.2d +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: fcvtzs z0.d, p0/m, z0.d +; CHECK-NEXT: mov z1.d, z0.d[1] +; CHECK-NEXT: fmov x8, d0 +; CHECK-NEXT: fmov x9, d1 +; CHECK-NEXT: stp w8, w9, [sp, #8] +; CHECK-NEXT: ldr d0, [sp, #8] +; CHECK-NEXT: add sp, sp, #16 ; CHECK-NEXT: ret %res = fptosi <1 x double> %op1 to <1 x i32> ret <1 x i32> %res @@ -1470,8 +1732,17 @@ define <2 x i32> @fcvtzs_v2f64_v2i32(<2 x double> %op1) #0 { ; CHECK-LABEL: fcvtzs_v2f64_v2i32: ; CHECK: // %bb.0: -; CHECK-NEXT: fcvtzs v0.2d, v0.2d -; CHECK-NEXT: xtn v0.2s, v0.2d +; CHECK-NEXT: sub sp, sp, #16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: fcvtzs z0.d, p0/m, z0.d +; CHECK-NEXT: mov z1.d, z0.d[1] +; CHECK-NEXT: fmov x8, d0 +; CHECK-NEXT: fmov x9, d1 +; CHECK-NEXT: stp w8, w9, [sp, #8] +; CHECK-NEXT: ldr d0, [sp, #8] +; CHECK-NEXT: add sp, sp, #16 ; CHECK-NEXT: ret %res = fptosi <2 x double> %op1 to <2 x i32> ret <2 x i32> %res @@ -1480,14 +1751,22 @@ define <4 x i32> @fcvtzs_v4f64_v4i32(<4 x double>* %a) #0 { ; CHECK-LABEL: fcvtzs_v4f64_v4i32: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q1, [x0] -; CHECK-NEXT: ptrue p0.s, vl2 -; CHECK-NEXT: fcvtzs v0.2d, v0.2d -; CHECK-NEXT: fcvtzs v1.2d, v1.2d -; CHECK-NEXT: xtn v0.2s, v0.2d -; CHECK-NEXT: xtn v1.2s, v1.2d -; CHECK-NEXT: splice z0.s, p0, z0.s, z1.s -; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 +; CHECK-NEXT: sub sp, sp, #16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: ldp q1, q0, [x0] +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: fcvtzs z0.d, p0/m, z0.d +; CHECK-NEXT: mov z2.d, z0.d[1] +; CHECK-NEXT: fmov x8, d0 +; CHECK-NEXT: movprfx z0, z1 +; CHECK-NEXT: fcvtzs z0.d, p0/m, z1.d +; CHECK-NEXT: mov z1.d, z0.d[1] +; CHECK-NEXT: fmov x9, d2 +; CHECK-NEXT: fmov x10, d0 +; CHECK-NEXT: fmov x11, d1 +; CHECK-NEXT: stp w8, w9, [sp, #8] +; CHECK-NEXT: stp w10, w11, [sp] +; CHECK-NEXT: ldr q0, [sp], #16 ; CHECK-NEXT: ret %op1 = load <4 x double>, <4 x double>* %a %res = fptosi <4 x double> %op1 to <4 x i32> @@ -1497,20 +1776,37 @@ define void @fcvtzs_v8f64_v8i32(<8 x double>* %a, <8 x i32>* %b) #0 { ; CHECK-LABEL: fcvtzs_v8f64_v8i32: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q1, [x0, #32] -; CHECK-NEXT: ptrue p0.s, vl2 -; CHECK-NEXT: fcvtzs v0.2d, v0.2d -; CHECK-NEXT: ldp q2, q3, [x0] -; CHECK-NEXT: fcvtzs v1.2d, v1.2d -; CHECK-NEXT: xtn v0.2s, v0.2d -; CHECK-NEXT: fcvtzs v2.2d, v2.2d -; CHECK-NEXT: xtn v1.2s, v1.2d -; CHECK-NEXT: fcvtzs v3.2d, v3.2d -; CHECK-NEXT: splice z0.s, p0, z0.s, z1.s -; CHECK-NEXT: xtn v2.2s, v2.2d -; CHECK-NEXT: xtn v3.2s, v3.2d -; CHECK-NEXT: splice z2.s, p0, z2.s, z3.s -; CHECK-NEXT: stp q2, q0, [x1] +; CHECK-NEXT: sub sp, sp, #32 +; CHECK-NEXT: .cfi_def_cfa_offset 32 +; CHECK-NEXT: ldp q1, q0, [x0] +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: fcvtzs z0.d, p0/m, z0.d +; CHECK-NEXT: ldp q3, q2, [x0, #32] +; CHECK-NEXT: mov z4.d, z0.d[1] +; CHECK-NEXT: fmov x8, d0 +; CHECK-NEXT: movprfx z0, z1 +; CHECK-NEXT: fcvtzs z0.d, p0/m, z1.d +; CHECK-NEXT: movprfx z1, z2 +; CHECK-NEXT: fcvtzs z1.d, p0/m, z2.d +; CHECK-NEXT: movprfx z2, z3 +; CHECK-NEXT: fcvtzs z2.d, p0/m, z3.d +; CHECK-NEXT: fmov x9, d4 +; CHECK-NEXT: mov z3.d, z0.d[1] +; CHECK-NEXT: fmov x10, d0 +; CHECK-NEXT: fmov x11, d3 +; CHECK-NEXT: mov z0.d, z1.d[1] +; CHECK-NEXT: stp w8, w9, [sp, #8] +; CHECK-NEXT: fmov x9, d0 +; CHECK-NEXT: mov z0.d, z2.d[1] +; CHECK-NEXT: fmov x8, d1 +; CHECK-NEXT: stp w10, w11, [sp] +; CHECK-NEXT: fmov x10, d2 +; CHECK-NEXT: fmov x11, d0 +; CHECK-NEXT: stp w8, w9, [sp, #24] +; CHECK-NEXT: stp w10, w11, [sp, #16] +; CHECK-NEXT: ldp q1, q0, [sp] +; CHECK-NEXT: stp q1, q0, [x1] +; CHECK-NEXT: add sp, sp, #32 ; CHECK-NEXT: ret %op1 = load <8 x double>, <8 x double>* %a %res = fptosi <8 x double> %op1 to <8 x i32> @@ -1536,7 +1832,10 @@ define <2 x i64> @fcvtzs_v2f64_v2i64(<2 x double> %op1) #0 { ; CHECK-LABEL: fcvtzs_v2f64_v2i64: ; CHECK: // %bb.0: -; CHECK-NEXT: fcvtzs v0.2d, v0.2d +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: ptrue p0.d, vl2 +; CHECK-NEXT: fcvtzs z0.d, p0/m, z0.d +; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: ret %res = fptosi <2 x double> %op1 to <2 x i64> ret <2 x i64> %res @@ -1546,8 +1845,9 @@ ; CHECK-LABEL: fcvtzs_v4f64_v4i64: ; CHECK: // %bb.0: ; CHECK-NEXT: ldp q0, q1, [x0] -; CHECK-NEXT: fcvtzs v0.2d, v0.2d -; CHECK-NEXT: fcvtzs v1.2d, v1.2d +; CHECK-NEXT: ptrue p0.d, vl2 +; CHECK-NEXT: fcvtzs z0.d, p0/m, z0.d +; CHECK-NEXT: fcvtzs z1.d, p0/m, z1.d ; CHECK-NEXT: stp q0, q1, [x1] ; CHECK-NEXT: ret %op1 = load <4 x double>, <4 x double>* %a diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-to-fp.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-to-fp.ll --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-to-fp.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-to-fp.ll @@ -10,7 +10,10 @@ define <4 x half> @ucvtf_v4i16_v4f16(<4 x i16> %op1) #0 { ; CHECK-LABEL: ucvtf_v4i16_v4f16: ; CHECK: // %bb.0: -; CHECK-NEXT: ucvtf v0.4h, v0.4h +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: ptrue p0.h, vl4 +; CHECK-NEXT: ucvtf z0.h, p0/m, z0.h +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret %res = uitofp <4 x i16> %op1 to <4 x half> ret <4 x half> %res @@ -20,7 +23,8 @@ ; CHECK-LABEL: ucvtf_v8i16_v8f16: ; CHECK: // %bb.0: ; CHECK-NEXT: ldr q0, [x0] -; CHECK-NEXT: ucvtf v0.8h, v0.8h +; CHECK-NEXT: ptrue p0.h, vl8 +; CHECK-NEXT: ucvtf z0.h, p0/m, z0.h ; CHECK-NEXT: str q0, [x1] ; CHECK-NEXT: ret %op1 = load <8 x i16>, <8 x i16>* %a @@ -33,8 +37,9 @@ ; CHECK-LABEL: ucvtf_v16i16_v16f16: ; CHECK: // %bb.0: ; CHECK-NEXT: ldp q0, q1, [x0] -; CHECK-NEXT: ucvtf v0.8h, v0.8h -; CHECK-NEXT: ucvtf v1.8h, v1.8h +; CHECK-NEXT: ptrue p0.h, vl8 +; CHECK-NEXT: ucvtf z0.h, p0/m, z0.h +; CHECK-NEXT: ucvtf z1.h, p0/m, z1.h ; CHECK-NEXT: stp q0, q1, [x1] ; CHECK-NEXT: ret %op1 = load <16 x i16>, <16 x i16>* %a @@ -52,9 +57,11 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: adrp x8, .LCPI3_0 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: ptrue p0.s, vl2 ; CHECK-NEXT: ldr d1, [x8, :lo12:.LCPI3_0] ; CHECK-NEXT: and z0.d, z0.d, z1.d -; CHECK-NEXT: ucvtf v0.2s, v0.2s +; CHECK-NEXT: ucvtf z0.s, p0/m, z0.s +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret %res = uitofp <2 x i16> %op1 to <2 x float> ret <2 x float> %res @@ -64,8 +71,10 @@ ; CHECK-LABEL: ucvtf_v4i16_v4f32: ; CHECK: // %bb.0: ; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: uunpklo z0.s, z0.h -; CHECK-NEXT: ucvtf v0.4s, v0.4s +; CHECK-NEXT: ucvtf z0.s, p0/m, z0.s +; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: ret %res = uitofp <4 x i16> %op1 to <4 x float> ret <4 x float> %res @@ -75,11 +84,12 @@ ; CHECK-LABEL: ucvtf_v8i16_v8f32: ; CHECK: // %bb.0: ; CHECK-NEXT: ldr q0, [x0] +; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: uunpklo z1.s, z0.h ; CHECK-NEXT: ext z0.b, z0.b, z0.b, #8 ; CHECK-NEXT: uunpklo z0.s, z0.h -; CHECK-NEXT: ucvtf v1.4s, v1.4s -; CHECK-NEXT: ucvtf v0.4s, v0.4s +; CHECK-NEXT: ucvtf z1.s, p0/m, z1.s +; CHECK-NEXT: ucvtf z0.s, p0/m, z0.s ; CHECK-NEXT: stp q1, q0, [x1] ; CHECK-NEXT: ret %op1 = load <8 x i16>, <8 x i16>* %a @@ -91,19 +101,21 @@ define void @ucvtf_v16i16_v16f32(<16 x i16>* %a, <16 x float>* %b) #0 { ; CHECK-LABEL: ucvtf_v16i16_v16f32: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q1, q0, [x0] -; CHECK-NEXT: uunpklo z3.s, z1.h -; CHECK-NEXT: ext z1.b, z1.b, z1.b, #8 -; CHECK-NEXT: uunpklo z1.s, z1.h -; CHECK-NEXT: ucvtf v3.4s, v3.4s +; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: uunpklo z2.s, z0.h ; CHECK-NEXT: ext z0.b, z0.b, z0.b, #8 ; CHECK-NEXT: uunpklo z0.s, z0.h -; CHECK-NEXT: ucvtf v2.4s, v2.4s -; CHECK-NEXT: ucvtf v0.4s, v0.4s -; CHECK-NEXT: ucvtf v1.4s, v1.4s -; CHECK-NEXT: stp q3, q1, [x1] -; CHECK-NEXT: stp q2, q0, [x1, #32] +; CHECK-NEXT: uunpklo z3.s, z1.h +; CHECK-NEXT: ext z1.b, z1.b, z1.b, #8 +; CHECK-NEXT: uunpklo z1.s, z1.h +; CHECK-NEXT: ucvtf z3.s, p0/m, z3.s +; CHECK-NEXT: ucvtf z1.s, p0/m, z1.s +; CHECK-NEXT: ucvtf z0.s, p0/m, z0.s +; CHECK-NEXT: stp q3, q1, [x1, #32] +; CHECK-NEXT: movprfx z1, z2 +; CHECK-NEXT: ucvtf z1.s, p0/m, z2.s +; CHECK-NEXT: stp q1, q0, [x1] ; CHECK-NEXT: ret %op1 = load <16 x i16>, <16 x i16>* %a %res = uitofp <16 x i16> %op1 to <16 x float> @@ -132,10 +144,12 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: adrp x8, .LCPI8_0 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: ldr d1, [x8, :lo12:.LCPI8_0] ; CHECK-NEXT: and z0.d, z0.d, z1.d ; CHECK-NEXT: uunpklo z0.d, z0.s -; CHECK-NEXT: ucvtf v0.2d, v0.2d +; CHECK-NEXT: ucvtf z0.d, p0/m, z0.d +; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: ret %res = uitofp <2 x i16> %op1 to <2 x double> ret <2 x double> %res @@ -145,12 +159,13 @@ ; CHECK-LABEL: ucvtf_v4i16_v4f64: ; CHECK: // %bb.0: ; CHECK-NEXT: ldr d0, [x0] +; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: uunpklo z0.s, z0.h ; CHECK-NEXT: uunpklo z1.d, z0.s ; CHECK-NEXT: ext z0.b, z0.b, z0.b, #8 ; CHECK-NEXT: uunpklo z0.d, z0.s -; CHECK-NEXT: ucvtf v1.2d, v1.2d -; CHECK-NEXT: ucvtf v0.2d, v0.2d +; CHECK-NEXT: ucvtf z1.d, p0/m, z1.d +; CHECK-NEXT: ucvtf z0.d, p0/m, z0.d ; CHECK-NEXT: stp q1, q0, [x1] ; CHECK-NEXT: ret %op1 = load <4 x i16>, <4 x i16>* %a @@ -163,21 +178,23 @@ ; CHECK-LABEL: ucvtf_v8i16_v8f64: ; CHECK: // %bb.0: ; CHECK-NEXT: ldr q0, [x0] +; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: uunpklo z1.s, z0.h ; CHECK-NEXT: ext z0.b, z0.b, z0.b, #8 ; CHECK-NEXT: uunpklo z0.s, z0.h -; CHECK-NEXT: uunpklo z3.d, z1.s -; CHECK-NEXT: uunpklo z2.d, z0.s +; CHECK-NEXT: uunpklo z2.d, z1.s +; CHECK-NEXT: uunpklo z3.d, z0.s ; CHECK-NEXT: ext z0.b, z0.b, z0.b, #8 -; CHECK-NEXT: ext z1.b, z1.b, z1.b, #8 ; CHECK-NEXT: uunpklo z0.d, z0.s +; CHECK-NEXT: ucvtf z3.d, p0/m, z3.d +; CHECK-NEXT: ext z1.b, z1.b, z1.b, #8 +; CHECK-NEXT: ucvtf z0.d, p0/m, z0.d +; CHECK-NEXT: ucvtf z2.d, p0/m, z2.d ; CHECK-NEXT: uunpklo z1.d, z1.s -; CHECK-NEXT: ucvtf v2.2d, v2.2d -; CHECK-NEXT: ucvtf v0.2d, v0.2d -; CHECK-NEXT: ucvtf v3.2d, v3.2d -; CHECK-NEXT: ucvtf v1.2d, v1.2d -; CHECK-NEXT: stp q2, q0, [x1, #32] -; CHECK-NEXT: stp q3, q1, [x1] +; CHECK-NEXT: stp q3, q0, [x1, #32] +; CHECK-NEXT: movprfx z0, z1 +; CHECK-NEXT: ucvtf z0.d, p0/m, z1.d +; CHECK-NEXT: stp q2, q0, [x1] ; CHECK-NEXT: ret %op1 = load <8 x i16>, <8 x i16>* %a %res = uitofp <8 x i16> %op1 to <8 x double> @@ -189,6 +206,7 @@ ; CHECK-LABEL: ucvtf_v16i16_v16f64: ; CHECK: // %bb.0: ; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: uunpklo z2.s, z0.h ; CHECK-NEXT: ext z0.b, z0.b, z0.b, #8 ; CHECK-NEXT: uunpklo z0.s, z0.h @@ -196,29 +214,33 @@ ; CHECK-NEXT: uunpklo z3.s, z1.h ; CHECK-NEXT: ext z1.b, z1.b, z1.b, #8 ; CHECK-NEXT: uunpklo z1.s, z1.h -; CHECK-NEXT: uunpklo z5.d, z3.s -; CHECK-NEXT: uunpklo z6.d, z1.s +; CHECK-NEXT: uunpklo z6.d, z0.s +; CHECK-NEXT: uunpklo z7.d, z1.s ; CHECK-NEXT: ext z1.b, z1.b, z1.b, #8 +; CHECK-NEXT: ext z0.b, z0.b, z0.b, #8 ; CHECK-NEXT: uunpklo z1.d, z1.s -; CHECK-NEXT: ucvtf v6.2d, v6.2d -; CHECK-NEXT: ucvtf v1.2d, v1.2d -; CHECK-NEXT: uunpklo z7.d, z0.s +; CHECK-NEXT: uunpklo z5.d, z3.s ; CHECK-NEXT: ext z3.b, z3.b, z3.b, #8 +; CHECK-NEXT: uunpklo z0.d, z0.s +; CHECK-NEXT: ucvtf z7.d, p0/m, z7.d +; CHECK-NEXT: ucvtf z1.d, p0/m, z1.d ; CHECK-NEXT: ext z2.b, z2.b, z2.b, #8 -; CHECK-NEXT: ext z0.b, z0.b, z0.b, #8 ; CHECK-NEXT: uunpklo z3.d, z3.s -; CHECK-NEXT: uunpklo z0.d, z0.s -; CHECK-NEXT: ucvtf v4.2d, v4.2d -; CHECK-NEXT: stp q6, q1, [x1, #96] -; CHECK-NEXT: uunpklo z1.d, z2.s -; CHECK-NEXT: ucvtf v5.2d, v5.2d -; CHECK-NEXT: ucvtf v7.2d, v7.2d -; CHECK-NEXT: ucvtf v3.2d, v3.2d -; CHECK-NEXT: ucvtf v0.2d, v0.2d -; CHECK-NEXT: ucvtf v1.2d, v1.2d -; CHECK-NEXT: stp q7, q0, [x1, #32] -; CHECK-NEXT: stp q4, q1, [x1] -; CHECK-NEXT: stp q5, q3, [x1, #64] +; CHECK-NEXT: stp q7, q1, [x1, #96] +; CHECK-NEXT: ucvtf z5.d, p0/m, z5.d +; CHECK-NEXT: movprfx z1, z3 +; CHECK-NEXT: ucvtf z1.d, p0/m, z3.d +; CHECK-NEXT: ucvtf z0.d, p0/m, z0.d +; CHECK-NEXT: uunpklo z2.d, z2.s +; CHECK-NEXT: stp q5, q1, [x1, #64] +; CHECK-NEXT: movprfx z3, z6 +; CHECK-NEXT: ucvtf z3.d, p0/m, z6.d +; CHECK-NEXT: stp q3, q0, [x1, #32] +; CHECK-NEXT: movprfx z1, z4 +; CHECK-NEXT: ucvtf z1.d, p0/m, z4.d +; CHECK-NEXT: movprfx z0, z2 +; CHECK-NEXT: ucvtf z0.d, p0/m, z2.d +; CHECK-NEXT: stp q1, q0, [x1] ; CHECK-NEXT: ret %op1 = load <16 x i16>, <16 x i16>* %a %res = uitofp <16 x i16> %op1 to <16 x double> @@ -233,9 +255,24 @@ define <2 x half> @ucvtf_v2i32_v2f16(<2 x i32> %op1) #0 { ; CHECK-LABEL: ucvtf_v2i32_v2f16: ; CHECK: // %bb.0: +; CHECK-NEXT: sub sp, sp, #16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 -; CHECK-NEXT: ucvtf v0.4s, v0.4s -; CHECK-NEXT: fcvtn v0.4h, v0.4s +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: ucvtf z0.h, p0/m, z0.s +; CHECK-NEXT: fmov w8, s0 +; CHECK-NEXT: mov z1.s, z0.s[3] +; CHECK-NEXT: mov z2.s, z0.s[2] +; CHECK-NEXT: mov z0.s, z0.s[1] +; CHECK-NEXT: fmov w9, s1 +; CHECK-NEXT: fmov w10, s2 +; CHECK-NEXT: strh w8, [sp, #8] +; CHECK-NEXT: fmov w8, s0 +; CHECK-NEXT: strh w9, [sp, #14] +; CHECK-NEXT: strh w10, [sp, #12] +; CHECK-NEXT: strh w8, [sp, #10] +; CHECK-NEXT: ldr d0, [sp, #8] +; CHECK-NEXT: add sp, sp, #16 ; CHECK-NEXT: ret %res = uitofp <2 x i32> %op1 to <2 x half> ret <2 x half> %res @@ -244,8 +281,24 @@ define <4 x half> @ucvtf_v4i32_v4f16(<4 x i32> %op1) #0 { ; CHECK-LABEL: ucvtf_v4i32_v4f16: ; CHECK: // %bb.0: -; CHECK-NEXT: ucvtf v0.4s, v0.4s -; CHECK-NEXT: fcvtn v0.4h, v0.4s +; CHECK-NEXT: sub sp, sp, #16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: ucvtf z0.h, p0/m, z0.s +; CHECK-NEXT: fmov w8, s0 +; CHECK-NEXT: mov z1.s, z0.s[3] +; CHECK-NEXT: mov z2.s, z0.s[2] +; CHECK-NEXT: mov z0.s, z0.s[1] +; CHECK-NEXT: fmov w9, s1 +; CHECK-NEXT: fmov w10, s2 +; CHECK-NEXT: strh w8, [sp, #8] +; CHECK-NEXT: fmov w8, s0 +; CHECK-NEXT: strh w9, [sp, #14] +; CHECK-NEXT: strh w10, [sp, #12] +; CHECK-NEXT: strh w8, [sp, #10] +; CHECK-NEXT: ldr d0, [sp, #8] +; CHECK-NEXT: add sp, sp, #16 ; CHECK-NEXT: ret %res = uitofp <4 x i32> %op1 to <4 x half> ret <4 x half> %res @@ -254,14 +307,35 @@ define <8 x half> @ucvtf_v8i32_v8f16(<8 x i32>* %a) #0 { ; CHECK-LABEL: ucvtf_v8i32_v8f16: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q1, [x0] -; CHECK-NEXT: ptrue p0.h, vl4 -; CHECK-NEXT: ucvtf v0.4s, v0.4s -; CHECK-NEXT: ucvtf v1.4s, v1.4s -; CHECK-NEXT: fcvtn v0.4h, v0.4s -; CHECK-NEXT: fcvtn v1.4h, v1.4s -; CHECK-NEXT: splice z0.h, p0, z0.h, z1.h -; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 +; CHECK-NEXT: sub sp, sp, #16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: ldp q1, q0, [x0] +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: ucvtf z1.h, p0/m, z1.s +; CHECK-NEXT: fmov w9, s1 +; CHECK-NEXT: mov z5.s, z1.s[2] +; CHECK-NEXT: ucvtf z0.h, p0/m, z0.s +; CHECK-NEXT: fmov w8, s0 +; CHECK-NEXT: mov z2.s, z0.s[3] +; CHECK-NEXT: mov z3.s, z0.s[2] +; CHECK-NEXT: mov z4.s, z0.s[1] +; CHECK-NEXT: fmov w10, s2 +; CHECK-NEXT: strh w9, [sp] +; CHECK-NEXT: strh w8, [sp, #8] +; CHECK-NEXT: fmov w8, s3 +; CHECK-NEXT: fmov w9, s4 +; CHECK-NEXT: mov z0.s, z1.s[3] +; CHECK-NEXT: mov z1.s, z1.s[1] +; CHECK-NEXT: strh w10, [sp, #14] +; CHECK-NEXT: fmov w10, s0 +; CHECK-NEXT: strh w8, [sp, #12] +; CHECK-NEXT: fmov w8, s5 +; CHECK-NEXT: strh w9, [sp, #10] +; CHECK-NEXT: fmov w9, s1 +; CHECK-NEXT: strh w10, [sp, #6] +; CHECK-NEXT: strh w8, [sp, #4] +; CHECK-NEXT: strh w9, [sp, #2] +; CHECK-NEXT: ldr q0, [sp], #16 ; CHECK-NEXT: ret %op1 = load <8 x i32>, <8 x i32>* %a %res = uitofp <8 x i32> %op1 to <8 x half> @@ -271,20 +345,64 @@ define void @ucvtf_v16i32_v16f16(<16 x i32>* %a, <16 x half>* %b) #0 { ; CHECK-LABEL: ucvtf_v16i32_v16f16: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q1, [x0, #32] -; CHECK-NEXT: ptrue p0.h, vl4 -; CHECK-NEXT: ucvtf v0.4s, v0.4s -; CHECK-NEXT: ldp q2, q3, [x0] -; CHECK-NEXT: ucvtf v1.4s, v1.4s -; CHECK-NEXT: fcvtn v0.4h, v0.4s -; CHECK-NEXT: ucvtf v2.4s, v2.4s -; CHECK-NEXT: fcvtn v1.4h, v1.4s -; CHECK-NEXT: ucvtf v3.4s, v3.4s -; CHECK-NEXT: fcvtn v2.4h, v2.4s -; CHECK-NEXT: splice z0.h, p0, z0.h, z1.h -; CHECK-NEXT: fcvtn v3.4h, v3.4s -; CHECK-NEXT: splice z2.h, p0, z2.h, z3.h -; CHECK-NEXT: stp q2, q0, [x1] +; CHECK-NEXT: sub sp, sp, #32 +; CHECK-NEXT: .cfi_def_cfa_offset 32 +; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: ucvtf z0.h, p0/m, z0.s +; CHECK-NEXT: fmov w9, s0 +; CHECK-NEXT: mov z5.s, z0.s[2] +; CHECK-NEXT: ucvtf z1.h, p0/m, z1.s +; CHECK-NEXT: fmov w8, s1 +; CHECK-NEXT: mov z2.s, z1.s[3] +; CHECK-NEXT: mov z3.s, z1.s[2] +; CHECK-NEXT: fmov w10, s2 +; CHECK-NEXT: ldp q6, q7, [x0, #32] +; CHECK-NEXT: strh w8, [sp, #8] +; CHECK-NEXT: fmov w8, s3 +; CHECK-NEXT: mov z4.s, z1.s[1] +; CHECK-NEXT: mov z1.s, z0.s[3] +; CHECK-NEXT: strh w9, [sp] +; CHECK-NEXT: fmov w9, s4 +; CHECK-NEXT: strh w10, [sp, #14] +; CHECK-NEXT: fmov w10, s1 +; CHECK-NEXT: strh w8, [sp, #12] +; CHECK-NEXT: fmov w8, s5 +; CHECK-NEXT: strh w9, [sp, #10] +; CHECK-NEXT: mov z0.s, z0.s[1] +; CHECK-NEXT: strh w10, [sp, #6] +; CHECK-NEXT: strh w8, [sp, #4] +; CHECK-NEXT: movprfx z1, z7 +; CHECK-NEXT: ucvtf z1.h, p0/m, z7.s +; CHECK-NEXT: fmov w8, s0 +; CHECK-NEXT: mov z0.s, z1.s[3] +; CHECK-NEXT: mov z2.s, z1.s[2] +; CHECK-NEXT: mov z3.s, z1.s[1] +; CHECK-NEXT: fmov w9, s1 +; CHECK-NEXT: movprfx z1, z6 +; CHECK-NEXT: ucvtf z1.h, p0/m, z6.s +; CHECK-NEXT: fmov w10, s1 +; CHECK-NEXT: strh w8, [sp, #2] +; CHECK-NEXT: fmov w8, s0 +; CHECK-NEXT: mov z4.s, z1.s[3] +; CHECK-NEXT: strh w9, [sp, #24] +; CHECK-NEXT: fmov w9, s2 +; CHECK-NEXT: strh w10, [sp, #16] +; CHECK-NEXT: fmov w10, s3 +; CHECK-NEXT: mov z5.s, z1.s[2] +; CHECK-NEXT: mov z6.s, z1.s[1] +; CHECK-NEXT: strh w8, [sp, #30] +; CHECK-NEXT: fmov w8, s4 +; CHECK-NEXT: strh w9, [sp, #28] +; CHECK-NEXT: fmov w9, s5 +; CHECK-NEXT: strh w10, [sp, #26] +; CHECK-NEXT: fmov w10, s6 +; CHECK-NEXT: strh w8, [sp, #22] +; CHECK-NEXT: strh w9, [sp, #20] +; CHECK-NEXT: strh w10, [sp, #18] +; CHECK-NEXT: ldp q1, q0, [sp] +; CHECK-NEXT: stp q1, q0, [x1] +; CHECK-NEXT: add sp, sp, #32 ; CHECK-NEXT: ret %op1 = load <16 x i32>, <16 x i32>* %a %res = uitofp <16 x i32> %op1 to <16 x half> @@ -299,7 +417,10 @@ define <2 x float> @ucvtf_v2i32_v2f32(<2 x i32> %op1) #0 { ; CHECK-LABEL: ucvtf_v2i32_v2f32: ; CHECK: // %bb.0: -; CHECK-NEXT: ucvtf v0.2s, v0.2s +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: ptrue p0.s, vl2 +; CHECK-NEXT: ucvtf z0.s, p0/m, z0.s +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret %res = uitofp <2 x i32> %op1 to <2 x float> ret <2 x float> %res @@ -308,7 +429,10 @@ define <4 x float> @ucvtf_v4i32_v4f32(<4 x i32> %op1) #0 { ; CHECK-LABEL: ucvtf_v4i32_v4f32: ; CHECK: // %bb.0: -; CHECK-NEXT: ucvtf v0.4s, v0.4s +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: ptrue p0.s, vl4 +; CHECK-NEXT: ucvtf z0.s, p0/m, z0.s +; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: ret %res = uitofp <4 x i32> %op1 to <4 x float> ret <4 x float> %res @@ -318,8 +442,9 @@ ; CHECK-LABEL: ucvtf_v8i32_v8f32: ; CHECK: // %bb.0: ; CHECK-NEXT: ldp q0, q1, [x0] -; CHECK-NEXT: ucvtf v0.4s, v0.4s -; CHECK-NEXT: ucvtf v1.4s, v1.4s +; CHECK-NEXT: ptrue p0.s, vl4 +; CHECK-NEXT: ucvtf z0.s, p0/m, z0.s +; CHECK-NEXT: ucvtf z1.s, p0/m, z1.s ; CHECK-NEXT: stp q0, q1, [x1] ; CHECK-NEXT: ret %op1 = load <8 x i32>, <8 x i32>* %a @@ -336,8 +461,10 @@ ; CHECK-LABEL: ucvtf_v2i32_v2f64: ; CHECK: // %bb.0: ; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: uunpklo z0.d, z0.s -; CHECK-NEXT: ucvtf v0.2d, v0.2d +; CHECK-NEXT: ucvtf z0.d, p0/m, z0.d +; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: ret %res = uitofp <2 x i32> %op1 to <2 x double> ret <2 x double> %res @@ -347,11 +474,12 @@ ; CHECK-LABEL: ucvtf_v4i32_v4f64: ; CHECK: // %bb.0: ; CHECK-NEXT: ldr q0, [x0] +; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: uunpklo z1.d, z0.s ; CHECK-NEXT: ext z0.b, z0.b, z0.b, #8 ; CHECK-NEXT: uunpklo z0.d, z0.s -; CHECK-NEXT: ucvtf v1.2d, v1.2d -; CHECK-NEXT: ucvtf v0.2d, v0.2d +; CHECK-NEXT: ucvtf z1.d, p0/m, z1.d +; CHECK-NEXT: ucvtf z0.d, p0/m, z0.d ; CHECK-NEXT: stp q1, q0, [x1] ; CHECK-NEXT: ret %op1 = load <4 x i32>, <4 x i32>* %a @@ -363,19 +491,21 @@ define void @ucvtf_v8i32_v8f64(<8 x i32>* %a, <8 x double>* %b) #0 { ; CHECK-LABEL: ucvtf_v8i32_v8f64: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q1, q0, [x0] -; CHECK-NEXT: uunpklo z3.d, z1.s -; CHECK-NEXT: ext z1.b, z1.b, z1.b, #8 -; CHECK-NEXT: uunpklo z1.d, z1.s -; CHECK-NEXT: ucvtf v3.2d, v3.2d +; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: uunpklo z2.d, z0.s ; CHECK-NEXT: ext z0.b, z0.b, z0.b, #8 ; CHECK-NEXT: uunpklo z0.d, z0.s -; CHECK-NEXT: ucvtf v2.2d, v2.2d -; CHECK-NEXT: ucvtf v0.2d, v0.2d -; CHECK-NEXT: ucvtf v1.2d, v1.2d -; CHECK-NEXT: stp q3, q1, [x1] -; CHECK-NEXT: stp q2, q0, [x1, #32] +; CHECK-NEXT: uunpklo z3.d, z1.s +; CHECK-NEXT: ext z1.b, z1.b, z1.b, #8 +; CHECK-NEXT: uunpklo z1.d, z1.s +; CHECK-NEXT: ucvtf z3.d, p0/m, z3.d +; CHECK-NEXT: ucvtf z1.d, p0/m, z1.d +; CHECK-NEXT: ucvtf z0.d, p0/m, z0.d +; CHECK-NEXT: stp q3, q1, [x1, #32] +; CHECK-NEXT: movprfx z1, z2 +; CHECK-NEXT: ucvtf z1.d, p0/m, z2.d +; CHECK-NEXT: stp q1, q0, [x1] ; CHECK-NEXT: ret %op1 = load <8 x i32>, <8 x i32>* %a %res = uitofp <8 x i32> %op1 to <8 x double> @@ -410,14 +540,37 @@ define <4 x half> @ucvtf_v4i64_v4f16(<4 x i64>* %a) #0 { ; CHECK-LABEL: ucvtf_v4i64_v4f16: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q1, [x0] -; CHECK-NEXT: ptrue p0.s, vl2 -; CHECK-NEXT: ucvtf v0.2d, v0.2d -; CHECK-NEXT: ucvtf v1.2d, v1.2d -; CHECK-NEXT: fcvtn v0.2s, v0.2d -; CHECK-NEXT: fcvtn v1.2s, v1.2d -; CHECK-NEXT: splice z0.s, p0, z0.s, z1.s -; CHECK-NEXT: fcvtn v0.4h, v0.4s +; CHECK-NEXT: sub sp, sp, #32 +; CHECK-NEXT: .cfi_def_cfa_offset 32 +; CHECK-NEXT: ldp q1, q0, [x0] +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: ucvtf z0.s, p0/m, z0.d +; CHECK-NEXT: mov z2.d, z0.d[1] +; CHECK-NEXT: fmov x8, d0 +; CHECK-NEXT: movprfx z0, z1 +; CHECK-NEXT: ucvtf z0.s, p0/m, z1.d +; CHECK-NEXT: mov z1.d, z0.d[1] +; CHECK-NEXT: fmov x9, d2 +; CHECK-NEXT: fmov x10, d0 +; CHECK-NEXT: fmov x11, d1 +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: stp w8, w9, [sp, #8] +; CHECK-NEXT: stp w10, w11, [sp] +; CHECK-NEXT: ldr q0, [sp] +; CHECK-NEXT: fcvt z0.h, p0/m, z0.s +; CHECK-NEXT: fmov w8, s0 +; CHECK-NEXT: mov z1.s, z0.s[3] +; CHECK-NEXT: mov z2.s, z0.s[2] +; CHECK-NEXT: mov z0.s, z0.s[1] +; CHECK-NEXT: fmov w9, s1 +; CHECK-NEXT: fmov w10, s2 +; CHECK-NEXT: strh w8, [sp, #24] +; CHECK-NEXT: fmov w8, s0 +; CHECK-NEXT: strh w9, [sp, #30] +; CHECK-NEXT: strh w10, [sp, #28] +; CHECK-NEXT: strh w8, [sp, #26] +; CHECK-NEXT: ldr d0, [sp, #24] +; CHECK-NEXT: add sp, sp, #32 ; CHECK-NEXT: ret %op1 = load <4 x i64>, <4 x i64>* %a %res = uitofp <4 x i64> %op1 to <4 x half> @@ -427,24 +580,62 @@ define <8 x half> @ucvtf_v8i64_v8f16(<8 x i64>* %a) #0 { ; CHECK-LABEL: ucvtf_v8i64_v8f16: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q1, [x0, #32] -; CHECK-NEXT: ptrue p0.s, vl2 -; CHECK-NEXT: ucvtf v0.2d, v0.2d +; CHECK-NEXT: sub sp, sp, #48 +; CHECK-NEXT: .cfi_def_cfa_offset 48 +; CHECK-NEXT: ldp q1, q0, [x0, #32] +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: ucvtf z1.s, p0/m, z1.d +; CHECK-NEXT: fmov x9, d1 ; CHECK-NEXT: ldp q2, q3, [x0] -; CHECK-NEXT: ucvtf v1.2d, v1.2d -; CHECK-NEXT: fcvtn v0.2s, v0.2d -; CHECK-NEXT: ucvtf v2.2d, v2.2d -; CHECK-NEXT: fcvtn v1.2s, v1.2d -; CHECK-NEXT: ucvtf v3.2d, v3.2d -; CHECK-NEXT: fcvtn v2.2s, v2.2d -; CHECK-NEXT: splice z0.s, p0, z0.s, z1.s -; CHECK-NEXT: fcvtn v3.2s, v3.2d -; CHECK-NEXT: fcvtn v1.4h, v0.4s -; CHECK-NEXT: splice z2.s, p0, z2.s, z3.s -; CHECK-NEXT: ptrue p0.h, vl4 -; CHECK-NEXT: fcvtn v0.4h, v2.4s -; CHECK-NEXT: splice z0.h, p0, z0.h, z1.h -; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 +; CHECK-NEXT: ucvtf z0.s, p0/m, z0.d +; CHECK-NEXT: mov z4.d, z0.d[1] +; CHECK-NEXT: fmov x8, d0 +; CHECK-NEXT: mov z0.d, z1.d[1] +; CHECK-NEXT: fmov x10, d4 +; CHECK-NEXT: fmov x12, d0 +; CHECK-NEXT: ucvtf z2.s, p0/m, z2.d +; CHECK-NEXT: movprfx z1, z3 +; CHECK-NEXT: ucvtf z1.s, p0/m, z3.d +; CHECK-NEXT: mov z0.d, z2.d[1] +; CHECK-NEXT: stp w8, w10, [sp, #24] +; CHECK-NEXT: mov z3.d, z1.d[1] +; CHECK-NEXT: stp w9, w12, [sp, #16] +; CHECK-NEXT: fmov x10, d0 +; CHECK-NEXT: ldr q0, [sp, #16] +; CHECK-NEXT: fmov x11, d1 +; CHECK-NEXT: fmov x8, d3 +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: fmov x9, d2 +; CHECK-NEXT: fcvt z0.h, p0/m, z0.s +; CHECK-NEXT: stp w11, w8, [sp, #8] +; CHECK-NEXT: fmov w8, s0 +; CHECK-NEXT: mov z1.s, z0.s[3] +; CHECK-NEXT: stp w9, w10, [sp] +; CHECK-NEXT: mov z2.s, z0.s[2] +; CHECK-NEXT: mov z0.s, z0.s[1] +; CHECK-NEXT: strh w8, [sp, #40] +; CHECK-NEXT: fmov w8, s1 +; CHECK-NEXT: ldr q1, [sp] +; CHECK-NEXT: fmov w9, s2 +; CHECK-NEXT: fmov w10, s0 +; CHECK-NEXT: strh w8, [sp, #46] +; CHECK-NEXT: movprfx z0, z1 +; CHECK-NEXT: fcvt z0.h, p0/m, z1.s +; CHECK-NEXT: fmov w8, s0 +; CHECK-NEXT: mov z1.s, z0.s[3] +; CHECK-NEXT: mov z2.s, z0.s[2] +; CHECK-NEXT: mov z3.s, z0.s[1] +; CHECK-NEXT: strh w9, [sp, #44] +; CHECK-NEXT: fmov w9, s1 +; CHECK-NEXT: strh w10, [sp, #42] +; CHECK-NEXT: fmov w10, s2 +; CHECK-NEXT: strh w8, [sp, #32] +; CHECK-NEXT: fmov w8, s3 +; CHECK-NEXT: strh w9, [sp, #38] +; CHECK-NEXT: strh w10, [sp, #36] +; CHECK-NEXT: strh w8, [sp, #34] +; CHECK-NEXT: ldr q0, [sp, #32] +; CHECK-NEXT: add sp, sp, #48 ; CHECK-NEXT: ret %op1 = load <8 x i64>, <8 x i64>* %a %res = uitofp <8 x i64> %op1 to <8 x half> @@ -458,8 +649,17 @@ define <2 x float> @ucvtf_v2i64_v2f32(<2 x i64> %op1) #0 { ; CHECK-LABEL: ucvtf_v2i64_v2f32: ; CHECK: // %bb.0: -; CHECK-NEXT: ucvtf v0.2d, v0.2d -; CHECK-NEXT: fcvtn v0.2s, v0.2d +; CHECK-NEXT: sub sp, sp, #16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: ucvtf z0.s, p0/m, z0.d +; CHECK-NEXT: mov z1.d, z0.d[1] +; CHECK-NEXT: fmov x8, d0 +; CHECK-NEXT: fmov x9, d1 +; CHECK-NEXT: stp w8, w9, [sp, #8] +; CHECK-NEXT: ldr d0, [sp, #8] +; CHECK-NEXT: add sp, sp, #16 ; CHECK-NEXT: ret %res = uitofp <2 x i64> %op1 to <2 x float> ret <2 x float> %res @@ -468,14 +668,22 @@ define <4 x float> @ucvtf_v4i64_v4f32(<4 x i64>* %a) #0 { ; CHECK-LABEL: ucvtf_v4i64_v4f32: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q1, [x0] -; CHECK-NEXT: ptrue p0.s, vl2 -; CHECK-NEXT: ucvtf v0.2d, v0.2d -; CHECK-NEXT: ucvtf v1.2d, v1.2d -; CHECK-NEXT: fcvtn v0.2s, v0.2d -; CHECK-NEXT: fcvtn v1.2s, v1.2d -; CHECK-NEXT: splice z0.s, p0, z0.s, z1.s -; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 +; CHECK-NEXT: sub sp, sp, #16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: ldp q1, q0, [x0] +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: ucvtf z0.s, p0/m, z0.d +; CHECK-NEXT: mov z2.d, z0.d[1] +; CHECK-NEXT: fmov x8, d0 +; CHECK-NEXT: movprfx z0, z1 +; CHECK-NEXT: ucvtf z0.s, p0/m, z1.d +; CHECK-NEXT: mov z1.d, z0.d[1] +; CHECK-NEXT: fmov x9, d2 +; CHECK-NEXT: fmov x10, d0 +; CHECK-NEXT: fmov x11, d1 +; CHECK-NEXT: stp w8, w9, [sp, #8] +; CHECK-NEXT: stp w10, w11, [sp] +; CHECK-NEXT: ldr q0, [sp], #16 ; CHECK-NEXT: ret %op1 = load <4 x i64>, <4 x i64>* %a %res = uitofp <4 x i64> %op1 to <4 x float> @@ -485,20 +693,37 @@ define void @ucvtf_v8i64_v8f32(<8 x i64>* %a, <8 x float>* %b) #0 { ; CHECK-LABEL: ucvtf_v8i64_v8f32: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q1, [x0, #32] -; CHECK-NEXT: ptrue p0.s, vl2 -; CHECK-NEXT: ucvtf v0.2d, v0.2d -; CHECK-NEXT: ldp q2, q3, [x0] -; CHECK-NEXT: ucvtf v1.2d, v1.2d -; CHECK-NEXT: fcvtn v0.2s, v0.2d -; CHECK-NEXT: ucvtf v2.2d, v2.2d -; CHECK-NEXT: fcvtn v1.2s, v1.2d -; CHECK-NEXT: ucvtf v3.2d, v3.2d -; CHECK-NEXT: fcvtn v2.2s, v2.2d -; CHECK-NEXT: splice z0.s, p0, z0.s, z1.s -; CHECK-NEXT: fcvtn v3.2s, v3.2d -; CHECK-NEXT: splice z2.s, p0, z2.s, z3.s -; CHECK-NEXT: stp q2, q0, [x1] +; CHECK-NEXT: sub sp, sp, #32 +; CHECK-NEXT: .cfi_def_cfa_offset 32 +; CHECK-NEXT: ldp q1, q0, [x0] +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: ucvtf z0.s, p0/m, z0.d +; CHECK-NEXT: ldp q3, q2, [x0, #32] +; CHECK-NEXT: mov z4.d, z0.d[1] +; CHECK-NEXT: fmov x8, d0 +; CHECK-NEXT: movprfx z0, z1 +; CHECK-NEXT: ucvtf z0.s, p0/m, z1.d +; CHECK-NEXT: movprfx z1, z2 +; CHECK-NEXT: ucvtf z1.s, p0/m, z2.d +; CHECK-NEXT: movprfx z2, z3 +; CHECK-NEXT: ucvtf z2.s, p0/m, z3.d +; CHECK-NEXT: fmov x9, d4 +; CHECK-NEXT: mov z3.d, z0.d[1] +; CHECK-NEXT: fmov x10, d0 +; CHECK-NEXT: fmov x11, d3 +; CHECK-NEXT: mov z0.d, z1.d[1] +; CHECK-NEXT: stp w8, w9, [sp, #8] +; CHECK-NEXT: fmov x9, d0 +; CHECK-NEXT: mov z0.d, z2.d[1] +; CHECK-NEXT: fmov x8, d1 +; CHECK-NEXT: stp w10, w11, [sp] +; CHECK-NEXT: fmov x10, d2 +; CHECK-NEXT: fmov x11, d0 +; CHECK-NEXT: stp w8, w9, [sp, #24] +; CHECK-NEXT: stp w10, w11, [sp, #16] +; CHECK-NEXT: ldp q1, q0, [sp] +; CHECK-NEXT: stp q1, q0, [x1] +; CHECK-NEXT: add sp, sp, #32 ; CHECK-NEXT: ret %op1 = load <8 x i64>, <8 x i64>* %a %res = uitofp <8 x i64> %op1 to <8 x float> @@ -513,7 +738,10 @@ define <2 x double> @ucvtf_v2i64_v2f64(<2 x i64> %op1) #0 { ; CHECK-LABEL: ucvtf_v2i64_v2f64: ; CHECK: // %bb.0: -; CHECK-NEXT: ucvtf v0.2d, v0.2d +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: ptrue p0.d, vl2 +; CHECK-NEXT: ucvtf z0.d, p0/m, z0.d +; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: ret %res = uitofp <2 x i64> %op1 to <2 x double> ret <2 x double> %res @@ -523,8 +751,9 @@ ; CHECK-LABEL: ucvtf_v4i64_v4f64: ; CHECK: // %bb.0: ; CHECK-NEXT: ldp q0, q1, [x0] -; CHECK-NEXT: ucvtf v0.2d, v0.2d -; CHECK-NEXT: ucvtf v1.2d, v1.2d +; CHECK-NEXT: ptrue p0.d, vl2 +; CHECK-NEXT: ucvtf z0.d, p0/m, z0.d +; CHECK-NEXT: ucvtf z1.d, p0/m, z1.d ; CHECK-NEXT: stp q0, q1, [x1] ; CHECK-NEXT: ret %op1 = load <4 x i64>, <4 x i64>* %a @@ -540,7 +769,10 @@ define <4 x half> @scvtf_v4i16_v4f16(<4 x i16> %op1) #0 { ; CHECK-LABEL: scvtf_v4i16_v4f16: ; CHECK: // %bb.0: -; CHECK-NEXT: scvtf v0.4h, v0.4h +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: ptrue p0.h, vl4 +; CHECK-NEXT: scvtf z0.h, p0/m, z0.h +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret %res = sitofp <4 x i16> %op1 to <4 x half> ret <4 x half> %res @@ -550,7 +782,8 @@ ; CHECK-LABEL: scvtf_v8i16_v8f16: ; CHECK: // %bb.0: ; CHECK-NEXT: ldr q0, [x0] -; CHECK-NEXT: scvtf v0.8h, v0.8h +; CHECK-NEXT: ptrue p0.h, vl8 +; CHECK-NEXT: scvtf z0.h, p0/m, z0.h ; CHECK-NEXT: str q0, [x1] ; CHECK-NEXT: ret %op1 = load <8 x i16>, <8 x i16>* %a @@ -563,8 +796,9 @@ ; CHECK-LABEL: scvtf_v16i16_v16f16: ; CHECK: // %bb.0: ; CHECK-NEXT: ldp q0, q1, [x0] -; CHECK-NEXT: scvtf v0.8h, v0.8h -; CHECK-NEXT: scvtf v1.8h, v1.8h +; CHECK-NEXT: ptrue p0.h, vl8 +; CHECK-NEXT: scvtf z0.h, p0/m, z0.h +; CHECK-NEXT: scvtf z1.h, p0/m, z1.h ; CHECK-NEXT: stp q0, q1, [x1] ; CHECK-NEXT: ret %op1 = load <16 x i16>, <16 x i16>* %a @@ -585,7 +819,8 @@ ; CHECK-NEXT: ldr d1, [x8, :lo12:.LCPI33_0] ; CHECK-NEXT: lsl z0.s, p0/m, z0.s, z1.s ; CHECK-NEXT: asr z0.s, p0/m, z0.s, z1.s -; CHECK-NEXT: scvtf v0.2s, v0.2s +; CHECK-NEXT: scvtf z0.s, p0/m, z0.s +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret %res = sitofp <2 x i16> %op1 to <2 x float> ret <2 x float> %res @@ -595,8 +830,10 @@ ; CHECK-LABEL: scvtf_v4i16_v4f32: ; CHECK: // %bb.0: ; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: sunpklo z0.s, z0.h -; CHECK-NEXT: scvtf v0.4s, v0.4s +; CHECK-NEXT: scvtf z0.s, p0/m, z0.s +; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: ret %res = sitofp <4 x i16> %op1 to <4 x float> ret <4 x float> %res @@ -606,11 +843,12 @@ ; CHECK-LABEL: scvtf_v8i16_v8f32: ; CHECK: // %bb.0: ; CHECK-NEXT: ldr q0, [x0] +; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: sunpklo z1.s, z0.h ; CHECK-NEXT: ext z0.b, z0.b, z0.b, #8 ; CHECK-NEXT: sunpklo z0.s, z0.h -; CHECK-NEXT: scvtf v1.4s, v1.4s -; CHECK-NEXT: scvtf v0.4s, v0.4s +; CHECK-NEXT: scvtf z1.s, p0/m, z1.s +; CHECK-NEXT: scvtf z0.s, p0/m, z0.s ; CHECK-NEXT: stp q1, q0, [x1] ; CHECK-NEXT: ret %op1 = load <8 x i16>, <8 x i16>* %a @@ -622,19 +860,21 @@ define void @scvtf_v16i16_v16f32(<16 x i16>* %a, <16 x float>* %b) #0 { ; CHECK-LABEL: scvtf_v16i16_v16f32: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q1, q0, [x0] -; CHECK-NEXT: sunpklo z3.s, z1.h -; CHECK-NEXT: ext z1.b, z1.b, z1.b, #8 -; CHECK-NEXT: sunpklo z1.s, z1.h -; CHECK-NEXT: scvtf v3.4s, v3.4s +; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: sunpklo z2.s, z0.h ; CHECK-NEXT: ext z0.b, z0.b, z0.b, #8 ; CHECK-NEXT: sunpklo z0.s, z0.h -; CHECK-NEXT: scvtf v2.4s, v2.4s -; CHECK-NEXT: scvtf v0.4s, v0.4s -; CHECK-NEXT: scvtf v1.4s, v1.4s -; CHECK-NEXT: stp q3, q1, [x1] -; CHECK-NEXT: stp q2, q0, [x1, #32] +; CHECK-NEXT: sunpklo z3.s, z1.h +; CHECK-NEXT: ext z1.b, z1.b, z1.b, #8 +; CHECK-NEXT: sunpklo z1.s, z1.h +; CHECK-NEXT: scvtf z3.s, p0/m, z3.s +; CHECK-NEXT: scvtf z1.s, p0/m, z1.s +; CHECK-NEXT: scvtf z0.s, p0/m, z0.s +; CHECK-NEXT: stp q3, q1, [x1, #32] +; CHECK-NEXT: movprfx z1, z2 +; CHECK-NEXT: scvtf z1.s, p0/m, z2.s +; CHECK-NEXT: stp q1, q0, [x1] ; CHECK-NEXT: ret %op1 = load <16 x i16>, <16 x i16>* %a %res = sitofp <16 x i16> %op1 to <16 x float> @@ -655,8 +895,10 @@ ; CHECK-NEXT: ldr d1, [x8, :lo12:.LCPI37_0] ; CHECK-NEXT: lsl z0.s, p0/m, z0.s, z1.s ; CHECK-NEXT: asr z0.s, p0/m, z0.s, z1.s +; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: sunpklo z0.d, z0.s -; CHECK-NEXT: scvtf v0.2d, v0.2d +; CHECK-NEXT: scvtf z0.d, p0/m, z0.d +; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: ret %res = sitofp <2 x i16> %op1 to <2 x double> ret <2 x double> %res @@ -666,12 +908,13 @@ ; CHECK-LABEL: scvtf_v4i16_v4f64: ; CHECK: // %bb.0: ; CHECK-NEXT: ldr d0, [x0] +; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: sunpklo z0.s, z0.h ; CHECK-NEXT: sunpklo z1.d, z0.s ; CHECK-NEXT: ext z0.b, z0.b, z0.b, #8 ; CHECK-NEXT: sunpklo z0.d, z0.s -; CHECK-NEXT: scvtf v1.2d, v1.2d -; CHECK-NEXT: scvtf v0.2d, v0.2d +; CHECK-NEXT: scvtf z1.d, p0/m, z1.d +; CHECK-NEXT: scvtf z0.d, p0/m, z0.d ; CHECK-NEXT: stp q1, q0, [x1] ; CHECK-NEXT: ret %op1 = load <4 x i16>, <4 x i16>* %a @@ -684,21 +927,23 @@ ; CHECK-LABEL: scvtf_v8i16_v8f64: ; CHECK: // %bb.0: ; CHECK-NEXT: ldr q0, [x0] +; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: sunpklo z1.s, z0.h ; CHECK-NEXT: ext z0.b, z0.b, z0.b, #8 ; CHECK-NEXT: sunpklo z0.s, z0.h -; CHECK-NEXT: sunpklo z3.d, z1.s -; CHECK-NEXT: sunpklo z2.d, z0.s +; CHECK-NEXT: sunpklo z2.d, z1.s +; CHECK-NEXT: sunpklo z3.d, z0.s ; CHECK-NEXT: ext z0.b, z0.b, z0.b, #8 -; CHECK-NEXT: ext z1.b, z1.b, z1.b, #8 ; CHECK-NEXT: sunpklo z0.d, z0.s +; CHECK-NEXT: scvtf z3.d, p0/m, z3.d +; CHECK-NEXT: ext z1.b, z1.b, z1.b, #8 +; CHECK-NEXT: scvtf z0.d, p0/m, z0.d +; CHECK-NEXT: scvtf z2.d, p0/m, z2.d ; CHECK-NEXT: sunpklo z1.d, z1.s -; CHECK-NEXT: scvtf v2.2d, v2.2d -; CHECK-NEXT: scvtf v0.2d, v0.2d -; CHECK-NEXT: scvtf v3.2d, v3.2d -; CHECK-NEXT: scvtf v1.2d, v1.2d -; CHECK-NEXT: stp q2, q0, [x1, #32] -; CHECK-NEXT: stp q3, q1, [x1] +; CHECK-NEXT: stp q3, q0, [x1, #32] +; CHECK-NEXT: movprfx z0, z1 +; CHECK-NEXT: scvtf z0.d, p0/m, z1.d +; CHECK-NEXT: stp q2, q0, [x1] ; CHECK-NEXT: ret %op1 = load <8 x i16>, <8 x i16>* %a %res = sitofp <8 x i16> %op1 to <8 x double> @@ -710,6 +955,7 @@ ; CHECK-LABEL: scvtf_v16i16_v16f64: ; CHECK: // %bb.0: ; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: sunpklo z2.s, z0.h ; CHECK-NEXT: ext z0.b, z0.b, z0.b, #8 ; CHECK-NEXT: sunpklo z0.s, z0.h @@ -717,29 +963,33 @@ ; CHECK-NEXT: sunpklo z3.s, z1.h ; CHECK-NEXT: ext z1.b, z1.b, z1.b, #8 ; CHECK-NEXT: sunpklo z1.s, z1.h -; CHECK-NEXT: sunpklo z5.d, z3.s -; CHECK-NEXT: sunpklo z6.d, z1.s +; CHECK-NEXT: sunpklo z6.d, z0.s +; CHECK-NEXT: sunpklo z7.d, z1.s ; CHECK-NEXT: ext z1.b, z1.b, z1.b, #8 +; CHECK-NEXT: ext z0.b, z0.b, z0.b, #8 ; CHECK-NEXT: sunpklo z1.d, z1.s -; CHECK-NEXT: scvtf v6.2d, v6.2d -; CHECK-NEXT: scvtf v1.2d, v1.2d -; CHECK-NEXT: sunpklo z7.d, z0.s +; CHECK-NEXT: sunpklo z5.d, z3.s ; CHECK-NEXT: ext z3.b, z3.b, z3.b, #8 +; CHECK-NEXT: sunpklo z0.d, z0.s +; CHECK-NEXT: scvtf z7.d, p0/m, z7.d +; CHECK-NEXT: scvtf z1.d, p0/m, z1.d ; CHECK-NEXT: ext z2.b, z2.b, z2.b, #8 -; CHECK-NEXT: ext z0.b, z0.b, z0.b, #8 ; CHECK-NEXT: sunpklo z3.d, z3.s -; CHECK-NEXT: sunpklo z0.d, z0.s -; CHECK-NEXT: scvtf v4.2d, v4.2d -; CHECK-NEXT: stp q6, q1, [x1, #96] -; CHECK-NEXT: sunpklo z1.d, z2.s -; CHECK-NEXT: scvtf v5.2d, v5.2d -; CHECK-NEXT: scvtf v7.2d, v7.2d -; CHECK-NEXT: scvtf v3.2d, v3.2d -; CHECK-NEXT: scvtf v0.2d, v0.2d -; CHECK-NEXT: scvtf v1.2d, v1.2d -; CHECK-NEXT: stp q7, q0, [x1, #32] -; CHECK-NEXT: stp q4, q1, [x1] -; CHECK-NEXT: stp q5, q3, [x1, #64] +; CHECK-NEXT: stp q7, q1, [x1, #96] +; CHECK-NEXT: scvtf z5.d, p0/m, z5.d +; CHECK-NEXT: movprfx z1, z3 +; CHECK-NEXT: scvtf z1.d, p0/m, z3.d +; CHECK-NEXT: scvtf z0.d, p0/m, z0.d +; CHECK-NEXT: sunpklo z2.d, z2.s +; CHECK-NEXT: stp q5, q1, [x1, #64] +; CHECK-NEXT: movprfx z3, z6 +; CHECK-NEXT: scvtf z3.d, p0/m, z6.d +; CHECK-NEXT: stp q3, q0, [x1, #32] +; CHECK-NEXT: movprfx z1, z4 +; CHECK-NEXT: scvtf z1.d, p0/m, z4.d +; CHECK-NEXT: movprfx z0, z2 +; CHECK-NEXT: scvtf z0.d, p0/m, z2.d +; CHECK-NEXT: stp q1, q0, [x1] ; CHECK-NEXT: ret %op1 = load <16 x i16>, <16 x i16>* %a %res = sitofp <16 x i16> %op1 to <16 x double> @@ -754,9 +1004,24 @@ define <2 x half> @scvtf_v2i32_v2f16(<2 x i32> %op1) #0 { ; CHECK-LABEL: scvtf_v2i32_v2f16: ; CHECK: // %bb.0: +; CHECK-NEXT: sub sp, sp, #16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 -; CHECK-NEXT: scvtf v0.4s, v0.4s -; CHECK-NEXT: fcvtn v0.4h, v0.4s +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: scvtf z0.h, p0/m, z0.s +; CHECK-NEXT: fmov w8, s0 +; CHECK-NEXT: mov z1.s, z0.s[3] +; CHECK-NEXT: mov z2.s, z0.s[2] +; CHECK-NEXT: mov z0.s, z0.s[1] +; CHECK-NEXT: fmov w9, s1 +; CHECK-NEXT: fmov w10, s2 +; CHECK-NEXT: strh w8, [sp, #8] +; CHECK-NEXT: fmov w8, s0 +; CHECK-NEXT: strh w9, [sp, #14] +; CHECK-NEXT: strh w10, [sp, #12] +; CHECK-NEXT: strh w8, [sp, #10] +; CHECK-NEXT: ldr d0, [sp, #8] +; CHECK-NEXT: add sp, sp, #16 ; CHECK-NEXT: ret %res = sitofp <2 x i32> %op1 to <2 x half> ret <2 x half> %res @@ -765,8 +1030,24 @@ define <4 x half> @scvtf_v4i32_v4f16(<4 x i32> %op1) #0 { ; CHECK-LABEL: scvtf_v4i32_v4f16: ; CHECK: // %bb.0: -; CHECK-NEXT: scvtf v0.4s, v0.4s -; CHECK-NEXT: fcvtn v0.4h, v0.4s +; CHECK-NEXT: sub sp, sp, #16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: scvtf z0.h, p0/m, z0.s +; CHECK-NEXT: fmov w8, s0 +; CHECK-NEXT: mov z1.s, z0.s[3] +; CHECK-NEXT: mov z2.s, z0.s[2] +; CHECK-NEXT: mov z0.s, z0.s[1] +; CHECK-NEXT: fmov w9, s1 +; CHECK-NEXT: fmov w10, s2 +; CHECK-NEXT: strh w8, [sp, #8] +; CHECK-NEXT: fmov w8, s0 +; CHECK-NEXT: strh w9, [sp, #14] +; CHECK-NEXT: strh w10, [sp, #12] +; CHECK-NEXT: strh w8, [sp, #10] +; CHECK-NEXT: ldr d0, [sp, #8] +; CHECK-NEXT: add sp, sp, #16 ; CHECK-NEXT: ret %res = sitofp <4 x i32> %op1 to <4 x half> ret <4 x half> %res @@ -775,14 +1056,35 @@ define <8 x half> @scvtf_v8i32_v8f16(<8 x i32>* %a) #0 { ; CHECK-LABEL: scvtf_v8i32_v8f16: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q1, [x0] -; CHECK-NEXT: ptrue p0.h, vl4 -; CHECK-NEXT: scvtf v0.4s, v0.4s -; CHECK-NEXT: scvtf v1.4s, v1.4s -; CHECK-NEXT: fcvtn v0.4h, v0.4s -; CHECK-NEXT: fcvtn v1.4h, v1.4s -; CHECK-NEXT: splice z0.h, p0, z0.h, z1.h -; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 +; CHECK-NEXT: sub sp, sp, #16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: ldp q1, q0, [x0] +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: scvtf z1.h, p0/m, z1.s +; CHECK-NEXT: fmov w9, s1 +; CHECK-NEXT: mov z5.s, z1.s[2] +; CHECK-NEXT: scvtf z0.h, p0/m, z0.s +; CHECK-NEXT: fmov w8, s0 +; CHECK-NEXT: mov z2.s, z0.s[3] +; CHECK-NEXT: mov z3.s, z0.s[2] +; CHECK-NEXT: mov z4.s, z0.s[1] +; CHECK-NEXT: fmov w10, s2 +; CHECK-NEXT: strh w9, [sp] +; CHECK-NEXT: strh w8, [sp, #8] +; CHECK-NEXT: fmov w8, s3 +; CHECK-NEXT: fmov w9, s4 +; CHECK-NEXT: mov z0.s, z1.s[3] +; CHECK-NEXT: mov z1.s, z1.s[1] +; CHECK-NEXT: strh w10, [sp, #14] +; CHECK-NEXT: fmov w10, s0 +; CHECK-NEXT: strh w8, [sp, #12] +; CHECK-NEXT: fmov w8, s5 +; CHECK-NEXT: strh w9, [sp, #10] +; CHECK-NEXT: fmov w9, s1 +; CHECK-NEXT: strh w10, [sp, #6] +; CHECK-NEXT: strh w8, [sp, #4] +; CHECK-NEXT: strh w9, [sp, #2] +; CHECK-NEXT: ldr q0, [sp], #16 ; CHECK-NEXT: ret %op1 = load <8 x i32>, <8 x i32>* %a %res = sitofp <8 x i32> %op1 to <8 x half> @@ -796,7 +1098,10 @@ define <2 x float> @scvtf_v2i32_v2f32(<2 x i32> %op1) #0 { ; CHECK-LABEL: scvtf_v2i32_v2f32: ; CHECK: // %bb.0: -; CHECK-NEXT: scvtf v0.2s, v0.2s +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: ptrue p0.s, vl2 +; CHECK-NEXT: scvtf z0.s, p0/m, z0.s +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret %res = sitofp <2 x i32> %op1 to <2 x float> ret <2 x float> %res @@ -805,7 +1110,10 @@ define <4 x float> @scvtf_v4i32_v4f32(<4 x i32> %op1) #0 { ; CHECK-LABEL: scvtf_v4i32_v4f32: ; CHECK: // %bb.0: -; CHECK-NEXT: scvtf v0.4s, v0.4s +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: ptrue p0.s, vl4 +; CHECK-NEXT: scvtf z0.s, p0/m, z0.s +; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: ret %res = sitofp <4 x i32> %op1 to <4 x float> ret <4 x float> %res @@ -815,8 +1123,9 @@ ; CHECK-LABEL: scvtf_v8i32_v8f32: ; CHECK: // %bb.0: ; CHECK-NEXT: ldp q0, q1, [x0] -; CHECK-NEXT: scvtf v0.4s, v0.4s -; CHECK-NEXT: scvtf v1.4s, v1.4s +; CHECK-NEXT: ptrue p0.s, vl4 +; CHECK-NEXT: scvtf z0.s, p0/m, z0.s +; CHECK-NEXT: scvtf z1.s, p0/m, z1.s ; CHECK-NEXT: stp q0, q1, [x1] ; CHECK-NEXT: ret %op1 = load <8 x i32>, <8 x i32>* %a @@ -833,8 +1142,10 @@ ; CHECK-LABEL: scvtf_v2i32_v2f64: ; CHECK: // %bb.0: ; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: sunpklo z0.d, z0.s -; CHECK-NEXT: scvtf v0.2d, v0.2d +; CHECK-NEXT: scvtf z0.d, p0/m, z0.d +; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: ret %res = sitofp <2 x i32> %op1 to <2 x double> ret <2 x double> %res @@ -844,11 +1155,12 @@ ; CHECK-LABEL: scvtf_v4i32_v4f64: ; CHECK: // %bb.0: ; CHECK-NEXT: ldr q0, [x0] +; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: sunpklo z1.d, z0.s ; CHECK-NEXT: ext z0.b, z0.b, z0.b, #8 ; CHECK-NEXT: sunpklo z0.d, z0.s -; CHECK-NEXT: scvtf v1.2d, v1.2d -; CHECK-NEXT: scvtf v0.2d, v0.2d +; CHECK-NEXT: scvtf z1.d, p0/m, z1.d +; CHECK-NEXT: scvtf z0.d, p0/m, z0.d ; CHECK-NEXT: stp q1, q0, [x1] ; CHECK-NEXT: ret %op1 = load <4 x i32>, <4 x i32>* %a @@ -860,19 +1172,21 @@ define void @scvtf_v8i32_v8f64(<8 x i32>* %a, <8 x double>* %b) #0 { ; CHECK-LABEL: scvtf_v8i32_v8f64: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q1, q0, [x0] -; CHECK-NEXT: sunpklo z3.d, z1.s -; CHECK-NEXT: ext z1.b, z1.b, z1.b, #8 -; CHECK-NEXT: sunpklo z1.d, z1.s -; CHECK-NEXT: scvtf v3.2d, v3.2d +; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: sunpklo z2.d, z0.s ; CHECK-NEXT: ext z0.b, z0.b, z0.b, #8 ; CHECK-NEXT: sunpklo z0.d, z0.s -; CHECK-NEXT: scvtf v2.2d, v2.2d -; CHECK-NEXT: scvtf v0.2d, v0.2d -; CHECK-NEXT: scvtf v1.2d, v1.2d -; CHECK-NEXT: stp q3, q1, [x1] -; CHECK-NEXT: stp q2, q0, [x1, #32] +; CHECK-NEXT: sunpklo z3.d, z1.s +; CHECK-NEXT: ext z1.b, z1.b, z1.b, #8 +; CHECK-NEXT: sunpklo z1.d, z1.s +; CHECK-NEXT: scvtf z3.d, p0/m, z3.d +; CHECK-NEXT: scvtf z1.d, p0/m, z1.d +; CHECK-NEXT: scvtf z0.d, p0/m, z0.d +; CHECK-NEXT: stp q3, q1, [x1, #32] +; CHECK-NEXT: movprfx z1, z2 +; CHECK-NEXT: scvtf z1.d, p0/m, z2.d +; CHECK-NEXT: stp q1, q0, [x1] ; CHECK-NEXT: ret %op1 = load <8 x i32>, <8 x i32>* %a %res = sitofp <8 x i32> %op1 to <8 x double> @@ -883,32 +1197,38 @@ define void @scvtf_v16i32_v16f64(<16 x i32>* %a, <16 x double>* %b) #0 { ; CHECK-LABEL: scvtf_v16i32_v16f64: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q1, q0, [x0] -; CHECK-NEXT: sunpklo z4.d, z1.s -; CHECK-NEXT: ext z1.b, z1.b, z1.b, #8 -; CHECK-NEXT: sunpklo z1.d, z1.s -; CHECK-NEXT: scvtf v4.2d, v4.2d -; CHECK-NEXT: ldp q3, q2, [x0, #32] -; CHECK-NEXT: sunpklo z5.d, z0.s +; CHECK-NEXT: ldp q2, q3, [x0, #32] +; CHECK-NEXT: ptrue p0.d, vl2 +; CHECK-NEXT: mov z6.d, z2.d +; CHECK-NEXT: sunpklo z2.d, z2.s +; CHECK-NEXT: ext z6.b, z6.b, z6.b, #8 +; CHECK-NEXT: scvtf z2.d, p0/m, z2.d +; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: mov z7.d, z3.d +; CHECK-NEXT: sunpklo z3.d, z3.s +; CHECK-NEXT: ext z7.b, z7.b, z7.b, #8 +; CHECK-NEXT: scvtf z3.d, p0/m, z3.d +; CHECK-NEXT: sunpklo z7.d, z7.s +; CHECK-NEXT: sunpklo z6.d, z6.s +; CHECK-NEXT: sunpklo z4.d, z0.s ; CHECK-NEXT: ext z0.b, z0.b, z0.b, #8 ; CHECK-NEXT: sunpklo z0.d, z0.s -; CHECK-NEXT: scvtf v5.2d, v5.2d -; CHECK-NEXT: scvtf v0.2d, v0.2d -; CHECK-NEXT: scvtf v1.2d, v1.2d -; CHECK-NEXT: sunpklo z7.d, z3.s -; CHECK-NEXT: ext z3.b, z3.b, z3.b, #8 -; CHECK-NEXT: sunpklo z3.d, z3.s -; CHECK-NEXT: scvtf v7.2d, v7.2d -; CHECK-NEXT: sunpklo z6.d, z2.s -; CHECK-NEXT: ext z2.b, z2.b, z2.b, #8 -; CHECK-NEXT: sunpklo z2.d, z2.s -; CHECK-NEXT: scvtf v6.2d, v6.2d -; CHECK-NEXT: scvtf v2.2d, v2.2d -; CHECK-NEXT: scvtf v3.2d, v3.2d -; CHECK-NEXT: stp q4, q1, [x1] -; CHECK-NEXT: stp q5, q0, [x1, #32] -; CHECK-NEXT: stp q7, q3, [x1, #64] -; CHECK-NEXT: stp q6, q2, [x1, #96] +; CHECK-NEXT: scvtf z7.d, p0/m, z7.d +; CHECK-NEXT: sunpklo z5.d, z1.s +; CHECK-NEXT: ext z1.b, z1.b, z1.b, #8 +; CHECK-NEXT: sunpklo z1.d, z1.s +; CHECK-NEXT: stp q3, q7, [x1, #96] +; CHECK-NEXT: movprfx z3, z6 +; CHECK-NEXT: scvtf z3.d, p0/m, z6.d +; CHECK-NEXT: stp q2, q3, [x1, #64] +; CHECK-NEXT: movprfx z2, z5 +; CHECK-NEXT: scvtf z2.d, p0/m, z5.d +; CHECK-NEXT: scvtf z1.d, p0/m, z1.d +; CHECK-NEXT: scvtf z0.d, p0/m, z0.d +; CHECK-NEXT: stp q2, q1, [x1, #32] +; CHECK-NEXT: movprfx z2, z4 +; CHECK-NEXT: scvtf z2.d, p0/m, z4.d +; CHECK-NEXT: stp q2, q0, [x1] ; CHECK-NEXT: ret %op1 = load <16 x i32>, <16 x i32>* %a %res = sitofp <16 x i32> %op1 to <16 x double> @@ -943,14 +1263,37 @@ define <4 x half> @scvtf_v4i64_v4f16(<4 x i64>* %a) #0 { ; CHECK-LABEL: scvtf_v4i64_v4f16: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q1, [x0] -; CHECK-NEXT: ptrue p0.s, vl2 -; CHECK-NEXT: scvtf v0.2d, v0.2d -; CHECK-NEXT: scvtf v1.2d, v1.2d -; CHECK-NEXT: fcvtn v0.2s, v0.2d -; CHECK-NEXT: fcvtn v1.2s, v1.2d -; CHECK-NEXT: splice z0.s, p0, z0.s, z1.s -; CHECK-NEXT: fcvtn v0.4h, v0.4s +; CHECK-NEXT: sub sp, sp, #32 +; CHECK-NEXT: .cfi_def_cfa_offset 32 +; CHECK-NEXT: ldp q1, q0, [x0] +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: scvtf z0.s, p0/m, z0.d +; CHECK-NEXT: mov z2.d, z0.d[1] +; CHECK-NEXT: fmov x8, d0 +; CHECK-NEXT: movprfx z0, z1 +; CHECK-NEXT: scvtf z0.s, p0/m, z1.d +; CHECK-NEXT: mov z1.d, z0.d[1] +; CHECK-NEXT: fmov x9, d2 +; CHECK-NEXT: fmov x10, d0 +; CHECK-NEXT: fmov x11, d1 +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: stp w8, w9, [sp, #8] +; CHECK-NEXT: stp w10, w11, [sp] +; CHECK-NEXT: ldr q0, [sp] +; CHECK-NEXT: fcvt z0.h, p0/m, z0.s +; CHECK-NEXT: fmov w8, s0 +; CHECK-NEXT: mov z1.s, z0.s[3] +; CHECK-NEXT: mov z2.s, z0.s[2] +; CHECK-NEXT: mov z0.s, z0.s[1] +; CHECK-NEXT: fmov w9, s1 +; CHECK-NEXT: fmov w10, s2 +; CHECK-NEXT: strh w8, [sp, #24] +; CHECK-NEXT: fmov w8, s0 +; CHECK-NEXT: strh w9, [sp, #30] +; CHECK-NEXT: strh w10, [sp, #28] +; CHECK-NEXT: strh w8, [sp, #26] +; CHECK-NEXT: ldr d0, [sp, #24] +; CHECK-NEXT: add sp, sp, #32 ; CHECK-NEXT: ret %op1 = load <4 x i64>, <4 x i64>* %a %res = sitofp <4 x i64> %op1 to <4 x half> @@ -964,8 +1307,17 @@ define <2 x float> @scvtf_v2i64_v2f32(<2 x i64> %op1) #0 { ; CHECK-LABEL: scvtf_v2i64_v2f32: ; CHECK: // %bb.0: -; CHECK-NEXT: scvtf v0.2d, v0.2d -; CHECK-NEXT: fcvtn v0.2s, v0.2d +; CHECK-NEXT: sub sp, sp, #16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: scvtf z0.s, p0/m, z0.d +; CHECK-NEXT: mov z1.d, z0.d[1] +; CHECK-NEXT: fmov x8, d0 +; CHECK-NEXT: fmov x9, d1 +; CHECK-NEXT: stp w8, w9, [sp, #8] +; CHECK-NEXT: ldr d0, [sp, #8] +; CHECK-NEXT: add sp, sp, #16 ; CHECK-NEXT: ret %res = sitofp <2 x i64> %op1 to <2 x float> ret <2 x float> %res @@ -974,14 +1326,22 @@ define <4 x float> @scvtf_v4i64_v4f32(<4 x i64>* %a) #0 { ; CHECK-LABEL: scvtf_v4i64_v4f32: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q1, [x0] -; CHECK-NEXT: ptrue p0.s, vl2 -; CHECK-NEXT: scvtf v0.2d, v0.2d -; CHECK-NEXT: scvtf v1.2d, v1.2d -; CHECK-NEXT: fcvtn v0.2s, v0.2d -; CHECK-NEXT: fcvtn v1.2s, v1.2d -; CHECK-NEXT: splice z0.s, p0, z0.s, z1.s -; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 +; CHECK-NEXT: sub sp, sp, #16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: ldp q1, q0, [x0] +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: scvtf z0.s, p0/m, z0.d +; CHECK-NEXT: mov z2.d, z0.d[1] +; CHECK-NEXT: fmov x8, d0 +; CHECK-NEXT: movprfx z0, z1 +; CHECK-NEXT: scvtf z0.s, p0/m, z1.d +; CHECK-NEXT: mov z1.d, z0.d[1] +; CHECK-NEXT: fmov x9, d2 +; CHECK-NEXT: fmov x10, d0 +; CHECK-NEXT: fmov x11, d1 +; CHECK-NEXT: stp w8, w9, [sp, #8] +; CHECK-NEXT: stp w10, w11, [sp] +; CHECK-NEXT: ldr q0, [sp], #16 ; CHECK-NEXT: ret %op1 = load <4 x i64>, <4 x i64>* %a %res = sitofp <4 x i64> %op1 to <4 x float> @@ -995,7 +1355,10 @@ define <2 x double> @scvtf_v2i64_v2f64(<2 x i64> %op1) #0 { ; CHECK-LABEL: scvtf_v2i64_v2f64: ; CHECK: // %bb.0: -; CHECK-NEXT: scvtf v0.2d, v0.2d +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: ptrue p0.d, vl2 +; CHECK-NEXT: scvtf z0.d, p0/m, z0.d +; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: ret %res = sitofp <2 x i64> %op1 to <2 x double> ret <2 x double> %res @@ -1005,8 +1368,9 @@ ; CHECK-LABEL: scvtf_v4i64_v4f64: ; CHECK: // %bb.0: ; CHECK-NEXT: ldp q0, q1, [x0] -; CHECK-NEXT: scvtf v0.2d, v0.2d -; CHECK-NEXT: scvtf v1.2d, v1.2d +; CHECK-NEXT: ptrue p0.d, vl2 +; CHECK-NEXT: scvtf z0.d, p0/m, z0.d +; CHECK-NEXT: scvtf z1.d, p0/m, z1.d ; CHECK-NEXT: stp q0, q1, [x1] ; CHECK-NEXT: ret %op1 = load <4 x i64>, <4 x i64>* %a diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-trunc.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-trunc.ll --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-trunc.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-trunc.ll @@ -421,29 +421,25 @@ ; CHECK-LABEL: trunc_v8i64_v8i8: ; CHECK: // %bb.0: ; CHECK-NEXT: sub sp, sp, #16 -; CHECK-NEXT: ldp q0, q1, [x0, #32] -; CHECK-NEXT: xtn v0.2s, v0.2d +; CHECK-NEXT: ldp q1, q0, [x0, #32] +; CHECK-NEXT: fmov x9, d1 ; CHECK-NEXT: ldp q3, q2, [x0] -; CHECK-NEXT: xtn v1.2s, v1.2d -; CHECK-NEXT: fmov w9, s0 -; CHECK-NEXT: fmov w8, s1 -; CHECK-NEXT: mov z4.s, z1.s[1] -; CHECK-NEXT: xtn v3.2s, v3.2d +; CHECK-NEXT: fmov x8, d0 +; CHECK-NEXT: mov z4.d, z0.d[1] ; CHECK-NEXT: strb w9, [sp, #12] -; CHECK-NEXT: fmov w9, s4 -; CHECK-NEXT: mov z1.s, z0.s[1] -; CHECK-NEXT: xtn v2.2s, v2.2d +; CHECK-NEXT: fmov x9, d4 +; CHECK-NEXT: mov z0.d, z1.d[1] ; CHECK-NEXT: strb w8, [sp, #14] -; CHECK-NEXT: fmov w8, s3 -; CHECK-NEXT: fmov w10, s2 -; CHECK-NEXT: mov z0.s, z2.s[1] -; CHECK-NEXT: mov z2.s, z3.s[1] +; CHECK-NEXT: fmov x8, d3 ; CHECK-NEXT: strb w9, [sp, #15] +; CHECK-NEXT: fmov x10, d2 +; CHECK-NEXT: mov z1.d, z2.d[1] +; CHECK-NEXT: mov z2.d, z3.d[1] ; CHECK-NEXT: strb w8, [sp, #8] -; CHECK-NEXT: fmov w8, s0 +; CHECK-NEXT: fmov x8, d1 +; CHECK-NEXT: fmov x9, d2 ; CHECK-NEXT: strb w10, [sp, #10] -; CHECK-NEXT: fmov w10, s1 -; CHECK-NEXT: fmov w9, s2 +; CHECK-NEXT: fmov x10, d0 ; CHECK-NEXT: strb w8, [sp, #11] ; CHECK-NEXT: strb w10, [sp, #13] ; CHECK-NEXT: strb w9, [sp, #9] @@ -512,29 +508,25 @@ ; CHECK-LABEL: trunc_v8i64_v8i16: ; CHECK: // %bb.0: ; CHECK-NEXT: sub sp, sp, #16 -; CHECK-NEXT: ldp q0, q1, [x0, #32] -; CHECK-NEXT: xtn v0.2s, v0.2d +; CHECK-NEXT: ldp q1, q0, [x0, #32] +; CHECK-NEXT: fmov x9, d1 ; CHECK-NEXT: ldp q3, q2, [x0] -; CHECK-NEXT: xtn v1.2s, v1.2d -; CHECK-NEXT: fmov w9, s0 -; CHECK-NEXT: fmov w8, s1 -; CHECK-NEXT: mov z4.s, z1.s[1] -; CHECK-NEXT: xtn v3.2s, v3.2d +; CHECK-NEXT: fmov x8, d0 +; CHECK-NEXT: mov z4.d, z0.d[1] ; CHECK-NEXT: strh w9, [sp, #8] -; CHECK-NEXT: fmov w9, s4 -; CHECK-NEXT: mov z1.s, z0.s[1] -; CHECK-NEXT: xtn v2.2s, v2.2d +; CHECK-NEXT: fmov x9, d4 +; CHECK-NEXT: mov z0.d, z1.d[1] ; CHECK-NEXT: strh w8, [sp, #12] -; CHECK-NEXT: fmov w8, s3 -; CHECK-NEXT: fmov w10, s2 -; CHECK-NEXT: mov z0.s, z2.s[1] -; CHECK-NEXT: mov z2.s, z3.s[1] +; CHECK-NEXT: fmov x8, d3 ; CHECK-NEXT: strh w9, [sp, #14] +; CHECK-NEXT: fmov x10, d2 +; CHECK-NEXT: mov z1.d, z2.d[1] +; CHECK-NEXT: mov z2.d, z3.d[1] ; CHECK-NEXT: strh w8, [sp] -; CHECK-NEXT: fmov w8, s0 +; CHECK-NEXT: fmov x8, d1 +; CHECK-NEXT: fmov x9, d2 ; CHECK-NEXT: strh w10, [sp, #4] -; CHECK-NEXT: fmov w10, s1 -; CHECK-NEXT: fmov w9, s2 +; CHECK-NEXT: fmov x10, d0 ; CHECK-NEXT: strh w8, [sp, #6] ; CHECK-NEXT: strh w10, [sp, #10] ; CHECK-NEXT: strh w9, [sp, #2] @@ -604,18 +596,30 @@ define void @trunc_v8i64_v8i32(<8 x i64>* %in, <8 x i32>* %out) #0 { ; CHECK-LABEL: trunc_v8i64_v8i32: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q1, [x0, #32] -; CHECK-NEXT: ptrue p0.s, vl2 -; CHECK-NEXT: xtn v0.2s, v0.2d +; CHECK-NEXT: sub sp, sp, #32 +; CHECK-NEXT: ldp q1, q0, [x0, #32] +; CHECK-NEXT: fmov x9, d1 ; CHECK-NEXT: ldp q2, q3, [x0] -; CHECK-NEXT: xtn v1.2s, v1.2d -; CHECK-NEXT: splice z0.s, p0, z0.s, z1.s -; CHECK-NEXT: xtn v2.2s, v2.2d +; CHECK-NEXT: mov z4.d, z0.d[1] +; CHECK-NEXT: fmov x8, d0 +; CHECK-NEXT: mov z0.d, z1.d[1] +; CHECK-NEXT: fmov x10, d4 +; CHECK-NEXT: fmov x12, d0 +; CHECK-NEXT: mov z0.d, z2.d[1] +; CHECK-NEXT: stp w8, w10, [sp, #24] +; CHECK-NEXT: fmov x10, d0 +; CHECK-NEXT: mov z1.d, z3.d[1] +; CHECK-NEXT: fmov x11, d3 +; CHECK-NEXT: fmov x8, d1 +; CHECK-NEXT: stp w9, w12, [sp, #16] +; CHECK-NEXT: fmov x9, d2 +; CHECK-NEXT: stp w11, w8, [sp, #8] +; CHECK-NEXT: stp w9, w10, [sp] +; CHECK-NEXT: ldp q1, q0, [sp] +; CHECK-NEXT: add z1.s, z1.s, z1.s ; CHECK-NEXT: add z0.s, z0.s, z0.s -; CHECK-NEXT: xtn v3.2s, v3.2d -; CHECK-NEXT: splice z2.s, p0, z2.s, z3.s -; CHECK-NEXT: add z1.s, z2.s, z2.s ; CHECK-NEXT: stp q1, q0, [x1] +; CHECK-NEXT: add sp, sp, #32 ; CHECK-NEXT: ret %a = load <8 x i64>, <8 x i64>* %in %b = trunc <8 x i64> %a to <8 x i32>