diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -1438,12 +1438,18 @@ setTruncStoreAction(MVT::v2f64, MVT::v2f32, Custom); setTruncStoreAction(MVT::v4f64, MVT::v4f32, Custom); for (MVT VT : {MVT::v8i8, MVT::v16i8, MVT::v4i16, MVT::v8i16, MVT::v2i32, - MVT::v4i32, MVT::v1i64, MVT::v2i64}) + MVT::v4i32, MVT::v1i64, MVT::v2i64}) { + for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) + setOperationAction(Op, VT, Expand); addTypeForStreamingSVE(VT); + } for (MVT VT : - {MVT::v4f16, MVT::v8f16, MVT::v2f32, MVT::v4f32, MVT::v2f64}) + {MVT::v4f16, MVT::v8f16, MVT::v2f32, MVT::v4f32, MVT::v2f64}) { + for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) + setOperationAction(Op, VT, Expand); addTypeForStreamingSVE(VT); + } } // NOTE: Currently this has to happen after computeRegisterProperties rather @@ -1661,6 +1667,32 @@ } void AArch64TargetLowering::addTypeForStreamingSVE(MVT VT) { + setOperationAction(ISD::STORE, VT, Legal); + setOperationAction(ISD::LOAD, VT, Legal); + setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Legal); + setOperationAction(ISD::BITCAST, VT, Custom); + setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom); + setOperationAction(ISD::MLOAD, VT, Custom); + setOperationAction(ISD::MSTORE, VT, Custom); + setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom); + setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom); + setOperationAction(ISD::SDIV, VT, Custom); + setOperationAction(ISD::SHL, VT, Custom); + setOperationAction(ISD::SRA, VT, Custom); + setOperationAction(ISD::SRL, VT, Custom); + setOperationAction(ISD::OR, VT, Custom); + setOperationAction(ISD::SETCC, VT, Custom); + setOperationAction(ISD::UDIV, VT, Custom); + setOperationAction(ISD::SINT_TO_FP, VT, Custom); + setOperationAction(ISD::FP_TO_SINT, VT, Custom); + setOperationAction(ISD::FP_TO_UINT, VT, Custom); + setOperationAction(ISD::UINT_TO_FP, VT, Custom); + setOperationAction(ISD::VECREDUCE_FMIN, VT, Custom); + setOperationAction(ISD::VECREDUCE_FMAX, VT, Custom); + setOperationAction(ISD::VECREDUCE_SEQ_FADD, VT, Custom); + setOperationAction(ISD::BUILD_VECTOR, VT, Custom); + setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom); + setOperationAction(ISD::FCOPYSIGN, VT, Custom); setOperationAction(ISD::ANY_EXTEND, VT, Custom); setOperationAction(ISD::ZERO_EXTEND, VT, Custom); setOperationAction(ISD::SIGN_EXTEND, VT, Custom); @@ -4294,7 +4326,8 @@ EVT OpVT = Op.getValueType(); EVT ArgVT = Op.getOperand(0).getValueType(); - if (useSVEForFixedLengthVectorVT(OpVT)) + if (useSVEForFixedLengthVectorVT(OpVT, + Subtarget->forceStreamingCompatibleSVE())) return LowerFixedLengthBitcastToSVE(Op, DAG); if (OpVT.isScalableVector()) { diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-select.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-select.ll --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-select.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-select.ll @@ -61,15 +61,15 @@ ; CHECK-NEXT: ldr q0, [x0] ; CHECK-NEXT: csetm w8, ne ; CHECK-NEXT: ldr q1, [x0, #16] -; CHECK-NEXT: ldr q2, [x1] -; CHECK-NEXT: ldr q3, [x1, #16] +; CHECK-NEXT: ldr q2, [x1, #16] +; CHECK-NEXT: ldr q3, [x1] ; CHECK-NEXT: mov z4.h, w8 ; CHECK-NEXT: bic z2.d, z2.d, z4.d ; CHECK-NEXT: and z0.d, z0.d, z4.d ; CHECK-NEXT: bic z3.d, z3.d, z4.d ; CHECK-NEXT: and z1.d, z1.d, z4.d -; CHECK-NEXT: orr z0.d, z0.d, z2.d -; CHECK-NEXT: orr z1.d, z1.d, z3.d +; CHECK-NEXT: orr z0.d, z0.d, z3.d +; CHECK-NEXT: orr z1.d, z1.d, z2.d ; CHECK-NEXT: stp q0, q1, [x0] ; CHECK-NEXT: ret %op1 = load volatile <16 x half>, ptr %a @@ -121,21 +121,21 @@ ; CHECK-LABEL: select_v8f32: ; CHECK: // %bb.0: ; CHECK-NEXT: tst w2, #0x1 -; CHECK-NEXT: ldr q0, [x0] +; CHECK-NEXT: ldr q0, [x0, #16] ; CHECK-NEXT: csetm w8, ne -; CHECK-NEXT: ldr q1, [x0, #16] +; CHECK-NEXT: ldr q1, [x0] ; CHECK-NEXT: mvn w9, w8 -; CHECK-NEXT: ldr q2, [x1] -; CHECK-NEXT: ldr q3, [x1, #16] +; CHECK-NEXT: ldr q2, [x1, #16] +; CHECK-NEXT: ldr q3, [x1] ; CHECK-NEXT: mov z4.s, w8 ; CHECK-NEXT: mov z5.s, w9 -; CHECK-NEXT: and z1.d, z1.d, z4.d ; CHECK-NEXT: and z0.d, z0.d, z4.d -; CHECK-NEXT: and z2.d, z2.d, z5.d +; CHECK-NEXT: and z1.d, z1.d, z4.d ; CHECK-NEXT: and z3.d, z3.d, z5.d -; CHECK-NEXT: orr z0.d, z0.d, z2.d +; CHECK-NEXT: and z2.d, z2.d, z5.d ; CHECK-NEXT: orr z1.d, z1.d, z3.d -; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: orr z0.d, z0.d, z2.d +; CHECK-NEXT: stp q1, q0, [x0] ; CHECK-NEXT: ret %op1 = load volatile <8 x float>, ptr %a %op2 = load volatile <8 x float>, ptr %b @@ -186,21 +186,21 @@ ; CHECK-LABEL: select_v4f64: ; CHECK: // %bb.0: ; CHECK-NEXT: tst w2, #0x1 -; CHECK-NEXT: ldr q0, [x0] +; CHECK-NEXT: ldr q0, [x0, #16] ; CHECK-NEXT: csetm x8, ne -; CHECK-NEXT: ldr q1, [x0, #16] +; CHECK-NEXT: ldr q1, [x0] ; CHECK-NEXT: mvn x9, x8 -; CHECK-NEXT: ldr q2, [x1] -; CHECK-NEXT: ldr q3, [x1, #16] +; CHECK-NEXT: ldr q2, [x1, #16] +; CHECK-NEXT: ldr q3, [x1] ; CHECK-NEXT: mov z4.d, x8 ; CHECK-NEXT: mov z5.d, x9 -; CHECK-NEXT: and z1.d, z1.d, z4.d ; CHECK-NEXT: and z0.d, z0.d, z4.d -; CHECK-NEXT: and z2.d, z2.d, z5.d +; CHECK-NEXT: and z1.d, z1.d, z4.d ; CHECK-NEXT: and z3.d, z3.d, z5.d -; CHECK-NEXT: orr z0.d, z0.d, z2.d +; CHECK-NEXT: and z2.d, z2.d, z5.d ; CHECK-NEXT: orr z1.d, z1.d, z3.d -; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: orr z0.d, z0.d, z2.d +; CHECK-NEXT: stp q1, q0, [x0] ; CHECK-NEXT: ret %op1 = load volatile <4 x double>, ptr %a %op2 = load volatile <4 x double>, ptr %b diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-to-int.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-to-int.ll --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-to-int.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-to-int.ll @@ -718,8 +718,8 @@ ; CHECK-LABEL: fcvtzu_v1f64_v1i64: ; CHECK: // %bb.0: ; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 -; CHECK-NEXT: fcvtzu x8, d0 -; CHECK-NEXT: mov z0.d, x8 +; CHECK-NEXT: ptrue p0.d, vl1 +; CHECK-NEXT: fcvtzu z0.d, p0/m, z0.d ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret %res = fptoui <1 x double> %op1 to <1 x i64> @@ -1471,8 +1471,8 @@ ; CHECK-LABEL: fcvtzs_v1f64_v1i64: ; CHECK: // %bb.0: ; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 -; CHECK-NEXT: fcvtzs x8, d0 -; CHECK-NEXT: mov z0.d, x8 +; CHECK-NEXT: ptrue p0.d, vl1 +; CHECK-NEXT: fcvtzs z0.d, p0/m, z0.d ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret %res = fptosi <1 x double> %op1 to <1 x i64> diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-vselect.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-vselect.ll --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-vselect.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-vselect.ll @@ -70,20 +70,20 @@ define void @select_v16f16(ptr %a, ptr %b) #0 { ; CHECK-LABEL: select_v16f16: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ldp q1, q0, [x0] ; CHECK-NEXT: ptrue p0.h, vl8 ; CHECK-NEXT: ldp q3, q2, [x1] -; CHECK-NEXT: fcmeq p1.h, p0/z, z1.h, z2.h -; CHECK-NEXT: fcmeq p0.h, p0/z, z0.h, z3.h +; CHECK-NEXT: fcmeq p1.h, p0/z, z0.h, z2.h +; CHECK-NEXT: fcmeq p0.h, p0/z, z1.h, z3.h ; CHECK-NEXT: mov z4.h, p1/z, #-1 // =0xffffffffffffffff ; CHECK-NEXT: mov z5.h, p0/z, #-1 // =0xffffffffffffffff ; CHECK-NEXT: bic z2.d, z2.d, z4.d ; CHECK-NEXT: bic z3.d, z3.d, z5.d -; CHECK-NEXT: and z0.d, z0.d, z5.d -; CHECK-NEXT: and z1.d, z1.d, z4.d -; CHECK-NEXT: orr z0.d, z0.d, z3.d -; CHECK-NEXT: orr z1.d, z1.d, z2.d -; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: and z1.d, z1.d, z5.d +; CHECK-NEXT: and z0.d, z0.d, z4.d +; CHECK-NEXT: orr z1.d, z1.d, z3.d +; CHECK-NEXT: orr z0.d, z0.d, z2.d +; CHECK-NEXT: stp q1, q0, [x0] ; CHECK-NEXT: ret %op1 = load <16 x half>, ptr %a %op2 = load <16 x half>, ptr %b @@ -133,20 +133,20 @@ define void @select_v8f32(ptr %a, ptr %b) #0 { ; CHECK-LABEL: select_v8f32: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ldp q1, q0, [x0] ; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: ldp q3, q2, [x1] -; CHECK-NEXT: fcmeq p1.s, p0/z, z1.s, z2.s -; CHECK-NEXT: fcmeq p0.s, p0/z, z0.s, z3.s +; CHECK-NEXT: fcmeq p1.s, p0/z, z0.s, z2.s +; CHECK-NEXT: fcmeq p0.s, p0/z, z1.s, z3.s ; CHECK-NEXT: mov z4.s, p1/z, #-1 // =0xffffffffffffffff ; CHECK-NEXT: mov z5.s, p0/z, #-1 // =0xffffffffffffffff ; CHECK-NEXT: bic z2.d, z2.d, z4.d ; CHECK-NEXT: bic z3.d, z3.d, z5.d -; CHECK-NEXT: and z0.d, z0.d, z5.d -; CHECK-NEXT: and z1.d, z1.d, z4.d -; CHECK-NEXT: orr z0.d, z0.d, z3.d -; CHECK-NEXT: orr z1.d, z1.d, z2.d -; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: and z1.d, z1.d, z5.d +; CHECK-NEXT: and z0.d, z0.d, z4.d +; CHECK-NEXT: orr z1.d, z1.d, z3.d +; CHECK-NEXT: orr z0.d, z0.d, z2.d +; CHECK-NEXT: stp q1, q0, [x0] ; CHECK-NEXT: ret %op1 = load <8 x float>, ptr %a %op2 = load <8 x float>, ptr %b @@ -197,20 +197,20 @@ define void @select_v4f64(ptr %a, ptr %b) #0 { ; CHECK-LABEL: select_v4f64: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ldp q1, q0, [x0] ; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: ldp q3, q2, [x1] -; CHECK-NEXT: fcmeq p1.d, p0/z, z1.d, z2.d -; CHECK-NEXT: fcmeq p0.d, p0/z, z0.d, z3.d +; CHECK-NEXT: fcmeq p1.d, p0/z, z0.d, z2.d +; CHECK-NEXT: fcmeq p0.d, p0/z, z1.d, z3.d ; CHECK-NEXT: mov z4.d, p1/z, #-1 // =0xffffffffffffffff ; CHECK-NEXT: mov z5.d, p0/z, #-1 // =0xffffffffffffffff ; CHECK-NEXT: bic z2.d, z2.d, z4.d ; CHECK-NEXT: bic z3.d, z3.d, z5.d -; CHECK-NEXT: and z0.d, z0.d, z5.d -; CHECK-NEXT: and z1.d, z1.d, z4.d -; CHECK-NEXT: orr z0.d, z0.d, z3.d -; CHECK-NEXT: orr z1.d, z1.d, z2.d -; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: and z1.d, z1.d, z5.d +; CHECK-NEXT: and z0.d, z0.d, z4.d +; CHECK-NEXT: orr z1.d, z1.d, z3.d +; CHECK-NEXT: orr z0.d, z0.d, z2.d +; CHECK-NEXT: stp q1, q0, [x0] ; CHECK-NEXT: ret %op1 = load <4 x double>, ptr %a %op2 = load <4 x double>, ptr %b diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-insert-vector-elt.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-insert-vector-elt.ll --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-insert-vector-elt.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-insert-vector-elt.ll @@ -398,12 +398,12 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: ldp q0, q1, [x0] ; CHECK-NEXT: mov w8, #1 -; CHECK-NEXT: fmov d3, #5.00000000 -; CHECK-NEXT: index z4.d, #0, #1 +; CHECK-NEXT: fmov d4, #5.00000000 +; CHECK-NEXT: index z2.d, #0, #1 ; CHECK-NEXT: ptrue p0.d -; CHECK-NEXT: mov z2.d, x8 -; CHECK-NEXT: cmpeq p0.d, p0/z, z4.d, z2.d -; CHECK-NEXT: mov z1.d, p0/m, d3 +; CHECK-NEXT: mov z3.d, x8 +; CHECK-NEXT: cmpeq p0.d, p0/z, z2.d, z3.d +; CHECK-NEXT: mov z1.d, p0/m, d4 ; CHECK-NEXT: // kill: def $q1 killed $q1 killed $z1 ; CHECK-NEXT: ret %op1 = load <4 x double>, ptr %a diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-ld2-alloca.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-ld2-alloca.ll --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-ld2-alloca.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-ld2-alloca.ll @@ -84,33 +84,33 @@ ; CHECK-NEXT: bl def ; CHECK-NEXT: ldp q0, q1, [sp, #16] ; CHECK-NEXT: mov z2.b, z0.b[14] -; CHECK-NEXT: mov z3.b, z0.b[12] ; CHECK-NEXT: fmov w8, s0 ; CHECK-NEXT: fmov w9, s2 -; CHECK-NEXT: fmov w10, s3 +; CHECK-NEXT: mov z3.b, z0.b[12] ; CHECK-NEXT: mov z4.b, z0.b[10] ; CHECK-NEXT: mov z5.b, z0.b[8] -; CHECK-NEXT: mov z6.b, z0.b[6] +; CHECK-NEXT: fmov w10, s3 ; CHECK-NEXT: strb w8, [sp] ; CHECK-NEXT: fmov w8, s4 ; CHECK-NEXT: strb w9, [sp, #7] ; CHECK-NEXT: fmov w9, s5 -; CHECK-NEXT: strb w10, [sp, #6] -; CHECK-NEXT: fmov w10, s6 +; CHECK-NEXT: mov z6.b, z0.b[6] ; CHECK-NEXT: mov z7.b, z0.b[4] ; CHECK-NEXT: mov z0.b, z0.b[2] +; CHECK-NEXT: strb w10, [sp, #6] +; CHECK-NEXT: fmov w10, s6 ; CHECK-NEXT: strb w8, [sp, #5] ; CHECK-NEXT: fmov w8, s7 ; CHECK-NEXT: strb w9, [sp, #4] ; CHECK-NEXT: fmov w9, s0 ; CHECK-NEXT: strb w10, [sp, #3] -; CHECK-NEXT: fmov w10, s1 ; CHECK-NEXT: strb w8, [sp, #2] +; CHECK-NEXT: fmov w8, s1 ; CHECK-NEXT: strb w9, [sp, #1] -; CHECK-NEXT: strb w10, [x19, #8] ; CHECK-NEXT: ldr q0, [sp] -; CHECK-NEXT: fmov x8, d0 -; CHECK-NEXT: str x8, [x19] +; CHECK-NEXT: strb w8, [x19, #8] +; CHECK-NEXT: fmov x9, d0 +; CHECK-NEXT: str x9, [x19] ; CHECK-NEXT: ldp x30, x19, [sp, #48] // 16-byte Folded Reload ; CHECK-NEXT: add sp, sp, #64 ; CHECK-NEXT: ret diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-loads.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-loads.ll --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-loads.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-loads.ll @@ -61,6 +61,7 @@ ; CHECK-LABEL: load_v2f16: ; CHECK: // %bb.0: ; CHECK-NEXT: ldr s0, [x0] +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret %load = load <2 x half>, ptr %a ret <2 x half> %load diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-masked-load.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-masked-load.ll --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-masked-load.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-masked-load.ll @@ -247,44 +247,45 @@ define <8 x float> @masked_load_v8f32(ptr %src, <8 x i1> %mask) #0 { ; CHECK-LABEL: masked_load_v8f32: ; CHECK: // %bb.0: +; CHECK-NEXT: sub sp, sp, #16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 -; CHECK-NEXT: fmov w8, s0 -; CHECK-NEXT: mov z1.b, z0.b[3] -; CHECK-NEXT: mov z2.b, z0.b[2] -; CHECK-NEXT: ptrue p0.s, vl4 -; CHECK-NEXT: mov z3.b, z0.b[1] -; CHECK-NEXT: mov z4.b, z0.b[7] -; CHECK-NEXT: mov z5.b, z0.b[6] -; CHECK-NEXT: mov z6.b, z0.b[5] -; CHECK-NEXT: fmov w9, s1 -; CHECK-NEXT: mov z0.b, z0.b[4] +; CHECK-NEXT: mov z1.b, z0.b[7] +; CHECK-NEXT: mov z2.b, z0.b[6] +; CHECK-NEXT: fmov w8, s1 +; CHECK-NEXT: fmov w9, s2 +; CHECK-NEXT: mov z3.b, z0.b[5] +; CHECK-NEXT: mov z1.b, z0.b[4] +; CHECK-NEXT: fmov w10, s3 +; CHECK-NEXT: mov z2.b, z0.b[3] +; CHECK-NEXT: strh w8, [sp, #14] +; CHECK-NEXT: fmov w8, s1 +; CHECK-NEXT: strh w9, [sp, #12] +; CHECK-NEXT: fmov w9, s0 +; CHECK-NEXT: mov z3.b, z0.b[2] +; CHECK-NEXT: mov z1.b, z0.b[1] +; CHECK-NEXT: strh w10, [sp, #10] ; CHECK-NEXT: fmov w10, s2 -; CHECK-NEXT: strh w8, [sp, #-16]! -; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: strh w8, [sp, #8] ; CHECK-NEXT: fmov w8, s3 -; CHECK-NEXT: strh w9, [sp, #6] -; CHECK-NEXT: fmov w9, s4 -; CHECK-NEXT: strh w10, [sp, #4] -; CHECK-NEXT: fmov w10, s5 -; CHECK-NEXT: strh w8, [sp, #2] -; CHECK-NEXT: fmov w8, s6 -; CHECK-NEXT: strh w9, [sp, #14] -; CHECK-NEXT: fmov w9, s0 -; CHECK-NEXT: strh w10, [sp, #12] -; CHECK-NEXT: strh w8, [sp, #10] +; CHECK-NEXT: strh w9, [sp] +; CHECK-NEXT: fmov w9, s1 +; CHECK-NEXT: strh w10, [sp, #6] +; CHECK-NEXT: ptrue p0.s, vl4 +; CHECK-NEXT: strh w8, [sp, #4] ; CHECK-NEXT: mov x8, #4 -; CHECK-NEXT: strh w9, [sp, #8] -; CHECK-NEXT: ldp d0, d1, [sp] -; CHECK-NEXT: uunpklo z0.s, z0.h +; CHECK-NEXT: strh w9, [sp, #2] +; CHECK-NEXT: ldp d1, d0, [sp] ; CHECK-NEXT: uunpklo z1.s, z1.h -; CHECK-NEXT: lsl z0.s, p0/m, z0.s, #31 +; CHECK-NEXT: uunpklo z0.s, z0.h ; CHECK-NEXT: lsl z1.s, p0/m, z1.s, #31 -; CHECK-NEXT: asr z0.s, p0/m, z0.s, #31 +; CHECK-NEXT: lsl z0.s, p0/m, z0.s, #31 ; CHECK-NEXT: asr z1.s, p0/m, z1.s, #31 +; CHECK-NEXT: asr z0.s, p0/m, z0.s, #31 ; CHECK-NEXT: cmpne p1.s, p0/z, z0.s, #0 ; CHECK-NEXT: cmpne p0.s, p0/z, z1.s, #0 -; CHECK-NEXT: ld1w { z0.s }, p1/z, [x0] -; CHECK-NEXT: ld1w { z1.s }, p0/z, [x0, x8, lsl #2] +; CHECK-NEXT: ld1w { z1.s }, p1/z, [x0, x8, lsl #2] +; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: // kill: def $q1 killed $q1 killed $z1 ; CHECK-NEXT: add sp, sp, #16 @@ -323,10 +324,10 @@ ; CHECK-NEXT: lsl z0.d, p0/m, z0.d, #63 ; CHECK-NEXT: asr z1.d, p0/m, z1.d, #63 ; CHECK-NEXT: asr z0.d, p0/m, z0.d, #63 -; CHECK-NEXT: cmpne p1.d, p0/z, z1.d, #0 -; CHECK-NEXT: cmpne p0.d, p0/z, z0.d, #0 -; CHECK-NEXT: ld1d { z0.d }, p1/z, [x0] -; CHECK-NEXT: ld1d { z1.d }, p0/z, [x0, x8, lsl #3] +; CHECK-NEXT: cmpne p1.d, p0/z, z0.d, #0 +; CHECK-NEXT: cmpne p0.d, p0/z, z1.d, #0 +; CHECK-NEXT: ld1d { z1.d }, p1/z, [x0, x8, lsl #3] +; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: // kill: def $q1 killed $q1 killed $z1 ; CHECK-NEXT: ret