diff --git a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp --- a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp @@ -6545,7 +6545,7 @@ int FI = cast(N)->getIndex(); // We can only encode VL scaled offsets, so only fold in frame indexes // referencing SVE objects. - if (FI == 0 || MFI.getStackID(FI) == TargetStackID::ScalableVector) { + if (MFI.getStackID(FI) == TargetStackID::ScalableVector) { Base = CurDAG->getTargetFrameIndex(FI, TLI->getPointerTy(DL)); OffImm = CurDAG->getTargetConstant(0, SDLoc(N), MVT::i64); return true; @@ -6580,7 +6580,7 @@ int FI = cast(Base)->getIndex(); // We can only encode VL scaled offsets, so only fold in frame indexes // referencing SVE objects. - if (FI == 0 || MFI.getStackID(FI) == TargetStackID::ScalableVector) + if (MFI.getStackID(FI) == TargetStackID::ScalableVector) Base = CurDAG->getTargetFrameIndex(FI, TLI->getPointerTy(DL)); } diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-ld2-alloca.ll b/llvm/test/CodeGen/AArch64/sve-fixed-ld2-alloca.ll --- a/llvm/test/CodeGen/AArch64/sve-fixed-ld2-alloca.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-ld2-alloca.ll @@ -8,16 +8,19 @@ define void @st1d_fixed(ptr %ptr) #0 { ; CHECK-LABEL: st1d_fixed: ; CHECK: // %bb.0: -; CHECK-NEXT: sub sp, sp, #144 -; CHECK-NEXT: stp x30, x19, [sp, #128] // 16-byte Folded Spill +; CHECK-NEXT: sub sp, sp, #160 +; CHECK-NEXT: stp x20, x19, [sp, #144] // 16-byte Folded Spill ; CHECK-NEXT: mov x19, x0 ; CHECK-NEXT: mov x0, sp +; CHECK-NEXT: str x30, [sp, #128] // 8-byte Folded Spill +; CHECK-NEXT: mov x20, sp ; CHECK-NEXT: bl def ; CHECK-NEXT: ptrue p0.d -; CHECK-NEXT: ld2d { z0.d, z1.d }, p0/z, [sp] +; CHECK-NEXT: ld2d { z0.d, z1.d }, p0/z, [x20] +; CHECK-NEXT: ldr x30, [sp, #128] // 8-byte Folded Reload ; CHECK-NEXT: st1d { z0.d }, p0, [x19] -; CHECK-NEXT: ldp x30, x19, [sp, #128] // 16-byte Folded Reload -; CHECK-NEXT: add sp, sp, #144 +; CHECK-NEXT: ldp x20, x19, [sp, #144] // 16-byte Folded Reload +; CHECK-NEXT: add sp, sp, #160 ; CHECK-NEXT: ret %alloc = alloca [16 x double] call void @def(ptr %alloc) diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-frame-offests.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-frame-offests.ll --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-frame-offests.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-frame-offests.ll @@ -9,13 +9,14 @@ ; accessing fixed width objects. define void @foo(ptr %a) #0 { ; CHECK-LABEL: foo: -; CHECK: SelectionDAG has 14 nodes: +; CHECK: SelectionDAG has 15 nodes: ; CHECK-NEXT: t0: ch,glue = EntryToken ; CHECK-NEXT: t12: nxv2i1 = PTRUE_D TargetConstant:i32<31> ; CHECK-NEXT: t2: i64,ch = CopyFromReg t0, Register:i64 %0 ; CHECK-NEXT: t18: nxv2i64,ch = LD1D_IMM t12, t2, TargetConstant:i64<0>, t0 ; CHECK-NEXT: t8: i64 = ADDXri TargetFrameIndex:i64<1>, TargetConstant:i32<0>, TargetConstant:i32<0> -; CHECK-NEXT: t17: ch = ST1D_IMM t18, t12, TargetFrameIndex:i64<0>, TargetConstant:i64<0>, t18:1 +; CHECK-NEXT: t6: i64 = ADDXri TargetFrameIndex:i64<0>, TargetConstant:i32<0>, TargetConstant:i32<0> +; CHECK-NEXT: t17: ch = ST1D_IMM t18, t12, t6, TargetConstant:i64<0>, t18:1 ; CHECK-NEXT: t16: ch = ST1D_IMM t18, t12, t8, TargetConstant:i64<0>, t17 ; CHECK-NEXT: t10: ch = RET_ReallyLR t16 ; CHECK-EMPTY: diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-permute-rev.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-permute-rev.ll --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-permute-rev.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-permute-rev.ll @@ -205,14 +205,15 @@ ; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] ; CHECK-NEXT: mov z1.d, z0.d[2] -; CHECK-NEXT: mov z2.d, z0.d[3] -; CHECK-NEXT: mov x10, v0.d[1] -; CHECK-NEXT: fmov x8, d1 -; CHECK-NEXT: fmov x9, d2 ; CHECK-NEXT: fmov x11, d0 +; CHECK-NEXT: fmov x8, d1 +; CHECK-NEXT: mov z1.d, z0.d[3] +; CHECK-NEXT: fmov x9, d1 +; CHECK-NEXT: mov x10, v0.d[1] ; CHECK-NEXT: stp x9, x8, [sp, #16] +; CHECK-NEXT: mov x8, sp ; CHECK-NEXT: stp x10, x11, [sp] -; CHECK-NEXT: ld1d { z0.d }, p0/z, [sp] +; CHECK-NEXT: ld1d { z0.d }, p0/z, [x8] ; CHECK-NEXT: st1d { z0.d }, p0, [x0] ; CHECK-NEXT: mov sp, x29 ; CHECK-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload @@ -273,9 +274,9 @@ ; CHECK-NEXT: ptrue p0.s, vl8 ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] ; CHECK-NEXT: mov w8, v0.s[1] +; CHECK-NEXT: fmov w10, s0 ; CHECK-NEXT: mov w9, v0.s[2] ; CHECK-NEXT: mov w11, v0.s[3] -; CHECK-NEXT: fmov w10, s0 ; CHECK-NEXT: mov z1.s, z0.s[4] ; CHECK-NEXT: mov z2.s, z0.s[5] ; CHECK-NEXT: mov z3.s, z0.s[6] @@ -287,8 +288,9 @@ ; CHECK-NEXT: fmov w9, s3 ; CHECK-NEXT: fmov w11, s0 ; CHECK-NEXT: stp w8, w10, [sp, #8] +; CHECK-NEXT: mov x8, sp ; CHECK-NEXT: stp w11, w9, [sp] -; CHECK-NEXT: ld1w { z0.s }, p0/z, [sp] +; CHECK-NEXT: ld1w { z0.s }, p0/z, [x8] ; CHECK-NEXT: st1w { z0.s }, p0, [x0] ; CHECK-NEXT: mov sp, x29 ; CHECK-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload @@ -392,19 +394,21 @@ ; CHECK-NEXT: mov z1.h, z0.h[8] ; CHECK-NEXT: fmov w8, s0 ; CHECK-NEXT: fmov w9, s1 +; CHECK-NEXT: mov z4.h, z0.h[11] ; CHECK-NEXT: mov z5.h, z0.h[12] ; CHECK-NEXT: mov z2.h, z0.h[9] +; CHECK-NEXT: strh w8, [sp, #14] +; CHECK-NEXT: fmov w8, s4 ; CHECK-NEXT: mov z3.h, z0.h[10] -; CHECK-NEXT: mov z4.h, z0.h[11] -; CHECK-NEXT: fmov w11, s2 ; CHECK-NEXT: strh w9, [sp, #30] ; CHECK-NEXT: fmov w9, s5 +; CHECK-NEXT: mov z16.h, z0.h[15] +; CHECK-NEXT: fmov w11, s2 ; CHECK-NEXT: fmov w12, s3 -; CHECK-NEXT: strh w8, [sp, #14] -; CHECK-NEXT: fmov w8, s4 +; CHECK-NEXT: strh w8, [sp, #24] +; CHECK-NEXT: fmov w8, s16 ; CHECK-NEXT: mov z6.h, z0.h[13] ; CHECK-NEXT: mov z7.h, z0.h[14] -; CHECK-NEXT: mov z16.h, z0.h[15] ; CHECK-NEXT: umov w10, v0.h[1] ; CHECK-NEXT: strh w9, [sp, #22] ; CHECK-NEXT: umov w9, v0.h[2] @@ -412,24 +416,23 @@ ; CHECK-NEXT: fmov w11, s6 ; CHECK-NEXT: strh w12, [sp, #26] ; CHECK-NEXT: fmov w12, s7 -; CHECK-NEXT: strh w8, [sp, #24] -; CHECK-NEXT: fmov w8, s16 +; CHECK-NEXT: strh w8, [sp, #16] +; CHECK-NEXT: umov w8, v0.h[5] ; CHECK-NEXT: strh w10, [sp, #12] ; CHECK-NEXT: strh w11, [sp, #20] ; CHECK-NEXT: umov w11, v0.h[3] ; CHECK-NEXT: strh w12, [sp, #18] ; CHECK-NEXT: umov w12, v0.h[4] -; CHECK-NEXT: strh w8, [sp, #16] -; CHECK-NEXT: umov w8, v0.h[5] ; CHECK-NEXT: umov w10, v0.h[6] ; CHECK-NEXT: strh w9, [sp, #10] ; CHECK-NEXT: umov w9, v0.h[7] +; CHECK-NEXT: strh w8, [sp, #4] +; CHECK-NEXT: mov x8, sp ; CHECK-NEXT: strh w11, [sp, #8] ; CHECK-NEXT: strh w12, [sp, #6] -; CHECK-NEXT: strh w8, [sp, #4] ; CHECK-NEXT: strh w10, [sp, #2] ; CHECK-NEXT: strh w9, [sp] -; CHECK-NEXT: ld1h { z0.h }, p0/z, [sp] +; CHECK-NEXT: ld1h { z0.h }, p0/z, [x8] ; CHECK-NEXT: st1h { z0.h }, p0, [x0] ; CHECK-NEXT: mov sp, x29 ; CHECK-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload @@ -457,39 +460,39 @@ ; CHECK-NEXT: orr x9, x8, #0x1e ; CHECK-NEXT: orr x10, x8, #0x1c ; CHECK-NEXT: ldr q1, [x0] -; CHECK-NEXT: orr x12, x8, #0x10 ; CHECK-NEXT: orr x11, x8, #0x18 +; CHECK-NEXT: orr x12, x8, #0x10 ; CHECK-NEXT: str h0, [sp, #22] ; CHECK-NEXT: st1 { v0.h }[4], [x9] ; CHECK-NEXT: orr x9, x8, #0xe ; CHECK-NEXT: st1 { v0.h }[5], [x10] ; CHECK-NEXT: orr x10, x8, #0xc -; CHECK-NEXT: st1 { v0.h }[3], [x12] -; CHECK-NEXT: mov w12, #26 -; CHECK-NEXT: st1 { v1.h }[4], [x9] -; CHECK-NEXT: orr x9, x8, #0x8 ; CHECK-NEXT: st1 { v0.h }[7], [x11] -; CHECK-NEXT: orr x11, x8, #0x2 +; CHECK-NEXT: orr x11, x8, #0x8 +; CHECK-NEXT: st1 { v1.h }[4], [x9] +; CHECK-NEXT: orr x9, x8, #0x4 ; CHECK-NEXT: st1 { v1.h }[5], [x10] -; CHECK-NEXT: orr x10, x8, #0x4 -; CHECK-NEXT: st1 { v1.h }[7], [x9] +; CHECK-NEXT: mov w10, #26 +; CHECK-NEXT: orr x10, x8, x10 +; CHECK-NEXT: st1 { v0.h }[3], [x12] +; CHECK-NEXT: st1 { v1.h }[1], [x9] +; CHECK-NEXT: orr x9, x8, #0x2 +; CHECK-NEXT: st1 { v1.h }[7], [x11] +; CHECK-NEXT: mov w11, #20 +; CHECK-NEXT: mov w12, #18 +; CHECK-NEXT: st1 { v0.h }[6], [x10] +; CHECK-NEXT: mov w10, #10 +; CHECK-NEXT: orr x11, x8, x11 +; CHECK-NEXT: st1 { v1.h }[2], [x9] ; CHECK-NEXT: orr x9, x8, x12 -; CHECK-NEXT: st1 { v1.h }[2], [x11] -; CHECK-NEXT: mov w11, #10 -; CHECK-NEXT: st1 { v1.h }[1], [x10] -; CHECK-NEXT: mov w10, #18 -; CHECK-NEXT: st1 { v0.h }[6], [x9] -; CHECK-NEXT: mov w9, #20 -; CHECK-NEXT: orr x9, x8, x9 ; CHECK-NEXT: orr x10, x8, x10 ; CHECK-NEXT: st1 { v1.h }[3], [x8] -; CHECK-NEXT: orr x8, x8, x11 -; CHECK-NEXT: str h1, [sp, #6] +; CHECK-NEXT: st1 { v0.h }[1], [x11] ; CHECK-NEXT: ptrue p0.h -; CHECK-NEXT: st1 { v0.h }[1], [x9] -; CHECK-NEXT: st1 { v0.h }[2], [x10] -; CHECK-NEXT: st1 { v1.h }[6], [x8] -; CHECK-NEXT: ld1h { z0.h }, p0/z, [sp] +; CHECK-NEXT: st1 { v0.h }[2], [x9] +; CHECK-NEXT: st1 { v1.h }[6], [x10] +; CHECK-NEXT: str h1, [sp, #6] +; CHECK-NEXT: ld1h { z0.h }, p0/z, [x8] ; CHECK-NEXT: st1h { z0.h }, p0, [x2] ; CHECK-NEXT: mov sp, x29 ; CHECK-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-permute-zip-uzp-trn.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-permute-zip-uzp-trn.ll --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-permute-zip-uzp-trn.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-permute-zip-uzp-trn.ll @@ -140,6 +140,7 @@ ; VBITS_EQ_512-NEXT: sub x9, sp, #48 ; VBITS_EQ_512-NEXT: and sp, x9, #0xffffffffffffffe0 ; VBITS_EQ_512-NEXT: ptrue p0.d, vl4 +; VBITS_EQ_512-NEXT: mov x8, sp ; VBITS_EQ_512-NEXT: ld1d { z0.d }, p0/z, [x0] ; VBITS_EQ_512-NEXT: ld1d { z1.d }, p0/z, [x1] ; VBITS_EQ_512-NEXT: mov z2.d, z1.d[3] @@ -149,7 +150,7 @@ ; VBITS_EQ_512-NEXT: mov z3.d, z0.d[2] ; VBITS_EQ_512-NEXT: zip1 z0.d, z0.d, z1.d ; VBITS_EQ_512-NEXT: stp d3, d2, [sp] -; VBITS_EQ_512-NEXT: ld1d { z2.d }, p0/z, [sp] +; VBITS_EQ_512-NEXT: ld1d { z2.d }, p0/z, [x8] ; VBITS_EQ_512-NEXT: fadd z0.d, p0/m, z0.d, z2.d ; VBITS_EQ_512-NEXT: st1d { z0.d }, p0, [x0] ; VBITS_EQ_512-NEXT: mov sp, x29 @@ -657,6 +658,7 @@ ; CHECK-NEXT: sub x9, sp, #48 ; CHECK-NEXT: and sp, x9, #0xffffffffffffffe0 ; CHECK-NEXT: ptrue p0.d, vl4 +; CHECK-NEXT: mov x8, sp ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] ; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1] ; CHECK-NEXT: mov z2.d, z1.d[3] @@ -666,7 +668,7 @@ ; CHECK-NEXT: mov z3.d, z0.d[2] ; CHECK-NEXT: zip1 z0.d, z0.d, z1.d ; CHECK-NEXT: stp d3, d2, [sp] -; CHECK-NEXT: ld1d { z2.d }, p0/z, [sp] +; CHECK-NEXT: ld1d { z2.d }, p0/z, [x8] ; CHECK-NEXT: fadd z0.d, p0/m, z0.d, z2.d ; CHECK-NEXT: st1d { z0.d }, p0, [x0] ; CHECK-NEXT: mov sp, x29 diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-vector-shuffle.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-vector-shuffle.ll --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-vector-shuffle.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-vector-shuffle.ll @@ -918,6 +918,7 @@ ; CHECK-NEXT: sub x9, sp, #48 ; CHECK-NEXT: and sp, x9, #0xffffffffffffffe0 ; CHECK-NEXT: ptrue p0.d, vl4 +; CHECK-NEXT: mov x8, sp ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] ; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1] ; CHECK-NEXT: mov z2.d, z1.d[1] @@ -925,7 +926,7 @@ ; CHECK-NEXT: mov z1.d, z0.d[3] ; CHECK-NEXT: mov z0.d, z0.d[2] ; CHECK-NEXT: stp d0, d1, [sp] -; CHECK-NEXT: ld1d { z0.d }, p0/z, [sp] +; CHECK-NEXT: ld1d { z0.d }, p0/z, [x8] ; CHECK-NEXT: st1d { z0.d }, p0, [x0] ; CHECK-NEXT: mov sp, x29 ; CHECK-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-ld2-alloca.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-ld2-alloca.ll --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-ld2-alloca.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-ld2-alloca.ll @@ -8,23 +8,25 @@ define void @alloc_v4i8(ptr %st_ptr) #0 { ; CHECK-LABEL: alloc_v4i8: ; CHECK: // %bb.0: -; CHECK-NEXT: sub sp, sp, #32 -; CHECK-NEXT: stp x30, x19, [sp, #16] // 16-byte Folded Spill +; CHECK-NEXT: sub sp, sp, #48 +; CHECK-NEXT: stp x20, x19, [sp, #32] // 16-byte Folded Spill ; CHECK-NEXT: mov x19, x0 -; CHECK-NEXT: add x0, sp, #12 +; CHECK-NEXT: add x0, sp, #28 +; CHECK-NEXT: str x30, [sp, #16] // 8-byte Folded Spill +; CHECK-NEXT: add x20, sp, #28 ; CHECK-NEXT: bl def -; CHECK-NEXT: add x8, sp, #12 ; CHECK-NEXT: ptrue p0.b, vl2 -; CHECK-NEXT: ld2b { z0.b, z1.b }, p0/z, [x8] +; CHECK-NEXT: ld2b { z0.b, z1.b }, p0/z, [x20] ; CHECK-NEXT: ptrue p0.s, vl2 +; CHECK-NEXT: ldr x30, [sp, #16] // 8-byte Folded Reload ; CHECK-NEXT: mov z2.b, z0.b[1] ; CHECK-NEXT: fmov w8, s0 ; CHECK-NEXT: fmov w9, s2 -; CHECK-NEXT: stp w8, w9, [sp] -; CHECK-NEXT: ldr d0, [sp] +; CHECK-NEXT: stp w8, w9, [sp, #8] +; CHECK-NEXT: ldr d0, [sp, #8] ; CHECK-NEXT: st1b { z0.s }, p0, [x19] -; CHECK-NEXT: ldp x30, x19, [sp, #16] // 16-byte Folded Reload -; CHECK-NEXT: add sp, sp, #32 +; CHECK-NEXT: ldp x20, x19, [sp, #32] // 16-byte Folded Reload +; CHECK-NEXT: add sp, sp, #48 ; CHECK-NEXT: ret %alloc = alloca [4 x i8] call void @def(ptr %alloc) @@ -38,13 +40,14 @@ ; CHECK-LABEL: alloc_v6i8: ; CHECK: // %bb.0: ; CHECK-NEXT: sub sp, sp, #48 -; CHECK-NEXT: stp x30, x19, [sp, #32] // 16-byte Folded Spill +; CHECK-NEXT: stp x20, x19, [sp, #32] // 16-byte Folded Spill ; CHECK-NEXT: mov x19, x0 ; CHECK-NEXT: add x0, sp, #24 +; CHECK-NEXT: str x30, [sp, #16] // 8-byte Folded Spill +; CHECK-NEXT: add x20, sp, #24 ; CHECK-NEXT: bl def -; CHECK-NEXT: add x8, sp, #24 ; CHECK-NEXT: ptrue p0.b, vl3 -; CHECK-NEXT: ld2b { z0.b, z1.b }, p0/z, [x8] +; CHECK-NEXT: ld2b { z0.b, z1.b }, p0/z, [x20] ; CHECK-NEXT: ptrue p0.h, vl4 ; CHECK-NEXT: fmov w8, s1 ; CHECK-NEXT: mov z2.b, z1.b[3] @@ -52,18 +55,19 @@ ; CHECK-NEXT: mov z0.b, z1.b[1] ; CHECK-NEXT: fmov w9, s2 ; CHECK-NEXT: fmov w10, s3 -; CHECK-NEXT: strh w8, [sp, #8] +; CHECK-NEXT: strh w8, [sp] ; CHECK-NEXT: fmov w8, s0 -; CHECK-NEXT: strh w9, [sp, #14] -; CHECK-NEXT: strh w10, [sp, #12] -; CHECK-NEXT: strh w8, [sp, #10] -; CHECK-NEXT: add x8, sp, #20 -; CHECK-NEXT: ldr d0, [sp, #8] +; CHECK-NEXT: strh w9, [sp, #6] +; CHECK-NEXT: strh w10, [sp, #4] +; CHECK-NEXT: strh w8, [sp, #2] +; CHECK-NEXT: add x8, sp, #12 +; CHECK-NEXT: ldr d0, [sp] ; CHECK-NEXT: st1b { z0.h }, p0, [x8] -; CHECK-NEXT: ldrh w8, [sp, #20] +; CHECK-NEXT: ldrh w8, [sp, #12] ; CHECK-NEXT: strb w10, [x19, #2] +; CHECK-NEXT: ldr x30, [sp, #16] // 8-byte Folded Reload ; CHECK-NEXT: strh w8, [x19] -; CHECK-NEXT: ldp x30, x19, [sp, #32] // 16-byte Folded Reload +; CHECK-NEXT: ldp x20, x19, [sp, #32] // 16-byte Folded Reload ; CHECK-NEXT: add sp, sp, #48 ; CHECK-NEXT: ret %alloc = alloca [6 x i8] @@ -135,7 +139,7 @@ ; CHECK-NEXT: bl def ; CHECK-NEXT: mov x8, #4 ; CHECK-NEXT: ptrue p0.d, vl2 -; CHECK-NEXT: ld2d { z0.d, z1.d }, p0/z, [sp] +; CHECK-NEXT: ld2d { z0.d, z1.d }, p0/z, [x20] ; CHECK-NEXT: ld2d { z2.d, z3.d }, p0/z, [x20, x8, lsl #3] ; CHECK-NEXT: ldr x30, [sp, #64] // 8-byte Folded Reload ; CHECK-NEXT: stp q0, q2, [x19]