diff --git a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td --- a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td @@ -1887,13 +1887,18 @@ defm : pred_store; defm : pred_store; - multiclass unpred_store { + multiclass unpred_store { let AddedComplexity = 1 in { + def _reg : Pat<(Store (Ty ZPR:$val), (AddrCP GPR64sp:$base, GPR64:$offset)), + (RegRegInst ZPR:$val, (PTrue 31), GPR64sp:$base, GPR64:$offset)>; + } + let AddedComplexity = 2 in { def _imm : Pat<(Store (Ty ZPR:$val), (am_sve_indexed_s4 GPR64sp:$base, simm4s1:$offset)), (RegImmInst ZPR:$val, (PTrue 31), GPR64sp:$base, simm4s1:$offset)>; } - let AddedComplexity = 2 in { + let AddedComplexity = 3 in { def _fi : Pat<(Store (Ty ZPR:$val), (am_sve_fi GPR64sp:$base, simm4s1:$offset)), (RegImmInst ZPR:$val, (PTrue 31), GPR64sp:$base, simm4s1:$offset)>; } @@ -1902,32 +1907,36 @@ (RegImmInst ZPR:$val, (PTrue 31), GPR64:$base, (i64 0))>; } - defm : unpred_store< store, nxv16i8, ST1B_IMM, PTRUE_B>; - defm : unpred_store< truncstorevi8, nxv8i16, ST1B_H_IMM, PTRUE_H>; - defm : unpred_store< truncstorevi8, nxv4i32, ST1B_S_IMM, PTRUE_S>; - defm : unpred_store< truncstorevi8, nxv2i64, ST1B_D_IMM, PTRUE_D>; - defm : unpred_store< store, nxv8i16, ST1H_IMM, PTRUE_H>; - defm : unpred_store; - defm : unpred_store; - defm : unpred_store< store, nxv4i32, ST1W_IMM, PTRUE_S>; - defm : unpred_store; - defm : unpred_store< store, nxv2i64, ST1D_IMM, PTRUE_D>; - defm : unpred_store< store, nxv8f16, ST1H_IMM, PTRUE_H>; - defm : unpred_store< store, nxv8bf16, ST1H_IMM, PTRUE_H>; - defm : unpred_store< store, nxv4f16, ST1H_S_IMM, PTRUE_S>; - defm : unpred_store< store, nxv2f16, ST1H_D_IMM, PTRUE_D>; - defm : unpred_store< store, nxv4f32, ST1W_IMM, PTRUE_S>; - defm : unpred_store< store, nxv2f32, ST1W_D_IMM, PTRUE_D>; - defm : unpred_store< store, nxv2f64, ST1D_IMM, PTRUE_D>; - - multiclass unpred_load { + defm : unpred_store< store, nxv16i8, ST1B, ST1B_IMM, PTRUE_B, am_sve_regreg_lsl0>; + defm : unpred_store< truncstorevi8, nxv8i16, ST1B_H, ST1B_H_IMM, PTRUE_H, am_sve_regreg_lsl0>; + defm : unpred_store< truncstorevi8, nxv4i32, ST1B_S, ST1B_S_IMM, PTRUE_S, am_sve_regreg_lsl0>; + defm : unpred_store< truncstorevi8, nxv2i64, ST1B_D, ST1B_D_IMM, PTRUE_D, am_sve_regreg_lsl0>; + defm : unpred_store< store, nxv8i16, ST1H, ST1H_IMM, PTRUE_H, am_sve_regreg_lsl1>; + defm : unpred_store; + defm : unpred_store; + defm : unpred_store< store, nxv4i32, ST1W, ST1W_IMM, PTRUE_S, am_sve_regreg_lsl2>; + defm : unpred_store; + defm : unpred_store< store, nxv2i64, ST1D, ST1D_IMM, PTRUE_D, am_sve_regreg_lsl3>; + defm : unpred_store< store, nxv8f16, ST1H, ST1H_IMM, PTRUE_H, am_sve_regreg_lsl1>; + defm : unpred_store< store, nxv8bf16, ST1H, ST1H_IMM, PTRUE_H, am_sve_regreg_lsl1>; + defm : unpred_store< store, nxv4f16, ST1H_S, ST1H_S_IMM, PTRUE_S, am_sve_regreg_lsl1>; + defm : unpred_store< store, nxv2f16, ST1H_D, ST1H_D_IMM, PTRUE_D, am_sve_regreg_lsl1>; + defm : unpred_store< store, nxv4f32, ST1W, ST1W_IMM, PTRUE_S, am_sve_regreg_lsl2>; + defm : unpred_store< store, nxv2f32, ST1W_D, ST1W_D_IMM, PTRUE_D, am_sve_regreg_lsl2>; + defm : unpred_store< store, nxv2f64, ST1D, ST1D_IMM, PTRUE_D, am_sve_regreg_lsl3>; + + multiclass unpred_load { let AddedComplexity = 1 in { + def _reg: Pat<(Ty (Load (AddrCP GPR64sp:$base, GPR64:$offset))), + (RegRegInst (PTrue 31), GPR64sp:$base, GPR64:$offset)>; + } + let AddedComplexity = 2 in { def _imm: Pat<(Ty (Load (am_sve_indexed_s4 GPR64sp:$base, simm4s1:$offset))), (RegImmInst (PTrue 31), GPR64sp:$base, simm4s1:$offset)>; } - - let AddedComplexity = 2 in { + let AddedComplexity = 3 in { def _fi : Pat<(Ty (Load (am_sve_fi GPR64sp:$base, simm4s1:$offset))), (RegImmInst (PTrue 31), GPR64sp:$base, simm4s1:$offset)>; } @@ -1936,35 +1945,35 @@ (RegImmInst (PTrue 31), GPR64:$base, (i64 0))>; } - defm : unpred_load< load, nxv16i8, LD1B_IMM, PTRUE_B>; - defm : unpred_load< zextloadvi8, nxv8i16, LD1B_H_IMM, PTRUE_H>; - defm : unpred_load< zextloadvi8, nxv4i32, LD1B_S_IMM, PTRUE_S>; - defm : unpred_load< zextloadvi8, nxv2i64, LD1B_D_IMM, PTRUE_D>; - defm : unpred_load< extloadvi8, nxv8i16, LD1B_H_IMM, PTRUE_H>; - defm : unpred_load< extloadvi8, nxv4i32, LD1B_S_IMM, PTRUE_S>; - defm : unpred_load< extloadvi8, nxv2i64, LD1B_D_IMM, PTRUE_D>; - defm : unpred_load< sextloadvi8, nxv8i16, LD1SB_H_IMM, PTRUE_H>; - defm : unpred_load< sextloadvi8, nxv4i32, LD1SB_S_IMM, PTRUE_S>; - defm : unpred_load< sextloadvi8, nxv2i64, LD1SB_D_IMM, PTRUE_D>; - defm : unpred_load< load, nxv8i16, LD1H_IMM, PTRUE_H>; - defm : unpred_load; - defm : unpred_load; - defm : unpred_load< extloadvi16, nxv4i32, LD1H_S_IMM, PTRUE_S>; - defm : unpred_load< extloadvi16, nxv2i64, LD1H_D_IMM, PTRUE_D>; - defm : unpred_load; - defm : unpred_load; - defm : unpred_load< load, nxv4i32, LD1W_IMM, PTRUE_S>; - defm : unpred_load; - defm : unpred_load< extloadvi32, nxv2i64, LD1W_D_IMM, PTRUE_D>; - defm : unpred_load; - defm : unpred_load< load, nxv2i64, LD1D_IMM, PTRUE_D>; - defm : unpred_load< load, nxv8f16, LD1H_IMM, PTRUE_H>; - defm : unpred_load< load, nxv8bf16, LD1H_IMM, PTRUE_H>; - defm : unpred_load< load, nxv4f16, LD1H_S_IMM, PTRUE_S>; - defm : unpred_load< load, nxv2f16, LD1H_D_IMM, PTRUE_D>; - defm : unpred_load< load, nxv4f32, LD1W_IMM, PTRUE_S>; - defm : unpred_load< load, nxv2f32, LD1W_D_IMM, PTRUE_D>; - defm : unpred_load< load, nxv2f64, LD1D_IMM, PTRUE_D>; + defm : unpred_load< load, nxv16i8, LD1B, LD1B_IMM, PTRUE_B, am_sve_regreg_lsl0>; + defm : unpred_load< zextloadvi8, nxv8i16, LD1B_H, LD1B_H_IMM, PTRUE_H, am_sve_regreg_lsl0>; + defm : unpred_load< zextloadvi8, nxv4i32, LD1B_S, LD1B_S_IMM, PTRUE_S, am_sve_regreg_lsl0>; + defm : unpred_load< zextloadvi8, nxv2i64, LD1B_D, LD1B_D_IMM, PTRUE_D, am_sve_regreg_lsl0>; + defm : unpred_load< extloadvi8, nxv8i16, LD1B_H, LD1B_H_IMM, PTRUE_H, am_sve_regreg_lsl0>; + defm : unpred_load< extloadvi8, nxv4i32, LD1B_S, LD1B_S_IMM, PTRUE_S, am_sve_regreg_lsl0>; + defm : unpred_load< extloadvi8, nxv2i64, LD1B_D, LD1B_D_IMM, PTRUE_D, am_sve_regreg_lsl0>; + defm : unpred_load< sextloadvi8, nxv8i16, LD1SB_H, LD1SB_H_IMM, PTRUE_H, am_sve_regreg_lsl0>; + defm : unpred_load< sextloadvi8, nxv4i32, LD1SB_S, LD1SB_S_IMM, PTRUE_S, am_sve_regreg_lsl0>; + defm : unpred_load< sextloadvi8, nxv2i64, LD1SB_D, LD1SB_D_IMM, PTRUE_D, am_sve_regreg_lsl0>; + defm : unpred_load< load, nxv8i16, LD1H, LD1H_IMM, PTRUE_H, am_sve_regreg_lsl1>; + defm : unpred_load; + defm : unpred_load; + defm : unpred_load< extloadvi16, nxv4i32, LD1H_S, LD1H_S_IMM, PTRUE_S, am_sve_regreg_lsl1>; + defm : unpred_load< extloadvi16, nxv2i64, LD1H_D, LD1H_D_IMM, PTRUE_D, am_sve_regreg_lsl1>; + defm : unpred_load; + defm : unpred_load; + defm : unpred_load< load, nxv4i32, LD1W, LD1W_IMM, PTRUE_S, am_sve_regreg_lsl2>; + defm : unpred_load; + defm : unpred_load< extloadvi32, nxv2i64, LD1W_D, LD1W_D_IMM, PTRUE_D, am_sve_regreg_lsl2>; + defm : unpred_load; + defm : unpred_load< load, nxv2i64, LD1D, LD1D_IMM, PTRUE_D, am_sve_regreg_lsl3>; + defm : unpred_load< load, nxv8f16, LD1H, LD1H_IMM, PTRUE_H, am_sve_regreg_lsl1>; + defm : unpred_load< load, nxv8bf16, LD1H, LD1H_IMM, PTRUE_H, am_sve_regreg_lsl1>; + defm : unpred_load< load, nxv4f16, LD1H_S, LD1H_S_IMM, PTRUE_S, am_sve_regreg_lsl1>; + defm : unpred_load< load, nxv2f16, LD1H_D, LD1H_D_IMM, PTRUE_D, am_sve_regreg_lsl1>; + defm : unpred_load< load, nxv4f32, LD1W, LD1W_IMM, PTRUE_S, am_sve_regreg_lsl2>; + defm : unpred_load< load, nxv2f32, LD1W_D, LD1W_D_IMM, PTRUE_D, am_sve_regreg_lsl2>; + defm : unpred_load< load, nxv2f64, LD1D, LD1D_IMM, PTRUE_D, am_sve_regreg_lsl3>; multiclass unpred_store_predicate { def _fi : Pat<(store (Ty PPR:$val), (am_sve_fi GPR64sp:$base, simm9:$offset)), diff --git a/llvm/test/CodeGen/AArch64/sve-fold-vscale.ll b/llvm/test/CodeGen/AArch64/sve-fold-vscale.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sve-fold-vscale.ll @@ -0,0 +1,74 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve -disable-lsr < %s 2>%t | FileCheck %s +; RUN: FileCheck --check-prefix=WARN --allow-empty %s <%t + +; If this check fails please read test/CodeGen/AArch64/README for instructions on how to resolve it. +; WARN-NOT: warning + +; Check that vscale call is recognised by load/store reg/reg pattern and +; partially folded, with the rest pulled out of the loop. This requires LSR to +; be disabled, which is something that will be addressed at a later date. + +define void @ld1w_reg_loop([32000 x i32]* %addr) { +; CHECK-LABEL: ld1w_reg_loop: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mov x8, xzr +; CHECK-NEXT: cntw x9 +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: .LBB0_1: // %vector.body +; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] +; CHECK-NEXT: adds x8, x8, x9 +; CHECK-NEXT: b.ne .LBB0_1 +; CHECK-NEXT: // %bb.2: // %for.cond.cleanup +; CHECK-NEXT: ret +entry: + %0 = call i64 @llvm.vscale.i64() + %1 = shl i64 %0, 2 + br label %vector.body + +vector.body: + %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ] + %2 = getelementptr inbounds [32000 x i32], [32000 x i32]* %addr, i64 0, i64 %index + %3 = bitcast i32* %2 to * + %load = load volatile , * %3, align 16 + %index.next = add i64 %index, %1 + %4 = icmp eq i64 %index.next, 0 + br i1 %4, label %for.cond.cleanup, label %vector.body + +for.cond.cleanup: + ret void +} + +define void @st1w_reg_loop([32000 x i32]* %addr, %val) { +; CHECK-LABEL: st1w_reg_loop: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mov x8, xzr +; CHECK-NEXT: cntw x9 +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: .LBB1_1: // %vector.body +; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: st1w { z0.s }, p0, [x0, x8, lsl #2] +; CHECK-NEXT: adds x8, x8, x9 +; CHECK-NEXT: b.ne .LBB1_1 +; CHECK-NEXT: // %bb.2: // %for.cond.cleanup +; CHECK-NEXT: ret +entry: + %0 = call i64 @llvm.vscale.i64() + %1 = shl i64 %0, 2 + br label %vector.body + +vector.body: + %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ] + %2 = getelementptr inbounds [32000 x i32], [32000 x i32]* %addr, i64 0, i64 %index + %3 = bitcast i32* %2 to * + store volatile %val, * %3, align 16 + %index.next = add i64 %index, %1 + %4 = icmp eq i64 %index.next, 0 + br i1 %4, label %for.cond.cleanup, label %vector.body + +for.cond.cleanup: + ret void +} + +declare i64 @llvm.vscale.i64() diff --git a/llvm/test/CodeGen/AArch64/sve-ld1-addressing-mode-reg-imm.ll b/llvm/test/CodeGen/AArch64/sve-ld1-addressing-mode-reg-imm.ll --- a/llvm/test/CodeGen/AArch64/sve-ld1-addressing-mode-reg-imm.ll +++ b/llvm/test/CodeGen/AArch64/sve-ld1-addressing-mode-reg-imm.ll @@ -43,9 +43,9 @@ define @ld1b_out_of_upper_bound(* %a) { ; CHECK-LABEL: ld1b_out_of_upper_bound: ; CHECK: // %bb.0: -; CHECK-NEXT: addvl x8, x0, #8 +; CHECK-NEXT: rdvl x8, #8 ; CHECK-NEXT: ptrue p0.b -; CHECK-NEXT: ld1b { z0.b }, p0/z, [x8] +; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0, x8] ; CHECK-NEXT: ret %base = getelementptr , * %a, i64 8 %load = load , * %base @@ -55,9 +55,9 @@ define @ld1b_out_of_lower_bound(* %a) { ; CHECK-LABEL: ld1b_out_of_lower_bound: ; CHECK: // %bb.0: -; CHECK-NEXT: addvl x8, x0, #-9 +; CHECK-NEXT: rdvl x8, #-9 ; CHECK-NEXT: ptrue p0.b -; CHECK-NEXT: ld1b { z0.b }, p0/z, [x8] +; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0, x8] ; CHECK-NEXT: ret %base = getelementptr , * %a, i64 -9 %load = load , * %base @@ -139,3 +139,72 @@ ret void } +define void @ld1b_reg([64 x i8]* %addr, i64 %off) { +; CHECK-LABEL: ld1b_reg: +; CHECK: // %bb.0: +; CHECK-NEXT: cntw x8 +; CHECK-NEXT: add x8, x1, x8 +; CHECK-NEXT: ptrue p0.b +; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0, x8] +; CHECK-NEXT: ret + %vscale = call i64 @llvm.vscale.i64() + %mul = mul i64 %vscale, 4 + %offset = add i64 %off, %mul + %ptr = getelementptr inbounds [64 x i8], [64 x i8]* %addr, i64 0, i64 %offset + %ptrcast = bitcast i8* %ptr to * + %val = load volatile , * %ptrcast + ret void +} + +define void @ld1h_reg([64 x i16]* %addr, i64 %off) { +; CHECK-LABEL: ld1h_reg: +; CHECK: // %bb.0: +; CHECK-NEXT: cntw x8 +; CHECK-NEXT: add x8, x1, x8 +; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] +; CHECK-NEXT: ret + %vscale = call i64 @llvm.vscale.i64() + %mul = mul i64 %vscale, 4 + %offset = add i64 %off, %mul + %ptr = getelementptr inbounds [64 x i16], [64 x i16]* %addr, i64 0, i64 %offset + %ptrcast = bitcast i16* %ptr to * + %val = load volatile , * %ptrcast + ret void +} + +define void @ld1w_reg([64 x i32]* %addr, i64 %off) { +; CHECK-LABEL: ld1w_reg: +; CHECK: // %bb.0: +; CHECK-NEXT: cntw x8 +; CHECK-NEXT: add x8, x1, x8 +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] +; CHECK-NEXT: ret + %vscale = call i64 @llvm.vscale.i64() + %mul = mul i64 %vscale, 4 + %offset = add i64 %off, %mul + %ptr = getelementptr inbounds [64 x i32], [64 x i32]* %addr, i64 0, i64 %offset + %ptrcast = bitcast i32* %ptr to * + %val = load volatile , * %ptrcast + ret void +} + +define void @ld1d_reg([64 x i64]* %addr, i64 %off) { +; CHECK-LABEL: ld1d_reg: +; CHECK: // %bb.0: +; CHECK-NEXT: cntw x8 +; CHECK-NEXT: add x8, x1, x8 +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] +; CHECK-NEXT: ret + %vscale = call i64 @llvm.vscale.i64() + %mul = mul i64 %vscale, 4 + %offset = add i64 %off, %mul + %ptr = getelementptr inbounds [64 x i64], [64 x i64]* %addr, i64 0, i64 %offset + %ptrcast = bitcast i64* %ptr to * + %val = load volatile , * %ptrcast + ret void +} + +declare i64 @llvm.vscale.i64() diff --git a/llvm/test/CodeGen/AArch64/sve-st1-addressing-mode-reg-imm.ll b/llvm/test/CodeGen/AArch64/sve-st1-addressing-mode-reg-imm.ll --- a/llvm/test/CodeGen/AArch64/sve-st1-addressing-mode-reg-imm.ll +++ b/llvm/test/CodeGen/AArch64/sve-st1-addressing-mode-reg-imm.ll @@ -43,9 +43,9 @@ define void @st1b_out_of_upper_bound( %data, * %a) { ; CHECK-LABEL: st1b_out_of_upper_bound: ; CHECK: // %bb.0: -; CHECK-NEXT: addvl x8, x0, #8 +; CHECK-NEXT: rdvl x8, #8 ; CHECK-NEXT: ptrue p0.b -; CHECK-NEXT: st1b { z0.b }, p0, [x8] +; CHECK-NEXT: st1b { z0.b }, p0, [x0, x8] ; CHECK-NEXT: ret %base = getelementptr , * %a, i64 8 store %data, * %base @@ -55,9 +55,9 @@ define void @st1b_out_of_lower_bound( %data, * %a) { ; CHECK-LABEL: st1b_out_of_lower_bound: ; CHECK: // %bb.0: -; CHECK-NEXT: addvl x8, x0, #-9 +; CHECK-NEXT: rdvl x8, #-9 ; CHECK-NEXT: ptrue p0.b -; CHECK-NEXT: st1b { z0.b }, p0, [x8] +; CHECK-NEXT: st1b { z0.b }, p0, [x0, x8] ; CHECK-NEXT: ret %base = getelementptr , * %a, i64 -9 store %data, * %base @@ -165,3 +165,73 @@ store %splat, * %out ret void } + +define void @st1b_reg([64 x i8]* %addr, i64 %off, %val) { +; CHECK-LABEL: st1b_reg: +; CHECK: // %bb.0: +; CHECK-NEXT: cntw x8 +; CHECK-NEXT: add x8, x1, x8 +; CHECK-NEXT: ptrue p0.b +; CHECK-NEXT: st1b { z0.b }, p0, [x0, x8] +; CHECK-NEXT: ret + %vscale = call i64 @llvm.vscale.i64() + %mul = mul i64 %vscale, 4 + %offset = add i64 %off, %mul + %ptr = getelementptr inbounds [64 x i8], [64 x i8]* %addr, i64 0, i64 %offset + %ptrcast = bitcast i8* %ptr to * + store %val, * %ptrcast + ret void +} + +define void @st1h_reg([64 x i16]* %addr, i64 %off, %val) { +; CHECK-LABEL: st1h_reg: +; CHECK: // %bb.0: +; CHECK-NEXT: cntw x8 +; CHECK-NEXT: add x8, x1, x8 +; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: st1h { z0.h }, p0, [x0, x8, lsl #1] +; CHECK-NEXT: ret + %vscale = call i64 @llvm.vscale.i64() + %mul = mul i64 %vscale, 4 + %offset = add i64 %off, %mul + %ptr = getelementptr inbounds [64 x i16], [64 x i16]* %addr, i64 0, i64 %offset + %ptrcast = bitcast i16* %ptr to * + store %val, * %ptrcast + ret void +} + +define void @st1w_reg([64 x i32]* %addr, i64 %off, %val) { +; CHECK-LABEL: st1w_reg: +; CHECK: // %bb.0: +; CHECK-NEXT: cntw x8 +; CHECK-NEXT: add x8, x1, x8 +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: st1w { z0.s }, p0, [x0, x8, lsl #2] +; CHECK-NEXT: ret + %vscale = call i64 @llvm.vscale.i64() + %mul = mul i64 %vscale, 4 + %offset = add i64 %off, %mul + %ptr = getelementptr inbounds [64 x i32], [64 x i32]* %addr, i64 0, i64 %offset + %ptrcast = bitcast i32* %ptr to * + store %val, * %ptrcast + ret void +} + +define void @st1d_reg([64 x i64]* %addr, i64 %off, %val) { +; CHECK-LABEL: st1d_reg: +; CHECK: // %bb.0: +; CHECK-NEXT: cntw x8 +; CHECK-NEXT: add x8, x1, x8 +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: st1d { z0.d }, p0, [x0, x8, lsl #3] +; CHECK-NEXT: ret + %vscale = call i64 @llvm.vscale.i64() + %mul = mul i64 %vscale, 4 + %offset = add i64 %off, %mul + %ptr = getelementptr inbounds [64 x i64], [64 x i64]* %addr, i64 0, i64 %offset + %ptrcast = bitcast i64* %ptr to * + store %val, * %ptrcast + ret void +} + +declare i64 @llvm.vscale.i64()