diff --git a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td --- a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td @@ -1891,13 +1891,18 @@ defm : pred_store; defm : pred_store; - multiclass unpred_store { + multiclass unpred_store { let AddedComplexity = 1 in { + def _reg : Pat<(Store (Ty ZPR:$val), (AddrCP GPR64sp:$base, GPR64:$offset)), + (RegRegInst ZPR:$val, (PTrue 31), GPR64sp:$base, GPR64:$offset)>; + } + let AddedComplexity = 2 in { def _imm : Pat<(Store (Ty ZPR:$val), (am_sve_indexed_s4 GPR64sp:$base, simm4s1:$offset)), (RegImmInst ZPR:$val, (PTrue 31), GPR64sp:$base, simm4s1:$offset)>; } - let AddedComplexity = 2 in { + let AddedComplexity = 3 in { def _fi : Pat<(Store (Ty ZPR:$val), (am_sve_fi GPR64sp:$base, simm4s1:$offset)), (RegImmInst ZPR:$val, (PTrue 31), GPR64sp:$base, simm4s1:$offset)>; } @@ -1906,32 +1911,36 @@ (RegImmInst ZPR:$val, (PTrue 31), GPR64:$base, (i64 0))>; } - defm : unpred_store< store, nxv16i8, ST1B_IMM, PTRUE_B>; - defm : unpred_store< truncstorevi8, nxv8i16, ST1B_H_IMM, PTRUE_H>; - defm : unpred_store< truncstorevi8, nxv4i32, ST1B_S_IMM, PTRUE_S>; - defm : unpred_store< truncstorevi8, nxv2i64, ST1B_D_IMM, PTRUE_D>; - defm : unpred_store< store, nxv8i16, ST1H_IMM, PTRUE_H>; - defm : unpred_store; - defm : unpred_store; - defm : unpred_store< store, nxv4i32, ST1W_IMM, PTRUE_S>; - defm : unpred_store; - defm : unpred_store< store, nxv2i64, ST1D_IMM, PTRUE_D>; - defm : unpred_store< store, nxv8f16, ST1H_IMM, PTRUE_H>; - defm : unpred_store< store, nxv8bf16, ST1H_IMM, PTRUE_H>; - defm : unpred_store< store, nxv4f16, ST1H_S_IMM, PTRUE_S>; - defm : unpred_store< store, nxv2f16, ST1H_D_IMM, PTRUE_D>; - defm : unpred_store< store, nxv4f32, ST1W_IMM, PTRUE_S>; - defm : unpred_store< store, nxv2f32, ST1W_D_IMM, PTRUE_D>; - defm : unpred_store< store, nxv2f64, ST1D_IMM, PTRUE_D>; - - multiclass unpred_load { + defm : unpred_store< store, nxv16i8, ST1B, ST1B_IMM, PTRUE_B, am_sve_regreg_lsl0>; + defm : unpred_store< truncstorevi8, nxv8i16, ST1B_H, ST1B_H_IMM, PTRUE_H, am_sve_regreg_lsl0>; + defm : unpred_store< truncstorevi8, nxv4i32, ST1B_S, ST1B_S_IMM, PTRUE_S, am_sve_regreg_lsl0>; + defm : unpred_store< truncstorevi8, nxv2i64, ST1B_D, ST1B_D_IMM, PTRUE_D, am_sve_regreg_lsl0>; + defm : unpred_store< store, nxv8i16, ST1H, ST1H_IMM, PTRUE_H, am_sve_regreg_lsl1>; + defm : unpred_store; + defm : unpred_store; + defm : unpred_store< store, nxv4i32, ST1W, ST1W_IMM, PTRUE_S, am_sve_regreg_lsl2>; + defm : unpred_store; + defm : unpred_store< store, nxv2i64, ST1D, ST1D_IMM, PTRUE_D, am_sve_regreg_lsl3>; + defm : unpred_store< store, nxv8f16, ST1H, ST1H_IMM, PTRUE_H, am_sve_regreg_lsl1>; + defm : unpred_store< store, nxv8bf16, ST1H, ST1H_IMM, PTRUE_H, am_sve_regreg_lsl1>; + defm : unpred_store< store, nxv4f16, ST1H_S, ST1H_S_IMM, PTRUE_S, am_sve_regreg_lsl1>; + defm : unpred_store< store, nxv2f16, ST1H_D, ST1H_D_IMM, PTRUE_D, am_sve_regreg_lsl1>; + defm : unpred_store< store, nxv4f32, ST1W, ST1W_IMM, PTRUE_S, am_sve_regreg_lsl2>; + defm : unpred_store< store, nxv2f32, ST1W_D, ST1W_D_IMM, PTRUE_D, am_sve_regreg_lsl2>; + defm : unpred_store< store, nxv2f64, ST1D, ST1D_IMM, PTRUE_D, am_sve_regreg_lsl3>; + + multiclass unpred_load { let AddedComplexity = 1 in { + def _reg: Pat<(Ty (Load (AddrCP GPR64sp:$base, GPR64:$offset))), + (RegRegInst (PTrue 31), GPR64sp:$base, GPR64:$offset)>; + } + let AddedComplexity = 2 in { def _imm: Pat<(Ty (Load (am_sve_indexed_s4 GPR64sp:$base, simm4s1:$offset))), (RegImmInst (PTrue 31), GPR64sp:$base, simm4s1:$offset)>; } - - let AddedComplexity = 2 in { + let AddedComplexity = 3 in { def _fi : Pat<(Ty (Load (am_sve_fi GPR64sp:$base, simm4s1:$offset))), (RegImmInst (PTrue 31), GPR64sp:$base, simm4s1:$offset)>; } @@ -1940,35 +1949,35 @@ (RegImmInst (PTrue 31), GPR64:$base, (i64 0))>; } - defm : unpred_load< load, nxv16i8, LD1B_IMM, PTRUE_B>; - defm : unpred_load< zextloadvi8, nxv8i16, LD1B_H_IMM, PTRUE_H>; - defm : unpred_load< zextloadvi8, nxv4i32, LD1B_S_IMM, PTRUE_S>; - defm : unpred_load< zextloadvi8, nxv2i64, LD1B_D_IMM, PTRUE_D>; - defm : unpred_load< extloadvi8, nxv8i16, LD1B_H_IMM, PTRUE_H>; - defm : unpred_load< extloadvi8, nxv4i32, LD1B_S_IMM, PTRUE_S>; - defm : unpred_load< extloadvi8, nxv2i64, LD1B_D_IMM, PTRUE_D>; - defm : unpred_load< sextloadvi8, nxv8i16, LD1SB_H_IMM, PTRUE_H>; - defm : unpred_load< sextloadvi8, nxv4i32, LD1SB_S_IMM, PTRUE_S>; - defm : unpred_load< sextloadvi8, nxv2i64, LD1SB_D_IMM, PTRUE_D>; - defm : unpred_load< load, nxv8i16, LD1H_IMM, PTRUE_H>; - defm : unpred_load; - defm : unpred_load; - defm : unpred_load< extloadvi16, nxv4i32, LD1H_S_IMM, PTRUE_S>; - defm : unpred_load< extloadvi16, nxv2i64, LD1H_D_IMM, PTRUE_D>; - defm : unpred_load; - defm : unpred_load; - defm : unpred_load< load, nxv4i32, LD1W_IMM, PTRUE_S>; - defm : unpred_load; - defm : unpred_load< extloadvi32, nxv2i64, LD1W_D_IMM, PTRUE_D>; - defm : unpred_load; - defm : unpred_load< load, nxv2i64, LD1D_IMM, PTRUE_D>; - defm : unpred_load< load, nxv8f16, LD1H_IMM, PTRUE_H>; - defm : unpred_load< load, nxv8bf16, LD1H_IMM, PTRUE_H>; - defm : unpred_load< load, nxv4f16, LD1H_S_IMM, PTRUE_S>; - defm : unpred_load< load, nxv2f16, LD1H_D_IMM, PTRUE_D>; - defm : unpred_load< load, nxv4f32, LD1W_IMM, PTRUE_S>; - defm : unpred_load< load, nxv2f32, LD1W_D_IMM, PTRUE_D>; - defm : unpred_load< load, nxv2f64, LD1D_IMM, PTRUE_D>; + defm : unpred_load< load, nxv16i8, LD1B, LD1B_IMM, PTRUE_B, am_sve_regreg_lsl0>; + defm : unpred_load< zextloadvi8, nxv8i16, LD1B_H, LD1B_H_IMM, PTRUE_H, am_sve_regreg_lsl0>; + defm : unpred_load< zextloadvi8, nxv4i32, LD1B_S, LD1B_S_IMM, PTRUE_S, am_sve_regreg_lsl0>; + defm : unpred_load< zextloadvi8, nxv2i64, LD1B_D, LD1B_D_IMM, PTRUE_D, am_sve_regreg_lsl0>; + defm : unpred_load< extloadvi8, nxv8i16, LD1B_H, LD1B_H_IMM, PTRUE_H, am_sve_regreg_lsl0>; + defm : unpred_load< extloadvi8, nxv4i32, LD1B_S, LD1B_S_IMM, PTRUE_S, am_sve_regreg_lsl0>; + defm : unpred_load< extloadvi8, nxv2i64, LD1B_D, LD1B_D_IMM, PTRUE_D, am_sve_regreg_lsl0>; + defm : unpred_load< sextloadvi8, nxv8i16, LD1SB_H, LD1SB_H_IMM, PTRUE_H, am_sve_regreg_lsl0>; + defm : unpred_load< sextloadvi8, nxv4i32, LD1SB_S, LD1SB_S_IMM, PTRUE_S, am_sve_regreg_lsl0>; + defm : unpred_load< sextloadvi8, nxv2i64, LD1SB_D, LD1SB_D_IMM, PTRUE_D, am_sve_regreg_lsl0>; + defm : unpred_load< load, nxv8i16, LD1H, LD1H_IMM, PTRUE_H, am_sve_regreg_lsl1>; + defm : unpred_load; + defm : unpred_load; + defm : unpred_load< extloadvi16, nxv4i32, LD1H_S, LD1H_S_IMM, PTRUE_S, am_sve_regreg_lsl1>; + defm : unpred_load< extloadvi16, nxv2i64, LD1H_D, LD1H_D_IMM, PTRUE_D, am_sve_regreg_lsl1>; + defm : unpred_load; + defm : unpred_load; + defm : unpred_load< load, nxv4i32, LD1W, LD1W_IMM, PTRUE_S, am_sve_regreg_lsl2>; + defm : unpred_load; + defm : unpred_load< extloadvi32, nxv2i64, LD1W_D, LD1W_D_IMM, PTRUE_D, am_sve_regreg_lsl2>; + defm : unpred_load; + defm : unpred_load< load, nxv2i64, LD1D, LD1D_IMM, PTRUE_D, am_sve_regreg_lsl3>; + defm : unpred_load< load, nxv8f16, LD1H, LD1H_IMM, PTRUE_H, am_sve_regreg_lsl1>; + defm : unpred_load< load, nxv8bf16, LD1H, LD1H_IMM, PTRUE_H, am_sve_regreg_lsl1>; + defm : unpred_load< load, nxv4f16, LD1H_S, LD1H_S_IMM, PTRUE_S, am_sve_regreg_lsl1>; + defm : unpred_load< load, nxv2f16, LD1H_D, LD1H_D_IMM, PTRUE_D, am_sve_regreg_lsl1>; + defm : unpred_load< load, nxv4f32, LD1W, LD1W_IMM, PTRUE_S, am_sve_regreg_lsl2>; + defm : unpred_load< load, nxv2f32, LD1W_D, LD1W_D_IMM, PTRUE_D, am_sve_regreg_lsl2>; + defm : unpred_load< load, nxv2f64, LD1D, LD1D_IMM, PTRUE_D, am_sve_regreg_lsl3>; multiclass unpred_store_predicate { def _fi : Pat<(store (Ty PPR:$val), (am_sve_fi GPR64sp:$base, simm9:$offset)), diff --git a/llvm/test/CodeGen/AArch64/named-vector-shuffles-sve.ll b/llvm/test/CodeGen/AArch64/named-vector-shuffles-sve.ll --- a/llvm/test/CodeGen/AArch64/named-vector-shuffles-sve.ll +++ b/llvm/test/CodeGen/AArch64/named-vector-shuffles-sve.ll @@ -14,14 +14,13 @@ ; CHECK-NEXT: addvl sp, sp, #-2 ; CHECK-NEXT: rdvl x9, #1 ; CHECK-NEXT: sub x9, x9, #1 // =1 -; CHECK-NEXT: cmp x9, #0 // =0 ; CHECK-NEXT: ptrue p0.b ; CHECK-NEXT: mov x8, sp -; CHECK-NEXT: csel x9, x9, xzr, lo +; CHECK-NEXT: cmp x9, #0 // =0 ; CHECK-NEXT: st1b { z0.b }, p0, [sp] ; CHECK-NEXT: st1b { z1.b }, p0, [x8, #1, mul vl] -; CHECK-NEXT: add x8, x8, x9 -; CHECK-NEXT: ld1b { z0.b }, p0/z, [x8] +; CHECK-NEXT: csel x9, x9, xzr, lo +; CHECK-NEXT: ld1b { z0.b }, p0/z, [x8, x9] ; CHECK-NEXT: addvl sp, sp, #2 ; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload ; CHECK-NEXT: ret @@ -36,15 +35,14 @@ ; CHECK-NEXT: addvl sp, sp, #-2 ; CHECK-NEXT: rdvl x9, #1 ; CHECK-NEXT: sub x9, x9, #1 // =1 -; CHECK-NEXT: mov w10, #15 -; CHECK-NEXT: cmp x9, #15 // =15 ; CHECK-NEXT: ptrue p0.b ; CHECK-NEXT: mov x8, sp -; CHECK-NEXT: csel x9, x9, x10, lo +; CHECK-NEXT: mov w10, #15 +; CHECK-NEXT: cmp x9, #15 // =15 ; CHECK-NEXT: st1b { z0.b }, p0, [sp] ; CHECK-NEXT: st1b { z1.b }, p0, [x8, #1, mul vl] -; CHECK-NEXT: add x8, x8, x9 -; CHECK-NEXT: ld1b { z0.b }, p0/z, [x8] +; CHECK-NEXT: csel x9, x9, x10, lo +; CHECK-NEXT: ld1b { z0.b }, p0/z, [x8, x9] ; CHECK-NEXT: addvl sp, sp, #2 ; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload ; CHECK-NEXT: ret @@ -60,15 +58,14 @@ ; CHECK-NEXT: addvl sp, sp, #-2 ; CHECK-NEXT: rdvl x9, #1 ; CHECK-NEXT: sub x9, x9, #1 // =1 -; CHECK-NEXT: mov w10, #16 -; CHECK-NEXT: cmp x9, #16 // =16 ; CHECK-NEXT: ptrue p0.b ; CHECK-NEXT: mov x8, sp -; CHECK-NEXT: csel x9, x9, x10, lo +; CHECK-NEXT: mov w10, #16 +; CHECK-NEXT: cmp x9, #16 // =16 ; CHECK-NEXT: st1b { z0.b }, p0, [sp] ; CHECK-NEXT: st1b { z1.b }, p0, [x8, #1, mul vl] -; CHECK-NEXT: add x8, x8, x9 -; CHECK-NEXT: ld1b { z0.b }, p0/z, [x8] +; CHECK-NEXT: csel x9, x9, x10, lo +; CHECK-NEXT: ld1b { z0.b }, p0/z, [x8, x9] ; CHECK-NEXT: addvl sp, sp, #2 ; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload ; CHECK-NEXT: ret @@ -83,14 +80,13 @@ ; CHECK-NEXT: addvl sp, sp, #-2 ; CHECK-NEXT: cnth x9 ; CHECK-NEXT: sub x9, x9, #1 // =1 -; CHECK-NEXT: cmp x9, #0 // =0 ; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: mov x8, sp -; CHECK-NEXT: csel x9, x9, xzr, lo +; CHECK-NEXT: cmp x9, #0 // =0 ; CHECK-NEXT: st1h { z0.h }, p0, [sp] ; CHECK-NEXT: st1h { z1.h }, p0, [x8, #1, mul vl] -; CHECK-NEXT: add x8, x8, x9, lsl #1 -; CHECK-NEXT: ld1h { z0.h }, p0/z, [x8] +; CHECK-NEXT: csel x9, x9, xzr, lo +; CHECK-NEXT: ld1h { z0.h }, p0/z, [x8, x9, lsl #1] ; CHECK-NEXT: addvl sp, sp, #2 ; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload ; CHECK-NEXT: ret @@ -105,15 +101,14 @@ ; CHECK-NEXT: addvl sp, sp, #-2 ; CHECK-NEXT: cnth x10 ; CHECK-NEXT: sub x10, x10, #1 // =1 -; CHECK-NEXT: mov w9, #7 -; CHECK-NEXT: cmp x10, #7 // =7 ; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: mov x8, sp -; CHECK-NEXT: csel x9, x10, x9, lo +; CHECK-NEXT: mov w9, #7 +; CHECK-NEXT: cmp x10, #7 // =7 ; CHECK-NEXT: st1h { z0.h }, p0, [sp] ; CHECK-NEXT: st1h { z1.h }, p0, [x8, #1, mul vl] -; CHECK-NEXT: add x8, x8, x9, lsl #1 -; CHECK-NEXT: ld1h { z0.h }, p0/z, [x8] +; CHECK-NEXT: csel x9, x10, x9, lo +; CHECK-NEXT: ld1h { z0.h }, p0/z, [x8, x9, lsl #1] ; CHECK-NEXT: addvl sp, sp, #2 ; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload ; CHECK-NEXT: ret @@ -129,15 +124,14 @@ ; CHECK-NEXT: addvl sp, sp, #-2 ; CHECK-NEXT: cnth x10 ; CHECK-NEXT: sub x10, x10, #1 // =1 -; CHECK-NEXT: mov w9, #8 -; CHECK-NEXT: cmp x10, #8 // =8 ; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: mov x8, sp -; CHECK-NEXT: csel x9, x10, x9, lo +; CHECK-NEXT: mov w9, #8 +; CHECK-NEXT: cmp x10, #8 // =8 ; CHECK-NEXT: st1h { z0.h }, p0, [sp] ; CHECK-NEXT: st1h { z1.h }, p0, [x8, #1, mul vl] -; CHECK-NEXT: add x8, x8, x9, lsl #1 -; CHECK-NEXT: ld1h { z0.h }, p0/z, [x8] +; CHECK-NEXT: csel x9, x10, x9, lo +; CHECK-NEXT: ld1h { z0.h }, p0/z, [x8, x9, lsl #1] ; CHECK-NEXT: addvl sp, sp, #2 ; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload ; CHECK-NEXT: ret @@ -152,14 +146,13 @@ ; CHECK-NEXT: addvl sp, sp, #-2 ; CHECK-NEXT: cntw x9 ; CHECK-NEXT: sub x9, x9, #1 // =1 -; CHECK-NEXT: cmp x9, #0 // =0 ; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: mov x8, sp -; CHECK-NEXT: csel x9, x9, xzr, lo +; CHECK-NEXT: cmp x9, #0 // =0 ; CHECK-NEXT: st1w { z0.s }, p0, [sp] ; CHECK-NEXT: st1w { z1.s }, p0, [x8, #1, mul vl] -; CHECK-NEXT: add x8, x8, x9, lsl #2 -; CHECK-NEXT: ld1w { z0.s }, p0/z, [x8] +; CHECK-NEXT: csel x9, x9, xzr, lo +; CHECK-NEXT: ld1w { z0.s }, p0/z, [x8, x9, lsl #2] ; CHECK-NEXT: addvl sp, sp, #2 ; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload ; CHECK-NEXT: ret @@ -174,15 +167,14 @@ ; CHECK-NEXT: addvl sp, sp, #-2 ; CHECK-NEXT: cntw x10 ; CHECK-NEXT: sub x10, x10, #1 // =1 -; CHECK-NEXT: mov w9, #3 -; CHECK-NEXT: cmp x10, #3 // =3 ; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: mov x8, sp -; CHECK-NEXT: csel x9, x10, x9, lo +; CHECK-NEXT: mov w9, #3 +; CHECK-NEXT: cmp x10, #3 // =3 ; CHECK-NEXT: st1w { z0.s }, p0, [sp] ; CHECK-NEXT: st1w { z1.s }, p0, [x8, #1, mul vl] -; CHECK-NEXT: add x8, x8, x9, lsl #2 -; CHECK-NEXT: ld1w { z0.s }, p0/z, [x8] +; CHECK-NEXT: csel x9, x10, x9, lo +; CHECK-NEXT: ld1w { z0.s }, p0/z, [x8, x9, lsl #2] ; CHECK-NEXT: addvl sp, sp, #2 ; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload ; CHECK-NEXT: ret @@ -198,15 +190,14 @@ ; CHECK-NEXT: addvl sp, sp, #-2 ; CHECK-NEXT: cntw x10 ; CHECK-NEXT: sub x10, x10, #1 // =1 -; CHECK-NEXT: mov w9, #4 -; CHECK-NEXT: cmp x10, #4 // =4 ; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: mov x8, sp -; CHECK-NEXT: csel x9, x10, x9, lo +; CHECK-NEXT: mov w9, #4 +; CHECK-NEXT: cmp x10, #4 // =4 ; CHECK-NEXT: st1w { z0.s }, p0, [sp] ; CHECK-NEXT: st1w { z1.s }, p0, [x8, #1, mul vl] -; CHECK-NEXT: add x8, x8, x9, lsl #2 -; CHECK-NEXT: ld1w { z0.s }, p0/z, [x8] +; CHECK-NEXT: csel x9, x10, x9, lo +; CHECK-NEXT: ld1w { z0.s }, p0/z, [x8, x9, lsl #2] ; CHECK-NEXT: addvl sp, sp, #2 ; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload ; CHECK-NEXT: ret @@ -221,14 +212,13 @@ ; CHECK-NEXT: addvl sp, sp, #-2 ; CHECK-NEXT: cntd x9 ; CHECK-NEXT: sub x9, x9, #1 // =1 -; CHECK-NEXT: cmp x9, #0 // =0 ; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: mov x8, sp -; CHECK-NEXT: csel x9, x9, xzr, lo +; CHECK-NEXT: cmp x9, #0 // =0 ; CHECK-NEXT: st1d { z0.d }, p0, [sp] ; CHECK-NEXT: st1d { z1.d }, p0, [x8, #1, mul vl] -; CHECK-NEXT: add x8, x8, x9, lsl #3 -; CHECK-NEXT: ld1d { z0.d }, p0/z, [x8] +; CHECK-NEXT: csel x9, x9, xzr, lo +; CHECK-NEXT: ld1d { z0.d }, p0/z, [x8, x9, lsl #3] ; CHECK-NEXT: addvl sp, sp, #2 ; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload ; CHECK-NEXT: ret @@ -243,14 +233,13 @@ ; CHECK-NEXT: addvl sp, sp, #-2 ; CHECK-NEXT: cntd x9 ; CHECK-NEXT: sub x9, x9, #1 // =1 -; CHECK-NEXT: cmp x9, #1 // =1 ; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: mov x8, sp -; CHECK-NEXT: csinc x9, x9, xzr, lo +; CHECK-NEXT: cmp x9, #1 // =1 ; CHECK-NEXT: st1d { z0.d }, p0, [sp] ; CHECK-NEXT: st1d { z1.d }, p0, [x8, #1, mul vl] -; CHECK-NEXT: add x8, x8, x9, lsl #3 -; CHECK-NEXT: ld1d { z0.d }, p0/z, [x8] +; CHECK-NEXT: csinc x9, x9, xzr, lo +; CHECK-NEXT: ld1d { z0.d }, p0/z, [x8, x9, lsl #3] ; CHECK-NEXT: addvl sp, sp, #2 ; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload ; CHECK-NEXT: ret @@ -266,15 +255,14 @@ ; CHECK-NEXT: addvl sp, sp, #-2 ; CHECK-NEXT: cntd x10 ; CHECK-NEXT: sub x10, x10, #1 // =1 -; CHECK-NEXT: mov w9, #2 -; CHECK-NEXT: cmp x10, #2 // =2 ; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: mov x8, sp -; CHECK-NEXT: csel x9, x10, x9, lo +; CHECK-NEXT: mov w9, #2 +; CHECK-NEXT: cmp x10, #2 // =2 ; CHECK-NEXT: st1d { z0.d }, p0, [sp] ; CHECK-NEXT: st1d { z1.d }, p0, [x8, #1, mul vl] -; CHECK-NEXT: add x8, x8, x9, lsl #3 -; CHECK-NEXT: ld1d { z0.d }, p0/z, [x8] +; CHECK-NEXT: csel x9, x10, x9, lo +; CHECK-NEXT: ld1d { z0.d }, p0/z, [x8, x9, lsl #3] ; CHECK-NEXT: addvl sp, sp, #2 ; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload ; CHECK-NEXT: ret @@ -289,14 +277,13 @@ ; CHECK-NEXT: addvl sp, sp, #-2 ; CHECK-NEXT: cnth x9 ; CHECK-NEXT: sub x9, x9, #1 // =1 -; CHECK-NEXT: cmp x9, #0 // =0 ; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: mov x8, sp -; CHECK-NEXT: csel x9, x9, xzr, lo +; CHECK-NEXT: cmp x9, #0 // =0 ; CHECK-NEXT: st1h { z0.h }, p0, [sp] ; CHECK-NEXT: st1h { z1.h }, p0, [x8, #1, mul vl] -; CHECK-NEXT: add x8, x8, x9, lsl #1 -; CHECK-NEXT: ld1h { z0.h }, p0/z, [x8] +; CHECK-NEXT: csel x9, x9, xzr, lo +; CHECK-NEXT: ld1h { z0.h }, p0/z, [x8, x9, lsl #1] ; CHECK-NEXT: addvl sp, sp, #2 ; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload ; CHECK-NEXT: ret @@ -311,15 +298,14 @@ ; CHECK-NEXT: addvl sp, sp, #-2 ; CHECK-NEXT: cnth x10 ; CHECK-NEXT: sub x10, x10, #1 // =1 -; CHECK-NEXT: mov w9, #7 -; CHECK-NEXT: cmp x10, #7 // =7 ; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: mov x8, sp -; CHECK-NEXT: csel x9, x10, x9, lo +; CHECK-NEXT: mov w9, #7 +; CHECK-NEXT: cmp x10, #7 // =7 ; CHECK-NEXT: st1h { z0.h }, p0, [sp] ; CHECK-NEXT: st1h { z1.h }, p0, [x8, #1, mul vl] -; CHECK-NEXT: add x8, x8, x9, lsl #1 -; CHECK-NEXT: ld1h { z0.h }, p0/z, [x8] +; CHECK-NEXT: csel x9, x10, x9, lo +; CHECK-NEXT: ld1h { z0.h }, p0/z, [x8, x9, lsl #1] ; CHECK-NEXT: addvl sp, sp, #2 ; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload ; CHECK-NEXT: ret @@ -335,15 +321,14 @@ ; CHECK-NEXT: addvl sp, sp, #-2 ; CHECK-NEXT: cnth x10 ; CHECK-NEXT: sub x10, x10, #1 // =1 -; CHECK-NEXT: mov w9, #8 -; CHECK-NEXT: cmp x10, #8 // =8 ; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: mov x8, sp -; CHECK-NEXT: csel x9, x10, x9, lo +; CHECK-NEXT: mov w9, #8 +; CHECK-NEXT: cmp x10, #8 // =8 ; CHECK-NEXT: st1h { z0.h }, p0, [sp] ; CHECK-NEXT: st1h { z1.h }, p0, [x8, #1, mul vl] -; CHECK-NEXT: add x8, x8, x9, lsl #1 -; CHECK-NEXT: ld1h { z0.h }, p0/z, [x8] +; CHECK-NEXT: csel x9, x10, x9, lo +; CHECK-NEXT: ld1h { z0.h }, p0/z, [x8, x9, lsl #1] ; CHECK-NEXT: addvl sp, sp, #2 ; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload ; CHECK-NEXT: ret @@ -358,14 +343,13 @@ ; CHECK-NEXT: addvl sp, sp, #-2 ; CHECK-NEXT: cntw x9 ; CHECK-NEXT: sub x9, x9, #1 // =1 -; CHECK-NEXT: cmp x9, #0 // =0 ; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: mov x8, sp -; CHECK-NEXT: csel x9, x9, xzr, lo +; CHECK-NEXT: cmp x9, #0 // =0 ; CHECK-NEXT: st1w { z0.s }, p0, [sp] ; CHECK-NEXT: st1w { z1.s }, p0, [x8, #1, mul vl] -; CHECK-NEXT: add x8, x8, x9, lsl #2 -; CHECK-NEXT: ld1w { z0.s }, p0/z, [x8] +; CHECK-NEXT: csel x9, x9, xzr, lo +; CHECK-NEXT: ld1w { z0.s }, p0/z, [x8, x9, lsl #2] ; CHECK-NEXT: addvl sp, sp, #2 ; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload ; CHECK-NEXT: ret @@ -380,15 +364,14 @@ ; CHECK-NEXT: addvl sp, sp, #-2 ; CHECK-NEXT: cntw x10 ; CHECK-NEXT: sub x10, x10, #1 // =1 -; CHECK-NEXT: mov w9, #3 -; CHECK-NEXT: cmp x10, #3 // =3 ; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: mov x8, sp -; CHECK-NEXT: csel x9, x10, x9, lo +; CHECK-NEXT: mov w9, #3 +; CHECK-NEXT: cmp x10, #3 // =3 ; CHECK-NEXT: st1w { z0.s }, p0, [sp] ; CHECK-NEXT: st1w { z1.s }, p0, [x8, #1, mul vl] -; CHECK-NEXT: add x8, x8, x9, lsl #2 -; CHECK-NEXT: ld1w { z0.s }, p0/z, [x8] +; CHECK-NEXT: csel x9, x10, x9, lo +; CHECK-NEXT: ld1w { z0.s }, p0/z, [x8, x9, lsl #2] ; CHECK-NEXT: addvl sp, sp, #2 ; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload ; CHECK-NEXT: ret @@ -404,15 +387,14 @@ ; CHECK-NEXT: addvl sp, sp, #-2 ; CHECK-NEXT: cntw x10 ; CHECK-NEXT: sub x10, x10, #1 // =1 -; CHECK-NEXT: mov w9, #4 -; CHECK-NEXT: cmp x10, #4 // =4 ; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: mov x8, sp -; CHECK-NEXT: csel x9, x10, x9, lo +; CHECK-NEXT: mov w9, #4 +; CHECK-NEXT: cmp x10, #4 // =4 ; CHECK-NEXT: st1w { z0.s }, p0, [sp] ; CHECK-NEXT: st1w { z1.s }, p0, [x8, #1, mul vl] -; CHECK-NEXT: add x8, x8, x9, lsl #2 -; CHECK-NEXT: ld1w { z0.s }, p0/z, [x8] +; CHECK-NEXT: csel x9, x10, x9, lo +; CHECK-NEXT: ld1w { z0.s }, p0/z, [x8, x9, lsl #2] ; CHECK-NEXT: addvl sp, sp, #2 ; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload ; CHECK-NEXT: ret @@ -427,14 +409,13 @@ ; CHECK-NEXT: addvl sp, sp, #-2 ; CHECK-NEXT: cntd x9 ; CHECK-NEXT: sub x9, x9, #1 // =1 -; CHECK-NEXT: cmp x9, #0 // =0 ; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: mov x8, sp -; CHECK-NEXT: csel x9, x9, xzr, lo +; CHECK-NEXT: cmp x9, #0 // =0 ; CHECK-NEXT: st1d { z0.d }, p0, [sp] ; CHECK-NEXT: st1d { z1.d }, p0, [x8, #1, mul vl] -; CHECK-NEXT: add x8, x8, x9, lsl #3 -; CHECK-NEXT: ld1d { z0.d }, p0/z, [x8] +; CHECK-NEXT: csel x9, x9, xzr, lo +; CHECK-NEXT: ld1d { z0.d }, p0/z, [x8, x9, lsl #3] ; CHECK-NEXT: addvl sp, sp, #2 ; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload ; CHECK-NEXT: ret @@ -449,14 +430,13 @@ ; CHECK-NEXT: addvl sp, sp, #-2 ; CHECK-NEXT: cntd x9 ; CHECK-NEXT: sub x9, x9, #1 // =1 -; CHECK-NEXT: cmp x9, #1 // =1 ; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: mov x8, sp -; CHECK-NEXT: csinc x9, x9, xzr, lo +; CHECK-NEXT: cmp x9, #1 // =1 ; CHECK-NEXT: st1d { z0.d }, p0, [sp] ; CHECK-NEXT: st1d { z1.d }, p0, [x8, #1, mul vl] -; CHECK-NEXT: add x8, x8, x9, lsl #3 -; CHECK-NEXT: ld1d { z0.d }, p0/z, [x8] +; CHECK-NEXT: csinc x9, x9, xzr, lo +; CHECK-NEXT: ld1d { z0.d }, p0/z, [x8, x9, lsl #3] ; CHECK-NEXT: addvl sp, sp, #2 ; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload ; CHECK-NEXT: ret @@ -472,15 +452,14 @@ ; CHECK-NEXT: addvl sp, sp, #-2 ; CHECK-NEXT: cntd x10 ; CHECK-NEXT: sub x10, x10, #1 // =1 -; CHECK-NEXT: mov w9, #2 -; CHECK-NEXT: cmp x10, #2 // =2 ; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: mov x8, sp -; CHECK-NEXT: csel x9, x10, x9, lo +; CHECK-NEXT: mov w9, #2 +; CHECK-NEXT: cmp x10, #2 // =2 ; CHECK-NEXT: st1d { z0.d }, p0, [sp] ; CHECK-NEXT: st1d { z1.d }, p0, [x8, #1, mul vl] -; CHECK-NEXT: add x8, x8, x9, lsl #3 -; CHECK-NEXT: ld1d { z0.d }, p0/z, [x8] +; CHECK-NEXT: csel x9, x10, x9, lo +; CHECK-NEXT: ld1d { z0.d }, p0/z, [x8, x9, lsl #3] ; CHECK-NEXT: addvl sp, sp, #2 ; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload ; CHECK-NEXT: ret @@ -495,17 +474,16 @@ ; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill ; CHECK-NEXT: addvl sp, sp, #-2 ; CHECK-NEXT: cntd x9 -; CHECK-NEXT: sub x9, x9, #1 // =1 ; CHECK-NEXT: mov z0.d, p0/z, #1 // =0x1 ; CHECK-NEXT: ptrue p0.d -; CHECK-NEXT: cmp x9, #1 // =1 +; CHECK-NEXT: sub x9, x9, #1 // =1 ; CHECK-NEXT: st1d { z0.d }, p0, [sp] ; CHECK-NEXT: mov z0.d, p1/z, #1 // =0x1 ; CHECK-NEXT: mov x8, sp -; CHECK-NEXT: csinc x9, x9, xzr, lo +; CHECK-NEXT: cmp x9, #1 // =1 ; CHECK-NEXT: st1d { z0.d }, p0, [x8, #1, mul vl] -; CHECK-NEXT: add x8, x8, x9, lsl #3 -; CHECK-NEXT: ld1d { z0.d }, p0/z, [x8] +; CHECK-NEXT: csinc x9, x9, xzr, lo +; CHECK-NEXT: ld1d { z0.d }, p0/z, [x8, x9, lsl #3] ; CHECK-NEXT: and z0.d, z0.d, #0x1 ; CHECK-NEXT: cmpne p0.d, p0/z, z0.d, #0 ; CHECK-NEXT: addvl sp, sp, #2 @@ -522,18 +500,17 @@ ; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill ; CHECK-NEXT: addvl sp, sp, #-2 ; CHECK-NEXT: cntw x10 -; CHECK-NEXT: sub x10, x10, #1 // =1 ; CHECK-NEXT: mov z0.s, p0/z, #1 // =0x1 ; CHECK-NEXT: ptrue p0.s -; CHECK-NEXT: mov w9, #2 -; CHECK-NEXT: cmp x10, #2 // =2 +; CHECK-NEXT: sub x10, x10, #1 // =1 ; CHECK-NEXT: st1w { z0.s }, p0, [sp] ; CHECK-NEXT: mov z0.s, p1/z, #1 // =0x1 ; CHECK-NEXT: mov x8, sp -; CHECK-NEXT: csel x9, x10, x9, lo +; CHECK-NEXT: mov w9, #2 +; CHECK-NEXT: cmp x10, #2 // =2 ; CHECK-NEXT: st1w { z0.s }, p0, [x8, #1, mul vl] -; CHECK-NEXT: add x8, x8, x9, lsl #2 -; CHECK-NEXT: ld1w { z0.s }, p0/z, [x8] +; CHECK-NEXT: csel x9, x10, x9, lo +; CHECK-NEXT: ld1w { z0.s }, p0/z, [x8, x9, lsl #2] ; CHECK-NEXT: and z0.s, z0.s, #0x1 ; CHECK-NEXT: cmpne p0.s, p0/z, z0.s, #0 ; CHECK-NEXT: addvl sp, sp, #2 @@ -550,18 +527,17 @@ ; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill ; CHECK-NEXT: addvl sp, sp, #-2 ; CHECK-NEXT: cnth x10 -; CHECK-NEXT: sub x10, x10, #1 // =1 ; CHECK-NEXT: mov z0.h, p0/z, #1 // =0x1 ; CHECK-NEXT: ptrue p0.h -; CHECK-NEXT: mov w9, #4 -; CHECK-NEXT: cmp x10, #4 // =4 +; CHECK-NEXT: sub x10, x10, #1 // =1 ; CHECK-NEXT: st1h { z0.h }, p0, [sp] ; CHECK-NEXT: mov z0.h, p1/z, #1 // =0x1 ; CHECK-NEXT: mov x8, sp -; CHECK-NEXT: csel x9, x10, x9, lo +; CHECK-NEXT: mov w9, #4 +; CHECK-NEXT: cmp x10, #4 // =4 ; CHECK-NEXT: st1h { z0.h }, p0, [x8, #1, mul vl] -; CHECK-NEXT: add x8, x8, x9, lsl #1 -; CHECK-NEXT: ld1h { z0.h }, p0/z, [x8] +; CHECK-NEXT: csel x9, x10, x9, lo +; CHECK-NEXT: ld1h { z0.h }, p0/z, [x8, x9, lsl #1] ; CHECK-NEXT: and z0.h, z0.h, #0x1 ; CHECK-NEXT: cmpne p0.h, p0/z, z0.h, #0 ; CHECK-NEXT: addvl sp, sp, #2 @@ -578,18 +554,17 @@ ; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill ; CHECK-NEXT: addvl sp, sp, #-2 ; CHECK-NEXT: rdvl x9, #1 -; CHECK-NEXT: sub x9, x9, #1 // =1 ; CHECK-NEXT: mov z0.b, p0/z, #1 // =0x1 ; CHECK-NEXT: ptrue p0.b -; CHECK-NEXT: mov w10, #8 -; CHECK-NEXT: cmp x9, #8 // =8 +; CHECK-NEXT: sub x9, x9, #1 // =1 ; CHECK-NEXT: st1b { z0.b }, p0, [sp] ; CHECK-NEXT: mov z0.b, p1/z, #1 // =0x1 ; CHECK-NEXT: mov x8, sp -; CHECK-NEXT: csel x9, x9, x10, lo +; CHECK-NEXT: mov w10, #8 +; CHECK-NEXT: cmp x9, #8 // =8 ; CHECK-NEXT: st1b { z0.b }, p0, [x8, #1, mul vl] -; CHECK-NEXT: add x8, x8, x9 -; CHECK-NEXT: ld1b { z0.b }, p0/z, [x8] +; CHECK-NEXT: csel x9, x9, x10, lo +; CHECK-NEXT: ld1b { z0.b }, p0/z, [x8, x9] ; CHECK-NEXT: and z0.b, z0.b, #0x1 ; CHECK-NEXT: cmpne p0.b, p0/z, z0.b, #0 ; CHECK-NEXT: addvl sp, sp, #2 @@ -607,14 +582,13 @@ ; CHECK-NEXT: addvl sp, sp, #-2 ; CHECK-NEXT: cntd x9 ; CHECK-NEXT: sub x9, x9, #1 // =1 -; CHECK-NEXT: cmp x9, #1 // =1 ; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: mov x8, sp -; CHECK-NEXT: csinc x9, x9, xzr, lo +; CHECK-NEXT: cmp x9, #1 // =1 ; CHECK-NEXT: st1d { z0.d }, p0, [sp] ; CHECK-NEXT: st1d { z1.d }, p0, [x8, #1, mul vl] -; CHECK-NEXT: add x8, x8, x9, lsl #3 -; CHECK-NEXT: ld1d { z0.d }, p0/z, [x8] +; CHECK-NEXT: csinc x9, x9, xzr, lo +; CHECK-NEXT: ld1d { z0.d }, p0/z, [x8, x9, lsl #3] ; CHECK-NEXT: addvl sp, sp, #2 ; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload ; CHECK-NEXT: ret @@ -657,11 +631,10 @@ ; CHECK-NEXT: addvl sp, sp, #-8 ; CHECK-NEXT: rdvl x10, #1 ; CHECK-NEXT: sub x10, x10, #1 // =1 -; CHECK-NEXT: mov w9, #16 -; CHECK-NEXT: cmp x10, #16 // =16 ; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: mov x8, sp -; CHECK-NEXT: csel x9, x10, x9, lo +; CHECK-NEXT: mov w9, #16 +; CHECK-NEXT: cmp x10, #16 // =16 ; CHECK-NEXT: st1w { z3.s }, p0, [x8, #3, mul vl] ; CHECK-NEXT: st1w { z2.s }, p0, [x8, #2, mul vl] ; CHECK-NEXT: st1w { z1.s }, p0, [x8, #1, mul vl] @@ -670,8 +643,9 @@ ; CHECK-NEXT: st1w { z4.s }, p0, [x8, #4, mul vl] ; CHECK-NEXT: st1w { z5.s }, p0, [x8, #5, mul vl] ; CHECK-NEXT: st1w { z6.s }, p0, [x8, #6, mul vl] +; CHECK-NEXT: csel x9, x10, x9, lo +; CHECK-NEXT: ld1w { z0.s }, p0/z, [x8, x9, lsl #2] ; CHECK-NEXT: add x8, x8, x9, lsl #2 -; CHECK-NEXT: ld1w { z0.s }, p0/z, [x8] ; CHECK-NEXT: ld1w { z1.s }, p0/z, [x8, #1, mul vl] ; CHECK-NEXT: ld1w { z2.s }, p0/z, [x8, #2, mul vl] ; CHECK-NEXT: ld1w { z3.s }, p0/z, [x8, #3, mul vl] @@ -696,8 +670,8 @@ ; CHECK-NEXT: st1b { z0.b }, p0, [sp] ; CHECK-NEXT: st1b { z1.b }, p0, [x8, #1, mul vl] ; CHECK-NEXT: addvl x8, x8, #1 -; CHECK-NEXT: sub x8, x8, #16 // =16 -; CHECK-NEXT: ld1b { z0.b }, p0/z, [x8] +; CHECK-NEXT: mov x9, #-16 +; CHECK-NEXT: ld1b { z0.b }, p0/z, [x8, x9] ; CHECK-NEXT: addvl sp, sp, #2 ; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload ; CHECK-NEXT: ret @@ -715,8 +689,8 @@ ; CHECK-NEXT: st1b { z0.b }, p0, [sp] ; CHECK-NEXT: st1b { z1.b }, p0, [x8, #1, mul vl] ; CHECK-NEXT: addvl x8, x8, #1 -; CHECK-NEXT: sub x8, x8, #1 // =1 -; CHECK-NEXT: ld1b { z0.b }, p0/z, [x8] +; CHECK-NEXT: mov x9, #-1 +; CHECK-NEXT: ld1b { z0.b }, p0/z, [x8, x9] ; CHECK-NEXT: addvl sp, sp, #2 ; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload ; CHECK-NEXT: ret @@ -1205,8 +1179,8 @@ ; CHECK-NEXT: st1b { z0.b }, p0, [sp] ; CHECK-NEXT: st1b { z1.b }, p0, [x8, #1, mul vl] ; CHECK-NEXT: addvl x8, x8, #1 -; CHECK-NEXT: sub x8, x8, #1 // =1 -; CHECK-NEXT: ld1b { z0.b }, p0/z, [x8] +; CHECK-NEXT: mov x9, #-1 +; CHECK-NEXT: ld1b { z0.b }, p0/z, [x8, x9] ; CHECK-NEXT: and z0.b, z0.b, #0x1 ; CHECK-NEXT: cmpne p0.b, p0/z, z0.b, #0 ; CHECK-NEXT: addvl sp, sp, #2 diff --git a/llvm/test/CodeGen/AArch64/sve-fold-vscale.ll b/llvm/test/CodeGen/AArch64/sve-fold-vscale.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sve-fold-vscale.ll @@ -0,0 +1,74 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve -disable-lsr < %s 2>%t | FileCheck %s +; RUN: FileCheck --check-prefix=WARN --allow-empty %s <%t + +; If this check fails please read test/CodeGen/AArch64/README for instructions on how to resolve it. +; WARN-NOT: warning + +; Check that vscale call is recognised by load/store reg/reg pattern and +; partially folded, with the rest pulled out of the loop. This requires LSR to +; be disabled, which is something that will be addressed at a later date. + +define void @ld1w_reg_loop([32000 x i32]* %addr) { +; CHECK-LABEL: ld1w_reg_loop: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mov x8, xzr +; CHECK-NEXT: cntw x9 +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: .LBB0_1: // %vector.body +; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] +; CHECK-NEXT: adds x8, x8, x9 +; CHECK-NEXT: b.ne .LBB0_1 +; CHECK-NEXT: // %bb.2: // %for.cond.cleanup +; CHECK-NEXT: ret +entry: + %0 = call i64 @llvm.vscale.i64() + %1 = shl i64 %0, 2 + br label %vector.body + +vector.body: + %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ] + %2 = getelementptr inbounds [32000 x i32], [32000 x i32]* %addr, i64 0, i64 %index + %3 = bitcast i32* %2 to * + %load = load volatile , * %3, align 16 + %index.next = add i64 %index, %1 + %4 = icmp eq i64 %index.next, 0 + br i1 %4, label %for.cond.cleanup, label %vector.body + +for.cond.cleanup: + ret void +} + +define void @st1w_reg_loop([32000 x i32]* %addr, %val) { +; CHECK-LABEL: st1w_reg_loop: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mov x8, xzr +; CHECK-NEXT: cntw x9 +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: .LBB1_1: // %vector.body +; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: st1w { z0.s }, p0, [x0, x8, lsl #2] +; CHECK-NEXT: adds x8, x8, x9 +; CHECK-NEXT: b.ne .LBB1_1 +; CHECK-NEXT: // %bb.2: // %for.cond.cleanup +; CHECK-NEXT: ret +entry: + %0 = call i64 @llvm.vscale.i64() + %1 = shl i64 %0, 2 + br label %vector.body + +vector.body: + %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ] + %2 = getelementptr inbounds [32000 x i32], [32000 x i32]* %addr, i64 0, i64 %index + %3 = bitcast i32* %2 to * + store volatile %val, * %3, align 16 + %index.next = add i64 %index, %1 + %4 = icmp eq i64 %index.next, 0 + br i1 %4, label %for.cond.cleanup, label %vector.body + +for.cond.cleanup: + ret void +} + +declare i64 @llvm.vscale.i64() diff --git a/llvm/test/CodeGen/AArch64/sve-ld1-addressing-mode-reg-imm.ll b/llvm/test/CodeGen/AArch64/sve-ld1-addressing-mode-reg-imm.ll --- a/llvm/test/CodeGen/AArch64/sve-ld1-addressing-mode-reg-imm.ll +++ b/llvm/test/CodeGen/AArch64/sve-ld1-addressing-mode-reg-imm.ll @@ -43,9 +43,9 @@ define @ld1b_out_of_upper_bound(* %a) { ; CHECK-LABEL: ld1b_out_of_upper_bound: ; CHECK: // %bb.0: -; CHECK-NEXT: addvl x8, x0, #8 +; CHECK-NEXT: rdvl x8, #8 ; CHECK-NEXT: ptrue p0.b -; CHECK-NEXT: ld1b { z0.b }, p0/z, [x8] +; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0, x8] ; CHECK-NEXT: ret %base = getelementptr , * %a, i64 8 %load = load , * %base @@ -55,9 +55,9 @@ define @ld1b_out_of_lower_bound(* %a) { ; CHECK-LABEL: ld1b_out_of_lower_bound: ; CHECK: // %bb.0: -; CHECK-NEXT: addvl x8, x0, #-9 +; CHECK-NEXT: rdvl x8, #-9 ; CHECK-NEXT: ptrue p0.b -; CHECK-NEXT: ld1b { z0.b }, p0/z, [x8] +; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0, x8] ; CHECK-NEXT: ret %base = getelementptr , * %a, i64 -9 %load = load , * %base @@ -138,4 +138,3 @@ %val = load volatile , * %a ret void } - diff --git a/llvm/test/CodeGen/AArch64/sve-ld1-addressing-mode-reg-reg.ll b/llvm/test/CodeGen/AArch64/sve-ld1-addressing-mode-reg-reg.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sve-ld1-addressing-mode-reg-reg.ll @@ -0,0 +1,302 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve,+bf16 < %s 2>%t | FileCheck %s +; RUN: FileCheck --check-prefix=WARN --allow-empty %s <%t + +; If this check fails please read test/CodeGen/AArch64/README for instructions on how to resolve it. +; WARN-NOT: warning + +; LD1B + +define @ld1_nxv16i8(i8* %addr, i64 %off) { +; CHECK-LABEL: ld1_nxv16i8: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.b +; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0, x1] +; CHECK-NEXT: ret + %ptr = getelementptr inbounds i8, i8* %addr, i64 %off + %ptrcast = bitcast i8* %ptr to * + %val = load volatile , * %ptrcast + ret %val +} + +define @ld1_nxv8i16_zext8(i8* %addr, i64 %off) { +; CHECK-LABEL: ld1_nxv8i16_zext8: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: ld1b { z0.h }, p0/z, [x0, x1] +; CHECK-NEXT: ret + %ptr = getelementptr inbounds i8, i8* %addr, i64 %off + %ptrcast = bitcast i8* %ptr to * + %val = load volatile , * %ptrcast + %zext = zext %val to + ret %zext +} + +define @ld1_nxv4i32_zext8(i8* %addr, i64 %off) { +; CHECK-LABEL: ld1_nxv4i32_zext8: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: ld1b { z0.s }, p0/z, [x0, x1] +; CHECK-NEXT: ret + %ptr = getelementptr inbounds i8, i8* %addr, i64 %off + %ptrcast = bitcast i8* %ptr to * + %val = load volatile , * %ptrcast + %zext = zext %val to + ret %zext +} + +define @ld1_nxv2i64_zext8(i8* %addr, i64 %off) { +; CHECK-LABEL: ld1_nxv2i64_zext8: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: ld1b { z0.d }, p0/z, [x0, x1] +; CHECK-NEXT: ret + %ptr = getelementptr inbounds i8, i8* %addr, i64 %off + %ptrcast = bitcast i8* %ptr to * + %val = load volatile , * %ptrcast + %zext = zext %val to + ret %zext +} + +define @ld1_nxv8i16_sext8(i8* %addr, i64 %off) { +; CHECK-LABEL: ld1_nxv8i16_sext8: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: ld1sb { z0.h }, p0/z, [x0, x1] +; CHECK-NEXT: ret + %ptr = getelementptr inbounds i8, i8* %addr, i64 %off + %ptrcast = bitcast i8* %ptr to * + %val = load volatile , * %ptrcast + %sext = sext %val to + ret %sext +} + +define @ld1_nxv4i32_sext8(i8* %addr, i64 %off) { +; CHECK-LABEL: ld1_nxv4i32_sext8: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: ld1sb { z0.s }, p0/z, [x0, x1] +; CHECK-NEXT: ret + %ptr = getelementptr inbounds i8, i8* %addr, i64 %off + %ptrcast = bitcast i8* %ptr to * + %val = load volatile , * %ptrcast + %sext = sext %val to + ret %sext +} + +define @ld1_nxv2i64_sext8(i8* %addr, i64 %off) { +; CHECK-LABEL: ld1_nxv2i64_sext8: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: ld1sb { z0.d }, p0/z, [x0, x1] +; CHECK-NEXT: ret + %ptr = getelementptr inbounds i8, i8* %addr, i64 %off + %ptrcast = bitcast i8* %ptr to * + %val = load volatile , * %ptrcast + %sext = sext %val to + ret %sext +} + +; LD1H + +define @ld1_nxv8i16(i16* %addr, i64 %off) { +; CHECK-LABEL: ld1_nxv8i16: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0, x1, lsl #1] +; CHECK-NEXT: ret + %ptr = getelementptr inbounds i16, i16* %addr, i64 %off + %ptrcast = bitcast i16* %ptr to * + %val = load volatile , * %ptrcast + ret %val +} + +define @ld1_nxv4i32_zext16(i16* %addr, i64 %off) { +; CHECK-LABEL: ld1_nxv4i32_zext16: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: ld1h { z0.s }, p0/z, [x0, x1, lsl #1] +; CHECK-NEXT: ret + %ptr = getelementptr inbounds i16, i16* %addr, i64 %off + %ptrcast = bitcast i16* %ptr to * + %val = load volatile , * %ptrcast + %zext = zext %val to + ret %zext +} + +define @ld1_nxv2i64_zext16(i16* %addr, i64 %off) { +; CHECK-LABEL: ld1_nxv2i64_zext16: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: ld1h { z0.d }, p0/z, [x0, x1, lsl #1] +; CHECK-NEXT: ret + %ptr = getelementptr inbounds i16, i16* %addr, i64 %off + %ptrcast = bitcast i16* %ptr to * + %val = load volatile , * %ptrcast + %zext = zext %val to + ret %zext +} + +define @ld1_nxv4i32_sext16(i16* %addr, i64 %off) { +; CHECK-LABEL: ld1_nxv4i32_sext16: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: ld1sh { z0.s }, p0/z, [x0, x1, lsl #1] +; CHECK-NEXT: ret + %ptr = getelementptr inbounds i16, i16* %addr, i64 %off + %ptrcast = bitcast i16* %ptr to * + %val = load volatile , * %ptrcast + %sext = sext %val to + ret %sext +} + +define @ld1_nxv2i64_sext16(i16* %addr, i64 %off) { +; CHECK-LABEL: ld1_nxv2i64_sext16: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: ld1sh { z0.d }, p0/z, [x0, x1, lsl #1] +; CHECK-NEXT: ret + %ptr = getelementptr inbounds i16, i16* %addr, i64 %off + %ptrcast = bitcast i16* %ptr to * + %val = load volatile , * %ptrcast + %sext = sext %val to + ret %sext +} + +define @ld1_nxv8f16(half* %addr, i64 %off) { +; CHECK-LABEL: ld1_nxv8f16: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0, x1, lsl #1] +; CHECK-NEXT: ret + %ptr = getelementptr inbounds half, half* %addr, i64 %off + %ptrcast = bitcast half* %ptr to * + %val = load volatile , * %ptrcast + ret %val +} + +define @ld1_nxv8bf16(bfloat* %addr, i64 %off) { +; CHECK-LABEL: ld1_nxv8bf16: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0, x1, lsl #1] +; CHECK-NEXT: ret + %ptr = getelementptr inbounds bfloat, bfloat* %addr, i64 %off + %ptrcast = bitcast bfloat* %ptr to * + %val = load volatile , * %ptrcast + ret %val +} + +define @ld1_nxv4f16(half* %addr, i64 %off) { +; CHECK-LABEL: ld1_nxv4f16: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: ld1h { z0.s }, p0/z, [x0, x1, lsl #1] +; CHECK-NEXT: ret + %ptr = getelementptr inbounds half, half* %addr, i64 %off + %ptrcast = bitcast half* %ptr to * + %val = load volatile , * %ptrcast + ret %val +} + +define @ld1_nxv2f16(half* %addr, i64 %off) { +; CHECK-LABEL: ld1_nxv2f16: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: ld1h { z0.d }, p0/z, [x0, x1, lsl #1] +; CHECK-NEXT: ret + %ptr = getelementptr inbounds half, half* %addr, i64 %off + %ptrcast = bitcast half* %ptr to * + %val = load volatile , * %ptrcast + ret %val +} + +; LD1W + +define @ld1_nxv4i32(i32* %addr, i64 %off) { +; CHECK-LABEL: ld1_nxv4i32: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0, x1, lsl #2] +; CHECK-NEXT: ret + %ptr = getelementptr inbounds i32, i32* %addr, i64 %off + %ptrcast = bitcast i32* %ptr to * + %val = load volatile , * %ptrcast + ret %val +} + +define @ld1_nxv2i64_zext32(i32* %addr, i64 %off) { +; CHECK-LABEL: ld1_nxv2i64_zext32: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: ld1w { z0.d }, p0/z, [x0, x1, lsl #2] +; CHECK-NEXT: ret + %ptr = getelementptr inbounds i32, i32* %addr, i64 %off + %ptrcast = bitcast i32* %ptr to * + %val = load volatile , * %ptrcast + %zext = zext %val to + ret %zext +} + +define @ld1_nxv2i64_sext32(i32* %addr, i64 %off) { +; CHECK-LABEL: ld1_nxv2i64_sext32: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: ld1sw { z0.d }, p0/z, [x0, x1, lsl #2] +; CHECK-NEXT: ret + %ptr = getelementptr inbounds i32, i32* %addr, i64 %off + %ptrcast = bitcast i32* %ptr to * + %val = load volatile , * %ptrcast + %sext = sext %val to + ret %sext +} + +define @ld1_nxv4f32(float* %addr, i64 %off) { +; CHECK-LABEL: ld1_nxv4f32: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0, x1, lsl #2] +; CHECK-NEXT: ret + %ptr = getelementptr inbounds float, float* %addr, i64 %off + %ptrcast = bitcast float* %ptr to * + %val = load volatile , * %ptrcast + ret %val +} + +define @ld1_nxv2f32(float* %addr, i64 %off) { +; CHECK-LABEL: ld1_nxv2f32: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: ld1w { z0.d }, p0/z, [x0, x1, lsl #2] +; CHECK-NEXT: ret + %ptr = getelementptr inbounds float, float* %addr, i64 %off + %ptrcast = bitcast float* %ptr to * + %val = load volatile , * %ptrcast + ret %val +} + +; LD1D + +define @ld1_nxv2i64(i64* %addr, i64 %off) { +; CHECK-LABEL: ld1_nxv2i64: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0, x1, lsl #3] +; CHECK-NEXT: ret + %ptr = getelementptr inbounds i64, i64* %addr, i64 %off + %ptrcast = bitcast i64* %ptr to * + %val = load volatile , * %ptrcast + ret %val +} + +define @ld1_nxv2f64(double* %addr, i64 %off) { +; CHECK-LABEL: ld1_nxv2f64: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0, x1, lsl #3] +; CHECK-NEXT: ret + %ptr = getelementptr inbounds double, double* %addr, i64 %off + %ptrcast = bitcast double* %ptr to * + %val = load volatile , * %ptrcast + ret %val +} diff --git a/llvm/test/CodeGen/AArch64/sve-st1-addressing-mode-reg-imm.ll b/llvm/test/CodeGen/AArch64/sve-st1-addressing-mode-reg-imm.ll --- a/llvm/test/CodeGen/AArch64/sve-st1-addressing-mode-reg-imm.ll +++ b/llvm/test/CodeGen/AArch64/sve-st1-addressing-mode-reg-imm.ll @@ -43,9 +43,9 @@ define void @st1b_out_of_upper_bound( %data, * %a) { ; CHECK-LABEL: st1b_out_of_upper_bound: ; CHECK: // %bb.0: -; CHECK-NEXT: addvl x8, x0, #8 +; CHECK-NEXT: rdvl x8, #8 ; CHECK-NEXT: ptrue p0.b -; CHECK-NEXT: st1b { z0.b }, p0, [x8] +; CHECK-NEXT: st1b { z0.b }, p0, [x0, x8] ; CHECK-NEXT: ret %base = getelementptr , * %a, i64 8 store %data, * %base @@ -55,9 +55,9 @@ define void @st1b_out_of_lower_bound( %data, * %a) { ; CHECK-LABEL: st1b_out_of_lower_bound: ; CHECK: // %bb.0: -; CHECK-NEXT: addvl x8, x0, #-9 +; CHECK-NEXT: rdvl x8, #-9 ; CHECK-NEXT: ptrue p0.b -; CHECK-NEXT: st1b { z0.b }, p0, [x8] +; CHECK-NEXT: st1b { z0.b }, p0, [x0, x8] ; CHECK-NEXT: ret %base = getelementptr , * %a, i64 -9 store %data, * %base diff --git a/llvm/test/CodeGen/AArch64/sve-st1-addressing-mode-reg-reg.ll b/llvm/test/CodeGen/AArch64/sve-st1-addressing-mode-reg-reg.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sve-st1-addressing-mode-reg-reg.ll @@ -0,0 +1,224 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve,+bf16 < %s 2>%t | FileCheck %s +; RUN: FileCheck --check-prefix=WARN --allow-empty %s <%t + +; If this check fails please read test/CodeGen/AArch64/README for instructions on how to resolve it. +; WARN-NOT: warning + +; ST1B + +define void @st1_nxv16i8(i8* %addr, i64 %off, %val) { +; CHECK-LABEL: st1_nxv16i8: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.b +; CHECK-NEXT: st1b { z0.b }, p0, [x0, x1] +; CHECK-NEXT: ret + %ptr = getelementptr inbounds i8, i8* %addr, i64 %off + %ptrcast = bitcast i8* %ptr to * + store %val, * %ptrcast + ret void +} + +define void @st1_nxv8i16_trunc8(i8* %addr, i64 %off, %val) { +; CHECK-LABEL: st1_nxv8i16_trunc8: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: st1b { z0.h }, p0, [x0, x1] +; CHECK-NEXT: ret + %ptr = getelementptr inbounds i8, i8* %addr, i64 %off + %ptrcast = bitcast i8* %ptr to * + %trunc = trunc %val to + store %trunc, * %ptrcast + ret void +} + +define void @st1_nxv4i32_trunc8(i8* %addr, i64 %off, %val) { +; CHECK-LABEL: st1_nxv4i32_trunc8: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: st1b { z0.s }, p0, [x0, x1] +; CHECK-NEXT: ret + %ptr = getelementptr inbounds i8, i8* %addr, i64 %off + %ptrcast = bitcast i8* %ptr to * + %trunc = trunc %val to + store %trunc, * %ptrcast + ret void +} + +define void @st1_nxv2i64_trunc8(i8* %addr, i64 %off, %val) { +; CHECK-LABEL: st1_nxv2i64_trunc8: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: st1b { z0.d }, p0, [x0, x1] +; CHECK-NEXT: ret + %ptr = getelementptr inbounds i8, i8* %addr, i64 %off + %ptrcast = bitcast i8* %ptr to * + %trunc = trunc %val to + store %trunc, * %ptrcast + ret void +} + +; ST1H + +define void @st1_nxv8i16(i16* %addr, i64 %off, %val) { +; CHECK-LABEL: st1_nxv8i16: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: st1h { z0.h }, p0, [x0, x1, lsl #1] +; CHECK-NEXT: ret + %ptr = getelementptr inbounds i16, i16* %addr, i64 %off + %ptrcast = bitcast i16* %ptr to * + store %val, * %ptrcast + ret void +} + +define void @st1_nxv4i32_trunc16(i16* %addr, i64 %off, %val) { +; CHECK-LABEL: st1_nxv4i32_trunc16: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: st1h { z0.s }, p0, [x0, x1, lsl #1] +; CHECK-NEXT: ret + %ptr = getelementptr inbounds i16, i16* %addr, i64 %off + %ptrcast = bitcast i16* %ptr to * + %trunc = trunc %val to + store %trunc, * %ptrcast + ret void +} + +define void @st1_nxv2i64_trunc16(i16* %addr, i64 %off, %val) { +; CHECK-LABEL: st1_nxv2i64_trunc16: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: st1h { z0.d }, p0, [x0, x1, lsl #1] +; CHECK-NEXT: ret + %ptr = getelementptr inbounds i16, i16* %addr, i64 %off + %ptrcast = bitcast i16* %ptr to * + %trunc = trunc %val to + store %trunc, * %ptrcast + ret void +} + +define void @st1_nxv8f16(half* %addr, i64 %off, %val) { +; CHECK-LABEL: st1_nxv8f16: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: st1h { z0.h }, p0, [x0, x1, lsl #1] +; CHECK-NEXT: ret + %ptr = getelementptr inbounds half, half* %addr, i64 %off + %ptrcast = bitcast half* %ptr to * + store %val, * %ptrcast + ret void +} + +define void @st1_nxv8bf16(bfloat* %addr, i64 %off, %val) { +; CHECK-LABEL: st1_nxv8bf16: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: st1h { z0.h }, p0, [x0, x1, lsl #1] +; CHECK-NEXT: ret + %ptr = getelementptr inbounds bfloat, bfloat* %addr, i64 %off + %ptrcast = bitcast bfloat* %ptr to * + store %val, * %ptrcast + ret void +} + +define void @st1_nxv4f16(half* %addr, i64 %off, %val) { +; CHECK-LABEL: st1_nxv4f16: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: st1h { z0.s }, p0, [x0, x1, lsl #1] +; CHECK-NEXT: ret + %ptr = getelementptr inbounds half, half* %addr, i64 %off + %ptrcast = bitcast half* %ptr to * + store %val, * %ptrcast + ret void +} + +define void @st1_nxv2f16(half* %addr, i64 %off, %val) { +; CHECK-LABEL: st1_nxv2f16: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: st1h { z0.d }, p0, [x0, x1, lsl #1] +; CHECK-NEXT: ret + %ptr = getelementptr inbounds half, half* %addr, i64 %off + %ptrcast = bitcast half* %ptr to * + store %val, * %ptrcast + ret void +} + +; ST1W + +define void @st1_nxv4i32(i32* %addr, i64 %off, %val) { +; CHECK-LABEL: st1_nxv4i32: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: st1w { z0.s }, p0, [x0, x1, lsl #2] +; CHECK-NEXT: ret + %ptr = getelementptr inbounds i32, i32* %addr, i64 %off + %ptrcast = bitcast i32* %ptr to * + store %val, * %ptrcast + ret void +} + +define void @st1_nxv2i64_trunc32(i32* %addr, i64 %off, %val) { +; CHECK-LABEL: st1_nxv2i64_trunc32: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: st1w { z0.d }, p0, [x0, x1, lsl #2] +; CHECK-NEXT: ret + %ptr = getelementptr inbounds i32, i32* %addr, i64 %off + %ptrcast = bitcast i32* %ptr to * + %trunc = trunc %val to + store %trunc, * %ptrcast + ret void +} + +define void @st1_nxv4f32(float* %addr, i64 %off, %val) { +; CHECK-LABEL: st1_nxv4f32: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: st1w { z0.s }, p0, [x0, x1, lsl #2] +; CHECK-NEXT: ret + %ptr = getelementptr inbounds float, float* %addr, i64 %off + %ptrcast = bitcast float* %ptr to * + store %val, * %ptrcast + ret void +} + +define void @st1_nxv2f32(float* %addr, i64 %off, %val) { +; CHECK-LABEL: st1_nxv2f32: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: st1w { z0.d }, p0, [x0, x1, lsl #2] +; CHECK-NEXT: ret + %ptr = getelementptr inbounds float, float* %addr, i64 %off + %ptrcast = bitcast float* %ptr to * + store %val, * %ptrcast + ret void +} + +; ST1D + +define void @st1_nxv2i64(i64* %addr, i64 %off, %val) { +; CHECK-LABEL: st1_nxv2i64: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: st1d { z0.d }, p0, [x0, x1, lsl #3] +; CHECK-NEXT: ret + %ptr = getelementptr inbounds i64, i64* %addr, i64 %off + %ptrcast = bitcast i64* %ptr to * + store %val, * %ptrcast + ret void +} + +define void @st1_nxv2f64(double* %addr, i64 %off, %val) { +; CHECK-LABEL: st1_nxv2f64: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: st1d { z0.d }, p0, [x0, x1, lsl #3] +; CHECK-NEXT: ret + %ptr = getelementptr inbounds double, double* %addr, i64 %off + %ptrcast = bitcast double* %ptr to * + store %val, * %ptrcast + ret void +}