diff --git a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td --- a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td @@ -1975,6 +1975,25 @@ defm : unpred_load< load, nxv2f32, LD1W_D, LD1W_D_IMM, PTRUE_D, am_sve_regreg_lsl2>; defm : unpred_load< load, nxv2f64, LD1D, LD1D_IMM, PTRUE_D, am_sve_regreg_lsl3>; + // Allow using the reg+reg form of ld1b/st1b for memory accesses with the + // same width as nxv16i8. This saves an add in cases where we would + // otherwise compute the address separately. + multiclass unpred_loadstore_bitcast { + let Predicates = [IsLE] in { + def : Pat<(Ty (load (am_sve_regreg_lsl0 GPR64sp:$base, GPR64:$offset))), + (LD1B (PTRUE_B 31), GPR64sp:$base, GPR64:$offset)>; + def : Pat<(store (Ty ZPR:$val), (am_sve_regreg_lsl0 GPR64sp:$base, GPR64:$offset)), + (ST1B ZPR:$val, (PTRUE_B 31), GPR64sp:$base, GPR64:$offset)>; + } + } + defm : unpred_loadstore_bitcast; + defm : unpred_loadstore_bitcast; + defm : unpred_loadstore_bitcast; + defm : unpred_loadstore_bitcast; + defm : unpred_loadstore_bitcast; + defm : unpred_loadstore_bitcast; + defm : unpred_loadstore_bitcast; + multiclass unpred_store_predicate { def _fi : Pat<(store (Ty PPR:$val), (am_sve_fi GPR64sp:$base, simm9:$offset)), (Store PPR:$val, GPR64sp:$base, simm9:$offset)>; diff --git a/llvm/test/CodeGen/AArch64/named-vector-shuffles-sve.ll b/llvm/test/CodeGen/AArch64/named-vector-shuffles-sve.ll --- a/llvm/test/CodeGen/AArch64/named-vector-shuffles-sve.ll +++ b/llvm/test/CodeGen/AArch64/named-vector-shuffles-sve.ll @@ -652,11 +652,12 @@ ; CHECK-NEXT: addvl sp, sp, #-2 ; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: mov x8, sp +; CHECK-NEXT: ptrue p1.b ; CHECK-NEXT: st1h { z0.h }, p0, [sp] ; CHECK-NEXT: st1h { z1.h }, p0, [x8, #1, mul vl] ; CHECK-NEXT: addvl x8, x8, #1 -; CHECK-NEXT: sub x8, x8, #16 // =16 -; CHECK-NEXT: ld1h { z0.h }, p0/z, [x8] +; CHECK-NEXT: mov x9, #-16 +; CHECK-NEXT: ld1b { z0.b }, p1/z, [x8, x9] ; CHECK-NEXT: addvl sp, sp, #2 ; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload ; CHECK-NEXT: ret @@ -671,11 +672,12 @@ ; CHECK-NEXT: addvl sp, sp, #-2 ; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: mov x8, sp +; CHECK-NEXT: ptrue p1.b ; CHECK-NEXT: st1h { z0.h }, p0, [sp] ; CHECK-NEXT: st1h { z1.h }, p0, [x8, #1, mul vl] ; CHECK-NEXT: addvl x8, x8, #1 -; CHECK-NEXT: sub x8, x8, #2 // =2 -; CHECK-NEXT: ld1h { z0.h }, p0/z, [x8] +; CHECK-NEXT: mov x9, #-2 +; CHECK-NEXT: ld1b { z0.b }, p1/z, [x8, x9] ; CHECK-NEXT: addvl sp, sp, #2 ; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload ; CHECK-NEXT: ret @@ -714,11 +716,12 @@ ; CHECK-NEXT: addvl sp, sp, #-2 ; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: mov x8, sp +; CHECK-NEXT: ptrue p1.b ; CHECK-NEXT: st1w { z0.s }, p0, [sp] ; CHECK-NEXT: st1w { z1.s }, p0, [x8, #1, mul vl] ; CHECK-NEXT: addvl x8, x8, #1 -; CHECK-NEXT: sub x8, x8, #16 // =16 -; CHECK-NEXT: ld1w { z0.s }, p0/z, [x8] +; CHECK-NEXT: mov x9, #-16 +; CHECK-NEXT: ld1b { z0.b }, p1/z, [x8, x9] ; CHECK-NEXT: addvl sp, sp, #2 ; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload ; CHECK-NEXT: ret @@ -733,11 +736,12 @@ ; CHECK-NEXT: addvl sp, sp, #-2 ; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: mov x8, sp +; CHECK-NEXT: ptrue p1.b ; CHECK-NEXT: st1w { z0.s }, p0, [sp] ; CHECK-NEXT: st1w { z1.s }, p0, [x8, #1, mul vl] ; CHECK-NEXT: addvl x8, x8, #1 -; CHECK-NEXT: sub x8, x8, #4 // =4 -; CHECK-NEXT: ld1w { z0.s }, p0/z, [x8] +; CHECK-NEXT: mov x9, #-4 +; CHECK-NEXT: ld1b { z0.b }, p1/z, [x8, x9] ; CHECK-NEXT: addvl sp, sp, #2 ; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload ; CHECK-NEXT: ret @@ -776,11 +780,12 @@ ; CHECK-NEXT: addvl sp, sp, #-2 ; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: mov x8, sp +; CHECK-NEXT: ptrue p1.b ; CHECK-NEXT: st1d { z0.d }, p0, [sp] ; CHECK-NEXT: st1d { z1.d }, p0, [x8, #1, mul vl] ; CHECK-NEXT: addvl x8, x8, #1 -; CHECK-NEXT: sub x8, x8, #16 // =16 -; CHECK-NEXT: ld1d { z0.d }, p0/z, [x8] +; CHECK-NEXT: mov x9, #-16 +; CHECK-NEXT: ld1b { z0.b }, p1/z, [x8, x9] ; CHECK-NEXT: addvl sp, sp, #2 ; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload ; CHECK-NEXT: ret @@ -795,11 +800,12 @@ ; CHECK-NEXT: addvl sp, sp, #-2 ; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: mov x8, sp +; CHECK-NEXT: ptrue p1.b ; CHECK-NEXT: st1d { z0.d }, p0, [sp] ; CHECK-NEXT: st1d { z1.d }, p0, [x8, #1, mul vl] ; CHECK-NEXT: addvl x8, x8, #1 -; CHECK-NEXT: sub x8, x8, #8 // =8 -; CHECK-NEXT: ld1d { z0.d }, p0/z, [x8] +; CHECK-NEXT: mov x9, #-8 +; CHECK-NEXT: ld1b { z0.b }, p1/z, [x8, x9] ; CHECK-NEXT: addvl sp, sp, #2 ; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload ; CHECK-NEXT: ret @@ -838,11 +844,12 @@ ; CHECK-NEXT: addvl sp, sp, #-2 ; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: mov x8, sp +; CHECK-NEXT: ptrue p1.b ; CHECK-NEXT: st1h { z0.h }, p0, [sp] ; CHECK-NEXT: st1h { z1.h }, p0, [x8, #1, mul vl] ; CHECK-NEXT: addvl x8, x8, #1 -; CHECK-NEXT: sub x8, x8, #16 // =16 -; CHECK-NEXT: ld1h { z0.h }, p0/z, [x8] +; CHECK-NEXT: mov x9, #-16 +; CHECK-NEXT: ld1b { z0.b }, p1/z, [x8, x9] ; CHECK-NEXT: addvl sp, sp, #2 ; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload ; CHECK-NEXT: ret @@ -857,11 +864,12 @@ ; CHECK-NEXT: addvl sp, sp, #-2 ; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: mov x8, sp +; CHECK-NEXT: ptrue p1.b ; CHECK-NEXT: st1h { z0.h }, p0, [sp] ; CHECK-NEXT: st1h { z1.h }, p0, [x8, #1, mul vl] ; CHECK-NEXT: addvl x8, x8, #1 -; CHECK-NEXT: sub x8, x8, #2 // =2 -; CHECK-NEXT: ld1h { z0.h }, p0/z, [x8] +; CHECK-NEXT: mov x9, #-2 +; CHECK-NEXT: ld1b { z0.b }, p1/z, [x8, x9] ; CHECK-NEXT: addvl sp, sp, #2 ; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload ; CHECK-NEXT: ret @@ -900,11 +908,12 @@ ; CHECK-NEXT: addvl sp, sp, #-2 ; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: mov x8, sp +; CHECK-NEXT: ptrue p1.b ; CHECK-NEXT: st1w { z0.s }, p0, [sp] ; CHECK-NEXT: st1w { z1.s }, p0, [x8, #1, mul vl] ; CHECK-NEXT: addvl x8, x8, #1 -; CHECK-NEXT: sub x8, x8, #16 // =16 -; CHECK-NEXT: ld1w { z0.s }, p0/z, [x8] +; CHECK-NEXT: mov x9, #-16 +; CHECK-NEXT: ld1b { z0.b }, p1/z, [x8, x9] ; CHECK-NEXT: addvl sp, sp, #2 ; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload ; CHECK-NEXT: ret @@ -919,11 +928,12 @@ ; CHECK-NEXT: addvl sp, sp, #-2 ; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: mov x8, sp +; CHECK-NEXT: ptrue p1.b ; CHECK-NEXT: st1w { z0.s }, p0, [sp] ; CHECK-NEXT: st1w { z1.s }, p0, [x8, #1, mul vl] ; CHECK-NEXT: addvl x8, x8, #1 -; CHECK-NEXT: sub x8, x8, #4 // =4 -; CHECK-NEXT: ld1w { z0.s }, p0/z, [x8] +; CHECK-NEXT: mov x9, #-4 +; CHECK-NEXT: ld1b { z0.b }, p1/z, [x8, x9] ; CHECK-NEXT: addvl sp, sp, #2 ; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload ; CHECK-NEXT: ret @@ -962,11 +972,12 @@ ; CHECK-NEXT: addvl sp, sp, #-2 ; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: mov x8, sp +; CHECK-NEXT: ptrue p1.b ; CHECK-NEXT: st1d { z0.d }, p0, [sp] ; CHECK-NEXT: st1d { z1.d }, p0, [x8, #1, mul vl] ; CHECK-NEXT: addvl x8, x8, #1 -; CHECK-NEXT: sub x8, x8, #16 // =16 -; CHECK-NEXT: ld1d { z0.d }, p0/z, [x8] +; CHECK-NEXT: mov x9, #-16 +; CHECK-NEXT: ld1b { z0.b }, p1/z, [x8, x9] ; CHECK-NEXT: addvl sp, sp, #2 ; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload ; CHECK-NEXT: ret @@ -981,11 +992,12 @@ ; CHECK-NEXT: addvl sp, sp, #-2 ; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: mov x8, sp +; CHECK-NEXT: ptrue p1.b ; CHECK-NEXT: st1d { z0.d }, p0, [sp] ; CHECK-NEXT: st1d { z1.d }, p0, [x8, #1, mul vl] ; CHECK-NEXT: addvl x8, x8, #1 -; CHECK-NEXT: sub x8, x8, #8 // =8 -; CHECK-NEXT: ld1d { z0.d }, p0/z, [x8] +; CHECK-NEXT: mov x9, #-8 +; CHECK-NEXT: ld1b { z0.b }, p1/z, [x8, x9] ; CHECK-NEXT: addvl sp, sp, #2 ; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload ; CHECK-NEXT: ret @@ -1027,11 +1039,12 @@ ; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: mov z1.d, p1/z, #1 // =0x1 ; CHECK-NEXT: mov x8, sp +; CHECK-NEXT: ptrue p1.b ; CHECK-NEXT: st1d { z0.d }, p0, [sp] ; CHECK-NEXT: st1d { z1.d }, p0, [x8, #1, mul vl] ; CHECK-NEXT: addvl x8, x8, #1 -; CHECK-NEXT: sub x8, x8, #8 // =8 -; CHECK-NEXT: ld1d { z0.d }, p0/z, [x8] +; CHECK-NEXT: mov x9, #-8 +; CHECK-NEXT: ld1b { z0.b }, p1/z, [x8, x9] ; CHECK-NEXT: and z0.d, z0.d, #0x1 ; CHECK-NEXT: cmpne p0.d, p0/z, z0.d, #0 ; CHECK-NEXT: addvl sp, sp, #2 @@ -1051,11 +1064,12 @@ ; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: mov z1.s, p1/z, #1 // =0x1 ; CHECK-NEXT: mov x8, sp +; CHECK-NEXT: ptrue p1.b ; CHECK-NEXT: st1w { z0.s }, p0, [sp] ; CHECK-NEXT: st1w { z1.s }, p0, [x8, #1, mul vl] ; CHECK-NEXT: addvl x8, x8, #1 -; CHECK-NEXT: sub x8, x8, #4 // =4 -; CHECK-NEXT: ld1w { z0.s }, p0/z, [x8] +; CHECK-NEXT: mov x9, #-4 +; CHECK-NEXT: ld1b { z0.b }, p1/z, [x8, x9] ; CHECK-NEXT: and z0.s, z0.s, #0x1 ; CHECK-NEXT: cmpne p0.s, p0/z, z0.s, #0 ; CHECK-NEXT: addvl sp, sp, #2 @@ -1075,11 +1089,12 @@ ; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: mov z1.h, p1/z, #1 // =0x1 ; CHECK-NEXT: mov x8, sp +; CHECK-NEXT: ptrue p1.b ; CHECK-NEXT: st1h { z0.h }, p0, [sp] ; CHECK-NEXT: st1h { z1.h }, p0, [x8, #1, mul vl] ; CHECK-NEXT: addvl x8, x8, #1 -; CHECK-NEXT: sub x8, x8, #2 // =2 -; CHECK-NEXT: ld1h { z0.h }, p0/z, [x8] +; CHECK-NEXT: mov x9, #-2 +; CHECK-NEXT: ld1b { z0.b }, p1/z, [x8, x9] ; CHECK-NEXT: and z0.h, z0.h, #0x1 ; CHECK-NEXT: cmpne p0.h, p0/z, z0.h, #0 ; CHECK-NEXT: addvl sp, sp, #2 @@ -1121,11 +1136,12 @@ ; CHECK-NEXT: addvl sp, sp, #-2 ; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: mov x8, sp +; CHECK-NEXT: ptrue p1.b ; CHECK-NEXT: st1d { z0.d }, p0, [sp] ; CHECK-NEXT: st1d { z1.d }, p0, [x8, #1, mul vl] ; CHECK-NEXT: addvl x8, x8, #1 -; CHECK-NEXT: sub x8, x8, #16 // =16 -; CHECK-NEXT: ld1d { z0.d }, p0/z, [x8] +; CHECK-NEXT: mov x9, #-16 +; CHECK-NEXT: ld1b { z0.b }, p1/z, [x8, x9] ; CHECK-NEXT: addvl sp, sp, #2 ; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload ; CHECK-NEXT: ret @@ -1141,13 +1157,15 @@ ; CHECK-NEXT: addvl sp, sp, #-4 ; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: mov x8, sp +; CHECK-NEXT: ptrue p1.b +; CHECK-NEXT: mov x9, #-32 ; CHECK-NEXT: st1w { z1.s }, p0, [x8, #1, mul vl] ; CHECK-NEXT: st1w { z0.s }, p0, [sp] ; CHECK-NEXT: st1w { z3.s }, p0, [x8, #3, mul vl] ; CHECK-NEXT: st1w { z2.s }, p0, [x8, #2, mul vl] ; CHECK-NEXT: addvl x8, x8, #2 +; CHECK-NEXT: ld1b { z0.b }, p1/z, [x8, x9] ; CHECK-NEXT: sub x8, x8, #32 // =32 -; CHECK-NEXT: ld1w { z0.s }, p0/z, [x8] ; CHECK-NEXT: ld1w { z1.s }, p0/z, [x8, #1, mul vl] ; CHECK-NEXT: addvl sp, sp, #4 ; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload diff --git a/llvm/test/CodeGen/AArch64/sve-ld1-addressing-mode-reg-reg.ll b/llvm/test/CodeGen/AArch64/sve-ld1-addressing-mode-reg-reg.ll --- a/llvm/test/CodeGen/AArch64/sve-ld1-addressing-mode-reg-reg.ll +++ b/llvm/test/CodeGen/AArch64/sve-ld1-addressing-mode-reg-reg.ll @@ -15,6 +15,42 @@ ret %val } +define @ld1_nxv16i8_bitcast_to_i16(i8* %addr, i64 %off) { +; CHECK-LABEL: ld1_nxv16i8_bitcast_to_i16: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.b +; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0, x1] +; CHECK-NEXT: ret + %ptr = getelementptr inbounds i8, i8* %addr, i64 %off + %ptrcast = bitcast i8* %ptr to * + %val = load volatile , * %ptrcast + ret %val +} + +define @ld1_nxv16i8_bitcast_to_i32(i8* %addr, i64 %off) { +; CHECK-LABEL: ld1_nxv16i8_bitcast_to_i32: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.b +; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0, x1] +; CHECK-NEXT: ret + %ptr = getelementptr inbounds i8, i8* %addr, i64 %off + %ptrcast = bitcast i8* %ptr to * + %val = load volatile , * %ptrcast + ret %val +} + +define @ld1_nxv16i8_bitcast_to_i64(i8* %addr, i64 %off) { +; CHECK-LABEL: ld1_nxv16i8_bitcast_to_i64: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.b +; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0, x1] +; CHECK-NEXT: ret + %ptr = getelementptr inbounds i8, i8* %addr, i64 %off + %ptrcast = bitcast i8* %ptr to * + %val = load volatile , * %ptrcast + ret %val +} + define @ld1_nxv8i16_zext8(i8* %addr, i64 %off) { ; CHECK-LABEL: ld1_nxv8i16_zext8: ; CHECK: // %bb.0: diff --git a/llvm/test/CodeGen/AArch64/sve-st1-addressing-mode-reg-reg.ll b/llvm/test/CodeGen/AArch64/sve-st1-addressing-mode-reg-reg.ll --- a/llvm/test/CodeGen/AArch64/sve-st1-addressing-mode-reg-reg.ll +++ b/llvm/test/CodeGen/AArch64/sve-st1-addressing-mode-reg-reg.ll @@ -15,6 +15,42 @@ ret void } +define void @st1_nxv16i8_bitcast_from_i16(i8* %addr, i64 %off, %val) { +; CHECK-LABEL: st1_nxv16i8_bitcast_from_i16: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.b +; CHECK-NEXT: st1b { z0.b }, p0, [x0, x1] +; CHECK-NEXT: ret + %ptr = getelementptr inbounds i8, i8* %addr, i64 %off + %ptrcast = bitcast i8* %ptr to * + store %val, * %ptrcast + ret void +} + +define void @st1_nxv16i8_bitcast_from_i32(i8* %addr, i64 %off, %val) { +; CHECK-LABEL: st1_nxv16i8_bitcast_from_i32: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.b +; CHECK-NEXT: st1b { z0.b }, p0, [x0, x1] +; CHECK-NEXT: ret + %ptr = getelementptr inbounds i8, i8* %addr, i64 %off + %ptrcast = bitcast i8* %ptr to * + store %val, * %ptrcast + ret void +} + +define void @st1_nxv16i8_bitcast_from_i64(i8* %addr, i64 %off, %val) { +; CHECK-LABEL: st1_nxv16i8_bitcast_from_i64: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.b +; CHECK-NEXT: st1b { z0.b }, p0, [x0, x1] +; CHECK-NEXT: ret + %ptr = getelementptr inbounds i8, i8* %addr, i64 %off + %ptrcast = bitcast i8* %ptr to * + store %val, * %ptrcast + ret void +} + define void @st1_nxv8i16_trunc8(i8* %addr, i64 %off, %val) { ; CHECK-LABEL: st1_nxv8i16_trunc8: ; CHECK: // %bb.0: