diff --git a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td --- a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td @@ -1856,13 +1856,18 @@ defm : pred_store; defm : pred_store; - multiclass unpred_store { + multiclass unpred_store { let AddedComplexity = 1 in { + def _reg : Pat<(Store (Ty ZPR:$val), (AddrCP GPR64sp:$base, GPR64:$offset)), + (RegRegInst ZPR:$val, (PTrue 31), GPR64sp:$base, GPR64:$offset)>; + } + let AddedComplexity = 2 in { def _imm : Pat<(Store (Ty ZPR:$val), (am_sve_indexed_s4 GPR64sp:$base, simm4s1:$offset)), (RegImmInst ZPR:$val, (PTrue 31), GPR64sp:$base, simm4s1:$offset)>; } - let AddedComplexity = 2 in { + let AddedComplexity = 3 in { def _fi : Pat<(Store (Ty ZPR:$val), (am_sve_fi GPR64sp:$base, simm4s1:$offset)), (RegImmInst ZPR:$val, (PTrue 31), GPR64sp:$base, simm4s1:$offset)>; } @@ -1871,32 +1876,36 @@ (RegImmInst ZPR:$val, (PTrue 31), GPR64:$base, (i64 0))>; } - defm : unpred_store< store, nxv16i8, ST1B_IMM, PTRUE_B>; - defm : unpred_store< truncstorevi8, nxv8i16, ST1B_H_IMM, PTRUE_H>; - defm : unpred_store< truncstorevi8, nxv4i32, ST1B_S_IMM, PTRUE_S>; - defm : unpred_store< truncstorevi8, nxv2i64, ST1B_D_IMM, PTRUE_D>; - defm : unpred_store< store, nxv8i16, ST1H_IMM, PTRUE_H>; - defm : unpred_store; - defm : unpred_store; - defm : unpred_store< store, nxv4i32, ST1W_IMM, PTRUE_S>; - defm : unpred_store; - defm : unpred_store< store, nxv2i64, ST1D_IMM, PTRUE_D>; - defm : unpred_store< store, nxv8f16, ST1H_IMM, PTRUE_H>; - defm : unpred_store< store, nxv8bf16, ST1H_IMM, PTRUE_H>; - defm : unpred_store< store, nxv4f16, ST1H_S_IMM, PTRUE_S>; - defm : unpred_store< store, nxv2f16, ST1H_D_IMM, PTRUE_D>; - defm : unpred_store< store, nxv4f32, ST1W_IMM, PTRUE_S>; - defm : unpred_store< store, nxv2f32, ST1W_D_IMM, PTRUE_D>; - defm : unpred_store< store, nxv2f64, ST1D_IMM, PTRUE_D>; - - multiclass unpred_load { + defm : unpred_store< store, nxv16i8, ST1B, ST1B_IMM, PTRUE_B, am_sve_regreg_lsl0>; + defm : unpred_store< truncstorevi8, nxv8i16, ST1B_H, ST1B_H_IMM, PTRUE_H, am_sve_regreg_lsl0>; + defm : unpred_store< truncstorevi8, nxv4i32, ST1B_S, ST1B_S_IMM, PTRUE_S, am_sve_regreg_lsl0>; + defm : unpred_store< truncstorevi8, nxv2i64, ST1B_D, ST1B_D_IMM, PTRUE_D, am_sve_regreg_lsl0>; + defm : unpred_store< store, nxv8i16, ST1H, ST1H_IMM, PTRUE_H, am_sve_regreg_lsl1>; + defm : unpred_store; + defm : unpred_store; + defm : unpred_store< store, nxv4i32, ST1W, ST1W_IMM, PTRUE_S, am_sve_regreg_lsl2>; + defm : unpred_store; + defm : unpred_store< store, nxv2i64, ST1D, ST1D_IMM, PTRUE_D, am_sve_regreg_lsl3>; + defm : unpred_store< store, nxv8f16, ST1H, ST1H_IMM, PTRUE_H, am_sve_regreg_lsl1>; + defm : unpred_store< store, nxv8bf16, ST1H, ST1H_IMM, PTRUE_H, am_sve_regreg_lsl1>; + defm : unpred_store< store, nxv4f16, ST1H_S, ST1H_S_IMM, PTRUE_S, am_sve_regreg_lsl1>; + defm : unpred_store< store, nxv2f16, ST1H_D, ST1H_D_IMM, PTRUE_D, am_sve_regreg_lsl1>; + defm : unpred_store< store, nxv4f32, ST1W, ST1W_IMM, PTRUE_S, am_sve_regreg_lsl2>; + defm : unpred_store< store, nxv2f32, ST1W_D, ST1W_D_IMM, PTRUE_D, am_sve_regreg_lsl2>; + defm : unpred_store< store, nxv2f64, ST1D, ST1D_IMM, PTRUE_D, am_sve_regreg_lsl3>; + + multiclass unpred_load { let AddedComplexity = 1 in { + def _reg: Pat<(Ty (Load (AddrCP GPR64sp:$base, GPR64:$offset))), + (RegRegInst (PTrue 31), GPR64sp:$base, GPR64:$offset)>; + } + let AddedComplexity = 2 in { def _imm: Pat<(Ty (Load (am_sve_indexed_s4 GPR64sp:$base, simm4s1:$offset))), (RegImmInst (PTrue 31), GPR64sp:$base, simm4s1:$offset)>; } - - let AddedComplexity = 2 in { + let AddedComplexity = 3 in { def _fi : Pat<(Ty (Load (am_sve_fi GPR64sp:$base, simm4s1:$offset))), (RegImmInst (PTrue 31), GPR64sp:$base, simm4s1:$offset)>; } @@ -1905,35 +1914,35 @@ (RegImmInst (PTrue 31), GPR64:$base, (i64 0))>; } - defm : unpred_load< load, nxv16i8, LD1B_IMM, PTRUE_B>; - defm : unpred_load< zextloadvi8, nxv8i16, LD1B_H_IMM, PTRUE_H>; - defm : unpred_load< zextloadvi8, nxv4i32, LD1B_S_IMM, PTRUE_S>; - defm : unpred_load< zextloadvi8, nxv2i64, LD1B_D_IMM, PTRUE_D>; - defm : unpred_load< extloadvi8, nxv8i16, LD1B_H_IMM, PTRUE_H>; - defm : unpred_load< extloadvi8, nxv4i32, LD1B_S_IMM, PTRUE_S>; - defm : unpred_load< extloadvi8, nxv2i64, LD1B_D_IMM, PTRUE_D>; - defm : unpred_load< sextloadvi8, nxv8i16, LD1SB_H_IMM, PTRUE_H>; - defm : unpred_load< sextloadvi8, nxv4i32, LD1SB_S_IMM, PTRUE_S>; - defm : unpred_load< sextloadvi8, nxv2i64, LD1SB_D_IMM, PTRUE_D>; - defm : unpred_load< load, nxv8i16, LD1H_IMM, PTRUE_H>; - defm : unpred_load; - defm : unpred_load; - defm : unpred_load< extloadvi16, nxv4i32, LD1H_S_IMM, PTRUE_S>; - defm : unpred_load< extloadvi16, nxv2i64, LD1H_D_IMM, PTRUE_D>; - defm : unpred_load; - defm : unpred_load; - defm : unpred_load< load, nxv4i32, LD1W_IMM, PTRUE_S>; - defm : unpred_load; - defm : unpred_load< extloadvi32, nxv2i64, LD1W_D_IMM, PTRUE_D>; - defm : unpred_load; - defm : unpred_load< load, nxv2i64, LD1D_IMM, PTRUE_D>; - defm : unpred_load< load, nxv8f16, LD1H_IMM, PTRUE_H>; - defm : unpred_load< load, nxv8bf16, LD1H_IMM, PTRUE_H>; - defm : unpred_load< load, nxv4f16, LD1H_S_IMM, PTRUE_S>; - defm : unpred_load< load, nxv2f16, LD1H_D_IMM, PTRUE_D>; - defm : unpred_load< load, nxv4f32, LD1W_IMM, PTRUE_S>; - defm : unpred_load< load, nxv2f32, LD1W_D_IMM, PTRUE_D>; - defm : unpred_load< load, nxv2f64, LD1D_IMM, PTRUE_D>; + defm : unpred_load< load, nxv16i8, LD1B, LD1B_IMM, PTRUE_B, am_sve_regreg_lsl0>; + defm : unpred_load< zextloadvi8, nxv8i16, LD1B_H, LD1B_H_IMM, PTRUE_H, am_sve_regreg_lsl0>; + defm : unpred_load< zextloadvi8, nxv4i32, LD1B_S, LD1B_S_IMM, PTRUE_S, am_sve_regreg_lsl0>; + defm : unpred_load< zextloadvi8, nxv2i64, LD1B_D, LD1B_D_IMM, PTRUE_D, am_sve_regreg_lsl0>; + defm : unpred_load< extloadvi8, nxv8i16, LD1B_H, LD1B_H_IMM, PTRUE_H, am_sve_regreg_lsl0>; + defm : unpred_load< extloadvi8, nxv4i32, LD1B_S, LD1B_S_IMM, PTRUE_S, am_sve_regreg_lsl0>; + defm : unpred_load< extloadvi8, nxv2i64, LD1B_D, LD1B_D_IMM, PTRUE_D, am_sve_regreg_lsl0>; + defm : unpred_load< sextloadvi8, nxv8i16, LD1SB_H, LD1SB_H_IMM, PTRUE_H, am_sve_regreg_lsl0>; + defm : unpred_load< sextloadvi8, nxv4i32, LD1SB_S, LD1SB_S_IMM, PTRUE_S, am_sve_regreg_lsl0>; + defm : unpred_load< sextloadvi8, nxv2i64, LD1SB_D, LD1SB_D_IMM, PTRUE_D, am_sve_regreg_lsl0>; + defm : unpred_load< load, nxv8i16, LD1H, LD1H_IMM, PTRUE_H, am_sve_regreg_lsl1>; + defm : unpred_load; + defm : unpred_load; + defm : unpred_load< extloadvi16, nxv4i32, LD1H_S, LD1H_S_IMM, PTRUE_S, am_sve_regreg_lsl1>; + defm : unpred_load< extloadvi16, nxv2i64, LD1H_D, LD1H_D_IMM, PTRUE_D, am_sve_regreg_lsl1>; + defm : unpred_load; + defm : unpred_load; + defm : unpred_load< load, nxv4i32, LD1W, LD1W_IMM, PTRUE_S, am_sve_regreg_lsl2>; + defm : unpred_load; + defm : unpred_load< extloadvi32, nxv2i64, LD1W_D, LD1W_D_IMM, PTRUE_D, am_sve_regreg_lsl2>; + defm : unpred_load; + defm : unpred_load< load, nxv2i64, LD1D, LD1D_IMM, PTRUE_D, am_sve_regreg_lsl3>; + defm : unpred_load< load, nxv8f16, LD1H, LD1H_IMM, PTRUE_H, am_sve_regreg_lsl1>; + defm : unpred_load< load, nxv8bf16, LD1H, LD1H_IMM, PTRUE_H, am_sve_regreg_lsl1>; + defm : unpred_load< load, nxv4f16, LD1H_S, LD1H_S_IMM, PTRUE_S, am_sve_regreg_lsl1>; + defm : unpred_load< load, nxv2f16, LD1H_D, LD1H_D_IMM, PTRUE_D, am_sve_regreg_lsl1>; + defm : unpred_load< load, nxv4f32, LD1W, LD1W_IMM, PTRUE_S, am_sve_regreg_lsl2>; + defm : unpred_load< load, nxv2f32, LD1W_D, LD1W_D_IMM, PTRUE_D, am_sve_regreg_lsl2>; + defm : unpred_load< load, nxv2f64, LD1D, LD1D_IMM, PTRUE_D, am_sve_regreg_lsl3>; multiclass unpred_store_predicate { def _fi : Pat<(store (Ty PPR:$val), (am_sve_fi GPR64sp:$base, simm9:$offset)), diff --git a/llvm/lib/Transforms/Scalar/LICM.cpp b/llvm/lib/Transforms/Scalar/LICM.cpp --- a/llvm/lib/Transforms/Scalar/LICM.cpp +++ b/llvm/lib/Transforms/Scalar/LICM.cpp @@ -1196,6 +1196,11 @@ return false; using namespace PatternMatch; + if (match(CI, m_Intrinsic())) + // Vscale calls can be partially folded directly into other instructions, + // let the MachineLICM pass handle anything that is not. + return false; + if (match(CI, m_Intrinsic())) // Assumes don't actually alias anything or throw return true; diff --git a/llvm/test/CodeGen/AArch64/sve-ld1-addressing-mode-reg-imm.ll b/llvm/test/CodeGen/AArch64/sve-ld1-addressing-mode-reg-imm.ll --- a/llvm/test/CodeGen/AArch64/sve-ld1-addressing-mode-reg-imm.ll +++ b/llvm/test/CodeGen/AArch64/sve-ld1-addressing-mode-reg-imm.ll @@ -43,9 +43,9 @@ define @ld1b_out_of_upper_bound(* %a) { ; CHECK-LABEL: ld1b_out_of_upper_bound: ; CHECK: // %bb.0: -; CHECK-NEXT: addvl x8, x0, #8 +; CHECK-NEXT: rdvl x8, #8 ; CHECK-NEXT: ptrue p0.b -; CHECK-NEXT: ld1b { z0.b }, p0/z, [x8] +; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0, x8] ; CHECK-NEXT: ret %base = getelementptr , * %a, i64 8 %load = load , * %base @@ -55,9 +55,9 @@ define @ld1b_out_of_lower_bound(* %a) { ; CHECK-LABEL: ld1b_out_of_lower_bound: ; CHECK: // %bb.0: -; CHECK-NEXT: addvl x8, x0, #-9 +; CHECK-NEXT: rdvl x8, #-9 ; CHECK-NEXT: ptrue p0.b -; CHECK-NEXT: ld1b { z0.b }, p0/z, [x8] +; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0, x8] ; CHECK-NEXT: ret %base = getelementptr , * %a, i64 -9 %load = load , * %base @@ -139,3 +139,95 @@ ret void } +define void @ld1b_reg([64 x i8]* %addr, i64 %off) { +; CHECK-LABEL: ld1b_reg: +; CHECK: // %bb.0: +; CHECK-NEXT: cntw x8 +; CHECK-NEXT: add x8, x1, x8 +; CHECK-NEXT: ptrue p0.b +; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0, x8] + %vscale = call i64 @llvm.vscale.i64() + %mul = mul i64 %vscale, 4 + %offset = add i64 %off, %mul + %ptr = getelementptr inbounds [64 x i8], [64 x i8]* %addr, i64 0, i64 %offset + %ptrcast = bitcast i8* %ptr to * + %val = load volatile , * %ptrcast + ret void +} + +define void @ld1h_reg([64 x i16]* %addr, i64 %off) { +; CHECK-LABEL: ld1h_reg: +; CHECK: // %bb.0: +; CHECK-NEXT: cntw x8 +; CHECK-NEXT: add x8, x1, x8 +; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] + %vscale = call i64 @llvm.vscale.i64() + %mul = mul i64 %vscale, 4 + %offset = add i64 %off, %mul + %ptr = getelementptr inbounds [64 x i16], [64 x i16]* %addr, i64 0, i64 %offset + %ptrcast = bitcast i16* %ptr to * + %val = load volatile , * %ptrcast + ret void +} + +define void @ld1w_reg([64 x i32]* %addr, i64 %off) { +; CHECK-LABEL: ld1w_reg: +; CHECK: // %bb.0: +; CHECK-NEXT: cntw x8 +; CHECK-NEXT: add x8, x1, x8 +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] + %vscale = call i64 @llvm.vscale.i64() + %mul = mul i64 %vscale, 4 + %offset = add i64 %off, %mul + %ptr = getelementptr inbounds [64 x i32], [64 x i32]* %addr, i64 0, i64 %offset + %ptrcast = bitcast i32* %ptr to * + %val = load volatile , * %ptrcast + ret void +} + +define void @ld1d_reg([64 x i64]* %addr, i64 %off) { +; CHECK-LABEL: ld1d_reg: +; CHECK: // %bb.0: +; CHECK-NEXT: cntw x8 +; CHECK-NEXT: add x8, x1, x8 +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] + %vscale = call i64 @llvm.vscale.i64() + %mul = mul i64 %vscale, 4 + %offset = add i64 %off, %mul + %ptr = getelementptr inbounds [64 x i64], [64 x i64]* %addr, i64 0, i64 %offset + %ptrcast = bitcast i64* %ptr to * + %val = load volatile , * %ptrcast + ret void +} + +; Check that vscale call is recognised by load reg/reg pattern and partially +; folded, with the rest pulled out of the loop +define void @ld1d_reg_loop([64 x i64]* %addr, i64 %off) { +; CHECK-LABEL: ld1d_reg_loop: +; CHECK: // %bb.0 +; CHECK-NEXT: cntw x8 +; CHECK-NEXT: add x8, x1, x8 +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: [[BB:\.LBB[0-9_]+]]: // %loop +; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] +entry: + br label %loop + +loop: + %vscale = call i64 @llvm.vscale.i64() + %mul = mul i64 %vscale, 4 + %offset = add i64 %off, %mul + %ptr = getelementptr inbounds [64 x i64], [64 x i64]* %addr, i64 0, i64 %offset + %ptrcast = bitcast i64* %ptr to * + %val = load volatile , * %ptrcast + br label %loop + +exit: + ret void +} + +declare i64 @llvm.vscale.i64() diff --git a/llvm/test/CodeGen/AArch64/sve-st1-addressing-mode-reg-imm.ll b/llvm/test/CodeGen/AArch64/sve-st1-addressing-mode-reg-imm.ll --- a/llvm/test/CodeGen/AArch64/sve-st1-addressing-mode-reg-imm.ll +++ b/llvm/test/CodeGen/AArch64/sve-st1-addressing-mode-reg-imm.ll @@ -43,9 +43,9 @@ define void @st1b_out_of_upper_bound( %data, * %a) { ; CHECK-LABEL: st1b_out_of_upper_bound: ; CHECK: // %bb.0: -; CHECK-NEXT: addvl x8, x0, #8 +; CHECK-NEXT: rdvl x8, #8 ; CHECK-NEXT: ptrue p0.b -; CHECK-NEXT: st1b { z0.b }, p0, [x8] +; CHECK-NEXT: st1b { z0.b }, p0, [x0, x8] ; CHECK-NEXT: ret %base = getelementptr , * %a, i64 8 store %data, * %base @@ -55,9 +55,9 @@ define void @st1b_out_of_lower_bound( %data, * %a) { ; CHECK-LABEL: st1b_out_of_lower_bound: ; CHECK: // %bb.0: -; CHECK-NEXT: addvl x8, x0, #-9 +; CHECK-NEXT: rdvl x8, #-9 ; CHECK-NEXT: ptrue p0.b -; CHECK-NEXT: st1b { z0.b }, p0, [x8] +; CHECK-NEXT: st1b { z0.b }, p0, [x0, x8] ; CHECK-NEXT: ret %base = getelementptr , * %a, i64 -9 store %data, * %base @@ -165,3 +165,96 @@ store %splat, * %out ret void } + +define void @st1b_reg([64 x i8]* %addr, i64 %off, %val) { +; CHECK-LABEL: st1b_reg: +; CHECK: // %bb.0: +; CHECK-NEXT: cntw x8 +; CHECK-NEXT: add x8, x1, x8 +; CHECK-NEXT: ptrue p0.b +; CHECK-NEXT: st1b { z0.b }, p0, [x0, x8] + %vscale = call i64 @llvm.vscale.i64() + %mul = mul i64 %vscale, 4 + %offset = add i64 %off, %mul + %ptr = getelementptr inbounds [64 x i8], [64 x i8]* %addr, i64 0, i64 %offset + %ptrcast = bitcast i8* %ptr to * + store %val, * %ptrcast + ret void +} + +define void @st1h_reg([64 x i16]* %addr, i64 %off, %val) { +; CHECK-LABEL: st1h_reg: +; CHECK: // %bb.0: +; CHECK-NEXT: cntw x8 +; CHECK-NEXT: add x8, x1, x8 +; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: st1h { z0.h }, p0, [x0, x8, lsl #1] + %vscale = call i64 @llvm.vscale.i64() + %mul = mul i64 %vscale, 4 + %offset = add i64 %off, %mul + %ptr = getelementptr inbounds [64 x i16], [64 x i16]* %addr, i64 0, i64 %offset + %ptrcast = bitcast i16* %ptr to * + store %val, * %ptrcast + ret void +} + +define void @st1w_reg([64 x i32]* %addr, i64 %off, %val) { +; CHECK-LABEL: st1w_reg: +; CHECK: // %bb.0: +; CHECK-NEXT: cntw x8 +; CHECK-NEXT: add x8, x1, x8 +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: st1w { z0.s }, p0, [x0, x8, lsl #2] + %vscale = call i64 @llvm.vscale.i64() + %mul = mul i64 %vscale, 4 + %offset = add i64 %off, %mul + %ptr = getelementptr inbounds [64 x i32], [64 x i32]* %addr, i64 0, i64 %offset + %ptrcast = bitcast i32* %ptr to * + store %val, * %ptrcast + ret void +} + +define void @st1d_reg([64 x i64]* %addr, i64 %off, %val) { +; CHECK-LABEL: st1d_reg: +; CHECK: // %bb.0: +; CHECK-NEXT: cntw x8 +; CHECK-NEXT: add x8, x1, x8 +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: st1d { z0.d }, p0, [x0, x8, lsl #3] + %vscale = call i64 @llvm.vscale.i64() + %mul = mul i64 %vscale, 4 + %offset = add i64 %off, %mul + %ptr = getelementptr inbounds [64 x i64], [64 x i64]* %addr, i64 0, i64 %offset + %ptrcast = bitcast i64* %ptr to * + store %val, * %ptrcast + ret void +} + +; Check that vscale call is recognised by store reg/reg pattern and partially +; folded, with the rest pulled out of the loop +define void @st1d_reg_loop([64 x i64]* %addr, i64 %off, %val) { +; CHECK-LABEL: st1d_reg_loop: +; CHECK: // %bb.0 +; CHECK-NEXT: cntw x8 +; CHECK-NEXT: add x8, x1, x8 +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: [[BB:\.LBB[0-9_]+]]: // %loop +; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: st1d { z0.d }, p0, [x0, x8, lsl #3] +entry: + br label %loop + +loop: + %vscale = call i64 @llvm.vscale.i64() + %mul = mul i64 %vscale, 4 + %offset = add i64 %off, %mul + %ptr = getelementptr inbounds [64 x i64], [64 x i64]* %addr, i64 0, i64 %offset + %ptrcast = bitcast i64* %ptr to * + store %val, * %ptrcast + br label %loop + +exit: + ret void +} + +declare i64 @llvm.vscale.i64()