diff --git a/llvm/include/llvm/IR/IntrinsicsAArch64.td b/llvm/include/llvm/IR/IntrinsicsAArch64.td --- a/llvm/include/llvm/IR/IntrinsicsAArch64.td +++ b/llvm/include/llvm/IR/IntrinsicsAArch64.td @@ -931,6 +931,7 @@ } +def llvm_nxv1i1_ty : LLVMType; def llvm_nxv2i1_ty : LLVMType; def llvm_nxv4i1_ty : LLVMType; def llvm_nxv8i1_ty : LLVMType; @@ -2592,27 +2593,27 @@ // Loads def int_aarch64_sme_ld1b_horiz : SME_Load_Store_Intrinsic; - def int_aarch64_sme_ld1h_horiz : SME_Load_Store_Intrinsic; - def int_aarch64_sme_ld1w_horiz : SME_Load_Store_Intrinsic; - def int_aarch64_sme_ld1d_horiz : SME_Load_Store_Intrinsic; - def int_aarch64_sme_ld1q_horiz : SME_Load_Store_Intrinsic; + def int_aarch64_sme_ld1h_horiz : SME_Load_Store_Intrinsic; + def int_aarch64_sme_ld1w_horiz : SME_Load_Store_Intrinsic; + def int_aarch64_sme_ld1d_horiz : SME_Load_Store_Intrinsic; + def int_aarch64_sme_ld1q_horiz : SME_Load_Store_Intrinsic; def int_aarch64_sme_ld1b_vert : SME_Load_Store_Intrinsic; - def int_aarch64_sme_ld1h_vert : SME_Load_Store_Intrinsic; - def int_aarch64_sme_ld1w_vert : SME_Load_Store_Intrinsic; - def int_aarch64_sme_ld1d_vert : SME_Load_Store_Intrinsic; - def int_aarch64_sme_ld1q_vert : SME_Load_Store_Intrinsic; + def int_aarch64_sme_ld1h_vert : SME_Load_Store_Intrinsic; + def int_aarch64_sme_ld1w_vert : SME_Load_Store_Intrinsic; + def int_aarch64_sme_ld1d_vert : SME_Load_Store_Intrinsic; + def int_aarch64_sme_ld1q_vert : SME_Load_Store_Intrinsic; // Stores def int_aarch64_sme_st1b_horiz : SME_Load_Store_Intrinsic; - def int_aarch64_sme_st1h_horiz : SME_Load_Store_Intrinsic; - def int_aarch64_sme_st1w_horiz : SME_Load_Store_Intrinsic; - def int_aarch64_sme_st1d_horiz : SME_Load_Store_Intrinsic; - def int_aarch64_sme_st1q_horiz : SME_Load_Store_Intrinsic; + def int_aarch64_sme_st1h_horiz : SME_Load_Store_Intrinsic; + def int_aarch64_sme_st1w_horiz : SME_Load_Store_Intrinsic; + def int_aarch64_sme_st1d_horiz : SME_Load_Store_Intrinsic; + def int_aarch64_sme_st1q_horiz : SME_Load_Store_Intrinsic; def int_aarch64_sme_st1b_vert : SME_Load_Store_Intrinsic; - def int_aarch64_sme_st1h_vert : SME_Load_Store_Intrinsic; - def int_aarch64_sme_st1w_vert : SME_Load_Store_Intrinsic; - def int_aarch64_sme_st1d_vert : SME_Load_Store_Intrinsic; - def int_aarch64_sme_st1q_vert : SME_Load_Store_Intrinsic; + def int_aarch64_sme_st1h_vert : SME_Load_Store_Intrinsic; + def int_aarch64_sme_st1w_vert : SME_Load_Store_Intrinsic; + def int_aarch64_sme_st1d_vert : SME_Load_Store_Intrinsic; + def int_aarch64_sme_st1q_vert : SME_Load_Store_Intrinsic; // Spill + fill def int_aarch64_sme_ldr : DefaultAttrsIntrinsic< diff --git a/llvm/test/CodeGen/AArch64/sme-intrinsics-loads.ll b/llvm/test/CodeGen/AArch64/sme-intrinsics-loads.ll --- a/llvm/test/CodeGen/AArch64/sme-intrinsics-loads.ll +++ b/llvm/test/CodeGen/AArch64/sme-intrinsics-loads.ll @@ -30,7 +30,7 @@ ret void; } -define void @ld1h( %pg, ptr %ptr, i32 %sliceidx) { +define void @ld1h( %pg, ptr %ptr, i32 %sliceidx) { ; CHECK-LABEL: ld1h: ; CHECK: // %bb.0: ; CHECK-NEXT: mov w12, w1 @@ -41,14 +41,14 @@ ; CHECK-NEXT: ld1h {za1v.h[w12, 7]}, p0/z, [x0] ; CHECK-NEXT: ret %tileslice = add i32 %sliceidx, 7 - call void @llvm.aarch64.sme.ld1h.horiz( %pg, ptr %ptr, i64 0, i32 %tileslice) - call void @llvm.aarch64.sme.ld1h.horiz( %pg, ptr %ptr, i64 1, i32 0) - call void @llvm.aarch64.sme.ld1h.vert( %pg, ptr %ptr, i64 0, i32 0) - call void @llvm.aarch64.sme.ld1h.vert( %pg, ptr %ptr, i64 1, i32 %tileslice) + call void @llvm.aarch64.sme.ld1h.horiz( %pg, ptr %ptr, i64 0, i32 %tileslice) + call void @llvm.aarch64.sme.ld1h.horiz( %pg, ptr %ptr, i64 1, i32 0) + call void @llvm.aarch64.sme.ld1h.vert( %pg, ptr %ptr, i64 0, i32 0) + call void @llvm.aarch64.sme.ld1h.vert( %pg, ptr %ptr, i64 1, i32 %tileslice) ret void; } -define void @ld1h_with_addr_offset( %pg, ptr %ptr, i64 %index, i32 %sliceidx) { +define void @ld1h_with_addr_offset( %pg, ptr %ptr, i64 %index, i32 %sliceidx) { ; CHECK-LABEL: ld1h_with_addr_offset: ; CHECK: // %bb.0: ; CHECK-NEXT: mov w12, w2 @@ -58,12 +58,12 @@ ; CHECK-NEXT: ret %base = getelementptr i16, ptr %ptr, i64 %index %tileslice = add i32 %sliceidx, 7 - call void @llvm.aarch64.sme.ld1h.horiz( %pg, ptr %base, i64 0, i32 %tileslice) - call void @llvm.aarch64.sme.ld1h.vert( %pg, ptr %base, i64 1, i32 0) + call void @llvm.aarch64.sme.ld1h.horiz( %pg, ptr %base, i64 0, i32 %tileslice) + call void @llvm.aarch64.sme.ld1h.vert( %pg, ptr %base, i64 1, i32 0) ret void; } -define void @ld1w( %pg, ptr %ptr, i32 %sliceidx) { +define void @ld1w( %pg, ptr %ptr, i32 %sliceidx) { ; CHECK-LABEL: ld1w: ; CHECK: // %bb.0: ; CHECK-NEXT: mov w12, wzr @@ -78,18 +78,18 @@ ; CHECK-NEXT: ld1w {za3v.s[w12, 0]}, p0/z, [x0] ; CHECK-NEXT: ret %tileslice = add i32 %sliceidx, 3 - call void @llvm.aarch64.sme.ld1w.horiz( %pg, ptr %ptr, i64 0, i32 0) - call void @llvm.aarch64.sme.ld1w.horiz( %pg, ptr %ptr, i64 1, i32 0) - call void @llvm.aarch64.sme.ld1w.horiz( %pg, ptr %ptr, i64 2, i32 0) - call void @llvm.aarch64.sme.ld1w.horiz( %pg, ptr %ptr, i64 3, i32 %tileslice) - call void @llvm.aarch64.sme.ld1w.vert( %pg, ptr %ptr, i64 0, i32 0) - call void @llvm.aarch64.sme.ld1w.vert( %pg, ptr %ptr, i64 1, i32 0) - call void @llvm.aarch64.sme.ld1w.vert( %pg, ptr %ptr, i64 2, i32 %tileslice) - call void @llvm.aarch64.sme.ld1w.vert( %pg, ptr %ptr, i64 3, i32 0) + call void @llvm.aarch64.sme.ld1w.horiz( %pg, ptr %ptr, i64 0, i32 0) + call void @llvm.aarch64.sme.ld1w.horiz( %pg, ptr %ptr, i64 1, i32 0) + call void @llvm.aarch64.sme.ld1w.horiz( %pg, ptr %ptr, i64 2, i32 0) + call void @llvm.aarch64.sme.ld1w.horiz( %pg, ptr %ptr, i64 3, i32 %tileslice) + call void @llvm.aarch64.sme.ld1w.vert( %pg, ptr %ptr, i64 0, i32 0) + call void @llvm.aarch64.sme.ld1w.vert( %pg, ptr %ptr, i64 1, i32 0) + call void @llvm.aarch64.sme.ld1w.vert( %pg, ptr %ptr, i64 2, i32 %tileslice) + call void @llvm.aarch64.sme.ld1w.vert( %pg, ptr %ptr, i64 3, i32 0) ret void; } -define void @ld1w_with_addr_offset( %pg, ptr %ptr, i64 %index, i32 %sliceidx) { +define void @ld1w_with_addr_offset( %pg, ptr %ptr, i64 %index, i32 %sliceidx) { ; CHECK-LABEL: ld1w_with_addr_offset: ; CHECK: // %bb.0: ; CHECK-NEXT: mov w12, w2 @@ -99,12 +99,12 @@ ; CHECK-NEXT: ret %base = getelementptr i32, ptr %ptr, i64 %index %tileslice = add i32 %sliceidx, 3 - call void @llvm.aarch64.sme.ld1w.horiz( %pg, ptr %base, i64 0, i32 0) - call void @llvm.aarch64.sme.ld1w.vert( %pg, ptr %base, i64 3, i32 %tileslice) + call void @llvm.aarch64.sme.ld1w.horiz( %pg, ptr %base, i64 0, i32 0) + call void @llvm.aarch64.sme.ld1w.vert( %pg, ptr %base, i64 3, i32 %tileslice) ret void; } -define void @ld1d( %pg, ptr %ptr, i32 %sliceidx) { +define void @ld1d( %pg, ptr %ptr, i32 %sliceidx) { ; CHECK-LABEL: ld1d: ; CHECK: // %bb.0: ; CHECK-NEXT: mov w13, wzr @@ -127,26 +127,26 @@ ; CHECK-NEXT: ld1d {za7v.d[w12, 1]}, p0/z, [x0] ; CHECK-NEXT: ret %tileslice = add i32 %sliceidx, 1 - call void @llvm.aarch64.sme.ld1d.horiz( %pg, ptr %ptr, i64 0, i32 0) - call void @llvm.aarch64.sme.ld1d.horiz( %pg, ptr %ptr, i64 1, i32 0) - call void @llvm.aarch64.sme.ld1d.horiz( %pg, ptr %ptr, i64 2, i32 0) - call void @llvm.aarch64.sme.ld1d.horiz( %pg, ptr %ptr, i64 3, i32 0) - call void @llvm.aarch64.sme.ld1d.horiz( %pg, ptr %ptr, i64 4, i32 %tileslice) - call void @llvm.aarch64.sme.ld1d.horiz( %pg, ptr %ptr, i64 5, i32 0) - call void @llvm.aarch64.sme.ld1d.horiz( %pg, ptr %ptr, i64 6, i32 0) - call void @llvm.aarch64.sme.ld1d.horiz( %pg, ptr %ptr, i64 7, i32 0) - call void @llvm.aarch64.sme.ld1d.vert( %pg, ptr %ptr, i64 0, i32 0) - call void @llvm.aarch64.sme.ld1d.vert( %pg, ptr %ptr, i64 1, i32 0) - call void @llvm.aarch64.sme.ld1d.vert( %pg, ptr %ptr, i64 2, i32 0) - call void @llvm.aarch64.sme.ld1d.vert( %pg, ptr %ptr, i64 3, i32 0) - call void @llvm.aarch64.sme.ld1d.vert( %pg, ptr %ptr, i64 4, i32 0) - call void @llvm.aarch64.sme.ld1d.vert( %pg, ptr %ptr, i64 5, i32 0) - call void @llvm.aarch64.sme.ld1d.vert( %pg, ptr %ptr, i64 6, i32 0) - call void @llvm.aarch64.sme.ld1d.vert( %pg, ptr %ptr, i64 7, i32 %tileslice) + call void @llvm.aarch64.sme.ld1d.horiz( %pg, ptr %ptr, i64 0, i32 0) + call void @llvm.aarch64.sme.ld1d.horiz( %pg, ptr %ptr, i64 1, i32 0) + call void @llvm.aarch64.sme.ld1d.horiz( %pg, ptr %ptr, i64 2, i32 0) + call void @llvm.aarch64.sme.ld1d.horiz( %pg, ptr %ptr, i64 3, i32 0) + call void @llvm.aarch64.sme.ld1d.horiz( %pg, ptr %ptr, i64 4, i32 %tileslice) + call void @llvm.aarch64.sme.ld1d.horiz( %pg, ptr %ptr, i64 5, i32 0) + call void @llvm.aarch64.sme.ld1d.horiz( %pg, ptr %ptr, i64 6, i32 0) + call void @llvm.aarch64.sme.ld1d.horiz( %pg, ptr %ptr, i64 7, i32 0) + call void @llvm.aarch64.sme.ld1d.vert( %pg, ptr %ptr, i64 0, i32 0) + call void @llvm.aarch64.sme.ld1d.vert( %pg, ptr %ptr, i64 1, i32 0) + call void @llvm.aarch64.sme.ld1d.vert( %pg, ptr %ptr, i64 2, i32 0) + call void @llvm.aarch64.sme.ld1d.vert( %pg, ptr %ptr, i64 3, i32 0) + call void @llvm.aarch64.sme.ld1d.vert( %pg, ptr %ptr, i64 4, i32 0) + call void @llvm.aarch64.sme.ld1d.vert( %pg, ptr %ptr, i64 5, i32 0) + call void @llvm.aarch64.sme.ld1d.vert( %pg, ptr %ptr, i64 6, i32 0) + call void @llvm.aarch64.sme.ld1d.vert( %pg, ptr %ptr, i64 7, i32 %tileslice) ret void; } -define void @ld1d_with_addr_offset( %pg, ptr %ptr, i64 %index, i32 %sliceidx) { +define void @ld1d_with_addr_offset( %pg, ptr %ptr, i64 %index, i32 %sliceidx) { ; CHECK-LABEL: ld1d_with_addr_offset: ; CHECK: // %bb.0: ; CHECK-NEXT: mov w12, w2 @@ -156,12 +156,12 @@ ; CHECK-NEXT: ret %base = getelementptr i64, ptr %ptr, i64 %index %tileslice = add i32 %sliceidx, 1 - call void @llvm.aarch64.sme.ld1d.horiz( %pg, ptr %base, i64 0, i32 %tileslice) - call void @llvm.aarch64.sme.ld1d.vert( %pg, ptr %base, i64 7, i32 0) + call void @llvm.aarch64.sme.ld1d.horiz( %pg, ptr %base, i64 0, i32 %tileslice) + call void @llvm.aarch64.sme.ld1d.vert( %pg, ptr %base, i64 7, i32 0) ret void; } -define void @ld1q( %pg, ptr %ptr) { +define void @ld1q( %pg, ptr %ptr) { ; CHECK-LABEL: ld1q: ; CHECK: // %bb.0: ; CHECK-NEXT: mov w12, wzr @@ -198,42 +198,42 @@ ; CHECK-NEXT: ld1q {za14v.q[w12, 0]}, p0/z, [x0] ; CHECK-NEXT: ld1q {za15v.q[w12, 0]}, p0/z, [x0] ; CHECK-NEXT: ret - call void @llvm.aarch64.sme.ld1q.horiz( %pg, ptr %ptr, i64 0, i32 0) - call void @llvm.aarch64.sme.ld1q.horiz( %pg, ptr %ptr, i64 1, i32 0) - call void @llvm.aarch64.sme.ld1q.horiz( %pg, ptr %ptr, i64 2, i32 0) - call void @llvm.aarch64.sme.ld1q.horiz( %pg, ptr %ptr, i64 3, i32 0) - call void @llvm.aarch64.sme.ld1q.horiz( %pg, ptr %ptr, i64 4, i32 0) - call void @llvm.aarch64.sme.ld1q.horiz( %pg, ptr %ptr, i64 5, i32 0) - call void @llvm.aarch64.sme.ld1q.horiz( %pg, ptr %ptr, i64 6, i32 0) - call void @llvm.aarch64.sme.ld1q.horiz( %pg, ptr %ptr, i64 7, i32 0) - call void @llvm.aarch64.sme.ld1q.horiz( %pg, ptr %ptr, i64 8, i32 0) - call void @llvm.aarch64.sme.ld1q.horiz( %pg, ptr %ptr, i64 9, i32 0) - call void @llvm.aarch64.sme.ld1q.horiz( %pg, ptr %ptr, i64 10, i32 0) - call void @llvm.aarch64.sme.ld1q.horiz( %pg, ptr %ptr, i64 11, i32 0) - call void @llvm.aarch64.sme.ld1q.horiz( %pg, ptr %ptr, i64 12, i32 0) - call void @llvm.aarch64.sme.ld1q.horiz( %pg, ptr %ptr, i64 13, i32 0) - call void @llvm.aarch64.sme.ld1q.horiz( %pg, ptr %ptr, i64 14, i32 0) - call void @llvm.aarch64.sme.ld1q.horiz( %pg, ptr %ptr, i64 15, i32 0) - call void @llvm.aarch64.sme.ld1q.vert( %pg, ptr %ptr, i64 0, i32 0) - call void @llvm.aarch64.sme.ld1q.vert( %pg, ptr %ptr, i64 1, i32 0) - call void @llvm.aarch64.sme.ld1q.vert( %pg, ptr %ptr, i64 2, i32 0) - call void @llvm.aarch64.sme.ld1q.vert( %pg, ptr %ptr, i64 3, i32 0) - call void @llvm.aarch64.sme.ld1q.vert( %pg, ptr %ptr, i64 4, i32 0) - call void @llvm.aarch64.sme.ld1q.vert( %pg, ptr %ptr, i64 5, i32 0) - call void @llvm.aarch64.sme.ld1q.vert( %pg, ptr %ptr, i64 6, i32 0) - call void @llvm.aarch64.sme.ld1q.vert( %pg, ptr %ptr, i64 7, i32 0) - call void @llvm.aarch64.sme.ld1q.vert( %pg, ptr %ptr, i64 8, i32 0) - call void @llvm.aarch64.sme.ld1q.vert( %pg, ptr %ptr, i64 9, i32 0) - call void @llvm.aarch64.sme.ld1q.vert( %pg, ptr %ptr, i64 10, i32 0) - call void @llvm.aarch64.sme.ld1q.vert( %pg, ptr %ptr, i64 11, i32 0) - call void @llvm.aarch64.sme.ld1q.vert( %pg, ptr %ptr, i64 12, i32 0) - call void @llvm.aarch64.sme.ld1q.vert( %pg, ptr %ptr, i64 13, i32 0) - call void @llvm.aarch64.sme.ld1q.vert( %pg, ptr %ptr, i64 14, i32 0) - call void @llvm.aarch64.sme.ld1q.vert( %pg, ptr %ptr, i64 15, i32 0) + call void @llvm.aarch64.sme.ld1q.horiz( %pg, ptr %ptr, i64 0, i32 0) + call void @llvm.aarch64.sme.ld1q.horiz( %pg, ptr %ptr, i64 1, i32 0) + call void @llvm.aarch64.sme.ld1q.horiz( %pg, ptr %ptr, i64 2, i32 0) + call void @llvm.aarch64.sme.ld1q.horiz( %pg, ptr %ptr, i64 3, i32 0) + call void @llvm.aarch64.sme.ld1q.horiz( %pg, ptr %ptr, i64 4, i32 0) + call void @llvm.aarch64.sme.ld1q.horiz( %pg, ptr %ptr, i64 5, i32 0) + call void @llvm.aarch64.sme.ld1q.horiz( %pg, ptr %ptr, i64 6, i32 0) + call void @llvm.aarch64.sme.ld1q.horiz( %pg, ptr %ptr, i64 7, i32 0) + call void @llvm.aarch64.sme.ld1q.horiz( %pg, ptr %ptr, i64 8, i32 0) + call void @llvm.aarch64.sme.ld1q.horiz( %pg, ptr %ptr, i64 9, i32 0) + call void @llvm.aarch64.sme.ld1q.horiz( %pg, ptr %ptr, i64 10, i32 0) + call void @llvm.aarch64.sme.ld1q.horiz( %pg, ptr %ptr, i64 11, i32 0) + call void @llvm.aarch64.sme.ld1q.horiz( %pg, ptr %ptr, i64 12, i32 0) + call void @llvm.aarch64.sme.ld1q.horiz( %pg, ptr %ptr, i64 13, i32 0) + call void @llvm.aarch64.sme.ld1q.horiz( %pg, ptr %ptr, i64 14, i32 0) + call void @llvm.aarch64.sme.ld1q.horiz( %pg, ptr %ptr, i64 15, i32 0) + call void @llvm.aarch64.sme.ld1q.vert( %pg, ptr %ptr, i64 0, i32 0) + call void @llvm.aarch64.sme.ld1q.vert( %pg, ptr %ptr, i64 1, i32 0) + call void @llvm.aarch64.sme.ld1q.vert( %pg, ptr %ptr, i64 2, i32 0) + call void @llvm.aarch64.sme.ld1q.vert( %pg, ptr %ptr, i64 3, i32 0) + call void @llvm.aarch64.sme.ld1q.vert( %pg, ptr %ptr, i64 4, i32 0) + call void @llvm.aarch64.sme.ld1q.vert( %pg, ptr %ptr, i64 5, i32 0) + call void @llvm.aarch64.sme.ld1q.vert( %pg, ptr %ptr, i64 6, i32 0) + call void @llvm.aarch64.sme.ld1q.vert( %pg, ptr %ptr, i64 7, i32 0) + call void @llvm.aarch64.sme.ld1q.vert( %pg, ptr %ptr, i64 8, i32 0) + call void @llvm.aarch64.sme.ld1q.vert( %pg, ptr %ptr, i64 9, i32 0) + call void @llvm.aarch64.sme.ld1q.vert( %pg, ptr %ptr, i64 10, i32 0) + call void @llvm.aarch64.sme.ld1q.vert( %pg, ptr %ptr, i64 11, i32 0) + call void @llvm.aarch64.sme.ld1q.vert( %pg, ptr %ptr, i64 12, i32 0) + call void @llvm.aarch64.sme.ld1q.vert( %pg, ptr %ptr, i64 13, i32 0) + call void @llvm.aarch64.sme.ld1q.vert( %pg, ptr %ptr, i64 14, i32 0) + call void @llvm.aarch64.sme.ld1q.vert( %pg, ptr %ptr, i64 15, i32 0) ret void; } -define void @ld1q_with_addr_offset( %pg, ptr %ptr, i64 %index) { +define void @ld1q_with_addr_offset( %pg, ptr %ptr, i64 %index) { ; CHECK-LABEL: ld1q_with_addr_offset: ; CHECK: // %bb.0: ; CHECK-NEXT: mov w12, wzr @@ -241,8 +241,8 @@ ; CHECK-NEXT: ld1q {za15v.q[w12, 0]}, p0/z, [x0, x1, lsl #4] ; CHECK-NEXT: ret %base = getelementptr i128, ptr %ptr, i64 %index - call void @llvm.aarch64.sme.ld1q.horiz( %pg, ptr %base, i64 0, i32 0) - call void @llvm.aarch64.sme.ld1q.vert( %pg, ptr %base, i64 15, i32 0) + call void @llvm.aarch64.sme.ld1q.horiz( %pg, ptr %base, i64 0, i32 0) + call void @llvm.aarch64.sme.ld1q.vert( %pg, ptr %base, i64 15, i32 0) ret void; } @@ -297,7 +297,7 @@ ; Ensure that the tile offset is sunk, given that this is likely to be an 'add' ; that's decomposed into a base + offset in ISel. -define void @test_ld1_sink_tile0_offset_operand( %pg, ptr %src, i32 %base, i32 %N) { +define void @test_ld1_sink_tile0_offset_operand( %pg, ptr %src, i32 %base, i32 %N) { ; CHECK-LABEL: test_ld1_sink_tile0_offset_operand: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: mov w12, w1 @@ -317,9 +317,9 @@ for.body: %i = phi i32 [ 0, %entry ], [ %inc, %for.body ] - call void @llvm.aarch64.sme.ld1w.horiz( %pg, ptr %src, i64 0, i32 %base) - call void @llvm.aarch64.sme.ld1w.horiz( %pg, ptr %src, i64 0, i32 %add1) - call void @llvm.aarch64.sme.ld1w.horiz( %pg, ptr %src, i64 0, i32 %add2) + call void @llvm.aarch64.sme.ld1w.horiz( %pg, ptr %src, i64 0, i32 %base) + call void @llvm.aarch64.sme.ld1w.horiz( %pg, ptr %src, i64 0, i32 %add1) + call void @llvm.aarch64.sme.ld1w.horiz( %pg, ptr %src, i64 0, i32 %add2) %inc = add nuw nsw i32 %i, 1 %exitcond.not = icmp eq i32 %inc, %N br i1 %exitcond.not, label %exit, label %for.body @@ -330,15 +330,15 @@ declare void @llvm.aarch64.sme.ld1b.horiz(, ptr, i64, i32) -declare void @llvm.aarch64.sme.ld1h.horiz(, ptr, i64, i32) -declare void @llvm.aarch64.sme.ld1w.horiz(, ptr, i64, i32) -declare void @llvm.aarch64.sme.ld1d.horiz(, ptr, i64, i32) -declare void @llvm.aarch64.sme.ld1q.horiz(, ptr, i64, i32) +declare void @llvm.aarch64.sme.ld1h.horiz(, ptr, i64, i32) +declare void @llvm.aarch64.sme.ld1w.horiz(, ptr, i64, i32) +declare void @llvm.aarch64.sme.ld1d.horiz(, ptr, i64, i32) +declare void @llvm.aarch64.sme.ld1q.horiz(, ptr, i64, i32) declare void @llvm.aarch64.sme.ld1b.vert(, ptr, i64, i32) -declare void @llvm.aarch64.sme.ld1h.vert(, ptr, i64, i32) -declare void @llvm.aarch64.sme.ld1w.vert(, ptr, i64, i32) -declare void @llvm.aarch64.sme.ld1d.vert(, ptr, i64, i32) -declare void @llvm.aarch64.sme.ld1q.vert(, ptr, i64, i32) +declare void @llvm.aarch64.sme.ld1h.vert(, ptr, i64, i32) +declare void @llvm.aarch64.sme.ld1w.vert(, ptr, i64, i32) +declare void @llvm.aarch64.sme.ld1d.vert(, ptr, i64, i32) +declare void @llvm.aarch64.sme.ld1q.vert(, ptr, i64, i32) declare void @llvm.aarch64.sme.ldr(i32, ptr) declare i64 @llvm.vscale.i64() diff --git a/llvm/test/CodeGen/AArch64/sme-intrinsics-stores.ll b/llvm/test/CodeGen/AArch64/sme-intrinsics-stores.ll --- a/llvm/test/CodeGen/AArch64/sme-intrinsics-stores.ll +++ b/llvm/test/CodeGen/AArch64/sme-intrinsics-stores.ll @@ -30,7 +30,7 @@ ret void; } -define void @st1h( %pg, ptr %ptr, i32 %sliceidx) { +define void @st1h( %pg, ptr %ptr, i32 %sliceidx) { ; CHECK-LABEL: st1h: ; CHECK: // %bb.0: ; CHECK-NEXT: mov w12, w1 @@ -41,14 +41,14 @@ ; CHECK-NEXT: st1h {za1v.h[w12, 7]}, p0, [x0] ; CHECK-NEXT: ret %tileslice = add i32 %sliceidx, 7 - call void @llvm.aarch64.sme.st1h.horiz( %pg, ptr %ptr, i64 0, i32 %tileslice) - call void @llvm.aarch64.sme.st1h.horiz( %pg, ptr %ptr, i64 1, i32 0) - call void @llvm.aarch64.sme.st1h.vert( %pg, ptr %ptr, i64 0, i32 0) - call void @llvm.aarch64.sme.st1h.vert( %pg, ptr %ptr, i64 1, i32 %tileslice) + call void @llvm.aarch64.sme.st1h.horiz( %pg, ptr %ptr, i64 0, i32 %tileslice) + call void @llvm.aarch64.sme.st1h.horiz( %pg, ptr %ptr, i64 1, i32 0) + call void @llvm.aarch64.sme.st1h.vert( %pg, ptr %ptr, i64 0, i32 0) + call void @llvm.aarch64.sme.st1h.vert( %pg, ptr %ptr, i64 1, i32 %tileslice) ret void; } -define void @st1h_with_addr_offset( %pg, ptr %ptr, i64 %index, i32 %sliceidx) { +define void @st1h_with_addr_offset( %pg, ptr %ptr, i64 %index, i32 %sliceidx) { ; CHECK-LABEL: st1h_with_addr_offset: ; CHECK: // %bb.0: ; CHECK-NEXT: mov w12, w2 @@ -58,12 +58,12 @@ ; CHECK-NEXT: ret %base = getelementptr i16, ptr %ptr, i64 %index %tileslice = add i32 %sliceidx, 7 - call void @llvm.aarch64.sme.st1h.horiz( %pg, ptr %base, i64 0, i32 %tileslice) - call void @llvm.aarch64.sme.st1h.vert( %pg, ptr %base, i64 1, i32 0) + call void @llvm.aarch64.sme.st1h.horiz( %pg, ptr %base, i64 0, i32 %tileslice) + call void @llvm.aarch64.sme.st1h.vert( %pg, ptr %base, i64 1, i32 0) ret void; } -define void @st1w( %pg, ptr %ptr, i32 %sliceidx) { +define void @st1w( %pg, ptr %ptr, i32 %sliceidx) { ; CHECK-LABEL: st1w: ; CHECK: // %bb.0: ; CHECK-NEXT: mov w13, wzr @@ -78,18 +78,18 @@ ; CHECK-NEXT: st1w {za3v.s[w13, 0]}, p0, [x0] ; CHECK-NEXT: ret %tileslice = add i32 %sliceidx, 3 - call void @llvm.aarch64.sme.st1w.horiz( %pg, ptr %ptr, i64 0, i32 0) - call void @llvm.aarch64.sme.st1w.horiz( %pg, ptr %ptr, i64 1, i32 0) - call void @llvm.aarch64.sme.st1w.horiz( %pg, ptr %ptr, i64 2, i32 0) - call void @llvm.aarch64.sme.st1w.horiz( %pg, ptr %ptr, i64 3, i32 %tileslice) - call void @llvm.aarch64.sme.st1w.vert( %pg, ptr %ptr, i64 0, i32 0) - call void @llvm.aarch64.sme.st1w.vert( %pg, ptr %ptr, i64 1, i32 0) - call void @llvm.aarch64.sme.st1w.vert( %pg, ptr %ptr, i64 2, i32 %tileslice) - call void @llvm.aarch64.sme.st1w.vert( %pg, ptr %ptr, i64 3, i32 0) + call void @llvm.aarch64.sme.st1w.horiz( %pg, ptr %ptr, i64 0, i32 0) + call void @llvm.aarch64.sme.st1w.horiz( %pg, ptr %ptr, i64 1, i32 0) + call void @llvm.aarch64.sme.st1w.horiz( %pg, ptr %ptr, i64 2, i32 0) + call void @llvm.aarch64.sme.st1w.horiz( %pg, ptr %ptr, i64 3, i32 %tileslice) + call void @llvm.aarch64.sme.st1w.vert( %pg, ptr %ptr, i64 0, i32 0) + call void @llvm.aarch64.sme.st1w.vert( %pg, ptr %ptr, i64 1, i32 0) + call void @llvm.aarch64.sme.st1w.vert( %pg, ptr %ptr, i64 2, i32 %tileslice) + call void @llvm.aarch64.sme.st1w.vert( %pg, ptr %ptr, i64 3, i32 0) ret void; } -define void @st1w_with_addr_offset( %pg, ptr %ptr, i64 %index, i32 %sliceidx) { +define void @st1w_with_addr_offset( %pg, ptr %ptr, i64 %index, i32 %sliceidx) { ; CHECK-LABEL: st1w_with_addr_offset: ; CHECK: // %bb.0: ; CHECK-NEXT: mov w12, wzr @@ -99,12 +99,12 @@ ; CHECK-NEXT: ret %base = getelementptr i32, ptr %ptr, i64 %index %tileslice = add i32 %sliceidx, 3 - call void @llvm.aarch64.sme.st1w.horiz( %pg, ptr %base, i64 0, i32 0) - call void @llvm.aarch64.sme.st1w.vert( %pg, ptr %base, i64 3, i32 %tileslice) + call void @llvm.aarch64.sme.st1w.horiz( %pg, ptr %base, i64 0, i32 0) + call void @llvm.aarch64.sme.st1w.vert( %pg, ptr %base, i64 3, i32 %tileslice) ret void; } -define void @st1d( %pg, ptr %ptr, i32 %sliceidx) { +define void @st1d( %pg, ptr %ptr, i32 %sliceidx) { ; CHECK-LABEL: st1d: ; CHECK: // %bb.0: ; CHECK-NEXT: mov w13, wzr @@ -127,26 +127,26 @@ ; CHECK-NEXT: st1d {za7v.d[w12, 1]}, p0, [x0] ; CHECK-NEXT: ret %tileslice = add i32 %sliceidx, 1 - call void @llvm.aarch64.sme.st1d.horiz( %pg, ptr %ptr, i64 0, i32 0) - call void @llvm.aarch64.sme.st1d.horiz( %pg, ptr %ptr, i64 1, i32 0) - call void @llvm.aarch64.sme.st1d.horiz( %pg, ptr %ptr, i64 2, i32 0) - call void @llvm.aarch64.sme.st1d.horiz( %pg, ptr %ptr, i64 3, i32 0) - call void @llvm.aarch64.sme.st1d.horiz( %pg, ptr %ptr, i64 4, i32 %tileslice) - call void @llvm.aarch64.sme.st1d.horiz( %pg, ptr %ptr, i64 5, i32 0) - call void @llvm.aarch64.sme.st1d.horiz( %pg, ptr %ptr, i64 6, i32 0) - call void @llvm.aarch64.sme.st1d.horiz( %pg, ptr %ptr, i64 7, i32 0) - call void @llvm.aarch64.sme.st1d.vert( %pg, ptr %ptr, i64 0, i32 0) - call void @llvm.aarch64.sme.st1d.vert( %pg, ptr %ptr, i64 1, i32 0) - call void @llvm.aarch64.sme.st1d.vert( %pg, ptr %ptr, i64 2, i32 0) - call void @llvm.aarch64.sme.st1d.vert( %pg, ptr %ptr, i64 3, i32 0) - call void @llvm.aarch64.sme.st1d.vert( %pg, ptr %ptr, i64 4, i32 0) - call void @llvm.aarch64.sme.st1d.vert( %pg, ptr %ptr, i64 5, i32 0) - call void @llvm.aarch64.sme.st1d.vert( %pg, ptr %ptr, i64 6, i32 0) - call void @llvm.aarch64.sme.st1d.vert( %pg, ptr %ptr, i64 7, i32 %tileslice) + call void @llvm.aarch64.sme.st1d.horiz( %pg, ptr %ptr, i64 0, i32 0) + call void @llvm.aarch64.sme.st1d.horiz( %pg, ptr %ptr, i64 1, i32 0) + call void @llvm.aarch64.sme.st1d.horiz( %pg, ptr %ptr, i64 2, i32 0) + call void @llvm.aarch64.sme.st1d.horiz( %pg, ptr %ptr, i64 3, i32 0) + call void @llvm.aarch64.sme.st1d.horiz( %pg, ptr %ptr, i64 4, i32 %tileslice) + call void @llvm.aarch64.sme.st1d.horiz( %pg, ptr %ptr, i64 5, i32 0) + call void @llvm.aarch64.sme.st1d.horiz( %pg, ptr %ptr, i64 6, i32 0) + call void @llvm.aarch64.sme.st1d.horiz( %pg, ptr %ptr, i64 7, i32 0) + call void @llvm.aarch64.sme.st1d.vert( %pg, ptr %ptr, i64 0, i32 0) + call void @llvm.aarch64.sme.st1d.vert( %pg, ptr %ptr, i64 1, i32 0) + call void @llvm.aarch64.sme.st1d.vert( %pg, ptr %ptr, i64 2, i32 0) + call void @llvm.aarch64.sme.st1d.vert( %pg, ptr %ptr, i64 3, i32 0) + call void @llvm.aarch64.sme.st1d.vert( %pg, ptr %ptr, i64 4, i32 0) + call void @llvm.aarch64.sme.st1d.vert( %pg, ptr %ptr, i64 5, i32 0) + call void @llvm.aarch64.sme.st1d.vert( %pg, ptr %ptr, i64 6, i32 0) + call void @llvm.aarch64.sme.st1d.vert( %pg, ptr %ptr, i64 7, i32 %tileslice) ret void; } -define void @st1d_with_addr_offset( %pg, ptr %ptr, i64 %index, i32 %sliceidx) { +define void @st1d_with_addr_offset( %pg, ptr %ptr, i64 %index, i32 %sliceidx) { ; CHECK-LABEL: st1d_with_addr_offset: ; CHECK: // %bb.0: ; CHECK-NEXT: mov w12, w2 @@ -156,12 +156,12 @@ ; CHECK-NEXT: ret %base = getelementptr i64, ptr %ptr, i64 %index %tileslice = add i32 %sliceidx, 1 - call void @llvm.aarch64.sme.st1d.horiz( %pg, ptr %base, i64 0, i32 %tileslice) - call void @llvm.aarch64.sme.st1d.vert( %pg, ptr %base, i64 7, i32 0) + call void @llvm.aarch64.sme.st1d.horiz( %pg, ptr %base, i64 0, i32 %tileslice) + call void @llvm.aarch64.sme.st1d.vert( %pg, ptr %base, i64 7, i32 0) ret void; } -define void @st1q( %pg, ptr %ptr) { +define void @st1q( %pg, ptr %ptr) { ; CHECK-LABEL: st1q: ; CHECK: // %bb.0: ; CHECK-NEXT: mov w12, wzr @@ -198,42 +198,42 @@ ; CHECK-NEXT: st1q {za14v.q[w12, 0]}, p0, [x0] ; CHECK-NEXT: st1q {za15v.q[w12, 0]}, p0, [x0] ; CHECK-NEXT: ret - call void @llvm.aarch64.sme.st1q.horiz( %pg, ptr %ptr, i64 0, i32 0) - call void @llvm.aarch64.sme.st1q.horiz( %pg, ptr %ptr, i64 1, i32 0) - call void @llvm.aarch64.sme.st1q.horiz( %pg, ptr %ptr, i64 2, i32 0) - call void @llvm.aarch64.sme.st1q.horiz( %pg, ptr %ptr, i64 3, i32 0) - call void @llvm.aarch64.sme.st1q.horiz( %pg, ptr %ptr, i64 4, i32 0) - call void @llvm.aarch64.sme.st1q.horiz( %pg, ptr %ptr, i64 5, i32 0) - call void @llvm.aarch64.sme.st1q.horiz( %pg, ptr %ptr, i64 6, i32 0) - call void @llvm.aarch64.sme.st1q.horiz( %pg, ptr %ptr, i64 7, i32 0) - call void @llvm.aarch64.sme.st1q.horiz( %pg, ptr %ptr, i64 8, i32 0) - call void @llvm.aarch64.sme.st1q.horiz( %pg, ptr %ptr, i64 9, i32 0) - call void @llvm.aarch64.sme.st1q.horiz( %pg, ptr %ptr, i64 10, i32 0) - call void @llvm.aarch64.sme.st1q.horiz( %pg, ptr %ptr, i64 11, i32 0) - call void @llvm.aarch64.sme.st1q.horiz( %pg, ptr %ptr, i64 12, i32 0) - call void @llvm.aarch64.sme.st1q.horiz( %pg, ptr %ptr, i64 13, i32 0) - call void @llvm.aarch64.sme.st1q.horiz( %pg, ptr %ptr, i64 14, i32 0) - call void @llvm.aarch64.sme.st1q.horiz( %pg, ptr %ptr, i64 15, i32 0) - call void @llvm.aarch64.sme.st1q.vert( %pg, ptr %ptr, i64 0, i32 0) - call void @llvm.aarch64.sme.st1q.vert( %pg, ptr %ptr, i64 1, i32 0) - call void @llvm.aarch64.sme.st1q.vert( %pg, ptr %ptr, i64 2, i32 0) - call void @llvm.aarch64.sme.st1q.vert( %pg, ptr %ptr, i64 3, i32 0) - call void @llvm.aarch64.sme.st1q.vert( %pg, ptr %ptr, i64 4, i32 0) - call void @llvm.aarch64.sme.st1q.vert( %pg, ptr %ptr, i64 5, i32 0) - call void @llvm.aarch64.sme.st1q.vert( %pg, ptr %ptr, i64 6, i32 0) - call void @llvm.aarch64.sme.st1q.vert( %pg, ptr %ptr, i64 7, i32 0) - call void @llvm.aarch64.sme.st1q.vert( %pg, ptr %ptr, i64 8, i32 0) - call void @llvm.aarch64.sme.st1q.vert( %pg, ptr %ptr, i64 9, i32 0) - call void @llvm.aarch64.sme.st1q.vert( %pg, ptr %ptr, i64 10, i32 0) - call void @llvm.aarch64.sme.st1q.vert( %pg, ptr %ptr, i64 11, i32 0) - call void @llvm.aarch64.sme.st1q.vert( %pg, ptr %ptr, i64 12, i32 0) - call void @llvm.aarch64.sme.st1q.vert( %pg, ptr %ptr, i64 13, i32 0) - call void @llvm.aarch64.sme.st1q.vert( %pg, ptr %ptr, i64 14, i32 0) - call void @llvm.aarch64.sme.st1q.vert( %pg, ptr %ptr, i64 15, i32 0) + call void @llvm.aarch64.sme.st1q.horiz( %pg, ptr %ptr, i64 0, i32 0) + call void @llvm.aarch64.sme.st1q.horiz( %pg, ptr %ptr, i64 1, i32 0) + call void @llvm.aarch64.sme.st1q.horiz( %pg, ptr %ptr, i64 2, i32 0) + call void @llvm.aarch64.sme.st1q.horiz( %pg, ptr %ptr, i64 3, i32 0) + call void @llvm.aarch64.sme.st1q.horiz( %pg, ptr %ptr, i64 4, i32 0) + call void @llvm.aarch64.sme.st1q.horiz( %pg, ptr %ptr, i64 5, i32 0) + call void @llvm.aarch64.sme.st1q.horiz( %pg, ptr %ptr, i64 6, i32 0) + call void @llvm.aarch64.sme.st1q.horiz( %pg, ptr %ptr, i64 7, i32 0) + call void @llvm.aarch64.sme.st1q.horiz( %pg, ptr %ptr, i64 8, i32 0) + call void @llvm.aarch64.sme.st1q.horiz( %pg, ptr %ptr, i64 9, i32 0) + call void @llvm.aarch64.sme.st1q.horiz( %pg, ptr %ptr, i64 10, i32 0) + call void @llvm.aarch64.sme.st1q.horiz( %pg, ptr %ptr, i64 11, i32 0) + call void @llvm.aarch64.sme.st1q.horiz( %pg, ptr %ptr, i64 12, i32 0) + call void @llvm.aarch64.sme.st1q.horiz( %pg, ptr %ptr, i64 13, i32 0) + call void @llvm.aarch64.sme.st1q.horiz( %pg, ptr %ptr, i64 14, i32 0) + call void @llvm.aarch64.sme.st1q.horiz( %pg, ptr %ptr, i64 15, i32 0) + call void @llvm.aarch64.sme.st1q.vert( %pg, ptr %ptr, i64 0, i32 0) + call void @llvm.aarch64.sme.st1q.vert( %pg, ptr %ptr, i64 1, i32 0) + call void @llvm.aarch64.sme.st1q.vert( %pg, ptr %ptr, i64 2, i32 0) + call void @llvm.aarch64.sme.st1q.vert( %pg, ptr %ptr, i64 3, i32 0) + call void @llvm.aarch64.sme.st1q.vert( %pg, ptr %ptr, i64 4, i32 0) + call void @llvm.aarch64.sme.st1q.vert( %pg, ptr %ptr, i64 5, i32 0) + call void @llvm.aarch64.sme.st1q.vert( %pg, ptr %ptr, i64 6, i32 0) + call void @llvm.aarch64.sme.st1q.vert( %pg, ptr %ptr, i64 7, i32 0) + call void @llvm.aarch64.sme.st1q.vert( %pg, ptr %ptr, i64 8, i32 0) + call void @llvm.aarch64.sme.st1q.vert( %pg, ptr %ptr, i64 9, i32 0) + call void @llvm.aarch64.sme.st1q.vert( %pg, ptr %ptr, i64 10, i32 0) + call void @llvm.aarch64.sme.st1q.vert( %pg, ptr %ptr, i64 11, i32 0) + call void @llvm.aarch64.sme.st1q.vert( %pg, ptr %ptr, i64 12, i32 0) + call void @llvm.aarch64.sme.st1q.vert( %pg, ptr %ptr, i64 13, i32 0) + call void @llvm.aarch64.sme.st1q.vert( %pg, ptr %ptr, i64 14, i32 0) + call void @llvm.aarch64.sme.st1q.vert( %pg, ptr %ptr, i64 15, i32 0) ret void; } -define void @st1q_with_addr_offset( %pg, ptr %ptr, i64 %index) { +define void @st1q_with_addr_offset( %pg, ptr %ptr, i64 %index) { ; CHECK-LABEL: st1q_with_addr_offset: ; CHECK: // %bb.0: ; CHECK-NEXT: mov w12, wzr @@ -241,8 +241,8 @@ ; CHECK-NEXT: st1q {za15v.q[w12, 0]}, p0, [x0, x1, lsl #4] ; CHECK-NEXT: ret %base = getelementptr i128, ptr %ptr, i64 %index - call void @llvm.aarch64.sme.st1q.horiz( %pg, ptr %base, i64 0, i32 0) - call void @llvm.aarch64.sme.st1q.vert( %pg, ptr %base, i64 15, i32 0) + call void @llvm.aarch64.sme.st1q.horiz( %pg, ptr %base, i64 0, i32 0) + call void @llvm.aarch64.sme.st1q.vert( %pg, ptr %base, i64 15, i32 0) ret void; } @@ -297,7 +297,7 @@ ; Ensure that the tile offset is sunk, given that this is likely to be an 'add' ; that's decomposed into a base + offset in ISel. -define void @test_sink_tile0_offset_operand( %pg, ptr %src, i32 %base, i32 %N) { +define void @test_sink_tile0_offset_operand( %pg, ptr %src, i32 %base, i32 %N) { ; CHECK-LABEL: test_sink_tile0_offset_operand: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: mov w12, w1 @@ -317,9 +317,9 @@ for.body: %i = phi i32 [ 0, %entry ], [ %inc, %for.body ] - tail call void @llvm.aarch64.sme.st1w.horiz( %pg, ptr %src, i64 0, i32 %base) - tail call void @llvm.aarch64.sme.st1w.horiz( %pg, ptr %src, i64 0, i32 %add0) - tail call void @llvm.aarch64.sme.st1w.horiz( %pg, ptr %src, i64 0, i32 %add1) + tail call void @llvm.aarch64.sme.st1w.horiz( %pg, ptr %src, i64 0, i32 %base) + tail call void @llvm.aarch64.sme.st1w.horiz( %pg, ptr %src, i64 0, i32 %add0) + tail call void @llvm.aarch64.sme.st1w.horiz( %pg, ptr %src, i64 0, i32 %add1) %inc = add nuw nsw i32 %i, 1 %exitcond.not = icmp eq i32 %inc, %N br i1 %exitcond.not, label %exit, label %for.body @@ -329,15 +329,15 @@ } declare void @llvm.aarch64.sme.st1b.horiz(, ptr, i64, i32) -declare void @llvm.aarch64.sme.st1h.horiz(, ptr, i64, i32) -declare void @llvm.aarch64.sme.st1w.horiz(, ptr, i64, i32) -declare void @llvm.aarch64.sme.st1d.horiz(, ptr, i64, i32) -declare void @llvm.aarch64.sme.st1q.horiz(, ptr, i64, i32) +declare void @llvm.aarch64.sme.st1h.horiz(, ptr, i64, i32) +declare void @llvm.aarch64.sme.st1w.horiz(, ptr, i64, i32) +declare void @llvm.aarch64.sme.st1d.horiz(, ptr, i64, i32) +declare void @llvm.aarch64.sme.st1q.horiz(, ptr, i64, i32) declare void @llvm.aarch64.sme.st1b.vert(, ptr, i64, i32) -declare void @llvm.aarch64.sme.st1h.vert(, ptr, i64, i32) -declare void @llvm.aarch64.sme.st1w.vert(, ptr, i64, i32) -declare void @llvm.aarch64.sme.st1d.vert(, ptr, i64, i32) -declare void @llvm.aarch64.sme.st1q.vert(, ptr, i64, i32) +declare void @llvm.aarch64.sme.st1h.vert(, ptr, i64, i32) +declare void @llvm.aarch64.sme.st1w.vert(, ptr, i64, i32) +declare void @llvm.aarch64.sme.st1d.vert(, ptr, i64, i32) +declare void @llvm.aarch64.sme.st1q.vert(, ptr, i64, i32) declare void @llvm.aarch64.sme.str(i32, ptr) declare i64 @llvm.vscale.i64()