Index: llvm/lib/Target/AArch64/AArch64InstrInfo.cpp =================================================================== --- llvm/lib/Target/AArch64/AArch64InstrInfo.cpp +++ llvm/lib/Target/AArch64/AArch64InstrInfo.cpp @@ -2270,6 +2270,19 @@ case AArch64::LD1SW_D_IMM: case AArch64::LD1D_IMM: + case AArch64::LD2B_IMM: + case AArch64::LD2H_IMM: + case AArch64::LD2W_IMM: + case AArch64::LD2D_IMM: + case AArch64::LD3B_IMM: + case AArch64::LD3H_IMM: + case AArch64::LD3W_IMM: + case AArch64::LD3D_IMM: + case AArch64::LD4B_IMM: + case AArch64::LD4H_IMM: + case AArch64::LD4W_IMM: + case AArch64::LD4D_IMM: + case AArch64::ST1B_IMM: case AArch64::ST1B_H_IMM: case AArch64::ST1B_S_IMM: @@ -2281,6 +2294,19 @@ case AArch64::ST1W_D_IMM: case AArch64::ST1D_IMM: + case AArch64::ST2B_IMM: + case AArch64::ST2H_IMM: + case AArch64::ST2W_IMM: + case AArch64::ST2D_IMM: + case AArch64::ST3B_IMM: + case AArch64::ST3H_IMM: + case AArch64::ST3W_IMM: + case AArch64::ST3D_IMM: + case AArch64::ST4B_IMM: + case AArch64::ST4H_IMM: + case AArch64::ST4W_IMM: + case AArch64::ST4D_IMM: + case AArch64::LD1RB_IMM: case AArch64::LD1RB_H_IMM: case AArch64::LD1RB_S_IMM: @@ -2897,6 +2923,49 @@ MinOffset = -8; MaxOffset = 7; break; + + case AArch64::LD2B_IMM: + case AArch64::LD2H_IMM: + case AArch64::LD2W_IMM: + case AArch64::LD2D_IMM: + case AArch64::ST2B_IMM: + case AArch64::ST2H_IMM: + case AArch64::ST2W_IMM: + case AArch64::ST2D_IMM: + Scale = TypeSize::Scalable(32); + Width = SVEMaxBytesPerVector * 2; + MinOffset = -16; + MaxOffset = 14; + break; + + case AArch64::LD3B_IMM: + case AArch64::LD3H_IMM: + case AArch64::LD3W_IMM: + case AArch64::LD3D_IMM: + case AArch64::ST3B_IMM: + case AArch64::ST3H_IMM: + case AArch64::ST3W_IMM: + case AArch64::ST3D_IMM: + Scale = TypeSize::Scalable(48); + Width = SVEMaxBytesPerVector * 3; + MinOffset = -24; + MaxOffset = 21; + break; + + case AArch64::LD4B_IMM: + case AArch64::LD4H_IMM: + case AArch64::LD4W_IMM: + case AArch64::LD4D_IMM: + case AArch64::ST4B_IMM: + case AArch64::ST4H_IMM: + case AArch64::ST4W_IMM: + case AArch64::ST4D_IMM: + Scale = TypeSize::Scalable(64); + Width = SVEMaxBytesPerVector * 4; + MinOffset = -32; + MaxOffset = 28; + break; + case AArch64::LD1B_H_IMM: case AArch64::LD1SB_H_IMM: case AArch64::LD1H_S_IMM: Index: llvm/test/CodeGen/AArch64/ldN-reg-imm-alloca.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/AArch64/ldN-reg-imm-alloca.ll @@ -0,0 +1,339 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve < %s | FileCheck %s + +; LD2 + +define @ld2b_imm( %pg) vscale_range(2,2) { +; CHECK-LABEL: ld2b_imm: +; CHECK: // %bb.0: +; CHECK-NEXT: .cfi_def_cfa_offset 64 +; CHECK-NEXT: strb wzr, [sp, #-64]! +; CHECK-NEXT: ld2b { z0.b, z1.b }, p0/z, [sp] +; CHECK-NEXT: add sp, sp, #64 +; CHECK-NEXT: ret + %alloc = alloca [64 x i8], i32 1, align 16 + %ptr = bitcast [64 x i8]* %alloc to i8* + store i8 zeroinitializer, i8* %ptr, align 16 + %ld2 = call @llvm.aarch64.sve.ld2.nxv32i8( %pg, i8* %ptr) + ret %ld2 +} + +define @ld2h_imm( %pg) vscale_range(2,2) { +; CHECK-LABEL: ld2h_imm: +; CHECK: // %bb.0: +; CHECK-NEXT: .cfi_def_cfa_offset 64 +; CHECK-NEXT: strh wzr, [sp, #-64]! +; CHECK-NEXT: ld2h { z0.h, z1.h }, p0/z, [sp] +; CHECK-NEXT: add sp, sp, #64 +; CHECK-NEXT: ret + %alloc = alloca [64 x i8], i32 1, align 16 + %ptr = bitcast [64 x i8]* %alloc to i16* + store i16 zeroinitializer, i16* %ptr, align 16 + %ld2 = call @llvm.aarch64.sve.ld2.nxv16i16( %pg, i16* %ptr) + ret %ld2 +} + +define @ld2s_imm( %pg) vscale_range(2,2) { +; CHECK-LABEL: ld2s_imm: +; CHECK: // %bb.0: +; CHECK-NEXT: .cfi_def_cfa_offset 64 +; CHECK-NEXT: str wzr, [sp, #-64]! +; CHECK-NEXT: ld2w { z0.s, z1.s }, p0/z, [sp] +; CHECK-NEXT: add sp, sp, #64 +; CHECK-NEXT: ret + %alloc = alloca [32 x i16], i32 1, align 16 + %ptr = bitcast [32 x i16]* %alloc to i32* + store i32 zeroinitializer, i32* %ptr, align 16 + %ld2 = call @llvm.aarch64.sve.ld2.nxv8i32( %pg, i32* %ptr) + ret %ld2 +} + +define @ld2d_imm( %pg) vscale_range(2,2) { +; CHECK-LABEL: ld2d_imm: +; CHECK: // %bb.0: +; CHECK-NEXT: .cfi_def_cfa_offset 64 +; CHECK-NEXT: str xzr, [sp, #-64]! +; CHECK-NEXT: ld2d { z0.d, z1.d }, p0/z, [sp] +; CHECK-NEXT: add sp, sp, #64 +; CHECK-NEXT: ret + %alloc = alloca [8 x i64], i32 1, align 16 + %ptr = bitcast [8 x i64]* %alloc to i64* + store i64 zeroinitializer, i64* %ptr, align 16 + %ld2 = call @llvm.aarch64.sve.ld2.nxv4i64( %pg, i64* %ptr) + ret %ld2 +} + +define @ld2b_nonzero_imm( %pg) vscale_range(2,2) { +; CHECK-LABEL: ld2b_nonzero_imm: +; CHECK: // %bb.0: +; CHECK-NEXT: sub sp, sp, #192 +; CHECK-NEXT: .cfi_def_cfa_offset 192 +; CHECK-NEXT: mov x8, sp +; CHECK-NEXT: ptrue p1.b +; CHECK-NEXT: mov z0.b, #0 // =0x0 +; CHECK-NEXT: st1b { z0.b }, p1, [x8, #1, mul vl] +; CHECK-NEXT: st1b { z0.b }, p1, [x8] +; CHECK-NEXT: ld2b { z0.b, z1.b }, p0/z, [x8, #4, mul vl] +; CHECK-NEXT: add sp, sp, #192 +; CHECK-NEXT: ret + %alloc = alloca [64 x i8], i32 3 + %bc = bitcast [64 x i8]* %alloc to * + store zeroinitializer, * %bc, align 16 + %gep = getelementptr inbounds , * %bc, i64 2, i64 0 + %ld2 = call @llvm.aarch64.sve.ld2.nxv32i8( %pg, i8* %gep) + ret %ld2 +} + +; Test where the stack offset is non-zero due to an extra alloca +define @ld2b_f16_valid_imm( %pg) vscale_range(2,2) { +; CHECK-LABEL: ld2b_f16_valid_imm: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-5 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x28, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 40 * VG +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: addvl x8, sp, #1 +; CHECK-NEXT: ld2h { z0.h, z1.h }, p0/z, [x8, #2, mul vl] +; CHECK-NEXT: addvl sp, sp, #5 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + %alloca1 = alloca , i32 4 + %alloca2 = alloca , i32 1 + %base = getelementptr , * %alloca1, i64 2, i64 0 + %ld2 = call @llvm.aarch64.sve.ld2.nxv16f16( %pg, half* %base) + ret %ld2 +} + +; LD3 + +define @ld3b_imm( %pg) vscale_range(2,2) { +; CHECK-LABEL: ld3b_imm: +; CHECK: // %bb.0: +; CHECK-NEXT: .cfi_def_cfa_offset 96 +; CHECK-NEXT: strb wzr, [sp, #-96]! +; CHECK-NEXT: ld3b { z0.b, z1.b, z2.b }, p0/z, [sp] +; CHECK-NEXT: add sp, sp, #96 +; CHECK-NEXT: ret + %alloc = alloca [96 x i8], i32 1, align 16 + %ptr = bitcast [96 x i8]* %alloc to i8* + store i8 zeroinitializer, i8* %ptr, align 16 + %ld3 = call @llvm.aarch64.sve.ld3.nxv48i8( %pg, i8* %ptr) + ret %ld3 +} + +define @ld3h_imm( %pg) vscale_range(2,2) { +; CHECK-LABEL: ld3h_imm: +; CHECK: // %bb.0: +; CHECK-NEXT: .cfi_def_cfa_offset 96 +; CHECK-NEXT: strh wzr, [sp, #-96]! +; CHECK-NEXT: ld3h { z0.h, z1.h, z2.h }, p0/z, [sp] +; CHECK-NEXT: add sp, sp, #96 +; CHECK-NEXT: ret + %alloc = alloca [48 x i16], i32 1, align 16 + %ptr = bitcast [48 x i16]* %alloc to i16* + store i16 zeroinitializer, i16* %ptr, align 16 + %ld3 = call @llvm.aarch64.sve.ld3.nxv24i16( %pg, i16* %ptr) + ret %ld3 +} + +define @ld3s_imm( %pg) vscale_range(2,2) { +; CHECK-LABEL: ld3s_imm: +; CHECK: // %bb.0: +; CHECK-NEXT: .cfi_def_cfa_offset 96 +; CHECK-NEXT: str wzr, [sp, #-96]! +; CHECK-NEXT: ld3w { z0.s, z1.s, z2.s }, p0/z, [sp] +; CHECK-NEXT: add sp, sp, #96 +; CHECK-NEXT: ret + %alloc = alloca [24 x i32], i32 1, align 16 + %ptr = bitcast [24 x i32]* %alloc to i32* + store i32 zeroinitializer, i32* %ptr, align 16 + %ld3 = call @llvm.aarch64.sve.ld3.nxv12i32( %pg, i32* %ptr) + ret %ld3 +} + +define @ld3d_imm( %pg) vscale_range(2,2) { +; CHECK-LABEL: ld3d_imm: +; CHECK: // %bb.0: +; CHECK-NEXT: .cfi_def_cfa_offset 96 +; CHECK-NEXT: str xzr, [sp, #-96]! +; CHECK-NEXT: ld3d { z0.d, z1.d, z2.d }, p0/z, [sp] +; CHECK-NEXT: add sp, sp, #96 +; CHECK-NEXT: ret + %alloc = alloca [12 x i64], i32 1, align 16 + %ptr = bitcast [12 x i64]* %alloc to i64* + store i64 zeroinitializer, i64* %ptr, align 16 + %ld3 = call @llvm.aarch64.sve.ld3.nxv6i64( %pg, i64* %ptr) + ret %ld3 +} + +define @ld3h_nonzero_imm( %pg) vscale_range(2,2) { +; CHECK-LABEL: ld3h_nonzero_imm: +; CHECK: // %bb.0: +; CHECK-NEXT: sub sp, sp, #400 +; CHECK-NEXT: str x29, [sp, #384] // 8-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 400 +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: mov x8, sp +; CHECK-NEXT: ptrue p1.h +; CHECK-NEXT: mov z0.h, #0 // =0x0 +; CHECK-NEXT: st1h { z0.h }, p1, [x8, #2, mul vl] +; CHECK-NEXT: st1h { z0.h }, p1, [x8, #1, mul vl] +; CHECK-NEXT: st1h { z0.h }, p1, [x8] +; CHECK-NEXT: ld3h { z0.h, z1.h, z2.h }, p0/z, [x8, #12, mul vl] +; CHECK-NEXT: ldr x29, [sp, #384] // 8-byte Folded Reload +; CHECK-NEXT: add sp, sp, #400 +; CHECK-NEXT: ret + %alloc = alloca [48 x i16], i32 4 + %bc = bitcast [48 x i16]* %alloc to * + store zeroinitializer, * %bc, align 16 + %gep = getelementptr inbounds , * %bc, i64 3, i64 0 + %ld3 = call @llvm.aarch64.sve.ld3.nxv24i16( %pg, i16* %gep) + ret %ld3 +} + +; Test where the stack offset is non-zero due to an extra alloca +define @ld3b_f32_valid_imm( %pg) vscale_range(2,2) { +; CHECK-LABEL: ld3b_f32_valid_imm: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-13 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0xe8, 0x00, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 104 * VG +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: addvl x8, sp, #1 +; CHECK-NEXT: ld3w { z0.s, z1.s, z2.s }, p0/z, [x8, #9, mul vl] +; CHECK-NEXT: addvl sp, sp, #13 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + %alloca1 = alloca , i32 12 + %alloca2 = alloca , i32 1 + %base = getelementptr , * %alloca1, i64 9, i64 0 + %ld3 = call @llvm.aarch64.sve.ld3.nxv12f32( %pg, float* %base) + ret %ld3 +} + +; LD4 + +define @ld4b_imm( %pg) vscale_range(2,2) { +; CHECK-LABEL: ld4b_imm: +; CHECK: // %bb.0: +; CHECK-NEXT: .cfi_def_cfa_offset 128 +; CHECK-NEXT: strb wzr, [sp, #-128]! +; CHECK-NEXT: ld4b { z0.b, z1.b, z2.b, z3.b }, p0/z, [sp] +; CHECK-NEXT: add sp, sp, #128 +; CHECK-NEXT: ret + %alloc = alloca [128 x i8], i32 1, align 16 + %ptr = bitcast [128 x i8]* %alloc to i8* + store i8 zeroinitializer, i8* %ptr, align 16 + %ld4 = call @llvm.aarch64.sve.ld4.nxv64i8( %pg, i8* %ptr) + ret %ld4 +} + +define @ld4h_imm( %pg) vscale_range(2,2) { +; CHECK-LABEL: ld4h_imm: +; CHECK: // %bb.0: +; CHECK-NEXT: .cfi_def_cfa_offset 128 +; CHECK-NEXT: strh wzr, [sp, #-128]! +; CHECK-NEXT: ld4h { z0.h, z1.h, z2.h, z3.h }, p0/z, [sp] +; CHECK-NEXT: add sp, sp, #128 +; CHECK-NEXT: ret + %alloc = alloca [64 x i16], i32 1, align 16 + %ptr = bitcast [64 x i16]* %alloc to i16* + store i16 zeroinitializer, i16* %ptr, align 16 + %ld4 = call @llvm.aarch64.sve.ld4.nxv32i16( %pg, i16* %ptr) + ret %ld4 +} + +define @ld4s_imm( %pg) vscale_range(2,2) { +; CHECK-LABEL: ld4s_imm: +; CHECK: // %bb.0: +; CHECK-NEXT: .cfi_def_cfa_offset 128 +; CHECK-NEXT: str wzr, [sp, #-128]! +; CHECK-NEXT: ld4w { z0.s, z1.s, z2.s, z3.s }, p0/z, [sp] +; CHECK-NEXT: add sp, sp, #128 +; CHECK-NEXT: ret + %alloc = alloca [32 x i32], i32 1, align 16 + %ptr = bitcast [32 x i32]* %alloc to i32* + store i32 zeroinitializer, i32* %ptr, align 16 + %ld4 = call @llvm.aarch64.sve.ld4.nxv16i32( %pg, i32* %ptr) + ret %ld4 +} + +define @ld4d_imm( %pg) vscale_range(2,2) { +; CHECK-LABEL: ld4d_imm: +; CHECK: // %bb.0: +; CHECK-NEXT: .cfi_def_cfa_offset 128 +; CHECK-NEXT: str xzr, [sp, #-128]! +; CHECK-NEXT: ld4d { z0.d, z1.d, z2.d, z3.d }, p0/z, [sp] +; CHECK-NEXT: add sp, sp, #128 +; CHECK-NEXT: ret + %alloc = alloca [16 x i64], i32 1, align 16 + %ptr = bitcast [16 x i64]* %alloc to i64* + store i64 zeroinitializer, i64* %ptr, align 16 + %ld4 = call @llvm.aarch64.sve.ld4.nxv8i64( %pg, i64* %ptr) + ret %ld4 +} + +define @ld4s_nonzero_imm( %pg) vscale_range(2,2) { +; CHECK-LABEL: ld4s_nonzero_imm: +; CHECK: // %bb.0: +; CHECK-NEXT: sub sp, sp, #272 +; CHECK-NEXT: str x29, [sp, #256] // 8-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 272 +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: mov x8, sp +; CHECK-NEXT: ptrue p1.s +; CHECK-NEXT: mov z0.s, #0 // =0x0 +; CHECK-NEXT: st1w { z0.s }, p1, [x8, #3, mul vl] +; CHECK-NEXT: st1w { z0.s }, p1, [x8, #2, mul vl] +; CHECK-NEXT: st1w { z0.s }, p1, [x8, #1, mul vl] +; CHECK-NEXT: st1w { z0.s }, p1, [x8] +; CHECK-NEXT: ld4w { z0.s, z1.s, z2.s, z3.s }, p0/z, [x8, #4, mul vl] +; CHECK-NEXT: ldr x29, [sp, #256] // 8-byte Folded Reload +; CHECK-NEXT: add sp, sp, #272 +; CHECK-NEXT: ret + %alloc = alloca [32 x i32], i32 2 + %bc = bitcast [32 x i32]* %alloc to * + store zeroinitializer, * %bc, align 16 + %gep = getelementptr inbounds , * %bc, i64 1, i64 0 + %ld4 = call @llvm.aarch64.sve.ld4.nxv16i32( %pg, i32* %gep) + ret %ld4 +} + +; Test where the stack offset is non-zero due to an extra alloca +define @ld4b_f64_valid_imm( %pg) vscale_range(2,2) { +; CHECK-LABEL: ld4b_f64_valid_imm: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-15 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0xf8, 0x00, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 120 * VG +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: addvl x8, sp, #2 +; CHECK-NEXT: addvl x8, x8, #9 +; CHECK-NEXT: ld4d { z0.d, z1.d, z2.d, z3.d }, p0/z, [x8] +; CHECK-NEXT: addvl sp, sp, #15 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + %alloca1 = alloca , i32 13 + %alloca2 = alloca , i32 2 + %base = getelementptr , * %alloca1, i64 9, i64 0 + %ld4 = call @llvm.aarch64.sve.ld4.nxv8f64( %pg, double* %base) + ret %ld4 +} + +declare @llvm.aarch64.sve.ld2.nxv32i8(, i8*) +declare @llvm.aarch64.sve.ld2.nxv16i16(, i16*) +declare @llvm.aarch64.sve.ld2.nxv16f16(, half*) +declare @llvm.aarch64.sve.ld2.nxv8i32(, i32*) +declare @llvm.aarch64.sve.ld2.nxv4i64(, i64*) + +declare @llvm.aarch64.sve.ld3.nxv48i8(, i8*) +declare @llvm.aarch64.sve.ld3.nxv24i16(, i16*) +declare @llvm.aarch64.sve.ld3.nxv12i32(, i32*) +declare @llvm.aarch64.sve.ld3.nxv12f32(, float*) +declare @llvm.aarch64.sve.ld3.nxv6i64(, i64*) + +declare @llvm.aarch64.sve.ld4.nxv64i8(, i8*) +declare @llvm.aarch64.sve.ld4.nxv32i16(, i16*) +declare @llvm.aarch64.sve.ld4.nxv16i32(, i32*) +declare @llvm.aarch64.sve.ld4.nxv8i64(, i64*) +declare @llvm.aarch64.sve.ld4.nxv8f64(, double*) Index: llvm/test/CodeGen/AArch64/stN-reg-imm-alloca.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/AArch64/stN-reg-imm-alloca.ll @@ -0,0 +1,375 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve < %s | FileCheck %s + +; ST2 + +define void @st2b_imm( %in, %pg) vscale_range(2,2) { +; CHECK-LABEL: st2b_imm: +; CHECK: // %bb.0: +; CHECK-NEXT: sub sp, sp, #64 +; CHECK-NEXT: .cfi_def_cfa_offset 64 +; CHECK-NEXT: // kill: def $z0 killed $z0 def $z0_z1 +; CHECK-NEXT: mov z1.d, z0.d +; CHECK-NEXT: st2b { z0.b, z1.b }, p0, [sp] +; CHECK-NEXT: add sp, sp, #64 +; CHECK-NEXT: ret + %alloc = alloca [64 x i8], i32 1, align 16 + %ptr = bitcast [64 x i8]* %alloc to i8* + call void @llvm.aarch64.sve.st2.nxv16i8( %in, %in, %pg, i8* %ptr) + ret void +} + +define void @st2h_imm( %in, %pg) vscale_range(2,2) { +; CHECK-LABEL: st2h_imm: +; CHECK: // %bb.0: +; CHECK-NEXT: sub sp, sp, #64 +; CHECK-NEXT: .cfi_def_cfa_offset 64 +; CHECK-NEXT: // kill: def $z0 killed $z0 def $z0_z1 +; CHECK-NEXT: mov z1.d, z0.d +; CHECK-NEXT: st2h { z0.h, z1.h }, p0, [sp] +; CHECK-NEXT: add sp, sp, #64 +; CHECK-NEXT: ret + %alloc = alloca [32 x i16], i32 1, align 16 + %ptr = bitcast [32 x i16]* %alloc to i16* + call void @llvm.aarch64.sve.st2.nxv8i16( %in, %in, %pg, i16* %ptr) + ret void +} + +define void @st2w_imm( %in, %pg) vscale_range(2,2) { +; CHECK-LABEL: st2w_imm: +; CHECK: // %bb.0: +; CHECK-NEXT: sub sp, sp, #64 +; CHECK-NEXT: .cfi_def_cfa_offset 64 +; CHECK-NEXT: // kill: def $z0 killed $z0 def $z0_z1 +; CHECK-NEXT: mov z1.d, z0.d +; CHECK-NEXT: st2w { z0.s, z1.s }, p0, [sp] +; CHECK-NEXT: add sp, sp, #64 +; CHECK-NEXT: ret + %alloc = alloca [16 x i32], i32 1, align 16 + %ptr = bitcast [16 x i32]* %alloc to i32* + call void @llvm.aarch64.sve.st2.nxv4i32( %in, %in, %pg, i32* %ptr) + ret void +} + +define void @st2d_imm( %in, %pg) vscale_range(2,2) { +; CHECK-LABEL: st2d_imm: +; CHECK: // %bb.0: +; CHECK-NEXT: sub sp, sp, #64 +; CHECK-NEXT: .cfi_def_cfa_offset 64 +; CHECK-NEXT: // kill: def $z0 killed $z0 def $z0_z1 +; CHECK-NEXT: mov z1.d, z0.d +; CHECK-NEXT: st2d { z0.d, z1.d }, p0, [sp] +; CHECK-NEXT: add sp, sp, #64 +; CHECK-NEXT: ret + %alloc = alloca [8 x i64], i32 1, align 16 + %ptr = bitcast [8 x i64]* %alloc to i64* + call void @llvm.aarch64.sve.st2.nxv2i64( %in, %in, %pg, i64* %ptr) + ret void +} + +define void @st2d_nonzero_imm( %in, %pg) vscale_range(2,2) { +; CHECK-LABEL: st2d_nonzero_imm: +; CHECK: // %bb.0: +; CHECK-NEXT: sub sp, sp, #272 +; CHECK-NEXT: str x29, [sp, #256] // 8-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 272 +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: // kill: def $z0 killed $z0 def $z0_z1 +; CHECK-NEXT: mov z1.d, z0.d +; CHECK-NEXT: st2d { z0.d, z1.d }, p0, [sp, #2, mul vl] +; CHECK-NEXT: ldr x29, [sp, #256] // 8-byte Folded Reload +; CHECK-NEXT: add sp, sp, #272 +; CHECK-NEXT: ret + %alloc = alloca [8 x i64], i32 4 + %bc = bitcast [8 x i64]* %alloc to * + %base = getelementptr , * %bc, i64 2, i64 0 + call void @llvm.aarch64.sve.st2.nxv2i64( %in, %in, %pg, i64* %base) + ret void +} + +; Test where the stack offset is non-zero due to an extra alloca +define void @st2b_f8_valid_imm( %in, %pred) vscale_range(2,2) { +; CHECK-LABEL: st2b_f8_valid_imm: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-5 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x28, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 40 * VG +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: // kill: def $z0 killed $z0 def $z0_z1 +; CHECK-NEXT: ptrue p1.s +; CHECK-NEXT: mov z2.s, #0 // =0x0 +; CHECK-NEXT: mov z1.d, z0.d +; CHECK-NEXT: addvl x8, sp, #1 +; CHECK-NEXT: st1w { z2.s }, p1, [sp] +; CHECK-NEXT: st2h { z0.h, z1.h }, p0, [x8, #2, mul vl] +; CHECK-NEXT: addvl sp, sp, #5 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + %alloca1 = alloca , i32 4 + %alloca2 = alloca , i32 1 + %base = getelementptr , * %alloca1, i64 2, i64 0 + store zeroinitializer, * %alloca2 + call void @llvm.aarch64.sve.st2.nxv8f16( %in, %in, %pred, half* %base) + ret void +} + +; ST3 + +define void @st3b_imm( %in, %pg) vscale_range(2,2) { +; CHECK-LABEL: st3b_imm: +; CHECK: // %bb.0: +; CHECK-NEXT: sub sp, sp, #96 +; CHECK-NEXT: .cfi_def_cfa_offset 96 +; CHECK-NEXT: // kill: def $z0 killed $z0 def $z0_z1_z2 +; CHECK-NEXT: mov z1.d, z0.d +; CHECK-NEXT: mov z2.d, z0.d +; CHECK-NEXT: st3b { z0.b, z1.b, z2.b }, p0, [sp] +; CHECK-NEXT: add sp, sp, #96 +; CHECK-NEXT: ret + %alloc = alloca [96 x i8], i32 1, align 16 + %ptr = bitcast [96 x i8]* %alloc to i8* + call void @llvm.aarch64.sve.st3.nxv16i8( %in, %in, %in, %pg, i8* %ptr) + ret void +} + +define void @st3h_imm( %in, %pg) vscale_range(2,2) { +; CHECK-LABEL: st3h_imm: +; CHECK: // %bb.0: +; CHECK-NEXT: sub sp, sp, #96 +; CHECK-NEXT: .cfi_def_cfa_offset 96 +; CHECK-NEXT: // kill: def $z0 killed $z0 def $z0_z1_z2 +; CHECK-NEXT: mov z1.d, z0.d +; CHECK-NEXT: mov z2.d, z0.d +; CHECK-NEXT: st3h { z0.h, z1.h, z2.h }, p0, [sp] +; CHECK-NEXT: add sp, sp, #96 +; CHECK-NEXT: ret + %alloc = alloca [48 x i16], i32 1, align 16 + %ptr = bitcast [48 x i16]* %alloc to i16* + call void @llvm.aarch64.sve.st3.nxv8i16( %in, %in, %in, %pg, i16* %ptr) + ret void +} + +define void @st3w_imm( %in, %pg) vscale_range(2,2) { +; CHECK-LABEL: st3w_imm: +; CHECK: // %bb.0: +; CHECK-NEXT: sub sp, sp, #96 +; CHECK-NEXT: .cfi_def_cfa_offset 96 +; CHECK-NEXT: // kill: def $z0 killed $z0 def $z0_z1_z2 +; CHECK-NEXT: mov z1.d, z0.d +; CHECK-NEXT: mov z2.d, z0.d +; CHECK-NEXT: st3w { z0.s, z1.s, z2.s }, p0, [sp] +; CHECK-NEXT: add sp, sp, #96 +; CHECK-NEXT: ret + %alloc = alloca [24 x i32], i32 1, align 16 + %ptr = bitcast [24 x i32]* %alloc to i32* + call void @llvm.aarch64.sve.st3.nxv4i32( %in, %in, %in, %pg, i32* %ptr) + ret void +} + +define void @st3d_imm( %in, %pg) vscale_range(2,2) { +; CHECK-LABEL: st3d_imm: +; CHECK: // %bb.0: +; CHECK-NEXT: sub sp, sp, #96 +; CHECK-NEXT: .cfi_def_cfa_offset 96 +; CHECK-NEXT: // kill: def $z0 killed $z0 def $z0_z1_z2 +; CHECK-NEXT: mov z1.d, z0.d +; CHECK-NEXT: mov z2.d, z0.d +; CHECK-NEXT: st3d { z0.d, z1.d, z2.d }, p0, [sp] +; CHECK-NEXT: add sp, sp, #96 +; CHECK-NEXT: ret + %alloc = alloca [12 x i64], i32 1, align 16 + %ptr = bitcast [12 x i64]* %alloc to i64* + call void @llvm.aarch64.sve.st3.nxv2i64( %in, %in, %in, %pg, i64* %ptr) + ret void +} + +define void @st3d_nonzero_imm( %in, %pg, * %addr) vscale_range(2,2) { +; CHECK-LABEL: st3d_nonzero_imm: +; CHECK: // %bb.0: +; CHECK-NEXT: sub sp, sp, #400 +; CHECK-NEXT: str x29, [sp, #384] // 8-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 400 +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: // kill: def $z0 killed $z0 def $z0_z1_z2 +; CHECK-NEXT: mov z1.d, z0.d +; CHECK-NEXT: mov z2.d, z0.d +; CHECK-NEXT: st3w { z0.s, z1.s, z2.s }, p0, [sp, #9, mul vl] +; CHECK-NEXT: ldr x29, [sp, #384] // 8-byte Folded Reload +; CHECK-NEXT: add sp, sp, #400 +; CHECK-NEXT: ret + %alloc = alloca [24 x i32], i32 4 + %bc = bitcast [24 x i32]* %alloc to * + %base = getelementptr , * %bc, i64 9, i64 0 + call void @llvm.aarch64.sve.st3.nxv4f32( %in, %in, %in, %pg, float* %base) + ret void +} + +; Test where the stack offset is non-zero due to an extra alloca +define void @st3h_f32_valid_imm( %in, %pred) vscale_range(2,2) { +; CHECK-LABEL: st3h_f32_valid_imm: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-11 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0xd8, 0x00, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 88 * VG +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: // kill: def $z0 killed $z0 def $z0_z1_z2 +; CHECK-NEXT: addvl x8, sp, #3 +; CHECK-NEXT: ptrue p1.h +; CHECK-NEXT: mov z1.d, z0.d +; CHECK-NEXT: addvl x8, x8, #4 +; CHECK-NEXT: mov z2.d, z0.d +; CHECK-NEXT: mov z3.h, #0 // =0x0 +; CHECK-NEXT: st1h { z3.h }, p1, [sp] +; CHECK-NEXT: st3w { z0.s, z1.s, z2.s }, p0, [x8] +; CHECK-NEXT: addvl sp, sp, #11 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + %alloca1 = alloca , i32 8 + %alloca2 = alloca , i32 3 + %base = getelementptr , * %alloca1, i64 4, i64 0 + store zeroinitializer, * %alloca2 + call void @llvm.aarch64.sve.st3.nxv4f32( %in, %in, %in, %pred, float* %base) + ret void +} + +; ST4 + +define void @st4b_imm( %in, %pg) vscale_range(2,2) { +; CHECK-LABEL: st4b_imm: +; CHECK: // %bb.0: +; CHECK-NEXT: sub sp, sp, #128 +; CHECK-NEXT: .cfi_def_cfa_offset 128 +; CHECK-NEXT: // kill: def $z0 killed $z0 def $z0_z1_z2_z3 +; CHECK-NEXT: mov z1.d, z0.d +; CHECK-NEXT: mov z2.d, z0.d +; CHECK-NEXT: mov z3.d, z0.d +; CHECK-NEXT: st4b { z0.b, z1.b, z2.b, z3.b }, p0, [sp] +; CHECK-NEXT: add sp, sp, #128 +; CHECK-NEXT: ret + %alloc = alloca [128 x i8], i32 1, align 16 + %ptr = bitcast [128 x i8]* %alloc to i8* + call void @llvm.aarch64.sve.st4.nxv16i8( %in, %in, %in, %in, %pg, i8* %ptr) + ret void +} + +define void @st4h_imm( %in, %pg) vscale_range(2,2) { +; CHECK-LABEL: st4h_imm: +; CHECK: // %bb.0: +; CHECK-NEXT: sub sp, sp, #128 +; CHECK-NEXT: .cfi_def_cfa_offset 128 +; CHECK-NEXT: // kill: def $z0 killed $z0 def $z0_z1_z2_z3 +; CHECK-NEXT: mov z1.d, z0.d +; CHECK-NEXT: mov z2.d, z0.d +; CHECK-NEXT: mov z3.d, z0.d +; CHECK-NEXT: st4h { z0.h, z1.h, z2.h, z3.h }, p0, [sp] +; CHECK-NEXT: add sp, sp, #128 +; CHECK-NEXT: ret + %alloc = alloca [64 x i16], i32 1, align 16 + %ptr = bitcast [64 x i16]* %alloc to i16* + call void @llvm.aarch64.sve.st4.nxv8i16( %in, %in, %in, %in, %pg, i16* %ptr) + ret void +} + +define void @st4w_imm( %in, %pg) vscale_range(2,2) { +; CHECK-LABEL: st4w_imm: +; CHECK: // %bb.0: +; CHECK-NEXT: sub sp, sp, #128 +; CHECK-NEXT: .cfi_def_cfa_offset 128 +; CHECK-NEXT: // kill: def $z0 killed $z0 def $z0_z1_z2_z3 +; CHECK-NEXT: mov z1.d, z0.d +; CHECK-NEXT: mov z2.d, z0.d +; CHECK-NEXT: mov z3.d, z0.d +; CHECK-NEXT: st4w { z0.s, z1.s, z2.s, z3.s }, p0, [sp] +; CHECK-NEXT: add sp, sp, #128 +; CHECK-NEXT: ret + %alloc = alloca [32 x i32], i32 1, align 16 + %ptr = bitcast [32 x i32]* %alloc to i32* + call void @llvm.aarch64.sve.st4.nxv4i32( %in, %in, %in, %in, %pg, i32* %ptr) + ret void +} + +define void @st4d_imm( %in, %pg) vscale_range(2,2) { +; CHECK-LABEL: st4d_imm: +; CHECK: // %bb.0: +; CHECK-NEXT: sub sp, sp, #128 +; CHECK-NEXT: .cfi_def_cfa_offset 128 +; CHECK-NEXT: // kill: def $z0 killed $z0 def $z0_z1_z2_z3 +; CHECK-NEXT: mov z1.d, z0.d +; CHECK-NEXT: mov z2.d, z0.d +; CHECK-NEXT: mov z3.d, z0.d +; CHECK-NEXT: st4d { z0.d, z1.d, z2.d, z3.d }, p0, [sp] +; CHECK-NEXT: add sp, sp, #128 +; CHECK-NEXT: ret + %alloc = alloca [16 x i64], i32 1, align 16 + %ptr = bitcast [16 x i64]* %alloc to i64* + call void @llvm.aarch64.sve.st4.nxv2i64( %in, %in, %in, %in, %pg, i64* %ptr) + ret void +} + +define void @st4w_nonzero_imm( %in, %pg) vscale_range(2,2) { +; CHECK-LABEL: st4w_nonzero_imm: +; CHECK: // %bb.0: +; CHECK-NEXT: sub sp, sp, #272 +; CHECK-NEXT: str x29, [sp, #256] // 8-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 272 +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: // kill: def $z0 killed $z0 def $z0_z1_z2_z3 +; CHECK-NEXT: mov z1.d, z0.d +; CHECK-NEXT: mov z2.d, z0.d +; CHECK-NEXT: mov z3.d, z0.d +; CHECK-NEXT: st4w { z0.s, z1.s, z2.s, z3.s }, p0, [sp, #4, mul vl] +; CHECK-NEXT: ldr x29, [sp, #256] // 8-byte Folded Reload +; CHECK-NEXT: add sp, sp, #272 +; CHECK-NEXT: ret + %alloc = alloca [32 x i32], i32 2 + %bc = bitcast [32 x i32]* %alloc to * + %base = getelementptr inbounds , * %bc, i64 4, i64 0 + call void @llvm.aarch64.sve.st4.nxv4i32( %in, %in, %in, %in, %pg, i32* %base) + ret void +} + +; Test where the stack offset is non-zero due to an extra alloca +define void @st4d_f64_valid_imm( %in, %pred) vscale_range(2,2) { +; CHECK-LABEL: st4d_f64_valid_imm: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-9 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0xc8, 0x00, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 72 * VG +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: // kill: def $z0 killed $z0 def $z0_z1_z2_z3 +; CHECK-NEXT: ptrue p1.d +; CHECK-NEXT: mov z4.d, #0 // =0x0 +; CHECK-NEXT: mov z1.d, z0.d +; CHECK-NEXT: addvl x8, sp, #1 +; CHECK-NEXT: mov z2.d, z0.d +; CHECK-NEXT: st1d { z4.d }, p1, [sp] +; CHECK-NEXT: mov z3.d, z0.d +; CHECK-NEXT: st4d { z0.d, z1.d, z2.d, z3.d }, p0, [x8, #4, mul vl] +; CHECK-NEXT: addvl sp, sp, #9 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + %alloca1 = alloca , i32 8 + %alloca2 = alloca , i32 1 + %base = getelementptr , * %alloca1, i64 4, i64 0 + store zeroinitializer, * %alloca2 + call void @llvm.aarch64.sve.st4.nxv2f64( %in, %in, %in, %in, %pred, double* %base) + ret void +} + +declare void @llvm.aarch64.sve.st2.nxv16i8(, , , i8*) +declare void @llvm.aarch64.sve.st2.nxv8i16(, , , i16*) +declare void @llvm.aarch64.sve.st2.nxv8f16(, , , half*) +declare void @llvm.aarch64.sve.st2.nxv4i32(, , , i32*) +declare void @llvm.aarch64.sve.st2.nxv2i64(, , , i64*) + +declare void @llvm.aarch64.sve.st3.nxv16i8(, , , , i8*) +declare void @llvm.aarch64.sve.st3.nxv8i16(, , , , i16*) +declare void @llvm.aarch64.sve.st3.nxv4i32(, , , , i32*) +declare void @llvm.aarch64.sve.st3.nxv4f32(, , , , float*) +declare void @llvm.aarch64.sve.st3.nxv2i64(, , , , i64*) + +declare void @llvm.aarch64.sve.st4.nxv16i8(, , , , , i8*) +declare void @llvm.aarch64.sve.st4.nxv8i16(, , , , , i16*) +declare void @llvm.aarch64.sve.st4.nxv4i32(, , , , , i32*) +declare void @llvm.aarch64.sve.st4.nxv2i64(, , , , , i64*) +declare void @llvm.aarch64.sve.st4.nxv2f64(, , , , , double*) Index: llvm/test/CodeGen/AArch64/sve-fixed-ld2-alloca.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/AArch64/sve-fixed-ld2-alloca.ll @@ -0,0 +1,28 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -aarch64-sve-vector-bits-min=256 < %s | FileCheck %s + +target triple = "aarch64-unknown-linux-gnu" + +define void @st1d_fixed(<8 x double>* %ptr) #0 { +; CHECK-LABEL: st1d_fixed: +; CHECK: // %bb.0: +; CHECK-NEXT: sub sp, sp, #16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: add x8, sp, #8 +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: ld2d { z0.d, z1.d }, p0/z, [x8] +; CHECK-NEXT: mov x8, #4 +; CHECK-NEXT: mov z0.d, #0 // =0x0 +; CHECK-NEXT: st1d { z0.d }, p0, [x0] +; CHECK-NEXT: st1d { z0.d }, p0, [x0, x8, lsl #3] +; CHECK-NEXT: add sp, sp, #16 +; CHECK-NEXT: ret + %alloc = alloca [16 x double], i32 0 + %bc = bitcast [16 x double]* %alloc to <8 x double>* + %load = load <8 x double>, <8 x double>* %bc + %strided.vec = shufflevector <8 x double> %load, <8 x double> poison, <4 x i32> + store <8 x double> zeroinitializer, <8 x double>* %ptr + ret void +} + +attributes #0 = { "target-features"="+sve" vscale_range(2,2) }