diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -1324,6 +1324,7 @@ setOperationAction(ISD::MGATHER, VT, Custom); setOperationAction(ISD::MSCATTER, VT, Custom); setOperationAction(ISD::MLOAD, VT, Custom); + setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom); } setOperationAction(ISD::SPLAT_VECTOR, MVT::nxv8bf16, Custom); diff --git a/llvm/test/CodeGen/AArch64/sve-insert-vector.ll b/llvm/test/CodeGen/AArch64/sve-insert-vector.ll --- a/llvm/test/CodeGen/AArch64/sve-insert-vector.ll +++ b/llvm/test/CodeGen/AArch64/sve-insert-vector.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve < %s | FileCheck %s --check-prefixes=CHECK +; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve -mattr=+bf16 < %s | FileCheck %s --check-prefixes=CHECK define @insert_v2i64_nxv2i64( %vec, <2 x i64> %subvec) nounwind { ; CHECK-LABEL: insert_v2i64_nxv2i64: @@ -466,8 +466,73 @@ ret %v2 } +define @insert_nxv2bf16_nxv2bf16( %sv0, %sv1) nounwind { +; CHECK-LABEL: insert_nxv2bf16_nxv2bf16: +; CHECK: // %bb.0: +; CHECK-NEXT: z0.d, z1.d +; CHECK-NEXT: ret + %v0 = call @llvm.experimental.vector.insert.nxv2bf16.nxv2bf16( %sv0, %sv1, i64 0) + ret %v0 +} + +define @insert_nxv4bf16_nxv4bf16( %sv0, %sv1) nounwind { +; CHECK-LABEL: insert_nxv4bf16_nxv4bf16: +; CHECK: // %bb.0: +; CHECK-NEXT: z0.d, z1.d +; CHECK-NEXT: ret + %v0 = call @llvm.experimental.vector.insert.nxv4bf16.nxv4bf16( %sv0, %sv1, i64 0) + ret %v0 +} + +define @insert_nxv4bf16_v4bf16( %sv0, <4 x bfloat> %v1) nounwind { +; CHECK-LABEL: insert_nxv4bf16_v4bf16: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: st1h { z0.s }, p0, [sp, #1, mul vl] +; CHECK-NEXT: addpl x8, sp, #4 +; CHECK-NEXT: str d1, [x8] +; CHECK-NEXT: ld1h { z0.s }, p0/z, [sp, #1, mul vl] +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + %v0 = call @llvm.experimental.vector.insert.nxv4bf16.v4bf16( %sv0, <4 x bfloat> %v1, i64 0) + ret %v0 +} + +define @insert_nxv8bf16_nxv8bf16( %sv0, %sv1) nounwind { +; CHECK-LABEL: insert_nxv8bf16_nxv8bf16: +; CHECK: // %bb.0: +; CHECK-NEXT: z0.d, z1.d +; CHECK-NEXT: ret + %v0 = call @llvm.experimental.vector.insert.nxv8bf16.nxv8bf16( %sv0, %sv1, i64 0) + ret %v0 +} + +define @insert_nxv8bf16_v8bf16( %sv0, <8 x bfloat> %v1) nounwind { +; CHECK-LABEL: insert_nxv8bf16_v8bf16: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: st1h { z0.h }, p0, [sp] +; CHECK-NEXT: str q1, [sp] +; CHECK-NEXT: ld1h { z0.h }, p0/z, [sp] +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + %v0 = call @llvm.experimental.vector.insert.nxv8bf16.v8bf16( %sv0, <8 x bfloat> %v1, i64 0) + ret %v0 +} + declare @llvm.experimental.vector.insert.nxv3i32.nxv2i32(, , i64) declare @llvm.experimental.vector.insert.nxv3f32.nxv2f32(, , i64) declare @llvm.experimental.vector.insert.nxv6i32.nxv2i32(, , i64) declare @llvm.experimental.vector.insert.nxv6i32.nxv3i32(, , i64) declare @llvm.experimental.vector.insert.nxv4i32.nxv12i32(, , i64) +declare @llvm.experimental.vector.insert.nxv8bf16.nxv8bf16(, , i64) +declare @llvm.experimental.vector.insert.nxv8bf16.v8bf16(, <8 x bfloat>, i64) +declare @llvm.experimental.vector.insert.nxv4bf16.nxv4bf16(, , i64) +declare @llvm.experimental.vector.insert.nxv4bf16.v4bf16(, <4 x bfloat>, i64) +declare @llvm.experimental.vector.insert.nxv2bf16.nxv2bf16(, , i64)