diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -11477,29 +11477,33 @@ if (VT.getVectorElementCount() != (InVT.getVectorElementCount() * 2)) return SDValue(); - EVT WideVT; - SDValue ExtVec; + EVT NarrowVT = getPackedSVEVectorVT(VT.getVectorElementCount()); + EVT WideVT = getPackedSVEVectorVT(InVT.getVectorElementCount()); + // NOP cast operands to the largest legal vector of the same element count. if (VT.isFloatingPoint()) { - // The InVT type should be legal. We can safely cast the unpacked - // subvector from InVT -> VT. - WideVT = VT; - ExtVec = getSVESafeBitCast(VT, Vec1, DAG); + Vec0 = getSVESafeBitCast(NarrowVT, Vec0, DAG); + Vec1 = getSVESafeBitCast(WideVT, Vec1, DAG); } else { - // Extend elements of smaller vector... - WideVT = InVT.widenIntegerVectorElementType(*(DAG.getContext())); - ExtVec = DAG.getNode(ISD::ANY_EXTEND, DL, WideVT, Vec1); + // Legal integer vectors are already their largest so Vec0 is fine as is. + Vec1 = DAG.getNode(ISD::ANY_EXTEND, DL, WideVT, Vec1); } + // To replace the top/bottom half of vector V with vector SubV we widen the + // preserved half of V, concatinate this to SubV (the order dependant on the + // half being replaced) and then narrow the result. + SDValue Narrow; if (Idx == 0) { SDValue HiVec0 = DAG.getNode(AArch64ISD::UUNPKHI, DL, WideVT, Vec0); - return DAG.getNode(AArch64ISD::UZP1, DL, VT, ExtVec, HiVec0); - } else if (Idx == InVT.getVectorMinNumElements()) { + Narrow = DAG.getNode(AArch64ISD::UZP1, DL, NarrowVT, Vec1, HiVec0); + } else { + assert(Idx == InVT.getVectorMinNumElements() && + "Invalid subvector index!"); SDValue LoVec0 = DAG.getNode(AArch64ISD::UUNPKLO, DL, WideVT, Vec0); - return DAG.getNode(AArch64ISD::UZP1, DL, VT, LoVec0, ExtVec); + Narrow = DAG.getNode(AArch64ISD::UZP1, DL, NarrowVT, LoVec0, Vec1); } - return SDValue(); + return getSVESafeBitCast(VT, Narrow, DAG); } if (Idx == 0 && isPackedVectorType(VT, DAG)) { diff --git a/llvm/test/CodeGen/AArch64/sve-insert-vector.ll b/llvm/test/CodeGen/AArch64/sve-insert-vector.ll --- a/llvm/test/CodeGen/AArch64/sve-insert-vector.ll +++ b/llvm/test/CodeGen/AArch64/sve-insert-vector.ll @@ -299,6 +299,26 @@ ret %r } +define @insert_nxv4f16_nxv2f16_0( %sv0, %sv1) nounwind { +; CHECK-LABEL: insert_nxv4f16_nxv2f16_0: +; CHECK: // %bb.0: +; CHECK-NEXT: uunpkhi z0.d, z0.s +; CHECK-NEXT: uzp1 z0.s, z1.s, z0.s +; CHECK-NEXT: ret + %v0 = call @llvm.experimental.vector.insert.nxv4f16.nxv2f16( %sv0, %sv1, i64 0) + ret %v0 +} + +define @insert_nxv4f16_nxv2f16_2( %sv0, %sv1) nounwind { +; CHECK-LABEL: insert_nxv4f16_nxv2f16_2: +; CHECK: // %bb.0: +; CHECK-NEXT: uunpklo z0.d, z0.s +; CHECK-NEXT: uzp1 z0.s, z0.s, z1.s +; CHECK-NEXT: ret + %v0 = call @llvm.experimental.vector.insert.nxv4f16.nxv2f16( %sv0, %sv1, i64 2) + ret %v0 +} + ; Test that the index is scaled by vscale if the subvector is scalable. define @insert_nxv8f16_nxv2f16( %vec, %in) nounwind { ; CHECK-LABEL: insert_nxv8f16_nxv2f16: @@ -317,6 +337,26 @@ ret %r } +define @insert_nxv8f16_nxv4f16_0( %sv0, %sv1) nounwind { +; CHECK-LABEL: insert_nxv8f16_nxv4f16_0: +; CHECK: // %bb.0: +; CHECK-NEXT: uunpkhi z0.s, z0.h +; CHECK-NEXT: uzp1 z0.h, z1.h, z0.h +; CHECK-NEXT: ret + %v0 = call @llvm.experimental.vector.insert.nxv8f16.nxv4f16( %sv0, %sv1, i64 0) + ret %v0 +} + +define @insert_nxv8f16_nxv4f16_4( %sv0, %sv1) nounwind { +; CHECK-LABEL: insert_nxv8f16_nxv4f16_4: +; CHECK: // %bb.0: +; CHECK-NEXT: uunpklo z0.s, z0.h +; CHECK-NEXT: uzp1 z0.h, z0.h, z1.h +; CHECK-NEXT: ret + %v0 = call @llvm.experimental.vector.insert.nxv8f16.nxv4f16( %sv0, %sv1, i64 4) + ret %v0 +} + ; Fixed length clamping define @insert_fixed_v2i64_nxv2i64( %vec, <2 x i64> %subvec) nounwind #0 { @@ -367,24 +407,6 @@ ret %retval } -attributes #0 = { vscale_range(2,2) } - -declare @llvm.experimental.vector.insert.nxv2i64.v2i64(, <2 x i64>, i64) -declare @llvm.experimental.vector.insert.nxv4i32.v4i32(, <4 x i32>, i64) -declare @llvm.experimental.vector.insert.nxv8i16.v8i16(, <8 x i16>, i64) -declare @llvm.experimental.vector.insert.nxv16i8.v16i8(, <16 x i8>, i64) - -declare @llvm.experimental.vector.insert.nxv2i64.v4i64(, <4 x i64>, i64) - -declare @llvm.experimental.vector.insert.nxv8i64.nxv16i64(, , i64) -declare @llvm.experimental.vector.insert.v2i64.nxv16i64(, <2 x i64>, i64) -declare @llvm.experimental.vector.insert.nxv4i32.nxv1i32(, , i64) -declare @llvm.experimental.vector.insert.nxv6i16.nxv1i16(, , i64) - -declare @llvm.experimental.vector.insert.nxv8i16.nxv2i16(, , i64) - -declare @llvm.experimental.vector.insert.nxv8f16.nxv2f16(, , i64) - ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; Upacked types that need result widening ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; @@ -418,6 +440,26 @@ ret %v0 } +define @insert_nxv4f32_nxv2f32_0( %sv0, %sv1) nounwind { +; CHECK-LABEL: insert_nxv4f32_nxv2f32_0: +; CHECK: // %bb.0: +; CHECK-NEXT: uunpkhi z0.d, z0.s +; CHECK-NEXT: uzp1 z0.s, z1.s, z0.s +; CHECK-NEXT: ret + %v0 = call @llvm.experimental.vector.insert.nxv4f32.nxv2f32( %sv0, %sv1, i64 0) + ret %v0 +} + +define @insert_nxv4f32_nxv2f32_2( %sv0, %sv1) nounwind { +; CHECK-LABEL: insert_nxv4f32_nxv2f32_2: +; CHECK: // %bb.0: +; CHECK-NEXT: uunpklo z0.d, z0.s +; CHECK-NEXT: uzp1 z0.s, z0.s, z1.s +; CHECK-NEXT: ret + %v0 = call @llvm.experimental.vector.insert.nxv4f32.nxv2f32( %sv0, %sv1, i64 2) + ret %v0 +} + define @insert_nxv6i32_nxv2i32( %sv0, %sv1) nounwind { ; CHECK-LABEL: insert_nxv6i32_nxv2i32: ; CHECK: // %bb.0: @@ -509,6 +551,46 @@ ret %v0 } +define @insert_nxv8bf16_nxv4bf16_0( %sv0, %sv1) nounwind { +; CHECK-LABEL: insert_nxv8bf16_nxv4bf16_0: +; CHECK: // %bb.0: +; CHECK-NEXT: uunpkhi z0.s, z0.h +; CHECK-NEXT: uzp1 z0.h, z1.h, z0.h +; CHECK-NEXT: ret + %v0 = call @llvm.experimental.vector.insert.nxv8bf16.nxv4bf16( %sv0, %sv1, i64 0) + ret %v0 +} + +define @insert_nxv8bf16_nxv4bf16_4( %sv0, %sv1) nounwind { +; CHECK-LABEL: insert_nxv8bf16_nxv4bf16_4: +; CHECK: // %bb.0: +; CHECK-NEXT: uunpklo z0.s, z0.h +; CHECK-NEXT: uzp1 z0.h, z0.h, z1.h +; CHECK-NEXT: ret + %v0 = call @llvm.experimental.vector.insert.nxv8bf16.nxv4bf16( %sv0, %sv1, i64 4) + ret %v0 +} + +define @insert_nxv4bf16_nxv2bf16_0( %sv0, %sv1) nounwind { +; CHECK-LABEL: insert_nxv4bf16_nxv2bf16_0: +; CHECK: // %bb.0: +; CHECK-NEXT: uunpkhi z0.d, z0.s +; CHECK-NEXT: uzp1 z0.s, z1.s, z0.s +; CHECK-NEXT: ret + %v0 = call @llvm.experimental.vector.insert.nxv4bf16.nxv2bf16( %sv0, %sv1, i64 0) + ret %v0 +} + +define @insert_nxv4bf16_nxv2bf16_2( %sv0, %sv1) nounwind { +; CHECK-LABEL: insert_nxv4bf16_nxv2bf16_2: +; CHECK: // %bb.0: +; CHECK-NEXT: uunpklo z0.d, z0.s +; CHECK-NEXT: uzp1 z0.s, z0.s, z1.s +; CHECK-NEXT: ret + %v0 = call @llvm.experimental.vector.insert.nxv4bf16.nxv2bf16( %sv0, %sv1, i64 2) + ret %v0 +} + ; Test predicate inserts of half size. define @insert_nxv16i1_nxv8i1_0( %vec, %sv) { ; CHECK-LABEL: insert_nxv16i1_nxv8i1_0: @@ -616,16 +698,40 @@ ret %v0 } +attributes #0 = { vscale_range(2,2) } + +declare @llvm.experimental.vector.insert.nxv16i8.v16i8(, <16 x i8>, i64) + +declare @llvm.experimental.vector.insert.nxv6i16.nxv1i16(, , i64) +declare @llvm.experimental.vector.insert.nxv8i16.nxv2i16(, , i64) +declare @llvm.experimental.vector.insert.nxv8i16.v8i16(, <8 x i16>, i64) + declare @llvm.experimental.vector.insert.nxv3i32.nxv2i32(, , i64) -declare @llvm.experimental.vector.insert.nxv3f32.nxv2f32(, , i64) +declare @llvm.experimental.vector.insert.nxv4i32.nxv1i32(, , i64) +declare @llvm.experimental.vector.insert.nxv4i32.v4i32(, <4 x i32>, i64) +declare @llvm.experimental.vector.insert.nxv4i32.nxv12i32(, , i64) declare @llvm.experimental.vector.insert.nxv6i32.nxv2i32(, , i64) declare @llvm.experimental.vector.insert.nxv6i32.nxv3i32(, , i64) -declare @llvm.experimental.vector.insert.nxv4i32.nxv12i32(, , i64) -declare @llvm.experimental.vector.insert.nxv8bf16.nxv8bf16(, , i64) -declare @llvm.experimental.vector.insert.nxv8bf16.v8bf16(, <8 x bfloat>, i64) + +declare @llvm.experimental.vector.insert.nxv2bf16.nxv2bf16(, , i64) +declare @llvm.experimental.vector.insert.nxv4bf16.nxv2bf16(, , i64) declare @llvm.experimental.vector.insert.nxv4bf16.nxv4bf16(, , i64) declare @llvm.experimental.vector.insert.nxv4bf16.v4bf16(, <4 x bfloat>, i64) -declare @llvm.experimental.vector.insert.nxv2bf16.nxv2bf16(, , i64) +declare @llvm.experimental.vector.insert.nxv8bf16.nxv8bf16(, , i64) +declare @llvm.experimental.vector.insert.nxv8bf16.nxv4bf16(, , i64) +declare @llvm.experimental.vector.insert.nxv8bf16.v8bf16(, <8 x bfloat>, i64) + +declare @llvm.experimental.vector.insert.nxv2i64.v2i64(, <2 x i64>, i64) +declare @llvm.experimental.vector.insert.nxv2i64.v4i64(, <4 x i64>, i64) +declare @llvm.experimental.vector.insert.nxv8i64.nxv16i64(, , i64) +declare @llvm.experimental.vector.insert.v2i64.nxv16i64(, <2 x i64>, i64) + +declare @llvm.experimental.vector.insert.nxv4f16.nxv2f16(, , i64) +declare @llvm.experimental.vector.insert.nxv8f16.nxv2f16(, , i64) +declare @llvm.experimental.vector.insert.nxv8f16.nxv4f16(, , i64) + +declare @llvm.experimental.vector.insert.nxv3f32.nxv2f32(, , i64) +declare @llvm.experimental.vector.insert.nxv4f32.nxv2f32(, , i64) declare @llvm.experimental.vector.insert.nxv2i1.v8i1(, <8 x i1>, i64) declare @llvm.experimental.vector.insert.nxv4i1.v16i1(, <16 x i1>, i64)