diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h @@ -908,6 +908,7 @@ SDValue WidenVecRes_CONCAT_VECTORS(SDNode* N); SDValue WidenVecRes_EXTEND_VECTOR_INREG(SDNode* N); SDValue WidenVecRes_EXTRACT_SUBVECTOR(SDNode* N); + SDValue WidenVecRes_INSERT_SUBVECTOR(SDNode *N); SDValue WidenVecRes_INSERT_VECTOR_ELT(SDNode* N); SDValue WidenVecRes_LOAD(SDNode* N); SDValue WidenVecRes_MLOAD(MaskedLoadSDNode* N); diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp @@ -3038,6 +3038,9 @@ case ISD::BITCAST: Res = WidenVecRes_BITCAST(N); break; case ISD::BUILD_VECTOR: Res = WidenVecRes_BUILD_VECTOR(N); break; case ISD::CONCAT_VECTORS: Res = WidenVecRes_CONCAT_VECTORS(N); break; + case ISD::INSERT_SUBVECTOR: + Res = WidenVecRes_INSERT_SUBVECTOR(N); + break; case ISD::EXTRACT_SUBVECTOR: Res = WidenVecRes_EXTRACT_SUBVECTOR(N); break; case ISD::INSERT_VECTOR_ELT: Res = WidenVecRes_INSERT_VECTOR_ELT(N); break; case ISD::LOAD: Res = WidenVecRes_LOAD(N); break; @@ -4059,6 +4062,16 @@ return DAG.getBuildVector(WidenVT, dl, Ops); } +SDValue DAGTypeLegalizer::WidenVecRes_INSERT_SUBVECTOR(SDNode *N) { + EVT VT = N->getValueType(0); + EVT WidenVT = TLI.getTypeToTransformTo(*DAG.getContext(), VT); + SDValue InOp1 = GetWidenedVector(N->getOperand(0)); + SDValue InOp2 = N->getOperand(1); + SDValue Idx = N->getOperand(2); + SDLoc dl(N); + return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WidenVT, InOp1, InOp2, Idx); +} + SDValue DAGTypeLegalizer::WidenVecRes_EXTRACT_SUBVECTOR(SDNode *N) { EVT VT = N->getValueType(0); EVT EltVT = VT.getVectorElementType(); diff --git a/llvm/test/CodeGen/AArch64/sve-insert-vector.ll b/llvm/test/CodeGen/AArch64/sve-insert-vector.ll --- a/llvm/test/CodeGen/AArch64/sve-insert-vector.ll +++ b/llvm/test/CodeGen/AArch64/sve-insert-vector.ll @@ -399,3 +399,95 @@ declare @llvm.experimental.vector.insert.nxv8i16.nxv2i16(, , i64) declare @llvm.experimental.vector.insert.nxv8f16.nxv2f16(, , i64) + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; Upacked types that need result widening +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +define @insert_nxv3i32_nxv2i32( %sv0) { +; CHECK-LABEL: insert_nxv3i32_nxv2i32: +; CHECK: // %bb.0: +; CHECK-NEXT: uunpkhi z1.d, z0.s +; CHECK-NEXT: uzp1 z0.s, z0.s, z1.s +; CHECK-NEXT: ret + %v0 = call @llvm.experimental.vector.insert.nxv3i32.nxv2i32( undef, %sv0, i64 0) + ret %v0 +} + +;; Check that the Subvector is not widen so it does not crash. +define @insert_nxv3i32_nxv2i32_2( %sv0, %sv1) { +; CHECK-LABEL: insert_nxv3i32_nxv2i32_2: +; CHECK: // %bb.0: +; CHECK-NEXT: uunpkhi z0.d, z0.s +; CHECK-NEXT: uzp1 z0.s, z1.s, z0.s +; CHECK-NEXT: ret + %v0 = call @llvm.experimental.vector.insert.nxv3i32.nxv2i32( %sv0, %sv1, i64 0) + ret %v0 +} + +define @insert_nxv3f32_nxv2f32( %sv0) { +; CHECK-LABEL: insert_nxv3f32_nxv2f32: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 8 * VG +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: st1w { z0.d }, p0, [sp] +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: ld1w { z0.s }, p0/z, [sp] +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + %v0 = call @llvm.experimental.vector.insert.nxv3f32.nxv2f32( undef, %sv0, i64 0) + ret %v0 +} + +define @insert_nxv6i32_nxv2i32( %sv0, %sv1) { +; CHECK-LABEL: insert_nxv6i32_nxv2i32: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-2 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 16 * VG +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: mov x8, sp +; CHECK-NEXT: uunpklo z1.d, z0.s +; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: ptrue p1.s +; CHECK-NEXT: st1w { z1.d }, p0, [x8, #2, mul vl] +; CHECK-NEXT: st1w { z0.s }, p1, [sp] +; CHECK-NEXT: ld1w { z1.s }, p1/z, [x8, #1, mul vl] +; CHECK-NEXT: ld1w { z0.s }, p1/z, [sp] +; CHECK-NEXT: addvl sp, sp, #2 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + %v0 = call @llvm.experimental.vector.insert.nxv6i32.nxv2i32( undef, %sv0, i64 0) + %v1 = call @llvm.experimental.vector.insert.nxv6i32.nxv2i32( %v0, %sv0, i64 2) + ret %v1 +} + +;; This only works because the input vector is undef and index is zero +define @insert_nxv6i32_nxv3i32( %sv0) { +; CHECK-LABEL: insert_nxv6i32_nxv3i32: +; CHECK: // %bb.0: +; CHECK-NEXT: ret + %v0 = call @llvm.experimental.vector.insert.nxv6i32.nxv3i32( undef, %sv0, i64 0) + ret %v0 +} + +define @insert_nxv12i32_nxv4i32( %sv0, %sv1, %sv2) { +; CHECK-LABEL: insert_nxv12i32_nxv4i32: +; CHECK: // %bb.0: +; CHECK-NEXT: ret + %v0 = call @llvm.experimental.vector.insert.nxv4i32.nxv12i32( undef, %sv0, i64 0) + %v1 = call @llvm.experimental.vector.insert.nxv4i32.nxv12i32( %v0, %sv1, i64 4) + %v2 = call @llvm.experimental.vector.insert.nxv4i32.nxv12i32( %v1, %sv2, i64 8) + ret %v2 +} + +declare @llvm.experimental.vector.insert.nxv3i32.nxv2i32(, , i64) +declare @llvm.experimental.vector.insert.nxv3f32.nxv2f32(, , i64) +declare @llvm.experimental.vector.insert.nxv6i32.nxv2i32(, , i64) +declare @llvm.experimental.vector.insert.nxv6i32.nxv3i32(, , i64) +declare @llvm.experimental.vector.insert.nxv4i32.nxv12i32(, , i64)