diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp @@ -96,6 +96,8 @@ case ISD::EXTRACT_SUBVECTOR: Res = PromoteIntRes_EXTRACT_SUBVECTOR(N); break; + case ISD::INSERT_SUBVECTOR: + Res = PromoteIntRes_INSERT_SUBVECTOR(N); break; case ISD::VECTOR_REVERSE: Res = PromoteIntRes_VECTOR_REVERSE(N); break; case ISD::VECTOR_SHUFFLE: @@ -4698,6 +4700,38 @@ return DAG.getBuildVector(NOutVT, dl, Ops); } +SDValue DAGTypeLegalizer::PromoteIntRes_INSERT_SUBVECTOR(SDNode *N) { + + EVT OutVT = N->getValueType(0); + EVT NOutVT = TLI.getTypeToTransformTo(*DAG.getContext(), OutVT); + assert(NOutVT.isVector() && "This type must be promoted to a vector type"); + + SDLoc dl(N); + SDValue Vec = N->getOperand(0); + SDValue SubVec = N->getOperand(1); + SDValue Idx = N->getOperand(2); + + EVT VecVT = Vec.getValueType(); + EVT SubVecVT = SubVec.getValueType(); + + // To insert SubVec into Vec, store the wider vector to memory, overwrite the + // lower half with the narrower vector, and reload. + Align SmallestAlign = DAG.getReducedAlign(SubVecVT, /*UseABI=*/false); + SDValue StackPtr = + DAG.CreateStackTemporary(VecVT.getStoreSize(), SmallestAlign); + auto &MF = DAG.getMachineFunction(); + auto FrameIndex = cast(StackPtr.getNode())->getIndex(); + auto PtrInfo = MachinePointerInfo::getFixedStack(MF, FrameIndex); + + SDValue Store = DAG.getStore(DAG.getEntryNode(), dl, Vec, StackPtr, PtrInfo, + SmallestAlign); + + SDValue SubVecPtr = TLI.getVectorElementPointer(DAG, StackPtr, SubVecVT, Idx); + Store = DAG.getStore(Store, dl, SubVec, SubVecPtr, PtrInfo, SmallestAlign); + + return DAG.getLoad(NOutVT, dl, Store, StackPtr, PtrInfo, SmallestAlign); +} + SDValue DAGTypeLegalizer::PromoteIntRes_VECTOR_REVERSE(SDNode *N) { SDLoc dl(N); diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h @@ -298,6 +298,7 @@ SDValue PromoteIntRes_Atomic1(AtomicSDNode *N); SDValue PromoteIntRes_AtomicCmpSwap(AtomicSDNode *N, unsigned ResNo); SDValue PromoteIntRes_EXTRACT_SUBVECTOR(SDNode *N); + SDValue PromoteIntRes_INSERT_SUBVECTOR(SDNode *N); SDValue PromoteIntRes_VECTOR_REVERSE(SDNode *N); SDValue PromoteIntRes_VECTOR_SHUFFLE(SDNode *N); SDValue PromoteIntRes_VECTOR_SPLICE(SDNode *N); diff --git a/llvm/test/CodeGen/AArch64/insert-subvector-res-legalization.ll b/llvm/test/CodeGen/AArch64/insert-subvector-res-legalization.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/insert-subvector-res-legalization.ll @@ -0,0 +1,265 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s | FileCheck %s --check-prefix=CHECK + +target triple = "aarch64-unknown-linux-gnu" + +; SCALABLE INSERTED INTO SCALABLE TESTS + +define @vec_scalable_subvec_scalable_idx_zero_i8(* %a, * %b) #0 { +; CHECK-LABEL: vec_scalable_subvec_scalable_idx_zero_i8: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 8 * VG +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: ptrue p1.s +; CHECK-NEXT: ld1b { z0.h }, p0/z, [x0] +; CHECK-NEXT: ld1b { z1.s }, p1/z, [x1] +; CHECK-NEXT: st1b { z0.h }, p0, [sp, #1, mul vl] +; CHECK-NEXT: st1b { z1.s }, p1, [sp, #2, mul vl] +; CHECK-NEXT: addpl x8, sp, #4 +; CHECK-NEXT: ld1h { z0.h }, p0/z, [x8] +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + %vec = load , * %a + %subvec = load , * %b + %ins = call @llvm.experimental.vector.insert.nxv8i8.nxv4i8( %vec, %subvec, i64 0) + ret %ins +} + +define @vec_scalable_subvec_scalable_idx_nonzero_i8(* %a, * %b) #0 { +; CHECK-LABEL: vec_scalable_subvec_scalable_idx_nonzero_i8: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 8 * VG +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: ptrue p1.s +; CHECK-NEXT: ld1b { z0.h }, p0/z, [x0] +; CHECK-NEXT: ld1b { z1.s }, p1/z, [x1] +; CHECK-NEXT: addpl x8, sp, #4 +; CHECK-NEXT: orr x8, x8, #0x2 +; CHECK-NEXT: st1b { z0.h }, p0, [sp, #1, mul vl] +; CHECK-NEXT: st1b { z1.s }, p1, [x8] +; CHECK-NEXT: addpl x8, sp, #4 +; CHECK-NEXT: ld1h { z0.h }, p0/z, [x8] +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + %vec = load , * %a + %subvec = load , * %b + %ins = call @llvm.experimental.vector.insert.nxv8i8.nxv4i8( %vec, %subvec, i64 2) + ret %ins +} + +define @vec_scalable_subvec_scalable_idx_zero_i16(* %a, * %b) #0 { +; CHECK-LABEL: vec_scalable_subvec_scalable_idx_zero_i16: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 8 * VG +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: ptrue p1.d +; CHECK-NEXT: ld1h { z0.s }, p0/z, [x0] +; CHECK-NEXT: ld1h { z1.d }, p1/z, [x1] +; CHECK-NEXT: st1h { z0.s }, p0, [sp, #1, mul vl] +; CHECK-NEXT: st1h { z1.d }, p1, [sp, #2, mul vl] +; CHECK-NEXT: addpl x8, sp, #4 +; CHECK-NEXT: ld1w { z0.s }, p0/z, [x8] +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + %vec = load , * %a + %subvec = load , * %b + %ins = call @llvm.experimental.vector.insert.nxv4i16.nxv2i16( %vec, %subvec, i64 0) + ret %ins +} + +define @vec_scalable_subvec_scalable_idx_nonzero_i16(* %a, * %b) #0 { +; CHECK-LABEL: vec_scalable_subvec_scalable_idx_nonzero_i16: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 8 * VG +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: ptrue p1.d +; CHECK-NEXT: ld1h { z0.s }, p0/z, [x0] +; CHECK-NEXT: ld1h { z1.d }, p1/z, [x1] +; CHECK-NEXT: cntd x9 +; CHECK-NEXT: sub x9, x9, #1 // =1 +; CHECK-NEXT: mov w8, #2 +; CHECK-NEXT: cmp x9, #2 // =2 +; CHECK-NEXT: csel x8, x9, x8, lo +; CHECK-NEXT: addpl x9, sp, #4 +; CHECK-NEXT: st1h { z0.s }, p0, [sp, #1, mul vl] +; CHECK-NEXT: st1h { z1.d }, p1, [x9, x8, lsl #1] +; CHECK-NEXT: addpl x8, sp, #4 +; CHECK-NEXT: ld1w { z0.s }, p0/z, [x8] +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + %vec = load , * %a + %subvec = load , * %b + %ins = call @llvm.experimental.vector.insert.nxv4i16.nxv2i16( %vec, %subvec, i64 2) + ret %ins +} + +; FIXED INSERTED INTO SCALABLE TESTS + +define @vec_scalable_subvec_fixed_idx_zero_i8(* %a, <8 x i8>* %b) #0 { +; CHECK-LABEL: vec_scalable_subvec_fixed_idx_zero_i8: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 8 * VG +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: ld1b { z0.h }, p0/z, [x0] +; CHECK-NEXT: ldr d1, [x1] +; CHECK-NEXT: st1b { z0.h }, p0, [sp, #1, mul vl] +; CHECK-NEXT: addpl x8, sp, #4 +; CHECK-NEXT: str d1, [x8] +; CHECK-NEXT: addpl x8, sp, #4 +; CHECK-NEXT: ld1h { z0.h }, p0/z, [x8] +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + %vec = load , * %a + %subvec = load <8 x i8>, <8 x i8>* %b + %ins = call @llvm.experimental.vector.insert.nxv8i8.v8i8( %vec, <8 x i8> %subvec, i64 0) + ret %ins +} + +define @vec_scalable_subvec_fixed_idx_nonzero_i8(* %a, <8 x i8>* %b) #0 { +; CHECK-LABEL: vec_scalable_subvec_fixed_idx_nonzero_i8: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 8 * VG +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: ld1b { z0.h }, p0/z, [x0] +; CHECK-NEXT: ldr d1, [x1] +; CHECK-NEXT: st1b { z0.h }, p0, [sp, #1, mul vl] +; CHECK-NEXT: addpl x8, sp, #4 +; CHECK-NEXT: stur d1, [x8, #2] +; CHECK-NEXT: addpl x8, sp, #4 +; CHECK-NEXT: ld1h { z0.h }, p0/z, [x8] +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + %vec = load , * %a + %subvec = load <8 x i8>, <8 x i8>* %b + %ins = call @llvm.experimental.vector.insert.nxv8i8.v8i8( %vec, <8 x i8> %subvec, i64 2) + ret %ins +} + +define @vec_scalable_subvec_fixed_idx_zero_i16(* %a, <4 x i16>* %b) #0 { +; CHECK-LABEL: vec_scalable_subvec_fixed_idx_zero_i16: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 8 * VG +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: ld1h { z0.s }, p0/z, [x0] +; CHECK-NEXT: ldr d1, [x1] +; CHECK-NEXT: st1h { z0.s }, p0, [sp, #1, mul vl] +; CHECK-NEXT: addpl x8, sp, #4 +; CHECK-NEXT: str d1, [x8] +; CHECK-NEXT: addpl x8, sp, #4 +; CHECK-NEXT: ld1w { z0.s }, p0/z, [x8] +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + %vec = load , * %a + %subvec = load <4 x i16>, <4 x i16>* %b + %ins = call @llvm.experimental.vector.insert.nxv4i16.v4i16( %vec, <4 x i16> %subvec, i64 0) + ret %ins +} + +define @vec_scalable_subvec_fixed_idx_nonzero_i16(* %a, <4 x i16>* %b) #0 { +; CHECK-LABEL: vec_scalable_subvec_fixed_idx_nonzero_i16: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 8 * VG +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: ld1h { z0.s }, p0/z, [x0] +; CHECK-NEXT: ldr d1, [x1] +; CHECK-NEXT: st1h { z0.s }, p0, [sp, #1, mul vl] +; CHECK-NEXT: addpl x8, sp, #4 +; CHECK-NEXT: stur d1, [x8, #4] +; CHECK-NEXT: addpl x8, sp, #4 +; CHECK-NEXT: ld1w { z0.s }, p0/z, [x8] +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + %vec = load , * %a + %subvec = load <4 x i16>, <4 x i16>* %b + %ins = call @llvm.experimental.vector.insert.nxv4i16.v4i16( %vec, <4 x i16> %subvec, i64 2) + ret %ins +} + +define @vec_scalable_subvec_fixed_idx_zero_i32(* %a, <2 x i32>* %b) #0 { +; CHECK-LABEL: vec_scalable_subvec_fixed_idx_zero_i32: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 8 * VG +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: ld1w { z0.d }, p0/z, [x0] +; CHECK-NEXT: ldr d1, [x1] +; CHECK-NEXT: st1w { z0.d }, p0, [sp, #1, mul vl] +; CHECK-NEXT: addpl x8, sp, #4 +; CHECK-NEXT: str d1, [x8] +; CHECK-NEXT: addpl x8, sp, #4 +; CHECK-NEXT: ld1d { z0.d }, p0/z, [x8] +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + %vec = load , * %a + %subvec = load <2 x i32>, <2 x i32>* %b + %ins = call @llvm.experimental.vector.insert.nxv2i32.v2i32( %vec, <2 x i32> %subvec, i64 0) + ret %ins +} + +define @vec_scalable_subvec_fixed_idx_nonzero_i32(* %a, <2 x i32>* %b) #0 { +; CHECK-LABEL: vec_scalable_subvec_fixed_idx_nonzero_i32: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 8 * VG +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: ld1w { z0.d }, p0/z, [x0] +; CHECK-NEXT: ldr d1, [x1] +; CHECK-NEXT: st1w { z0.d }, p0, [sp, #1, mul vl] +; CHECK-NEXT: addpl x8, sp, #4 +; CHECK-NEXT: str d1, [x8, #8] +; CHECK-NEXT: addpl x8, sp, #4 +; CHECK-NEXT: ld1d { z0.d }, p0/z, [x8] +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + %vec = load , * %a + %subvec = load <2 x i32>, <2 x i32>* %b + %ins = call @llvm.experimental.vector.insert.nxv2i32.v2i32( %vec, <2 x i32> %subvec, i64 2) + ret %ins +} + +declare @llvm.experimental.vector.insert.nxv8i8.nxv4i8(, , i64) +declare @llvm.experimental.vector.insert.nxv4i16.nxv2i16(, , i64) + +declare @llvm.experimental.vector.insert.nxv8i8.v8i8(, <8 x i8>, i64) +declare @llvm.experimental.vector.insert.nxv4i16.v4i16(, <4 x i16>, i64) +declare @llvm.experimental.vector.insert.nxv2i32.v2i32(, <2 x i32>, i64) + +attributes #0 = { "target-features"="+sve" }