diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp @@ -97,6 +97,8 @@ case ISD::EXTRACT_SUBVECTOR: Res = PromoteIntRes_EXTRACT_SUBVECTOR(N); break; + case ISD::INSERT_SUBVECTOR: + Res = PromoteIntRes_INSERT_SUBVECTOR(N); break; case ISD::VECTOR_REVERSE: Res = PromoteIntRes_VECTOR_REVERSE(N); break; case ISD::VECTOR_SHUFFLE: @@ -4729,6 +4731,50 @@ return DAG.getBuildVector(NOutVT, dl, Ops); } +SDValue DAGTypeLegalizer::PromoteIntRes_INSERT_SUBVECTOR(SDNode *N) { + EVT OutVT = N->getValueType(0); + EVT NOutVT = TLI.getTypeToTransformTo(*DAG.getContext(), OutVT); + assert(NOutVT.isVector() && "This type must be promoted to a vector type"); + + SDLoc dl(N); + SDValue Vec = N->getOperand(0); + SDValue SubVec = N->getOperand(1); + SDValue Idx = N->getOperand(2); + + auto *ConstantIdx = cast(Idx); + unsigned IdxN = ConstantIdx->getZExtValue(); + + EVT VecVT = Vec.getValueType(); + EVT SubVecVT = SubVec.getValueType(); + + // To insert SubVec into Vec, store the wider vector to memory, overwrite the + // appropriate bits with the narrower vector, and reload. + Align SmallestAlign = DAG.getReducedAlign(SubVecVT, /*UseABI=*/false); + + SDValue StackPtr = + DAG.CreateStackTemporary(VecVT.getStoreSize(), SmallestAlign); + auto StackPtrVT = StackPtr->getValueType(0); + auto &MF = DAG.getMachineFunction(); + auto FrameIndex = cast(StackPtr.getNode())->getIndex(); + auto PtrInfo = MachinePointerInfo::getFixedStack(MF, FrameIndex); + + SDValue Store = DAG.getStore(DAG.getEntryNode(), dl, Vec, StackPtr, PtrInfo, + SmallestAlign); + + SDValue ScaledIdx = Idx; + if (SubVecVT.isScalableVector() && IdxN != 0) { + APInt IdxAPInt = cast(Idx)->getAPIntValue(); + ScaledIdx = DAG.getVScale(dl, StackPtrVT, + IdxAPInt.sextOrSelf(StackPtrVT.getSizeInBits())); + } + + SDValue SubVecPtr = + TLI.getVectorSubVecPointer(DAG, StackPtr, VecVT, SubVecVT, ScaledIdx); + Store = DAG.getStore(Store, dl, SubVec, SubVecPtr, PtrInfo, SmallestAlign); + return DAG.getExtLoad(ISD::LoadExtType::EXTLOAD, dl, NOutVT, Store, StackPtr, + PtrInfo, OutVT, SmallestAlign); +} + SDValue DAGTypeLegalizer::PromoteIntRes_VECTOR_REVERSE(SDNode *N) { SDLoc dl(N); diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h @@ -298,6 +298,7 @@ SDValue PromoteIntRes_Atomic1(AtomicSDNode *N); SDValue PromoteIntRes_AtomicCmpSwap(AtomicSDNode *N, unsigned ResNo); SDValue PromoteIntRes_EXTRACT_SUBVECTOR(SDNode *N); + SDValue PromoteIntRes_INSERT_SUBVECTOR(SDNode *N); SDValue PromoteIntRes_VECTOR_REVERSE(SDNode *N); SDValue PromoteIntRes_VECTOR_SHUFFLE(SDNode *N); SDValue PromoteIntRes_VECTOR_SPLICE(SDNode *N); diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp --- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp @@ -7837,11 +7837,13 @@ assert(EltSize * 8 == EltVT.getFixedSizeInBits() && "Converting bits to bytes lost precision"); - assert(SubVecVT.isFixedLengthVector() && - SubVecVT.getVectorElementType() == EltVT && - "Sub-vector must be a fixed vector with matching element type"); - Index = clampDynamicVectorIndex(DAG, Index, VecVT, dl, - SubVecVT.getVectorNumElements()); + // Scalable vectors don't need clamping as these are checked at compile time + if (SubVecVT.isFixedLengthVector()) { + assert(SubVecVT.getVectorElementType() == EltVT && + "Sub-vector must be a fixed vector with matching element type"); + Index = clampDynamicVectorIndex(DAG, Index, VecVT, dl, + SubVecVT.getVectorNumElements()); + } EVT IdxVT = Index.getValueType(); diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -17100,6 +17100,10 @@ case ISD::EXTRACT_SUBVECTOR: ReplaceExtractSubVectorResults(N, Results, DAG); return; + case ISD::INSERT_SUBVECTOR: + // Custom lowering has been requested for INSERT_SUBVECTOR -- but delegate + // to common code for result type legalisation + return; case ISD::INTRINSIC_WO_CHAIN: { EVT VT = N->getValueType(0); assert((VT == MVT::i8 || VT == MVT::i16) && diff --git a/llvm/test/CodeGen/AArch64/insert-subvector-res-legalization.ll b/llvm/test/CodeGen/AArch64/insert-subvector-res-legalization.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/insert-subvector-res-legalization.ll @@ -0,0 +1,276 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s | FileCheck %s + +target triple = "aarch64-unknown-linux-gnu" + +; SCALABLE INSERTED INTO SCALABLE TESTS + +define @vec_scalable_subvec_scalable_idx_zero_i8(* %a, * %b) #0 { +; CHECK-LABEL: vec_scalable_subvec_scalable_idx_zero_i8: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: ptrue p1.s +; CHECK-NEXT: ld1b { z0.h }, p0/z, [x0] +; CHECK-NEXT: ld1b { z1.s }, p1/z, [x1] +; CHECK-NEXT: st1b { z0.h }, p0, [sp, #1, mul vl] +; CHECK-NEXT: st1b { z1.s }, p1, [sp, #2, mul vl] +; CHECK-NEXT: ld1b { z0.h }, p0/z, [sp, #1, mul vl] +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + %vec = load , * %a + %subvec = load , * %b + %ins = call @llvm.experimental.vector.insert.nxv8i8.nxv4i8( %vec, %subvec, i64 0) + ret %ins +} + +define @vec_scalable_subvec_scalable_idx_nonzero_i8(* %a, * %b) #0 { +; CHECK-LABEL: vec_scalable_subvec_scalable_idx_nonzero_i8: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: ptrue p1.s +; CHECK-NEXT: ld1b { z0.h }, p0/z, [x0] +; CHECK-NEXT: ld1b { z1.s }, p1/z, [x1] +; CHECK-NEXT: addpl x8, sp, #4 +; CHECK-NEXT: st1b { z0.h }, p0, [sp, #1, mul vl] +; CHECK-NEXT: st1b { z1.s }, p1, [x8, #1, mul vl] +; CHECK-NEXT: ld1b { z0.h }, p0/z, [sp, #1, mul vl] +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + %vec = load , * %a + %subvec = load , * %b + %ins = call @llvm.experimental.vector.insert.nxv8i8.nxv4i8( %vec, %subvec, i64 4) + ret %ins +} + +define @vec_scalable_subvec_scalable_idx_zero_i16(* %a, * %b) #0 { +; CHECK-LABEL: vec_scalable_subvec_scalable_idx_zero_i16: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: ptrue p1.d +; CHECK-NEXT: ld1h { z0.s }, p0/z, [x0] +; CHECK-NEXT: ld1h { z1.d }, p1/z, [x1] +; CHECK-NEXT: st1h { z0.s }, p0, [sp, #1, mul vl] +; CHECK-NEXT: st1h { z1.d }, p1, [sp, #2, mul vl] +; CHECK-NEXT: ld1h { z0.s }, p0/z, [sp, #1, mul vl] +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + %vec = load , * %a + %subvec = load , * %b + %ins = call @llvm.experimental.vector.insert.nxv4i16.nxv2i16( %vec, %subvec, i64 0) + ret %ins +} + +define @vec_scalable_subvec_scalable_idx_nonzero_i16(* %a, * %b) #0 { +; CHECK-LABEL: vec_scalable_subvec_scalable_idx_nonzero_i16: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: ptrue p1.d +; CHECK-NEXT: ld1h { z0.s }, p0/z, [x0] +; CHECK-NEXT: ld1h { z1.d }, p1/z, [x1] +; CHECK-NEXT: addpl x8, sp, #4 +; CHECK-NEXT: st1h { z0.s }, p0, [sp, #1, mul vl] +; CHECK-NEXT: st1h { z1.d }, p1, [x8, #1, mul vl] +; CHECK-NEXT: ld1h { z0.s }, p0/z, [sp, #1, mul vl] +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + %vec = load , * %a + %subvec = load , * %b + %ins = call @llvm.experimental.vector.insert.nxv4i16.nxv2i16( %vec, %subvec, i64 2) + ret %ins +} + +; FIXED INSERTED INTO SCALABLE TESTS + +define @vec_scalable_subvec_fixed_idx_zero_i8(* %a, <8 x i8>* %b) #0 { +; CHECK-LABEL: vec_scalable_subvec_fixed_idx_zero_i8: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: ld1b { z0.h }, p0/z, [x0] +; CHECK-NEXT: ldr d1, [x1] +; CHECK-NEXT: st1b { z0.h }, p0, [sp, #1, mul vl] +; CHECK-NEXT: addpl x8, sp, #4 +; CHECK-NEXT: str d1, [x8] +; CHECK-NEXT: ld1b { z0.h }, p0/z, [sp, #1, mul vl] +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + %vec = load , * %a + %subvec = load <8 x i8>, <8 x i8>* %b + %ins = call @llvm.experimental.vector.insert.nxv8i8.v8i8( %vec, <8 x i8> %subvec, i64 0) + ret %ins +} + +define @vec_scalable_subvec_fixed_idx_nonzero_i8(* %a, <8 x i8>* %b) #0 { +; CHECK-LABEL: vec_scalable_subvec_fixed_idx_nonzero_i8: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: cnth x9 +; CHECK-NEXT: addpl x10, sp, #4 +; CHECK-NEXT: ld1b { z0.h }, p0/z, [x0] +; CHECK-NEXT: ldr d1, [x1] +; CHECK-NEXT: sub x9, x9, #8 // =8 +; CHECK-NEXT: mov w8, #8 +; CHECK-NEXT: cmp x9, #8 // =8 +; CHECK-NEXT: csel x8, x9, x8, lo +; CHECK-NEXT: st1b { z0.h }, p0, [sp, #1, mul vl] +; CHECK-NEXT: str d1, [x10, x8] +; CHECK-NEXT: ld1b { z0.h }, p0/z, [sp, #1, mul vl] +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + %vec = load , * %a + %subvec = load <8 x i8>, <8 x i8>* %b + %ins = call @llvm.experimental.vector.insert.nxv8i8.v8i8( %vec, <8 x i8> %subvec, i64 8) + ret %ins +} + +define @vec_scalable_subvec_fixed_idx_zero_i16(* %a, <4 x i16>* %b) #0 { +; CHECK-LABEL: vec_scalable_subvec_fixed_idx_zero_i16: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: ld1h { z0.s }, p0/z, [x0] +; CHECK-NEXT: ldr d1, [x1] +; CHECK-NEXT: st1h { z0.s }, p0, [sp, #1, mul vl] +; CHECK-NEXT: addpl x8, sp, #4 +; CHECK-NEXT: str d1, [x8] +; CHECK-NEXT: ld1h { z0.s }, p0/z, [sp, #1, mul vl] +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + %vec = load , * %a + %subvec = load <4 x i16>, <4 x i16>* %b + %ins = call @llvm.experimental.vector.insert.nxv4i16.v4i16( %vec, <4 x i16> %subvec, i64 0) + ret %ins +} + +define @vec_scalable_subvec_fixed_idx_nonzero_i16(* %a, <4 x i16>* %b) #0 { +; CHECK-LABEL: vec_scalable_subvec_fixed_idx_nonzero_i16: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: cntw x9 +; CHECK-NEXT: ld1h { z0.s }, p0/z, [x0] +; CHECK-NEXT: ldr d1, [x1] +; CHECK-NEXT: sub x9, x9, #4 // =4 +; CHECK-NEXT: mov w8, #4 +; CHECK-NEXT: cmp x9, #4 // =4 +; CHECK-NEXT: csel x8, x9, x8, lo +; CHECK-NEXT: addpl x9, sp, #4 +; CHECK-NEXT: lsl x8, x8, #1 +; CHECK-NEXT: st1h { z0.s }, p0, [sp, #1, mul vl] +; CHECK-NEXT: str d1, [x9, x8] +; CHECK-NEXT: ld1h { z0.s }, p0/z, [sp, #1, mul vl] +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + %vec = load , * %a + %subvec = load <4 x i16>, <4 x i16>* %b + %ins = call @llvm.experimental.vector.insert.nxv4i16.v4i16( %vec, <4 x i16> %subvec, i64 4) + ret %ins +} + +define @vec_scalable_subvec_fixed_idx_zero_i32(* %a, <2 x i32>* %b) #0 { +; CHECK-LABEL: vec_scalable_subvec_fixed_idx_zero_i32: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: ld1w { z0.d }, p0/z, [x0] +; CHECK-NEXT: ldr d1, [x1] +; CHECK-NEXT: st1w { z0.d }, p0, [sp, #1, mul vl] +; CHECK-NEXT: addpl x8, sp, #4 +; CHECK-NEXT: str d1, [x8] +; CHECK-NEXT: ld1w { z0.d }, p0/z, [sp, #1, mul vl] +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + %vec = load , * %a + %subvec = load <2 x i32>, <2 x i32>* %b + %ins = call @llvm.experimental.vector.insert.nxv2i32.v2i32( %vec, <2 x i32> %subvec, i64 0) + ret %ins +} + +define @vec_scalable_subvec_fixed_idx_nonzero_i32(* %a, <2 x i32>* %b) #0 { +; CHECK-LABEL: vec_scalable_subvec_fixed_idx_nonzero_i32: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: cntd x9 +; CHECK-NEXT: ld1w { z0.d }, p0/z, [x0] +; CHECK-NEXT: ldr d1, [x1] +; CHECK-NEXT: sub x9, x9, #2 // =2 +; CHECK-NEXT: mov w8, #2 +; CHECK-NEXT: cmp x9, #2 // =2 +; CHECK-NEXT: csel x8, x9, x8, lo +; CHECK-NEXT: addpl x9, sp, #4 +; CHECK-NEXT: lsl x8, x8, #2 +; CHECK-NEXT: st1w { z0.d }, p0, [sp, #1, mul vl] +; CHECK-NEXT: str d1, [x9, x8] +; CHECK-NEXT: ld1w { z0.d }, p0/z, [sp, #1, mul vl] +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + %vec = load , * %a + %subvec = load <2 x i32>, <2 x i32>* %b + %ins = call @llvm.experimental.vector.insert.nxv2i32.v2i32( %vec, <2 x i32> %subvec, i64 2) + ret %ins +} + +define @vec_scalable_subvec_fixed_idx_nonzero_large_i32(* %a, <8 x i32>* %b) #0 { +; CHECK-LABEL: vec_scalable_subvec_fixed_idx_nonzero_large_i32: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: cntd x8 +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: subs x8, x8, #8 // =8 +; CHECK-NEXT: ld1w { z0.d }, p0/z, [x0] +; CHECK-NEXT: ldp q1, q2, [x1] +; CHECK-NEXT: csel x8, xzr, x8, lo +; CHECK-NEXT: mov w9, #8 +; CHECK-NEXT: cmp x8, #8 // =8 +; CHECK-NEXT: csel x8, x8, x9, lo +; CHECK-NEXT: mov x9, sp +; CHECK-NEXT: add x8, x9, x8, lsl #2 +; CHECK-NEXT: st1w { z0.d }, p0, [sp] +; CHECK-NEXT: stp q1, q2, [x8] +; CHECK-NEXT: ld1w { z0.d }, p0/z, [sp] +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + %vec = load , * %a + %subvec = load <8 x i32>, <8 x i32>* %b + %ins = call @llvm.experimental.vector.insert.nxv2i32.v8i32( %vec, <8 x i32> %subvec, i64 8) + ret %ins +} + +declare @llvm.experimental.vector.insert.nxv8i8.nxv4i8(, , i64) +declare @llvm.experimental.vector.insert.nxv4i16.nxv2i16(, , i64) + +declare @llvm.experimental.vector.insert.nxv8i8.v8i8(, <8 x i8>, i64) +declare @llvm.experimental.vector.insert.nxv4i16.v4i16(, <4 x i16>, i64) +declare @llvm.experimental.vector.insert.nxv2i32.v2i32(, <2 x i32>, i64) + +declare @llvm.experimental.vector.insert.nxv2i32.v8i32(, <8 x i32>, i64) + +attributes #0 = { nounwind "target-features"="+sve" }