diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h --- a/llvm/include/llvm/CodeGen/TargetLowering.h +++ b/llvm/include/llvm/CodeGen/TargetLowering.h @@ -4477,7 +4477,7 @@ /// bounds the returned pointer is unspecified, but will be within the vector /// bounds. SDValue getVectorElementPointer(SelectionDAG &DAG, SDValue VecPtr, EVT VecVT, - SDValue Index) const; + SDValue Index, bool Clamp = true) const; /// Method for building the DAG expansion of ISD::[US][MIN|MAX]. This /// method accepts integers as its arguments. diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp @@ -96,6 +96,8 @@ case ISD::EXTRACT_SUBVECTOR: Res = PromoteIntRes_EXTRACT_SUBVECTOR(N); break; + case ISD::INSERT_SUBVECTOR: + Res = PromoteIntRes_INSERT_SUBVECTOR(N); break; case ISD::VECTOR_REVERSE: Res = PromoteIntRes_VECTOR_REVERSE(N); break; case ISD::VECTOR_SHUFFLE: @@ -4698,6 +4700,51 @@ return DAG.getBuildVector(NOutVT, dl, Ops); } +SDValue DAGTypeLegalizer::PromoteIntRes_INSERT_SUBVECTOR(SDNode *N) { + EVT OutVT = N->getValueType(0); + EVT NOutVT = TLI.getTypeToTransformTo(*DAG.getContext(), OutVT); + assert(NOutVT.isVector() && "This type must be promoted to a vector type"); + + SDLoc dl(N); + SDValue Vec = N->getOperand(0); + SDValue SubVec = N->getOperand(1); + SDValue Idx = N->getOperand(2); + + auto *ConstantIdx = cast(Idx); + unsigned IdxN = ConstantIdx->getZExtValue(); + + EVT VecVT = Vec.getValueType(); + EVT SubVecVT = SubVec.getValueType(); + + // To insert SubVec into Vec, store the wider vector to memory, overwrite the + // appropriate bits with the narrower vector, and reload. + Align SmallestAlign = DAG.getReducedAlign(SubVecVT, /*UseABI=*/false); + SDValue StackPtr = + DAG.CreateStackTemporary(VecVT.getStoreSize(), SmallestAlign); + auto StackPtrVT = StackPtr->getValueType(0); + auto &MF = DAG.getMachineFunction(); + auto FrameIndex = cast(StackPtr.getNode())->getIndex(); + auto PtrInfo = MachinePointerInfo::getFixedStack(MF, FrameIndex); + + SDValue Store = DAG.getStore(DAG.getEntryNode(), dl, Vec, StackPtr, PtrInfo, + SmallestAlign); + + SDValue ScaledIdx = Idx; + if (IdxN != 0) { + APInt IdxAPInt = cast(Idx)->getAPIntValue(); + ScaledIdx = DAG.getVScale(dl, StackPtrVT, + IdxAPInt.sextOrSelf(StackPtrVT.getSizeInBits())); + } + + // Only clamp for the case when inserting a fixed vector into a scalable. + bool Clamp = VecVT.isScalableVector() != SubVecVT.isScalableVector(); + + SDValue SubVecPtr = + TLI.getVectorElementPointer(DAG, StackPtr, SubVecVT, ScaledIdx, Clamp); + Store = DAG.getStore(Store, dl, SubVec, SubVecPtr, PtrInfo, SmallestAlign); + return DAG.getLoad(NOutVT, dl, Store, StackPtr, PtrInfo, SmallestAlign); +} + SDValue DAGTypeLegalizer::PromoteIntRes_VECTOR_REVERSE(SDNode *N) { SDLoc dl(N); diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h @@ -298,6 +298,7 @@ SDValue PromoteIntRes_Atomic1(AtomicSDNode *N); SDValue PromoteIntRes_AtomicCmpSwap(AtomicSDNode *N, unsigned ResNo); SDValue PromoteIntRes_EXTRACT_SUBVECTOR(SDNode *N); + SDValue PromoteIntRes_INSERT_SUBVECTOR(SDNode *N); SDValue PromoteIntRes_VECTOR_REVERSE(SDNode *N); SDValue PromoteIntRes_VECTOR_SHUFFLE(SDNode *N); SDValue PromoteIntRes_VECTOR_SPLICE(SDNode *N); diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp --- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp @@ -7804,7 +7804,8 @@ SDValue TargetLowering::getVectorElementPointer(SelectionDAG &DAG, SDValue VecPtr, EVT VecVT, - SDValue Index) const { + SDValue Index, + bool Clamp) const { SDLoc dl(Index); // Make sure the index type is big enough to compute in. Index = DAG.getZExtOrTrunc(Index, dl, VecPtr.getValueType()); @@ -7816,7 +7817,8 @@ assert(EltSize * 8 == EltVT.getFixedSizeInBits() && "Converting bits to bytes lost precision"); - Index = clampDynamicVectorIndex(DAG, Index, VecVT, dl); + if (Clamp) + Index = clampDynamicVectorIndex(DAG, Index, VecVT, dl); EVT IdxVT = Index.getValueType(); diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -16718,6 +16718,10 @@ case ISD::EXTRACT_SUBVECTOR: ReplaceExtractSubVectorResults(N, Results, DAG); return; + case ISD::INSERT_SUBVECTOR: + // Custom lowering has been requested for INSERT_SUBVECTOR -- but delegate + // to common code for result type legalisation + return; case ISD::INTRINSIC_WO_CHAIN: { EVT VT = N->getValueType(0); assert((VT == MVT::i8 || VT == MVT::i16) && diff --git a/llvm/test/CodeGen/AArch64/insert-subvector-res-legalization.ll b/llvm/test/CodeGen/AArch64/insert-subvector-res-legalization.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/insert-subvector-res-legalization.ll @@ -0,0 +1,248 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s | FileCheck %s --check-prefix=CHECK + +target triple = "aarch64-unknown-linux-gnu" + +; SCALABLE INSERTED INTO SCALABLE TESTS + +define @vec_scalable_subvec_scalable_idx_zero_i8(* %a, * %b) #0 { +; CHECK-LABEL: vec_scalable_subvec_scalable_idx_zero_i8: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: ptrue p1.s +; CHECK-NEXT: ld1b { z0.h }, p0/z, [x0] +; CHECK-NEXT: ld1b { z1.s }, p1/z, [x1] +; CHECK-NEXT: st1b { z0.h }, p0, [sp, #1, mul vl] +; CHECK-NEXT: st1b { z1.s }, p1, [sp, #2, mul vl] +; CHECK-NEXT: addpl x8, sp, #4 +; CHECK-NEXT: ld1h { z0.h }, p0/z, [x8] +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + %vec = load , * %a + %subvec = load , * %b + %ins = call @llvm.experimental.vector.insert.nxv8i8.nxv4i8( %vec, %subvec, i64 0) + ret %ins +} + +define @vec_scalable_subvec_scalable_idx_nonzero_i8(* %a, * %b) #0 { +; CHECK-LABEL: vec_scalable_subvec_scalable_idx_nonzero_i8: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: ptrue p1.s +; CHECK-NEXT: ld1b { z0.h }, p0/z, [x0] +; CHECK-NEXT: ld1b { z1.s }, p1/z, [x1] +; CHECK-NEXT: cntd x8 +; CHECK-NEXT: addpl x9, sp, #4 +; CHECK-NEXT: st1b { z0.h }, p0, [sp, #1, mul vl] +; CHECK-NEXT: st1b { z1.s }, p1, [x9, x8] +; CHECK-NEXT: addpl x8, sp, #4 +; CHECK-NEXT: ld1h { z0.h }, p0/z, [x8] +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + %vec = load , * %a + %subvec = load , * %b + %ins = call @llvm.experimental.vector.insert.nxv8i8.nxv4i8( %vec, %subvec, i64 2) + ret %ins +} + +define @vec_scalable_subvec_scalable_idx_zero_i16(* %a, * %b) #0 { +; CHECK-LABEL: vec_scalable_subvec_scalable_idx_zero_i16: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: ptrue p1.d +; CHECK-NEXT: ld1h { z0.s }, p0/z, [x0] +; CHECK-NEXT: ld1h { z1.d }, p1/z, [x1] +; CHECK-NEXT: st1h { z0.s }, p0, [sp, #1, mul vl] +; CHECK-NEXT: st1h { z1.d }, p1, [sp, #2, mul vl] +; CHECK-NEXT: addpl x8, sp, #4 +; CHECK-NEXT: ld1w { z0.s }, p0/z, [x8] +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + %vec = load , * %a + %subvec = load , * %b + %ins = call @llvm.experimental.vector.insert.nxv4i16.nxv2i16( %vec, %subvec, i64 0) + ret %ins +} + +define @vec_scalable_subvec_scalable_idx_nonzero_i16(* %a, * %b) #0 { +; CHECK-LABEL: vec_scalable_subvec_scalable_idx_nonzero_i16: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: ptrue p1.d +; CHECK-NEXT: ld1h { z0.s }, p0/z, [x0] +; CHECK-NEXT: ld1h { z1.d }, p1/z, [x1] +; CHECK-NEXT: addpl x8, sp, #4 +; CHECK-NEXT: st1h { z0.s }, p0, [sp, #1, mul vl] +; CHECK-NEXT: st1h { z1.d }, p1, [x8, #1, mul vl] +; CHECK-NEXT: addpl x8, sp, #4 +; CHECK-NEXT: ld1w { z0.s }, p0/z, [x8] +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + %vec = load , * %a + %subvec = load , * %b + %ins = call @llvm.experimental.vector.insert.nxv4i16.nxv2i16( %vec, %subvec, i64 2) + ret %ins +} + +; FIXED INSERTED INTO SCALABLE TESTS + +define @vec_scalable_subvec_fixed_idx_zero_i8(* %a, <8 x i8>* %b) #0 { +; CHECK-LABEL: vec_scalable_subvec_fixed_idx_zero_i8: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: ld1b { z0.h }, p0/z, [x0] +; CHECK-NEXT: ldr d1, [x1] +; CHECK-NEXT: st1b { z0.h }, p0, [sp, #1, mul vl] +; CHECK-NEXT: addpl x8, sp, #4 +; CHECK-NEXT: str d1, [x8] +; CHECK-NEXT: addpl x8, sp, #4 +; CHECK-NEXT: ld1h { z0.h }, p0/z, [x8] +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + %vec = load , * %a + %subvec = load <8 x i8>, <8 x i8>* %b + %ins = call @llvm.experimental.vector.insert.nxv8i8.v8i8( %vec, <8 x i8> %subvec, i64 0) + ret %ins +} + +define @vec_scalable_subvec_fixed_idx_nonzero_i8(* %a, <8 x i8>* %b) #0 { +; CHECK-LABEL: vec_scalable_subvec_fixed_idx_nonzero_i8: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: ld1b { z0.h }, p0/z, [x0] +; CHECK-NEXT: ldr d1, [x1] +; CHECK-NEXT: cntd x8 +; CHECK-NEXT: addpl x9, sp, #4 +; CHECK-NEXT: bfxil x9, x8, #0, #3 +; CHECK-NEXT: st1b { z0.h }, p0, [sp, #1, mul vl] +; CHECK-NEXT: str d1, [x9] +; CHECK-NEXT: addpl x8, sp, #4 +; CHECK-NEXT: ld1h { z0.h }, p0/z, [x8] +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + %vec = load , * %a + %subvec = load <8 x i8>, <8 x i8>* %b + %ins = call @llvm.experimental.vector.insert.nxv8i8.v8i8( %vec, <8 x i8> %subvec, i64 2) + ret %ins +} + +define @vec_scalable_subvec_fixed_idx_zero_i16(* %a, <4 x i16>* %b) #0 { +; CHECK-LABEL: vec_scalable_subvec_fixed_idx_zero_i16: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: ld1h { z0.s }, p0/z, [x0] +; CHECK-NEXT: ldr d1, [x1] +; CHECK-NEXT: st1h { z0.s }, p0, [sp, #1, mul vl] +; CHECK-NEXT: addpl x8, sp, #4 +; CHECK-NEXT: str d1, [x8] +; CHECK-NEXT: addpl x8, sp, #4 +; CHECK-NEXT: ld1w { z0.s }, p0/z, [x8] +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + %vec = load , * %a + %subvec = load <4 x i16>, <4 x i16>* %b + %ins = call @llvm.experimental.vector.insert.nxv4i16.v4i16( %vec, <4 x i16> %subvec, i64 0) + ret %ins +} + +define @vec_scalable_subvec_fixed_idx_nonzero_i16(* %a, <4 x i16>* %b) #0 { +; CHECK-LABEL: vec_scalable_subvec_fixed_idx_nonzero_i16: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: addpl x9, sp, #4 +; CHECK-NEXT: ld1h { z0.s }, p0/z, [x0] +; CHECK-NEXT: ldr d1, [x1] +; CHECK-NEXT: cntd x8 +; CHECK-NEXT: and x8, x8, #0x3 +; CHECK-NEXT: bfi x9, x8, #1, #2 +; CHECK-NEXT: st1h { z0.s }, p0, [sp, #1, mul vl] +; CHECK-NEXT: str d1, [x9] +; CHECK-NEXT: addpl x8, sp, #4 +; CHECK-NEXT: ld1w { z0.s }, p0/z, [x8] +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + %vec = load , * %a + %subvec = load <4 x i16>, <4 x i16>* %b + %ins = call @llvm.experimental.vector.insert.nxv4i16.v4i16( %vec, <4 x i16> %subvec, i64 2) + ret %ins +} + +define @vec_scalable_subvec_fixed_idx_zero_i32(* %a, <2 x i32>* %b) #0 { +; CHECK-LABEL: vec_scalable_subvec_fixed_idx_zero_i32: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: ld1w { z0.d }, p0/z, [x0] +; CHECK-NEXT: ldr d1, [x1] +; CHECK-NEXT: st1w { z0.d }, p0, [sp, #1, mul vl] +; CHECK-NEXT: addpl x8, sp, #4 +; CHECK-NEXT: str d1, [x8] +; CHECK-NEXT: addpl x8, sp, #4 +; CHECK-NEXT: ld1d { z0.d }, p0/z, [x8] +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + %vec = load , * %a + %subvec = load <2 x i32>, <2 x i32>* %b + %ins = call @llvm.experimental.vector.insert.nxv2i32.v2i32( %vec, <2 x i32> %subvec, i64 0) + ret %ins +} + +define @vec_scalable_subvec_fixed_idx_nonzero_i32(* %a, <2 x i32>* %b) #0 { +; CHECK-LABEL: vec_scalable_subvec_fixed_idx_nonzero_i32: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: addpl x9, sp, #4 +; CHECK-NEXT: ld1w { z0.d }, p0/z, [x0] +; CHECK-NEXT: ldr d1, [x1] +; CHECK-NEXT: cntd x8 +; CHECK-NEXT: and x8, x8, #0x1 +; CHECK-NEXT: bfi x9, x8, #2, #1 +; CHECK-NEXT: st1w { z0.d }, p0, [sp, #1, mul vl] +; CHECK-NEXT: str d1, [x9] +; CHECK-NEXT: addpl x8, sp, #4 +; CHECK-NEXT: ld1d { z0.d }, p0/z, [x8] +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + %vec = load , * %a + %subvec = load <2 x i32>, <2 x i32>* %b + %ins = call @llvm.experimental.vector.insert.nxv2i32.v2i32( %vec, <2 x i32> %subvec, i64 2) + ret %ins +} + +declare @llvm.experimental.vector.insert.nxv8i8.nxv4i8(, , i64) +declare @llvm.experimental.vector.insert.nxv4i16.nxv2i16(, , i64) + +declare @llvm.experimental.vector.insert.nxv8i8.v8i8(, <8 x i8>, i64) +declare @llvm.experimental.vector.insert.nxv4i16.v4i16(, <4 x i16>, i64) +declare @llvm.experimental.vector.insert.nxv2i32.v2i32(, <2 x i32>, i64) + +attributes #0 = { nounwind "target-features"="+sve" }