diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h --- a/llvm/include/llvm/CodeGen/TargetLowering.h +++ b/llvm/include/llvm/CodeGen/TargetLowering.h @@ -4488,6 +4488,14 @@ SDValue getVectorElementPointer(SelectionDAG &DAG, SDValue VecPtr, EVT VecVT, SDValue Index) const; + /// Get a pointer to a sub-vector of type \p SubVecVT at index \p Idx located + /// in memory for a vector of type \p VecVT starting at a base address of + /// \p VecPtr. If \p Idx plus the size of \p SubVecVT is out of bounds the + /// returned pointer is unspecified, but the value returned will be such that + /// the entire subvector would be within the vector bounds. + SDValue getVectorSubVecPointer(SelectionDAG &DAG, SDValue VecPtr, EVT VecVT, + EVT SubVecVT, SDValue Index) const; + /// Method for building the DAG expansion of ISD::[US][MIN|MAX]. This /// method accepts integers as its arguments. SDValue expandIntMINMAX(SDNode *Node, SelectionDAG &DAG) const; diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp @@ -1370,17 +1370,19 @@ MachinePointerInfo()); } - StackPtr = TLI.getVectorElementPointer(DAG, StackPtr, VecVT, Idx); - SDValue NewLoad; - if (Op.getValueType().isVector()) + if (Op.getValueType().isVector()) { + StackPtr = TLI.getVectorSubVecPointer(DAG, StackPtr, VecVT, + Op.getValueType(), Idx); NewLoad = DAG.getLoad(Op.getValueType(), dl, Ch, StackPtr, MachinePointerInfo()); - else + } else { + StackPtr = TLI.getVectorElementPointer(DAG, StackPtr, VecVT, Idx); NewLoad = DAG.getExtLoad(ISD::EXTLOAD, dl, Op.getValueType(), Ch, StackPtr, MachinePointerInfo(), VecVT.getVectorElementType()); + } // Replace the chain going out of the store, by the one out of the load. DAG.ReplaceAllUsesOfValueWith(Ch, SDValue(NewLoad.getNode(), 1)); @@ -1405,6 +1407,7 @@ // Store the value to a temporary stack slot, then LOAD the returned part. EVT VecVT = Vec.getValueType(); + EVT SubVecVT = Part.getValueType(); SDValue StackPtr = DAG.CreateStackTemporary(VecVT); int FI = cast(StackPtr.getNode())->getIndex(); MachinePointerInfo PtrInfo = @@ -1414,7 +1417,8 @@ SDValue Ch = DAG.getStore(DAG.getEntryNode(), dl, Vec, StackPtr, PtrInfo); // Then store the inserted part. - SDValue SubStackPtr = TLI.getVectorElementPointer(DAG, StackPtr, VecVT, Idx); + SDValue SubStackPtr = + TLI.getVectorSubVecPointer(DAG, StackPtr, VecVT, SubVecVT, Idx); // Store the subvector. Ch = DAG.getStore( diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp @@ -1313,7 +1313,8 @@ SmallestAlign); // Store the new subvector into the specified index. - SDValue SubVecPtr = TLI.getVectorElementPointer(DAG, StackPtr, VecVT, Idx); + SDValue SubVecPtr = + TLI.getVectorSubVecPointer(DAG, StackPtr, VecVT, SubVecVT, Idx); Store = DAG.getStore(Store, dl, SubVec, SubVecPtr, MachinePointerInfo::getUnknownStack(MF)); diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp --- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp @@ -7781,10 +7781,9 @@ return DAG.getNode(ISD::ADD, DL, AddrVT, Addr, Increment); } -static SDValue clampDynamicVectorIndex(SelectionDAG &DAG, - SDValue Idx, - EVT VecVT, - const SDLoc &dl) { +static SDValue clampDynamicVectorIndex(SelectionDAG &DAG, SDValue Idx, + EVT VecVT, const SDLoc &dl, + unsigned NumSubElts) { if (!VecVT.isScalableVector() && isa(Idx)) return Idx; @@ -7794,26 +7793,38 @@ // If this is a constant index and we know the value is less than the // minimum number of elements then it's safe to return Idx. if (auto *IdxCst = dyn_cast(Idx)) - if (IdxCst->getZExtValue() < NElts) + if (IdxCst->getZExtValue() + (NumSubElts - 1) < NElts) return Idx; SDValue VS = DAG.getVScale(dl, IdxVT, APInt(IdxVT.getFixedSizeInBits(), NElts)); - SDValue Sub = - DAG.getNode(ISD::SUB, dl, IdxVT, VS, DAG.getConstant(1, dl, IdxVT)); + unsigned SubOpcode = NumSubElts <= NElts ? ISD::SUB : ISD::USUBSAT; + SDValue Sub = DAG.getNode(SubOpcode, dl, IdxVT, VS, + DAG.getConstant(NumSubElts, dl, IdxVT)); return DAG.getNode(ISD::UMIN, dl, IdxVT, Idx, Sub); } - if (isPowerOf2_32(NElts)) { + if (isPowerOf2_32(NElts) && NumSubElts == 1) { APInt Imm = APInt::getLowBitsSet(IdxVT.getSizeInBits(), Log2_32(NElts)); return DAG.getNode(ISD::AND, dl, IdxVT, Idx, DAG.getConstant(Imm, dl, IdxVT)); } + unsigned MaxIndex = NumSubElts < NElts ? NElts - NumSubElts : 0; return DAG.getNode(ISD::UMIN, dl, IdxVT, Idx, - DAG.getConstant(NElts - 1, dl, IdxVT)); + DAG.getConstant(MaxIndex, dl, IdxVT)); } SDValue TargetLowering::getVectorElementPointer(SelectionDAG &DAG, SDValue VecPtr, EVT VecVT, SDValue Index) const { + return getVectorSubVecPointer( + DAG, VecPtr, VecVT, + EVT::getVectorVT(*DAG.getContext(), VecVT.getVectorElementType(), 1), + Index); +} + +SDValue TargetLowering::getVectorSubVecPointer(SelectionDAG &DAG, + SDValue VecPtr, EVT VecVT, + EVT SubVecVT, + SDValue Index) const { SDLoc dl(Index); // Make sure the index type is big enough to compute in. Index = DAG.getZExtOrTrunc(Index, dl, VecPtr.getValueType()); @@ -7825,7 +7836,11 @@ assert(EltSize * 8 == EltVT.getFixedSizeInBits() && "Converting bits to bytes lost precision"); - Index = clampDynamicVectorIndex(DAG, Index, VecVT, dl); + assert(SubVecVT.isFixedLengthVector() && + SubVecVT.getVectorElementType() == EltVT && + "Sub-vector must be a fixed vector with matching element type"); + Index = clampDynamicVectorIndex(DAG, Index, VecVT, dl, + SubVecVT.getVectorNumElements()); EVT IdxVT = Index.getValueType(); diff --git a/llvm/test/CodeGen/AArch64/split-vector-insert.ll b/llvm/test/CodeGen/AArch64/split-vector-insert.ll --- a/llvm/test/CodeGen/AArch64/split-vector-insert.ll +++ b/llvm/test/CodeGen/AArch64/split-vector-insert.ll @@ -24,7 +24,7 @@ ; CHECK-NEXT: cntd x9 ; CHECK-NEXT: st1d { z0.d }, p0, [sp] ; CHECK-NEXT: str q1, [sp] -; CHECK-NEXT: sub x9, x9, #1 // =1 +; CHECK-NEXT: sub x9, x9, #2 // =2 ; CHECK-NEXT: ld1d { z0.d }, p0/z, [sp] ; CHECK-NEXT: mov w8, #2 ; CHECK-NEXT: cmp x9, #2 // =2 @@ -74,7 +74,7 @@ ; CHECK-NEXT: cntd x9 ; CHECK-NEXT: st1d { z0.d }, p0, [sp] ; CHECK-NEXT: str q1, [sp] -; CHECK-NEXT: sub x9, x9, #1 // =1 +; CHECK-NEXT: sub x9, x9, #2 // =2 ; CHECK-NEXT: ld1d { z0.d }, p0/z, [sp] ; CHECK-NEXT: mov w8, #2 ; CHECK-NEXT: cmp x9, #2 // =2 diff --git a/llvm/test/CodeGen/AArch64/sve-extract-vector.ll b/llvm/test/CodeGen/AArch64/sve-extract-vector.ll --- a/llvm/test/CodeGen/AArch64/sve-extract-vector.ll +++ b/llvm/test/CodeGen/AArch64/sve-extract-vector.ll @@ -18,7 +18,7 @@ ; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill ; CHECK-NEXT: addvl sp, sp, #-1 ; CHECK-NEXT: cntd x9 -; CHECK-NEXT: sub x9, x9, #1 // =1 +; CHECK-NEXT: sub x9, x9, #2 // =2 ; CHECK-NEXT: mov w8, #2 ; CHECK-NEXT: cmp x9, #2 // =2 ; CHECK-NEXT: ptrue p0.d @@ -30,8 +30,8 @@ ; CHECK-NEXT: addvl sp, sp, #1 ; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload ; CHECK-NEXT: ret -%retval = call <2 x i64> @llvm.experimental.vector.extract.v2i64.nxv2i64( %vec, i64 2) -ret <2 x i64> %retval + %retval = call <2 x i64> @llvm.experimental.vector.extract.v2i64.nxv2i64( %vec, i64 2) + ret <2 x i64> %retval } ; Should codegen to a nop, since idx is zero. @@ -40,8 +40,8 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: ret -%retval = call <4 x i32> @llvm.experimental.vector.extract.v4i32.nxv4i32( %vec, i64 0) -ret <4 x i32> %retval + %retval = call <4 x i32> @llvm.experimental.vector.extract.v4i32.nxv4i32( %vec, i64 0) + ret <4 x i32> %retval } ; Goes through memory currently; idx != 0. @@ -51,7 +51,7 @@ ; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill ; CHECK-NEXT: addvl sp, sp, #-1 ; CHECK-NEXT: cntw x9 -; CHECK-NEXT: sub x9, x9, #1 // =1 +; CHECK-NEXT: sub x9, x9, #4 // =4 ; CHECK-NEXT: mov w8, #4 ; CHECK-NEXT: cmp x9, #4 // =4 ; CHECK-NEXT: ptrue p0.s @@ -84,7 +84,7 @@ ; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill ; CHECK-NEXT: addvl sp, sp, #-1 ; CHECK-NEXT: cnth x9 -; CHECK-NEXT: sub x9, x9, #1 // =1 +; CHECK-NEXT: sub x9, x9, #8 // =8 ; CHECK-NEXT: mov w8, #8 ; CHECK-NEXT: cmp x9, #8 // =8 ; CHECK-NEXT: ptrue p0.h @@ -117,7 +117,7 @@ ; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill ; CHECK-NEXT: addvl sp, sp, #-1 ; CHECK-NEXT: rdvl x9, #1 -; CHECK-NEXT: sub x9, x9, #1 // =1 +; CHECK-NEXT: sub x9, x9, #16 // =16 ; CHECK-NEXT: ptrue p0.b ; CHECK-NEXT: mov w8, #16 ; CHECK-NEXT: cmp x9, #16 // =16 @@ -151,11 +151,62 @@ ret %retval } +; Fixed length clamping + +define <2 x i64> @extract_fixed_v2i64_nxv2i64( %vec) nounwind #0 { +; CHECK-LABEL: extract_fixed_v2i64_nxv2i64: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: cntd x9 +; CHECK-NEXT: sub x9, x9, #2 // =2 +; CHECK-NEXT: mov w8, #2 +; CHECK-NEXT: cmp x9, #2 // =2 +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: csel x8, x9, x8, lo +; CHECK-NEXT: st1d { z0.d }, p0, [sp] +; CHECK-NEXT: lsl x8, x8, #3 +; CHECK-NEXT: mov x9, sp +; CHECK-NEXT: ldr q0, [x9, x8] +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + %retval = call <2 x i64> @llvm.experimental.vector.extract.v2i64.nxv2i64( %vec, i64 2) + ret <2 x i64> %retval +} + +define <4 x i64> @extract_fixed_v4i64_nxv2i64( %vec) nounwind #0 { +; CHECK-LABEL: extract_fixed_v4i64_nxv2i64: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: cntd x9 +; CHECK-NEXT: subs x9, x9, #4 // =4 +; CHECK-NEXT: csel x9, xzr, x9, lo +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: mov w10, #4 +; CHECK-NEXT: cmp x9, #4 // =4 +; CHECK-NEXT: ptrue p1.d, vl4 +; CHECK-NEXT: st1d { z0.d }, p0, [sp] +; CHECK-NEXT: csel x9, x9, x10, lo +; CHECK-NEXT: mov x10, sp +; CHECK-NEXT: ld1d { z0.d }, p1/z, [x10, x9, lsl #3] +; CHECK-NEXT: st1d { z0.d }, p1, [x8] +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + %retval = call <4 x i64> @llvm.experimental.vector.extract.v4i64.nxv2i64( %vec, i64 4) + ret <4 x i64> %retval +} + +attributes #0 = { vscale_range(2,2) } declare <2 x i64> @llvm.experimental.vector.extract.v2i64.nxv2i64(, i64) declare <4 x i32> @llvm.experimental.vector.extract.v4i32.nxv4i32(, i64) declare <8 x i16> @llvm.experimental.vector.extract.v8i16.nxv8i16(, i64) declare <16 x i8> @llvm.experimental.vector.extract.v16i8.nxv16i8(, i64) +declare <4 x i64> @llvm.experimental.vector.extract.v4i64.nxv2i64(, i64) + declare @llvm.experimental.vector.extract.nxv1i32.nxv4i32(, i64) declare @llvm.experimental.vector.extract.nxv1i16.nxv6i16(, i64) diff --git a/llvm/test/CodeGen/AArch64/sve-insert-vector.ll b/llvm/test/CodeGen/AArch64/sve-insert-vector.ll --- a/llvm/test/CodeGen/AArch64/sve-insert-vector.ll +++ b/llvm/test/CodeGen/AArch64/sve-insert-vector.ll @@ -23,7 +23,7 @@ ; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill ; CHECK-NEXT: addvl sp, sp, #-1 ; CHECK-NEXT: cntd x9 -; CHECK-NEXT: sub x9, x9, #1 // =1 +; CHECK-NEXT: sub x9, x9, #2 // =2 ; CHECK-NEXT: mov w8, #2 ; CHECK-NEXT: cmp x9, #2 // =2 ; CHECK-NEXT: csel x8, x9, x8, lo @@ -62,7 +62,7 @@ ; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill ; CHECK-NEXT: addvl sp, sp, #-1 ; CHECK-NEXT: cntw x9 -; CHECK-NEXT: sub x9, x9, #1 // =1 +; CHECK-NEXT: sub x9, x9, #4 // =4 ; CHECK-NEXT: mov w8, #4 ; CHECK-NEXT: cmp x9, #4 // =4 ; CHECK-NEXT: csel x8, x9, x8, lo @@ -101,7 +101,7 @@ ; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill ; CHECK-NEXT: addvl sp, sp, #-1 ; CHECK-NEXT: cnth x9 -; CHECK-NEXT: sub x9, x9, #1 // =1 +; CHECK-NEXT: sub x9, x9, #8 // =8 ; CHECK-NEXT: mov w8, #8 ; CHECK-NEXT: cmp x9, #8 // =8 ; CHECK-NEXT: csel x8, x9, x8, lo @@ -140,7 +140,7 @@ ; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill ; CHECK-NEXT: addvl sp, sp, #-1 ; CHECK-NEXT: rdvl x9, #1 -; CHECK-NEXT: sub x9, x9, #1 // =1 +; CHECK-NEXT: sub x9, x9, #16 // =16 ; CHECK-NEXT: mov w8, #16 ; CHECK-NEXT: cmp x9, #16 // =16 ; CHECK-NEXT: ptrue p0.b @@ -299,12 +299,66 @@ ret %retval } +; Fixed length clamping + +define @insert_fixed_v2i64_nxv2i64( %vec, <2 x i64> %subvec) nounwind #0 { +; CHECK-LABEL: insert_fixed_v2i64_nxv2i64: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: cntd x9 +; CHECK-NEXT: sub x9, x9, #2 // =2 +; CHECK-NEXT: mov w8, #2 +; CHECK-NEXT: cmp x9, #2 // =2 +; CHECK-NEXT: csel x8, x9, x8, lo +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: lsl x8, x8, #3 +; CHECK-NEXT: mov x9, sp +; CHECK-NEXT: st1d { z0.d }, p0, [sp] +; CHECK-NEXT: str q1, [x9, x8] +; CHECK-NEXT: ld1d { z0.d }, p0/z, [sp] +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + %retval = call @llvm.experimental.vector.insert.nxv2i64.v2i64( %vec, <2 x i64> %subvec, i64 2) + ret %retval +} + +define @insert_fixed_v4i64_nxv2i64( %vec, <4 x i64>* %ptr) nounwind #0 { +; CHECK-LABEL: insert_fixed_v4i64_nxv2i64: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: ptrue p0.d, vl4 +; CHECK-NEXT: cntd x8 +; CHECK-NEXT: ld1d { z1.d }, p0/z, [x0] +; CHECK-NEXT: subs x8, x8, #4 // =4 +; CHECK-NEXT: csel x8, xzr, x8, lo +; CHECK-NEXT: mov w9, #4 +; CHECK-NEXT: cmp x8, #4 // =4 +; CHECK-NEXT: ptrue p1.d +; CHECK-NEXT: csel x8, x8, x9, lo +; CHECK-NEXT: mov x9, sp +; CHECK-NEXT: st1d { z0.d }, p1, [sp] +; CHECK-NEXT: st1d { z1.d }, p0, [x9, x8, lsl #3] +; CHECK-NEXT: ld1d { z0.d }, p1/z, [sp] +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + %subvec = load <4 x i64>, <4 x i64>* %ptr + %retval = call @llvm.experimental.vector.insert.nxv2i64.v4i64( %vec, <4 x i64> %subvec, i64 4) + ret %retval +} + +attributes #0 = { vscale_range(2,2) } declare @llvm.experimental.vector.insert.nxv2i64.v2i64(, <2 x i64>, i64) declare @llvm.experimental.vector.insert.nxv4i32.v4i32(, <4 x i32>, i64) declare @llvm.experimental.vector.insert.nxv8i16.v8i16(, <8 x i16>, i64) declare @llvm.experimental.vector.insert.nxv16i8.v16i8(, <16 x i8>, i64) +declare @llvm.experimental.vector.insert.nxv2i64.v4i64(, <4 x i64>, i64) + declare @llvm.experimental.vector.insert.nxv8i64.nxv16i64(, , i64) declare @llvm.experimental.vector.insert.v2i64.nxv16i64(, <2 x i64>, i64) declare @llvm.experimental.vector.insert.nxv4i32.nxv1i32(, , i64)