diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp @@ -4741,38 +4741,15 @@ SDValue SubVec = N->getOperand(1); SDValue Idx = N->getOperand(2); - auto *ConstantIdx = cast(Idx); - unsigned IdxN = ConstantIdx->getZExtValue(); - - EVT VecVT = Vec.getValueType(); EVT SubVecVT = SubVec.getValueType(); + EVT NSubVT = + EVT::getVectorVT(*DAG.getContext(), NOutVT.getVectorElementType(), + SubVecVT.getVectorElementCount()); - // To insert SubVec into Vec, store the wider vector to memory, overwrite the - // appropriate bits with the narrower vector, and reload. - Align SmallestAlign = DAG.getReducedAlign(SubVecVT, /*UseABI=*/false); - - SDValue StackPtr = - DAG.CreateStackTemporary(VecVT.getStoreSize(), SmallestAlign); - auto StackPtrVT = StackPtr->getValueType(0); - auto &MF = DAG.getMachineFunction(); - auto FrameIndex = cast(StackPtr.getNode())->getIndex(); - auto PtrInfo = MachinePointerInfo::getFixedStack(MF, FrameIndex); - - SDValue Store = DAG.getStore(DAG.getEntryNode(), dl, Vec, StackPtr, PtrInfo, - SmallestAlign); - - SDValue ScaledIdx = Idx; - if (SubVecVT.isScalableVector() && IdxN != 0) { - APInt IdxAPInt = cast(Idx)->getAPIntValue(); - ScaledIdx = DAG.getVScale(dl, StackPtrVT, - IdxAPInt.sextOrSelf(StackPtrVT.getSizeInBits())); - } + Vec = GetPromotedInteger(Vec); + SubVec = DAG.getNode(ISD::ANY_EXTEND, dl, NSubVT, SubVec); - SDValue SubVecPtr = - TLI.getVectorSubVecPointer(DAG, StackPtr, VecVT, SubVecVT, ScaledIdx); - Store = DAG.getStore(Store, dl, SubVec, SubVecPtr, PtrInfo, SmallestAlign); - return DAG.getExtLoad(ISD::LoadExtType::EXTLOAD, dl, NOutVT, Store, StackPtr, - PtrInfo, OutVT, SmallestAlign); + return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, NOutVT, Vec, SubVec, Idx); } SDValue DAGTypeLegalizer::PromoteIntRes_VECTOR_REVERSE(SDNode *N) { diff --git a/llvm/test/CodeGen/AArch64/insert-subvector-res-legalization.ll b/llvm/test/CodeGen/AArch64/insert-subvector-res-legalization.ll --- a/llvm/test/CodeGen/AArch64/insert-subvector-res-legalization.ll +++ b/llvm/test/CodeGen/AArch64/insert-subvector-res-legalization.ll @@ -8,17 +8,12 @@ define @vec_scalable_subvec_scalable_idx_zero_i8(* %a, * %b) #0 { ; CHECK-LABEL: vec_scalable_subvec_scalable_idx_zero_i8: ; CHECK: // %bb.0: -; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill -; CHECK-NEXT: addvl sp, sp, #-1 ; CHECK-NEXT: ptrue p0.h -; CHECK-NEXT: ptrue p1.s ; CHECK-NEXT: ld1b { z0.h }, p0/z, [x0] -; CHECK-NEXT: ld1b { z1.s }, p1/z, [x1] -; CHECK-NEXT: st1b { z0.h }, p0, [sp, #1, mul vl] -; CHECK-NEXT: st1b { z1.s }, p1, [sp, #2, mul vl] -; CHECK-NEXT: ld1b { z0.h }, p0/z, [sp, #1, mul vl] -; CHECK-NEXT: addvl sp, sp, #1 -; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: ld1b { z1.s }, p0/z, [x1] +; CHECK-NEXT: uunpkhi z0.s, z0.h +; CHECK-NEXT: uzp1 z0.h, z1.h, z0.h ; CHECK-NEXT: ret %vec = load , * %a %subvec = load , * %b @@ -29,18 +24,12 @@ define @vec_scalable_subvec_scalable_idx_nonzero_i8(* %a, * %b) #0 { ; CHECK-LABEL: vec_scalable_subvec_scalable_idx_nonzero_i8: ; CHECK: // %bb.0: -; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill -; CHECK-NEXT: addvl sp, sp, #-1 ; CHECK-NEXT: ptrue p0.h -; CHECK-NEXT: ptrue p1.s ; CHECK-NEXT: ld1b { z0.h }, p0/z, [x0] -; CHECK-NEXT: ld1b { z1.s }, p1/z, [x1] -; CHECK-NEXT: addpl x8, sp, #4 -; CHECK-NEXT: st1b { z0.h }, p0, [sp, #1, mul vl] -; CHECK-NEXT: st1b { z1.s }, p1, [x8, #1, mul vl] -; CHECK-NEXT: ld1b { z0.h }, p0/z, [sp, #1, mul vl] -; CHECK-NEXT: addvl sp, sp, #1 -; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: ld1b { z1.s }, p0/z, [x1] +; CHECK-NEXT: uunpklo z0.s, z0.h +; CHECK-NEXT: uzp1 z0.h, z0.h, z1.h ; CHECK-NEXT: ret %vec = load , * %a %subvec = load , * %b @@ -51,17 +40,12 @@ define @vec_scalable_subvec_scalable_idx_zero_i16(* %a, * %b) #0 { ; CHECK-LABEL: vec_scalable_subvec_scalable_idx_zero_i16: ; CHECK: // %bb.0: -; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill -; CHECK-NEXT: addvl sp, sp, #-1 ; CHECK-NEXT: ptrue p0.s -; CHECK-NEXT: ptrue p1.d ; CHECK-NEXT: ld1h { z0.s }, p0/z, [x0] -; CHECK-NEXT: ld1h { z1.d }, p1/z, [x1] -; CHECK-NEXT: st1h { z0.s }, p0, [sp, #1, mul vl] -; CHECK-NEXT: st1h { z1.d }, p1, [sp, #2, mul vl] -; CHECK-NEXT: ld1h { z0.s }, p0/z, [sp, #1, mul vl] -; CHECK-NEXT: addvl sp, sp, #1 -; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: ld1h { z1.d }, p0/z, [x1] +; CHECK-NEXT: uunpkhi z0.d, z0.s +; CHECK-NEXT: uzp1 z0.s, z1.s, z0.s ; CHECK-NEXT: ret %vec = load , * %a %subvec = load , * %b @@ -72,18 +56,12 @@ define @vec_scalable_subvec_scalable_idx_nonzero_i16(* %a, * %b) #0 { ; CHECK-LABEL: vec_scalable_subvec_scalable_idx_nonzero_i16: ; CHECK: // %bb.0: -; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill -; CHECK-NEXT: addvl sp, sp, #-1 ; CHECK-NEXT: ptrue p0.s -; CHECK-NEXT: ptrue p1.d ; CHECK-NEXT: ld1h { z0.s }, p0/z, [x0] -; CHECK-NEXT: ld1h { z1.d }, p1/z, [x1] -; CHECK-NEXT: addpl x8, sp, #4 -; CHECK-NEXT: st1h { z0.s }, p0, [sp, #1, mul vl] -; CHECK-NEXT: st1h { z1.d }, p1, [x8, #1, mul vl] -; CHECK-NEXT: ld1h { z0.s }, p0/z, [sp, #1, mul vl] -; CHECK-NEXT: addvl sp, sp, #1 -; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: ld1h { z1.d }, p0/z, [x1] +; CHECK-NEXT: uunpklo z0.d, z0.s +; CHECK-NEXT: uzp1 z0.s, z0.s, z1.s ; CHECK-NEXT: ret %vec = load , * %a %subvec = load , * %b @@ -101,10 +79,10 @@ ; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: ld1b { z0.h }, p0/z, [x0] ; CHECK-NEXT: ldr d1, [x1] -; CHECK-NEXT: st1b { z0.h }, p0, [sp, #1, mul vl] -; CHECK-NEXT: addpl x8, sp, #4 -; CHECK-NEXT: str d1, [x8] -; CHECK-NEXT: ld1b { z0.h }, p0/z, [sp, #1, mul vl] +; CHECK-NEXT: st1h { z0.h }, p0, [sp] +; CHECK-NEXT: ushll v0.8h, v1.8b, #0 +; CHECK-NEXT: str q0, [sp] +; CHECK-NEXT: ld1h { z0.h }, p0/z, [sp] ; CHECK-NEXT: addvl sp, sp, #1 ; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload ; CHECK-NEXT: ret @@ -120,17 +98,19 @@ ; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill ; CHECK-NEXT: addvl sp, sp, #-1 ; CHECK-NEXT: ptrue p0.h -; CHECK-NEXT: cnth x9 -; CHECK-NEXT: addpl x10, sp, #4 ; CHECK-NEXT: ld1b { z0.h }, p0/z, [x0] ; CHECK-NEXT: ldr d1, [x1] +; CHECK-NEXT: cnth x9 ; CHECK-NEXT: sub x9, x9, #8 // =8 ; CHECK-NEXT: mov w8, #8 ; CHECK-NEXT: cmp x9, #8 // =8 ; CHECK-NEXT: csel x8, x9, x8, lo -; CHECK-NEXT: st1b { z0.h }, p0, [sp, #1, mul vl] -; CHECK-NEXT: str d1, [x10, x8] -; CHECK-NEXT: ld1b { z0.h }, p0/z, [sp, #1, mul vl] +; CHECK-NEXT: lsl x8, x8, #1 +; CHECK-NEXT: st1h { z0.h }, p0, [sp] +; CHECK-NEXT: ushll v0.8h, v1.8b, #0 +; CHECK-NEXT: mov x9, sp +; CHECK-NEXT: str q0, [x9, x8] +; CHECK-NEXT: ld1h { z0.h }, p0/z, [sp] ; CHECK-NEXT: addvl sp, sp, #1 ; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload ; CHECK-NEXT: ret @@ -148,10 +128,10 @@ ; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: ld1h { z0.s }, p0/z, [x0] ; CHECK-NEXT: ldr d1, [x1] -; CHECK-NEXT: st1h { z0.s }, p0, [sp, #1, mul vl] -; CHECK-NEXT: addpl x8, sp, #4 -; CHECK-NEXT: str d1, [x8] -; CHECK-NEXT: ld1h { z0.s }, p0/z, [sp, #1, mul vl] +; CHECK-NEXT: st1w { z0.s }, p0, [sp] +; CHECK-NEXT: ushll v0.4s, v1.4h, #0 +; CHECK-NEXT: str q0, [sp] +; CHECK-NEXT: ld1w { z0.s }, p0/z, [sp] ; CHECK-NEXT: addvl sp, sp, #1 ; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload ; CHECK-NEXT: ret @@ -167,18 +147,19 @@ ; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill ; CHECK-NEXT: addvl sp, sp, #-1 ; CHECK-NEXT: ptrue p0.s -; CHECK-NEXT: cntw x9 ; CHECK-NEXT: ld1h { z0.s }, p0/z, [x0] ; CHECK-NEXT: ldr d1, [x1] +; CHECK-NEXT: cntw x9 ; CHECK-NEXT: sub x9, x9, #4 // =4 ; CHECK-NEXT: mov w8, #4 ; CHECK-NEXT: cmp x9, #4 // =4 ; CHECK-NEXT: csel x8, x9, x8, lo -; CHECK-NEXT: addpl x9, sp, #4 -; CHECK-NEXT: lsl x8, x8, #1 -; CHECK-NEXT: st1h { z0.s }, p0, [sp, #1, mul vl] -; CHECK-NEXT: str d1, [x9, x8] -; CHECK-NEXT: ld1h { z0.s }, p0/z, [sp, #1, mul vl] +; CHECK-NEXT: lsl x8, x8, #2 +; CHECK-NEXT: st1w { z0.s }, p0, [sp] +; CHECK-NEXT: ushll v0.4s, v1.4h, #0 +; CHECK-NEXT: mov x9, sp +; CHECK-NEXT: str q0, [x9, x8] +; CHECK-NEXT: ld1w { z0.s }, p0/z, [sp] ; CHECK-NEXT: addvl sp, sp, #1 ; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload ; CHECK-NEXT: ret @@ -196,10 +177,10 @@ ; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: ld1w { z0.d }, p0/z, [x0] ; CHECK-NEXT: ldr d1, [x1] -; CHECK-NEXT: st1w { z0.d }, p0, [sp, #1, mul vl] -; CHECK-NEXT: addpl x8, sp, #4 -; CHECK-NEXT: str d1, [x8] -; CHECK-NEXT: ld1w { z0.d }, p0/z, [sp, #1, mul vl] +; CHECK-NEXT: st1d { z0.d }, p0, [sp] +; CHECK-NEXT: ushll v0.2d, v1.2s, #0 +; CHECK-NEXT: str q0, [sp] +; CHECK-NEXT: ld1d { z0.d }, p0/z, [sp] ; CHECK-NEXT: addvl sp, sp, #1 ; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload ; CHECK-NEXT: ret @@ -215,18 +196,19 @@ ; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill ; CHECK-NEXT: addvl sp, sp, #-1 ; CHECK-NEXT: ptrue p0.d -; CHECK-NEXT: cntd x9 ; CHECK-NEXT: ld1w { z0.d }, p0/z, [x0] ; CHECK-NEXT: ldr d1, [x1] +; CHECK-NEXT: cntd x9 ; CHECK-NEXT: sub x9, x9, #2 // =2 ; CHECK-NEXT: mov w8, #2 ; CHECK-NEXT: cmp x9, #2 // =2 ; CHECK-NEXT: csel x8, x9, x8, lo -; CHECK-NEXT: addpl x9, sp, #4 -; CHECK-NEXT: lsl x8, x8, #2 -; CHECK-NEXT: st1w { z0.d }, p0, [sp, #1, mul vl] -; CHECK-NEXT: str d1, [x9, x8] -; CHECK-NEXT: ld1w { z0.d }, p0/z, [sp, #1, mul vl] +; CHECK-NEXT: lsl x8, x8, #3 +; CHECK-NEXT: st1d { z0.d }, p0, [sp] +; CHECK-NEXT: ushll v0.2d, v1.2s, #0 +; CHECK-NEXT: mov x9, sp +; CHECK-NEXT: str q0, [x9, x8] +; CHECK-NEXT: ld1d { z0.d }, p0/z, [sp] ; CHECK-NEXT: addvl sp, sp, #1 ; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload ; CHECK-NEXT: ret @@ -236,25 +218,27 @@ ret %ins } -define @vec_scalable_subvec_fixed_idx_nonzero_large_i32(* %a, <8 x i32>* %b) #0 { +define @vec_scalable_subvec_fixed_idx_nonzero_large_i32(* %a, <8 x i32>* %b) #1 { ; CHECK-LABEL: vec_scalable_subvec_fixed_idx_nonzero_large_i32: ; CHECK: // %bb.0: ; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill ; CHECK-NEXT: addvl sp, sp, #-1 -; CHECK-NEXT: cntd x8 ; CHECK-NEXT: ptrue p0.d -; CHECK-NEXT: subs x8, x8, #8 // =8 +; CHECK-NEXT: ptrue p1.s, vl8 ; CHECK-NEXT: ld1w { z0.d }, p0/z, [x0] -; CHECK-NEXT: ldp q1, q2, [x1] +; CHECK-NEXT: ld1w { z1.s }, p1/z, [x1] +; CHECK-NEXT: cntd x8 +; CHECK-NEXT: subs x8, x8, #8 // =8 ; CHECK-NEXT: csel x8, xzr, x8, lo ; CHECK-NEXT: mov w9, #8 ; CHECK-NEXT: cmp x8, #8 // =8 +; CHECK-NEXT: ptrue p1.d, vl8 ; CHECK-NEXT: csel x8, x8, x9, lo +; CHECK-NEXT: st1d { z0.d }, p0, [sp] +; CHECK-NEXT: uunpklo z0.d, z1.s ; CHECK-NEXT: mov x9, sp -; CHECK-NEXT: add x8, x9, x8, lsl #2 -; CHECK-NEXT: st1w { z0.d }, p0, [sp] -; CHECK-NEXT: stp q1, q2, [x8] -; CHECK-NEXT: ld1w { z0.d }, p0/z, [sp] +; CHECK-NEXT: st1d { z0.d }, p1, [x9, x8, lsl #3] +; CHECK-NEXT: ld1d { z0.d }, p0/z, [sp] ; CHECK-NEXT: addvl sp, sp, #1 ; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload ; CHECK-NEXT: ret @@ -274,3 +258,4 @@ declare @llvm.experimental.vector.insert.nxv2i32.v8i32(, <8 x i32>, i64) attributes #0 = { nounwind "target-features"="+sve" } +attributes #1 = { nounwind "target-features"="+sve" vscale_range(4,4) }