diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -10958,16 +10958,15 @@ EVT InVT = Op.getOperand(1).getValueType(); unsigned Idx = cast(Op.getOperand(2))->getZExtValue(); - if (InVT.isScalableVector()) { - SDLoc DL(Op); - EVT VT = Op.getValueType(); + SDValue Vec0 = Op.getOperand(0); + SDValue Vec1 = Op.getOperand(1); + SDLoc DL(Op); + EVT VT = Op.getValueType(); + if (InVT.isScalableVector()) { if (!isTypeLegal(VT)) return SDValue(); - SDValue Vec0 = Op.getOperand(0); - SDValue Vec1 = Op.getOperand(1); - // Ensure the subvector is half the size of the main vector. if (VT.getVectorElementCount() != (InVT.getVectorElementCount() * 2)) return SDValue(); @@ -10997,6 +10996,15 @@ return SDValue(); } + if (Idx == 0 && isPackedVectorType(VT, DAG) && !Op.getOperand(0).isUndef()) { + unsigned int PredPattern = + getSVEPredPatternFromNumElements(InVT.getVectorNumElements()); + auto PredTy = VT.changeVectorElementType(MVT::i1); + SDValue PTrue = getPTrue(DAG, DL, PredTy, PredPattern); + SDValue ScalableVec1 = convertToScalableVector(DAG, VT, Vec1); + return DAG.getNode(ISD::VSELECT, DL, VT, PTrue, Vec0, ScalableVec1); + } + // This will be matched by custom code during ISelDAGToDAG. if (Idx == 0 && isPackedVectorType(InVT, DAG) && Op.getOperand(0).isUndef()) return Op; diff --git a/llvm/test/CodeGen/AArch64/insert-subvector-res-legalization.ll b/llvm/test/CodeGen/AArch64/insert-subvector-res-legalization.ll --- a/llvm/test/CodeGen/AArch64/insert-subvector-res-legalization.ll +++ b/llvm/test/CodeGen/AArch64/insert-subvector-res-legalization.ll @@ -74,17 +74,12 @@ define @vec_scalable_subvec_fixed_idx_zero_i8(* %a, <8 x i8>* %b) #0 { ; CHECK-LABEL: vec_scalable_subvec_fixed_idx_zero_i8: ; CHECK: // %bb.0: -; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill -; CHECK-NEXT: addvl sp, sp, #-1 -; CHECK-NEXT: ptrue p0.h -; CHECK-NEXT: ld1b { z0.h }, p0/z, [x0] -; CHECK-NEXT: ldr d1, [x1] -; CHECK-NEXT: ushll v1.8h, v1.8b, #0 -; CHECK-NEXT: st1h { z0.h }, p0, [sp] -; CHECK-NEXT: str q1, [sp] -; CHECK-NEXT: ld1h { z0.h }, p0/z, [sp] -; CHECK-NEXT: addvl sp, sp, #1 -; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: ld1b { z0.h }, p0/z, [x0] +; CHECK-NEXT: ldr d1, [x1] +; CHECK-NEXT: ptrue p0.h, vl8 +; CHECK-NEXT: ushll v1.8h, v1.8b, #0 +; CHECK-NEXT: sel z0.h, p0, z0.h, z1.h ; CHECK-NEXT: ret %vec = load , * %a %subvec = load <8 x i8>, <8 x i8>* %b @@ -123,17 +118,12 @@ define @vec_scalable_subvec_fixed_idx_zero_i16(* %a, <4 x i16>* %b) #0 { ; CHECK-LABEL: vec_scalable_subvec_fixed_idx_zero_i16: ; CHECK: // %bb.0: -; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill -; CHECK-NEXT: addvl sp, sp, #-1 -; CHECK-NEXT: ptrue p0.s -; CHECK-NEXT: ld1h { z0.s }, p0/z, [x0] -; CHECK-NEXT: ldr d1, [x1] -; CHECK-NEXT: ushll v1.4s, v1.4h, #0 -; CHECK-NEXT: st1w { z0.s }, p0, [sp] -; CHECK-NEXT: str q1, [sp] -; CHECK-NEXT: ld1w { z0.s }, p0/z, [sp] -; CHECK-NEXT: addvl sp, sp, #1 -; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: ld1h { z0.s }, p0/z, [x0] +; CHECK-NEXT: ldr d1, [x1] +; CHECK-NEXT: ptrue p0.s, vl4 +; CHECK-NEXT: ushll v1.4s, v1.4h, #0 +; CHECK-NEXT: sel z0.s, p0, z0.s, z1.s ; CHECK-NEXT: ret %vec = load , * %a %subvec = load <4 x i16>, <4 x i16>* %b @@ -172,18 +162,13 @@ define @vec_scalable_subvec_fixed_idx_zero_i32(* %a, <2 x i32>* %b) #0 { ; CHECK-LABEL: vec_scalable_subvec_fixed_idx_zero_i32: ; CHECK: // %bb.0: -; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill -; CHECK-NEXT: addvl sp, sp, #-1 -; CHECK-NEXT: ptrue p0.d -; CHECK-NEXT: ld1w { z0.d }, p0/z, [x0] -; CHECK-NEXT: ldr d1, [x1] -; CHECK-NEXT: ushll v1.2d, v1.2s, #0 -; CHECK-NEXT: st1d { z0.d }, p0, [sp] -; CHECK-NEXT: str q1, [sp] -; CHECK-NEXT: ld1d { z0.d }, p0/z, [sp] -; CHECK-NEXT: addvl sp, sp, #1 -; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload -; CHECK-NEXT: ret +; CHECK-LABEL: ptrue p0.d +; CHECK-LABEL: ld1w { z0.d }, p0/z, [x0] +; CHECK-LABEL: ldr d1, [x1] +; CHECK-LABEL: ptrue p0.d, vl2 +; CHECK-LABEL: ushll v1.2d, v1.2s, #0 +; CHECK-LABEL: sel z0.d, p0, z0.d, z1.d +; CHECK-LABEL: ret %vec = load , * %a %subvec = load <2 x i32>, <2 x i32>* %b %ins = call @llvm.experimental.vector.insert.nxv2i32.v2i32( %vec, <2 x i32> %subvec, i64 0) diff --git a/llvm/test/CodeGen/AArch64/split-vector-insert.ll b/llvm/test/CodeGen/AArch64/split-vector-insert.ll --- a/llvm/test/CodeGen/AArch64/split-vector-insert.ll +++ b/llvm/test/CodeGen/AArch64/split-vector-insert.ll @@ -16,42 +16,42 @@ ; CHECK-LABEL: test_nxv2i64_v8i64: ; CHECK: // %bb.0: -; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill -; CHECK-NEXT: addvl sp, sp, #-4 -; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 32 * VG +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-3 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 24 * VG ; CHECK-NEXT: .cfi_offset w29, -16 -; CHECK-NEXT: cntd x8 -; CHECK-NEXT: ptrue p0.d -; CHECK-NEXT: sub x8, x8, #2 -; CHECK-NEXT: st1d { z0.d }, p0, [sp] -; CHECK-NEXT: str q1, [sp] -; CHECK-NEXT: mov w9, #2 -; CHECK-NEXT: cmp x8, #2 -; CHECK-NEXT: ld1d { z0.d }, p0/z, [sp] -; CHECK-NEXT: csel x9, x8, x9, lo -; CHECK-NEXT: addvl x10, sp, #1 -; CHECK-NEXT: lsl x9, x9, #3 -; CHECK-NEXT: cmp x8, #4 -; CHECK-NEXT: st1d { z0.d }, p0, [sp, #1, mul vl] -; CHECK-NEXT: str q2, [x10, x9] -; CHECK-NEXT: mov w9, #4 -; CHECK-NEXT: ld1d { z0.d }, p0/z, [sp, #1, mul vl] -; CHECK-NEXT: csel x9, x8, x9, lo -; CHECK-NEXT: lsl x9, x9, #3 -; CHECK-NEXT: addvl x10, sp, #2 -; CHECK-NEXT: cmp x8, #6 -; CHECK-NEXT: st1d { z0.d }, p0, [sp, #2, mul vl] -; CHECK-NEXT: str q3, [x10, x9] -; CHECK-NEXT: mov w9, #6 -; CHECK-NEXT: ld1d { z0.d }, p0/z, [sp, #2, mul vl] -; CHECK-NEXT: csel x8, x8, x9, lo -; CHECK-NEXT: addvl x9, sp, #3 -; CHECK-NEXT: lsl x8, x8, #3 -; CHECK-NEXT: st1d { z0.d }, p0, [sp, #3, mul vl] -; CHECK-NEXT: str q4, [x9, x8] -; CHECK-NEXT: ld1d { z0.d }, p0/z, [sp, #3, mul vl] -; CHECK-NEXT: addvl sp, sp, #4 -; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: cntd x8 +; CHECK-NEXT: mov w9, #2 +; CHECK-NEXT: sub x8, x8, #2 +; CHECK-NEXT: ptrue p0.d, vl2 +; CHECK-NEXT: cmp x8, #2 +; CHECK-NEXT: mov x10, sp +; CHECK-NEXT: csel x9, x8, x9, lo +; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1 +; CHECK-NEXT: sel z0.d, p0, z0.d, z1.d +; CHECK-NEXT: lsl x9, x9, #3 +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: st1d { z0.d }, p0, [sp] +; CHECK-NEXT: cmp x8, #4 +; CHECK-NEXT: str q2, [x10, x9] +; CHECK-NEXT: mov w9, #4 +; CHECK-NEXT: ld1d { z0.d }, p0/z, [sp] +; CHECK-NEXT: csel x9, x8, x9, lo +; CHECK-NEXT: lsl x9, x9, #3 +; CHECK-NEXT: addvl x10, sp, #1 +; CHECK-NEXT: cmp x8, #6 +; CHECK-NEXT: st1d { z0.d }, p0, [sp, #1, mul vl] +; CHECK-NEXT: str q3, [x10, x9] +; CHECK-NEXT: mov w9, #6 +; CHECK-NEXT: ld1d { z0.d }, p0/z, [sp, #1, mul vl] +; CHECK-NEXT: csel x8, x8, x9, lo +; CHECK-NEXT: addvl x9, sp, #2 +; CHECK-NEXT: lsl x8, x8, #3 +; CHECK-NEXT: st1d { z0.d }, p0, [sp, #2, mul vl] +; CHECK-NEXT: str q4, [x9, x8] +; CHECK-NEXT: ld1d { z0.d }, p0/z, [sp, #2, mul vl] +; CHECK-NEXT: addvl sp, sp, #3 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload ; CHECK-NEXT: ret @@ -67,42 +67,42 @@ ; CHECK-LABEL: test_nxv2f64_v8f64: ; CHECK: // %bb.0: -; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill -; CHECK-NEXT: addvl sp, sp, #-4 -; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 32 * VG +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-3 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 24 * VG ; CHECK-NEXT: .cfi_offset w29, -16 -; CHECK-NEXT: cntd x8 -; CHECK-NEXT: ptrue p0.d -; CHECK-NEXT: sub x8, x8, #2 -; CHECK-NEXT: st1d { z0.d }, p0, [sp] -; CHECK-NEXT: str q1, [sp] -; CHECK-NEXT: mov w9, #2 -; CHECK-NEXT: cmp x8, #2 -; CHECK-NEXT: ld1d { z0.d }, p0/z, [sp] -; CHECK-NEXT: csel x9, x8, x9, lo -; CHECK-NEXT: addvl x10, sp, #1 -; CHECK-NEXT: lsl x9, x9, #3 -; CHECK-NEXT: cmp x8, #4 -; CHECK-NEXT: st1d { z0.d }, p0, [sp, #1, mul vl] -; CHECK-NEXT: str q2, [x10, x9] -; CHECK-NEXT: mov w9, #4 -; CHECK-NEXT: ld1d { z0.d }, p0/z, [sp, #1, mul vl] -; CHECK-NEXT: csel x9, x8, x9, lo -; CHECK-NEXT: lsl x9, x9, #3 -; CHECK-NEXT: addvl x10, sp, #2 -; CHECK-NEXT: cmp x8, #6 -; CHECK-NEXT: st1d { z0.d }, p0, [sp, #2, mul vl] -; CHECK-NEXT: str q3, [x10, x9] -; CHECK-NEXT: mov w9, #6 -; CHECK-NEXT: ld1d { z0.d }, p0/z, [sp, #2, mul vl] -; CHECK-NEXT: csel x8, x8, x9, lo -; CHECK-NEXT: addvl x9, sp, #3 -; CHECK-NEXT: lsl x8, x8, #3 -; CHECK-NEXT: st1d { z0.d }, p0, [sp, #3, mul vl] -; CHECK-NEXT: str q4, [x9, x8] -; CHECK-NEXT: ld1d { z0.d }, p0/z, [sp, #3, mul vl] -; CHECK-NEXT: addvl sp, sp, #4 -; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: cntd x8 +; CHECK-NEXT: mov w9, #2 +; CHECK-NEXT: sub x8, x8, #2 +; CHECK-NEXT: ptrue p0.d, vl2 +; CHECK-NEXT: cmp x8, #2 +; CHECK-NEXT: mov x10, sp +; CHECK-NEXT: csel x9, x8, x9, lo +; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1 +; CHECK-NEXT: sel z0.d, p0, z0.d, z1.d +; CHECK-NEXT: lsl x9, x9, #3 +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: st1d { z0.d }, p0, [sp] +; CHECK-NEXT: cmp x8, #4 +; CHECK-NEXT: str q2, [x10, x9] +; CHECK-NEXT: mov w9, #4 +; CHECK-NEXT: ld1d { z0.d }, p0/z, [sp] +; CHECK-NEXT: csel x9, x8, x9, lo +; CHECK-NEXT: lsl x9, x9, #3 +; CHECK-NEXT: addvl x10, sp, #1 +; CHECK-NEXT: cmp x8, #6 +; CHECK-NEXT: st1d { z0.d }, p0, [sp, #1, mul vl] +; CHECK-NEXT: str q3, [x10, x9] +; CHECK-NEXT: mov w9, #6 +; CHECK-NEXT: ld1d { z0.d }, p0/z, [sp, #1, mul vl] +; CHECK-NEXT: csel x8, x8, x9, lo +; CHECK-NEXT: addvl x9, sp, #2 +; CHECK-NEXT: lsl x8, x8, #3 +; CHECK-NEXT: st1d { z0.d }, p0, [sp, #2, mul vl] +; CHECK-NEXT: str q4, [x9, x8] +; CHECK-NEXT: ld1d { z0.d }, p0/z, [sp, #2, mul vl] +; CHECK-NEXT: addvl sp, sp, #3 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload ; CHECK-NEXT: ret diff --git a/llvm/test/CodeGen/AArch64/sve-insert-vector.ll b/llvm/test/CodeGen/AArch64/sve-insert-vector.ll --- a/llvm/test/CodeGen/AArch64/sve-insert-vector.ll +++ b/llvm/test/CodeGen/AArch64/sve-insert-vector.ll @@ -4,14 +4,9 @@ define @insert_v2i64_nxv2i64( %vec, <2 x i64> %subvec) nounwind { ; CHECK-LABEL: insert_v2i64_nxv2i64: ; CHECK: // %bb.0: -; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill -; CHECK-NEXT: addvl sp, sp, #-1 -; CHECK-NEXT: ptrue p0.d -; CHECK-NEXT: st1d { z0.d }, p0, [sp] -; CHECK-NEXT: str q1, [sp] -; CHECK-NEXT: ld1d { z0.d }, p0/z, [sp] -; CHECK-NEXT: addvl sp, sp, #1 -; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ptrue p0.d, vl2 +; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1 +; CHECK-NEXT: sel z0.d, p0, z0.d, z1.d ; CHECK-NEXT: ret %retval = call @llvm.experimental.vector.insert.nxv2i64.v2i64( %vec, <2 x i64> %subvec, i64 0) ret %retval @@ -43,15 +38,10 @@ define @insert_v4i32_nxv4i32( %vec, <4 x i32> %subvec) nounwind { ; CHECK-LABEL: insert_v4i32_nxv4i32: ; CHECK: // %bb.0: -; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill -; CHECK-NEXT: addvl sp, sp, #-1 -; CHECK-NEXT: ptrue p0.s -; CHECK-NEXT: st1w { z0.s }, p0, [sp] -; CHECK-NEXT: str q1, [sp] -; CHECK-NEXT: ld1w { z0.s }, p0/z, [sp] -; CHECK-NEXT: addvl sp, sp, #1 -; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload -; CHECK-NEXT: ret +; CHECK-LABEL ptrue p0.s, vl4 +; CHECK-LABEL // kill: def $q1 killed $q1 def $z1 +; CHECK-LABEL sel z0.s, p0, z0.s, z1.s +; CHECK-LABEL ret %retval = call @llvm.experimental.vector.insert.nxv4i32.v4i32( %vec, <4 x i32> %subvec, i64 0) ret %retval } @@ -82,14 +72,9 @@ define @insert_v8i16_nxv8i16( %vec, <8 x i16> %subvec) nounwind { ; CHECK-LABEL: insert_v8i16_nxv8i16: ; CHECK: // %bb.0: -; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill -; CHECK-NEXT: addvl sp, sp, #-1 -; CHECK-NEXT: ptrue p0.h -; CHECK-NEXT: st1h { z0.h }, p0, [sp] -; CHECK-NEXT: str q1, [sp] -; CHECK-NEXT: ld1h { z0.h }, p0/z, [sp] -; CHECK-NEXT: addvl sp, sp, #1 -; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ptrue p0.h, vl8 +; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1 +; CHECK-NEXT: sel z0.h, p0, z0.h, z1.h ; CHECK-NEXT: ret %retval = call @llvm.experimental.vector.insert.nxv8i16.v8i16( %vec, <8 x i16> %subvec, i64 0) ret %retval @@ -121,14 +106,9 @@ define @insert_v16i8_nxv16i8( %vec, <16 x i8> %subvec) nounwind { ; CHECK-LABEL: insert_v16i8_nxv16i8: ; CHECK: // %bb.0: -; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill -; CHECK-NEXT: addvl sp, sp, #-1 -; CHECK-NEXT: ptrue p0.b -; CHECK-NEXT: st1b { z0.b }, p0, [sp] -; CHECK-NEXT: str q1, [sp] -; CHECK-NEXT: ld1b { z0.b }, p0/z, [sp] -; CHECK-NEXT: addvl sp, sp, #1 -; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ptrue p0.b, vl16 +; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1 +; CHECK-NEXT: sel z0.b, p0, z0.b, z1.b ; CHECK-NEXT: ret %retval = call @llvm.experimental.vector.insert.nxv16i8.v16i8( %vec, <16 x i8> %subvec, i64 0) ret %retval @@ -513,15 +493,10 @@ define @insert_nxv8bf16_v8bf16( %sv0, <8 x bfloat> %v1) nounwind { ; CHECK-LABEL: insert_nxv8bf16_v8bf16: ; CHECK: // %bb.0: -; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill -; CHECK-NEXT: addvl sp, sp, #-1 -; CHECK-NEXT: ptrue p0.h -; CHECK-NEXT: st1h { z0.h }, p0, [sp] -; CHECK-NEXT: str q1, [sp] -; CHECK-NEXT: ld1h { z0.h }, p0/z, [sp] -; CHECK-NEXT: addvl sp, sp, #1 -; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload -; CHECK-NEXT: ret +; CHECK-NEXT ptrue p0.h, vl8 +; CHECK-NEXT // kill: def $q1 killed $q1 def $z1 +; CHECK-NEXT sel z0.h, p0, z0.h, z1.h +; CHECK-NEXT ret %v0 = call @llvm.experimental.vector.insert.nxv8bf16.v8bf16( %sv0, <8 x bfloat> %v1, i64 0) ret %v0 }