diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -10958,16 +10958,15 @@ EVT InVT = Op.getOperand(1).getValueType(); unsigned Idx = cast(Op.getOperand(2))->getZExtValue(); - if (InVT.isScalableVector()) { - SDLoc DL(Op); - EVT VT = Op.getValueType(); + SDValue Vec0 = Op.getOperand(0); + SDValue Vec1 = Op.getOperand(1); + SDLoc DL(Op); + EVT VT = Op.getValueType(); + if (InVT.isScalableVector()) { if (!isTypeLegal(VT)) return SDValue(); - SDValue Vec0 = Op.getOperand(0); - SDValue Vec1 = Op.getOperand(1); - // Ensure the subvector is half the size of the main vector. if (VT.getVectorElementCount() != (InVT.getVectorElementCount() * 2)) return SDValue(); @@ -10997,9 +10996,18 @@ return SDValue(); } - // This will be matched by custom code during ISelDAGToDAG. - if (Idx == 0 && isPackedVectorType(InVT, DAG) && Op.getOperand(0).isUndef()) - return Op; + if (Idx == 0 && isPackedVectorType(VT, DAG)) { + // This will be matched by custom code during ISelDAGToDAG. + if (Vec0.isUndef()) + return Op; + + unsigned int PredPattern = + getSVEPredPatternFromNumElements(InVT.getVectorNumElements()); + auto PredTy = VT.changeVectorElementType(MVT::i1); + SDValue PTrue = getPTrue(DAG, DL, PredTy, PredPattern); + SDValue ScalableVec1 = convertToScalableVector(DAG, VT, Vec1); + return DAG.getNode(ISD::VSELECT, DL, VT, PTrue, ScalableVec1, Vec0); + } return SDValue(); } diff --git a/llvm/test/CodeGen/AArch64/insert-subvector-res-legalization.ll b/llvm/test/CodeGen/AArch64/insert-subvector-res-legalization.ll --- a/llvm/test/CodeGen/AArch64/insert-subvector-res-legalization.ll +++ b/llvm/test/CodeGen/AArch64/insert-subvector-res-legalization.ll @@ -74,17 +74,12 @@ define @vec_scalable_subvec_fixed_idx_zero_i8(* %a, <8 x i8>* %b) #0 { ; CHECK-LABEL: vec_scalable_subvec_fixed_idx_zero_i8: ; CHECK: // %bb.0: -; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill -; CHECK-NEXT: addvl sp, sp, #-1 ; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: ld1b { z0.h }, p0/z, [x0] ; CHECK-NEXT: ldr d1, [x1] +; CHECK-NEXT: ptrue p0.h, vl8 ; CHECK-NEXT: ushll v1.8h, v1.8b, #0 -; CHECK-NEXT: st1h { z0.h }, p0, [sp] -; CHECK-NEXT: str q1, [sp] -; CHECK-NEXT: ld1h { z0.h }, p0/z, [sp] -; CHECK-NEXT: addvl sp, sp, #1 -; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: mov z0.h, p0/m, z1.h ; CHECK-NEXT: ret %vec = load , * %a %subvec = load <8 x i8>, <8 x i8>* %b @@ -123,17 +118,12 @@ define @vec_scalable_subvec_fixed_idx_zero_i16(* %a, <4 x i16>* %b) #0 { ; CHECK-LABEL: vec_scalable_subvec_fixed_idx_zero_i16: ; CHECK: // %bb.0: -; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill -; CHECK-NEXT: addvl sp, sp, #-1 ; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: ld1h { z0.s }, p0/z, [x0] ; CHECK-NEXT: ldr d1, [x1] +; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: ushll v1.4s, v1.4h, #0 -; CHECK-NEXT: st1w { z0.s }, p0, [sp] -; CHECK-NEXT: str q1, [sp] -; CHECK-NEXT: ld1w { z0.s }, p0/z, [sp] -; CHECK-NEXT: addvl sp, sp, #1 -; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: mov z0.s, p0/m, z1.s ; CHECK-NEXT: ret %vec = load , * %a %subvec = load <4 x i16>, <4 x i16>* %b @@ -172,17 +162,12 @@ define @vec_scalable_subvec_fixed_idx_zero_i32(* %a, <2 x i32>* %b) #0 { ; CHECK-LABEL: vec_scalable_subvec_fixed_idx_zero_i32: ; CHECK: // %bb.0: -; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill -; CHECK-NEXT: addvl sp, sp, #-1 ; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: ld1w { z0.d }, p0/z, [x0] ; CHECK-NEXT: ldr d1, [x1] +; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: ushll v1.2d, v1.2s, #0 -; CHECK-NEXT: st1d { z0.d }, p0, [sp] -; CHECK-NEXT: str q1, [sp] -; CHECK-NEXT: ld1d { z0.d }, p0/z, [sp] -; CHECK-NEXT: addvl sp, sp, #1 -; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: mov z0.d, p0/m, z1.d ; CHECK-NEXT: ret %vec = load , * %a %subvec = load <2 x i32>, <2 x i32>* %b diff --git a/llvm/test/CodeGen/AArch64/split-vector-insert.ll b/llvm/test/CodeGen/AArch64/split-vector-insert.ll --- a/llvm/test/CodeGen/AArch64/split-vector-insert.ll +++ b/llvm/test/CodeGen/AArch64/split-vector-insert.ll @@ -17,44 +17,46 @@ ; CHECK-LABEL: test_nxv2i64_v8i64: ; CHECK: // %bb.0: ; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill -; CHECK-NEXT: addvl sp, sp, #-4 -; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 32 * VG +; CHECK-NEXT: addvl sp, sp, #-3 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 24 * VG ; CHECK-NEXT: .cfi_offset w29, -16 ; CHECK-NEXT: cntd x8 -; CHECK-NEXT: ptrue p0.d -; CHECK-NEXT: sub x8, x8, #2 -; CHECK-NEXT: st1d { z0.d }, p0, [sp] -; CHECK-NEXT: str q1, [sp] ; CHECK-NEXT: mov w9, #2 +; CHECK-NEXT: sub x8, x8, #2 +; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: cmp x8, #2 -; CHECK-NEXT: ld1d { z0.d }, p0/z, [sp] +; CHECK-NEXT: mov x10, sp ; CHECK-NEXT: csel x9, x8, x9, lo -; CHECK-NEXT: addvl x10, sp, #1 +; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1 +; CHECK-NEXT: mov z0.d, p0/m, z1.d ; CHECK-NEXT: lsl x9, x9, #3 +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: st1d { z0.d }, p0, [sp] ; CHECK-NEXT: cmp x8, #4 -; CHECK-NEXT: st1d { z0.d }, p0, [sp, #1, mul vl] ; CHECK-NEXT: str q2, [x10, x9] ; CHECK-NEXT: mov w9, #4 -; CHECK-NEXT: ld1d { z0.d }, p0/z, [sp, #1, mul vl] +; CHECK-NEXT: ld1d { z0.d }, p0/z, [sp] ; CHECK-NEXT: csel x9, x8, x9, lo ; CHECK-NEXT: lsl x9, x9, #3 -; CHECK-NEXT: addvl x10, sp, #2 +; CHECK-NEXT: addvl x10, sp, #1 ; CHECK-NEXT: cmp x8, #6 -; CHECK-NEXT: st1d { z0.d }, p0, [sp, #2, mul vl] +; CHECK-NEXT: st1d { z0.d }, p0, [sp, #1, mul vl] ; CHECK-NEXT: str q3, [x10, x9] ; CHECK-NEXT: mov w9, #6 -; CHECK-NEXT: ld1d { z0.d }, p0/z, [sp, #2, mul vl] +; CHECK-NEXT: ld1d { z0.d }, p0/z, [sp, #1, mul vl] ; CHECK-NEXT: csel x8, x8, x9, lo -; CHECK-NEXT: addvl x9, sp, #3 +; CHECK-NEXT: addvl x9, sp, #2 ; CHECK-NEXT: lsl x8, x8, #3 -; CHECK-NEXT: st1d { z0.d }, p0, [sp, #3, mul vl] +; CHECK-NEXT: st1d { z0.d }, p0, [sp, #2, mul vl] ; CHECK-NEXT: str q4, [x9, x8] -; CHECK-NEXT: ld1d { z0.d }, p0/z, [sp, #3, mul vl] -; CHECK-NEXT: addvl sp, sp, #4 +; CHECK-NEXT: ld1d { z0.d }, p0/z, [sp, #2, mul vl] +; CHECK-NEXT: addvl sp, sp, #3 ; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload ; CHECK-NEXT: ret + + %r = call @llvm.experimental.vector.insert.nxv2i64.v8i64( %a, <8 x i64> %b, i64 0) ret %r } @@ -68,44 +70,46 @@ ; CHECK-LABEL: test_nxv2f64_v8f64: ; CHECK: // %bb.0: ; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill -; CHECK-NEXT: addvl sp, sp, #-4 -; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 32 * VG +; CHECK-NEXT: addvl sp, sp, #-3 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 24 * VG ; CHECK-NEXT: .cfi_offset w29, -16 ; CHECK-NEXT: cntd x8 -; CHECK-NEXT: ptrue p0.d -; CHECK-NEXT: sub x8, x8, #2 -; CHECK-NEXT: st1d { z0.d }, p0, [sp] -; CHECK-NEXT: str q1, [sp] ; CHECK-NEXT: mov w9, #2 +; CHECK-NEXT: sub x8, x8, #2 +; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: cmp x8, #2 -; CHECK-NEXT: ld1d { z0.d }, p0/z, [sp] +; CHECK-NEXT: mov x10, sp ; CHECK-NEXT: csel x9, x8, x9, lo -; CHECK-NEXT: addvl x10, sp, #1 +; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1 +; CHECK-NEXT: mov z0.d, p0/m, z1.d ; CHECK-NEXT: lsl x9, x9, #3 +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: st1d { z0.d }, p0, [sp] ; CHECK-NEXT: cmp x8, #4 -; CHECK-NEXT: st1d { z0.d }, p0, [sp, #1, mul vl] ; CHECK-NEXT: str q2, [x10, x9] ; CHECK-NEXT: mov w9, #4 -; CHECK-NEXT: ld1d { z0.d }, p0/z, [sp, #1, mul vl] +; CHECK-NEXT: ld1d { z0.d }, p0/z, [sp] ; CHECK-NEXT: csel x9, x8, x9, lo ; CHECK-NEXT: lsl x9, x9, #3 -; CHECK-NEXT: addvl x10, sp, #2 +; CHECK-NEXT: addvl x10, sp, #1 ; CHECK-NEXT: cmp x8, #6 -; CHECK-NEXT: st1d { z0.d }, p0, [sp, #2, mul vl] +; CHECK-NEXT: st1d { z0.d }, p0, [sp, #1, mul vl] ; CHECK-NEXT: str q3, [x10, x9] ; CHECK-NEXT: mov w9, #6 -; CHECK-NEXT: ld1d { z0.d }, p0/z, [sp, #2, mul vl] +; CHECK-NEXT: ld1d { z0.d }, p0/z, [sp, #1, mul vl] ; CHECK-NEXT: csel x8, x8, x9, lo -; CHECK-NEXT: addvl x9, sp, #3 +; CHECK-NEXT: addvl x9, sp, #2 ; CHECK-NEXT: lsl x8, x8, #3 -; CHECK-NEXT: st1d { z0.d }, p0, [sp, #3, mul vl] +; CHECK-NEXT: st1d { z0.d }, p0, [sp, #2, mul vl] ; CHECK-NEXT: str q4, [x9, x8] -; CHECK-NEXT: ld1d { z0.d }, p0/z, [sp, #3, mul vl] -; CHECK-NEXT: addvl sp, sp, #4 +; CHECK-NEXT: ld1d { z0.d }, p0/z, [sp, #2, mul vl] +; CHECK-NEXT: addvl sp, sp, #3 ; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload ; CHECK-NEXT: ret + + %r = call @llvm.experimental.vector.insert.nxv2f64.v8f64( %a, <8 x double> %b, i64 0) ret %r } diff --git a/llvm/test/CodeGen/AArch64/sve-insert-vector.ll b/llvm/test/CodeGen/AArch64/sve-insert-vector.ll --- a/llvm/test/CodeGen/AArch64/sve-insert-vector.ll +++ b/llvm/test/CodeGen/AArch64/sve-insert-vector.ll @@ -4,14 +4,9 @@ define @insert_v2i64_nxv2i64( %vec, <2 x i64> %subvec) nounwind { ; CHECK-LABEL: insert_v2i64_nxv2i64: ; CHECK: // %bb.0: -; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill -; CHECK-NEXT: addvl sp, sp, #-1 -; CHECK-NEXT: ptrue p0.d -; CHECK-NEXT: st1d { z0.d }, p0, [sp] -; CHECK-NEXT: str q1, [sp] -; CHECK-NEXT: ld1d { z0.d }, p0/z, [sp] -; CHECK-NEXT: addvl sp, sp, #1 -; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ptrue p0.d, vl2 +; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1 +; CHECK-NEXT: mov z0.d, p0/m, z1.d ; CHECK-NEXT: ret %retval = call @llvm.experimental.vector.insert.nxv2i64.v2i64( %vec, <2 x i64> %subvec, i64 0) ret %retval @@ -43,14 +38,9 @@ define @insert_v4i32_nxv4i32( %vec, <4 x i32> %subvec) nounwind { ; CHECK-LABEL: insert_v4i32_nxv4i32: ; CHECK: // %bb.0: -; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill -; CHECK-NEXT: addvl sp, sp, #-1 -; CHECK-NEXT: ptrue p0.s -; CHECK-NEXT: st1w { z0.s }, p0, [sp] -; CHECK-NEXT: str q1, [sp] -; CHECK-NEXT: ld1w { z0.s }, p0/z, [sp] -; CHECK-NEXT: addvl sp, sp, #1 -; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ptrue p0.s, vl4 +; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1 +; CHECK-NEXT: mov z0.s, p0/m, z1.s ; CHECK-NEXT: ret %retval = call @llvm.experimental.vector.insert.nxv4i32.v4i32( %vec, <4 x i32> %subvec, i64 0) ret %retval @@ -82,14 +72,9 @@ define @insert_v8i16_nxv8i16( %vec, <8 x i16> %subvec) nounwind { ; CHECK-LABEL: insert_v8i16_nxv8i16: ; CHECK: // %bb.0: -; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill -; CHECK-NEXT: addvl sp, sp, #-1 -; CHECK-NEXT: ptrue p0.h -; CHECK-NEXT: st1h { z0.h }, p0, [sp] -; CHECK-NEXT: str q1, [sp] -; CHECK-NEXT: ld1h { z0.h }, p0/z, [sp] -; CHECK-NEXT: addvl sp, sp, #1 -; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ptrue p0.h, vl8 +; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1 +; CHECK-NEXT: mov z0.h, p0/m, z1.h ; CHECK-NEXT: ret %retval = call @llvm.experimental.vector.insert.nxv8i16.v8i16( %vec, <8 x i16> %subvec, i64 0) ret %retval @@ -121,14 +106,9 @@ define @insert_v16i8_nxv16i8( %vec, <16 x i8> %subvec) nounwind { ; CHECK-LABEL: insert_v16i8_nxv16i8: ; CHECK: // %bb.0: -; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill -; CHECK-NEXT: addvl sp, sp, #-1 -; CHECK-NEXT: ptrue p0.b -; CHECK-NEXT: st1b { z0.b }, p0, [sp] -; CHECK-NEXT: str q1, [sp] -; CHECK-NEXT: ld1b { z0.b }, p0/z, [sp] -; CHECK-NEXT: addvl sp, sp, #1 -; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ptrue p0.b, vl16 +; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1 +; CHECK-NEXT: mov z0.b, p0/m, z1.b ; CHECK-NEXT: ret %retval = call @llvm.experimental.vector.insert.nxv16i8.v16i8( %vec, <16 x i8> %subvec, i64 0) ret %retval @@ -469,7 +449,7 @@ define @insert_nxv2bf16_nxv2bf16( %sv0, %sv1) nounwind { ; CHECK-LABEL: insert_nxv2bf16_nxv2bf16: ; CHECK: // %bb.0: -; CHECK-NEXT: z0.d, z1.d +; CHECK-NEXT: mov z0.d, z1.d ; CHECK-NEXT: ret %v0 = call @llvm.experimental.vector.insert.nxv2bf16.nxv2bf16( %sv0, %sv1, i64 0) ret %v0 @@ -478,7 +458,7 @@ define @insert_nxv4bf16_nxv4bf16( %sv0, %sv1) nounwind { ; CHECK-LABEL: insert_nxv4bf16_nxv4bf16: ; CHECK: // %bb.0: -; CHECK-NEXT: z0.d, z1.d +; CHECK-NEXT: mov z0.d, z1.d ; CHECK-NEXT: ret %v0 = call @llvm.experimental.vector.insert.nxv4bf16.nxv4bf16( %sv0, %sv1, i64 0) ret %v0 @@ -487,15 +467,15 @@ define @insert_nxv4bf16_v4bf16( %sv0, <4 x bfloat> %v1) nounwind { ; CHECK-LABEL: insert_nxv4bf16_v4bf16: ; CHECK: // %bb.0: -; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill -; CHECK-NEXT: addvl sp, sp, #-1 -; CHECK-NEXT: ptrue p0.s -; CHECK-NEXT: st1h { z0.s }, p0, [sp, #1, mul vl] -; CHECK-NEXT: addpl x8, sp, #4 -; CHECK-NEXT: str d1, [x8] -; CHECK-NEXT: ld1h { z0.s }, p0/z, [sp, #1, mul vl] -; CHECK-NEXT: addvl sp, sp, #1 -; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: st1h { z0.s }, p0, [sp, #1, mul vl] +; CHECK-NEXT: addpl x8, sp, #4 +; CHECK-NEXT: str d1, [x8] +; CHECK-NEXT: ld1h { z0.s }, p0/z, [sp, #1, mul vl] +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload ; CHECK-NEXT: ret %v0 = call @llvm.experimental.vector.insert.nxv4bf16.v4bf16( %sv0, <4 x bfloat> %v1, i64 0) ret %v0 @@ -504,7 +484,7 @@ define @insert_nxv8bf16_nxv8bf16( %sv0, %sv1) nounwind { ; CHECK-LABEL: insert_nxv8bf16_nxv8bf16: ; CHECK: // %bb.0: -; CHECK-NEXT: z0.d, z1.d +; CHECK-NEXT: mov z0.d, z1.d ; CHECK-NEXT: ret %v0 = call @llvm.experimental.vector.insert.nxv8bf16.nxv8bf16( %sv0, %sv1, i64 0) ret %v0 @@ -513,14 +493,9 @@ define @insert_nxv8bf16_v8bf16( %sv0, <8 x bfloat> %v1) nounwind { ; CHECK-LABEL: insert_nxv8bf16_v8bf16: ; CHECK: // %bb.0: -; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill -; CHECK-NEXT: addvl sp, sp, #-1 -; CHECK-NEXT: ptrue p0.h -; CHECK-NEXT: st1h { z0.h }, p0, [sp] -; CHECK-NEXT: str q1, [sp] -; CHECK-NEXT: ld1h { z0.h }, p0/z, [sp] -; CHECK-NEXT: addvl sp, sp, #1 -; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ptrue p0.h, vl8 +; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1 +; CHECK-NEXT: mov z0.h, p0/m, z1.h ; CHECK-NEXT: ret %v0 = call @llvm.experimental.vector.insert.nxv8bf16.v8bf16( %sv0, <8 x bfloat> %v1, i64 0) ret %v0