diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp @@ -11107,8 +11107,13 @@ // VECTOR_SHUFFLE doesn't support a scalable mask so use a dedicated node. if (VT.isScalableVector()) { MVT IdxVT = TLI.getVectorIdxTy(DAG.getDataLayout()); - setValue(&I, DAG.getNode(ISD::VECTOR_SPLICE, DL, VT, V1, V2, - DAG.getConstant(Imm, DL, IdxVT))); + SDValue V2Rev = DAG.getNode(ISD::VECTOR_REVERSE, DL, VT, V2); + if (Imm == 1) + setValue(&I, DAG.getNode(ISD::VECTOR_SPLICE, DL, VT, V2Rev, V1, + DAG.getConstant(Imm, DL, IdxVT))); + else + setValue(&I, DAG.getNode(ISD::VECTOR_SPLICE, DL, VT, V1, V2Rev, + DAG.getConstant(Imm, DL, IdxVT))); return; } diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h @@ -946,6 +946,7 @@ SDValue LowerToPredicatedOp(SDValue Op, SelectionDAG &DAG, unsigned NewOp, bool OverrideNEON = false) const; SDValue LowerToScalableOp(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerVECTOR_SPLICE(SDValue Op, SelectionDAG &DAG) const; SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, SelectionDAG &DAG) const; SDValue LowerINSERT_SUBVECTOR(SDValue Op, SelectionDAG &DAG) const; SDValue LowerDIV(SDValue Op, SelectionDAG &DAG) const; diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -1158,6 +1158,7 @@ setOperationAction(ISD::MULHS, VT, Custom); setOperationAction(ISD::MULHU, VT, Custom); setOperationAction(ISD::SPLAT_VECTOR, VT, Custom); + setOperationAction(ISD::VECTOR_SPLICE, VT, Custom); setOperationAction(ISD::SELECT, VT, Custom); setOperationAction(ISD::SETCC, VT, Custom); setOperationAction(ISD::SDIV, VT, Custom); @@ -1272,6 +1273,7 @@ setOperationAction(ISD::VECREDUCE_FMAX, VT, Custom); setOperationAction(ISD::VECREDUCE_FMIN, VT, Custom); setOperationAction(ISD::VECREDUCE_SEQ_FADD, VT, Custom); + setOperationAction(ISD::VECTOR_SPLICE, VT, Custom); setOperationAction(ISD::SELECT_CC, VT, Expand); } @@ -1538,6 +1540,7 @@ setOperationAction(ISD::SMAX, VT, Custom); setOperationAction(ISD::SMIN, VT, Custom); setOperationAction(ISD::SPLAT_VECTOR, VT, Custom); + setOperationAction(ISD::VECTOR_SPLICE, VT, Custom); setOperationAction(ISD::SRA, VT, Custom); setOperationAction(ISD::SRL, VT, Custom); setOperationAction(ISD::STORE, VT, Custom); @@ -4886,6 +4889,8 @@ /*OverrideNEON=*/true); case ISD::CTTZ: return LowerCTTZ(Op, DAG); + case ISD::VECTOR_SPLICE: + return LowerVECTOR_SPLICE(Op, DAG); } } @@ -7391,6 +7396,11 @@ return CS1; } +SDValue AArch64TargetLowering::LowerVECTOR_SPLICE(SDValue Op, + SelectionDAG &DAG) const { + return Op; +} + SDValue AArch64TargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const { ISD::CondCode CC = cast(Op.getOperand(4))->get(); diff --git a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td --- a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td @@ -1226,6 +1226,38 @@ def : Pat<(nxv8bf16 (concat_vectors nxv4bf16:$v1, nxv4bf16:$v2)), (UZP1_ZZZ_H $v1, $v2)>; + // Splice with lane equal to 1 + def : Pat<(nxv16i8 (vector_splice (nxv16i8 ZPR:$Z1), (nxv16i8 ZPR:$Z2), (i64 1))), + (INSR_ZV_B ZPR:$Z2, (INSERT_SUBREG (IMPLICIT_DEF), + (LASTB_VPZ_B (PTRUE_B 31), ZPR:$Z1), bsub))>; + def : Pat<(nxv8i16 (vector_splice (nxv8i16 ZPR:$Z1), (nxv8i16 ZPR:$Z2), (i64 1))), + (INSR_ZV_H ZPR:$Z2, (INSERT_SUBREG (IMPLICIT_DEF), + (LASTB_VPZ_H (PTRUE_H 31), ZPR:$Z1), hsub))>; + def : Pat<(nxv4i32 (vector_splice (nxv4i32 ZPR:$Z1), (nxv4i32 ZPR:$Z2), (i64 1))), + (INSR_ZV_S ZPR:$Z2, (INSERT_SUBREG (IMPLICIT_DEF), + (LASTB_VPZ_S (PTRUE_S 31), ZPR:$Z1), ssub))>; + def : Pat<(nxv2i64 (vector_splice (nxv2i64 ZPR:$Z1), (nxv2i64 ZPR:$Z2), (i64 1))), + (INSR_ZV_D ZPR:$Z2, (INSERT_SUBREG (IMPLICIT_DEF), + (LASTB_VPZ_D (PTRUE_D 31), ZPR:$Z1), dsub))>; + def : Pat<(nxv8f16 (vector_splice (nxv8f16 ZPR:$Z1), (nxv8f16 ZPR:$Z2), (i64 1))), + (INSR_ZV_H ZPR:$Z2, (INSERT_SUBREG (IMPLICIT_DEF), + (LASTB_VPZ_H (PTRUE_H 31), ZPR:$Z1), hsub))>; + def : Pat<(nxv4f16 (vector_splice (nxv4f16 ZPR:$Z1), (nxv4f16 ZPR:$Z2), (i64 1))), + (INSR_ZV_H ZPR:$Z2, (INSERT_SUBREG (IMPLICIT_DEF), + (LASTB_VPZ_H (PTRUE_S 31), ZPR:$Z1), hsub))>; + def : Pat<(nxv2f16 (vector_splice (nxv2f16 ZPR:$Z1), (nxv2f16 ZPR:$Z2), (i64 1))), + (INSR_ZV_H ZPR:$Z2, (INSERT_SUBREG (IMPLICIT_DEF), + (LASTB_VPZ_H (PTRUE_D 31), ZPR:$Z1), hsub))>; + def : Pat<(nxv4f32 (vector_splice (nxv4f32 ZPR:$Z1), (nxv4f32 ZPR:$Z2), (i64 1))), + (INSR_ZV_S ZPR:$Z2, (INSERT_SUBREG (IMPLICIT_DEF), + (LASTB_VPZ_S (PTRUE_S 31), ZPR:$Z1), ssub))>; + def : Pat<(nxv2f32 (vector_splice (nxv2f32 ZPR:$Z1), (nxv2f32 ZPR:$Z2), (i64 1))), + (INSR_ZV_S ZPR:$Z2, (INSERT_SUBREG (IMPLICIT_DEF), + (LASTB_VPZ_S (PTRUE_D 31), ZPR:$Z1), ssub))>; + def : Pat<(nxv2f64 (vector_splice (nxv2f64 ZPR:$Z1), (nxv2f64 ZPR:$Z2), (i64 1))), + (INSR_ZV_D ZPR:$Z2, (INSERT_SUBREG (IMPLICIT_DEF), + (LASTB_VPZ_D (PTRUE_D 31), ZPR:$Z1), dsub))>; + defm CMPHS_PPzZZ : sve_int_cmp_0<0b000, "cmphs", SETUGE, SETULE>; defm CMPHI_PPzZZ : sve_int_cmp_0<0b001, "cmphi", SETUGT, SETULT>; defm CMPGE_PPzZZ : sve_int_cmp_0<0b100, "cmpge", SETGE, SETLE>; @@ -2370,6 +2402,28 @@ def : Pat<(vector_extract (nxv2f64 ZPR:$Zs), (i64 0)), (f64 (EXTRACT_SUBREG ZPR:$Zs, dsub))>; } + + // Splice with generic index + def : Pat<(nxv16i8 (vector_splice (nxv16i8 ZPR:$Z1), (nxv16i8 ZPR:$Z2), GPR64:$index)), + (REV_ZZ_B (SPLICE_ZPZ_B (WHILELE_PXX_B XZR, GPR64:$index), ZPR:$Z2, ZPR:$Z1))>; + def : Pat<(nxv8i16 (vector_splice (nxv8i16 ZPR:$Z1), (nxv8i16 ZPR:$Z2), GPR64:$index)), + (REV_ZZ_H (SPLICE_ZPZ_H (WHILELE_PXX_H XZR, GPR64:$index), ZPR:$Z2, ZPR:$Z1))>; + def : Pat<(nxv4i32 (vector_splice (nxv4i32 ZPR:$Z1), (nxv4i32 ZPR:$Z2), GPR64:$index)), + (REV_ZZ_S (SPLICE_ZPZ_S (WHILELE_PXX_S XZR, GPR64:$index), ZPR:$Z2, ZPR:$Z1))>; + def : Pat<(nxv2i64 (vector_splice (nxv2i64 ZPR:$Z1), (nxv2i64 ZPR:$Z2), GPR64:$index)), + (REV_ZZ_D (SPLICE_ZPZ_D (WHILELE_PXX_D XZR, GPR64:$index), ZPR:$Z2, ZPR:$Z1))>; + def : Pat<(nxv8f16 (vector_splice (nxv8f16 ZPR:$Z1), (nxv8f16 ZPR:$Z2), GPR64:$index)), + (REV_ZZ_H (SPLICE_ZPZ_H (WHILELE_PXX_H XZR, GPR64:$index), ZPR:$Z2, ZPR:$Z1))>; + def : Pat<(nxv4f16 (vector_splice (nxv4f16 ZPR:$Z1), (nxv4f16 ZPR:$Z2), GPR64:$index)), + (REV_ZZ_H (SPLICE_ZPZ_H (WHILELE_PXX_H XZR, GPR64:$index), ZPR:$Z2, ZPR:$Z1))>; + def : Pat<(nxv2f16 (vector_splice (nxv2f16 ZPR:$Z1), (nxv2f16 ZPR:$Z2), GPR64:$index)), + (REV_ZZ_H (SPLICE_ZPZ_H (WHILELE_PXX_H XZR, GPR64:$index), ZPR:$Z2, ZPR:$Z1))>; + def : Pat<(nxv4f32 (vector_splice (nxv4f32 ZPR:$Z1), (nxv4f32 ZPR:$Z2), GPR64:$index)), + (REV_ZZ_S (SPLICE_ZPZ_S (WHILELE_PXX_S XZR, GPR64:$index), ZPR:$Z2, ZPR:$Z1))>; + def : Pat<(nxv2f32 (vector_splice (nxv2f32 ZPR:$Z1), (nxv2f32 ZPR:$Z2), GPR64:$index)), + (REV_ZZ_S (SPLICE_ZPZ_S (WHILELE_PXX_S XZR, GPR64:$index), ZPR:$Z2, ZPR:$Z1))>; + def : Pat<(nxv2f64 (vector_splice (nxv2f64 ZPR:$Z1), (nxv2f64 ZPR:$Z2), GPR64:$index)), + (REV_ZZ_D (SPLICE_ZPZ_D (WHILELE_PXX_D XZR, GPR64:$index), ZPR:$Z2, ZPR:$Z1))>; } let Predicates = [HasSVE, HasMatMulInt8] in { diff --git a/llvm/test/CodeGen/AArch64/named-vector-shuffles-sve.ll b/llvm/test/CodeGen/AArch64/named-vector-shuffles-sve.ll --- a/llvm/test/CodeGen/AArch64/named-vector-shuffles-sve.ll +++ b/llvm/test/CodeGen/AArch64/named-vector-shuffles-sve.ll @@ -10,15 +10,10 @@ define @splice_nxv16i8_first_idx( %a, %b) #0 { ; CHECK-LABEL: splice_nxv16i8_first_idx: ; CHECK: // %bb.0: -; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill -; CHECK-NEXT: addvl sp, sp, #-2 -; CHECK-NEXT: ptrue p0.b -; CHECK-NEXT: mov x8, sp -; CHECK-NEXT: st1b { z0.b }, p0, [sp] -; CHECK-NEXT: st1b { z1.b }, p0, [x8, #1, mul vl] -; CHECK-NEXT: ld1b { z0.b }, p0/z, [sp] -; CHECK-NEXT: addvl sp, sp, #2 -; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: rev z1.b, z1.b +; CHECK-NEXT: whilele p0.b, xzr, xzr +; CHECK-NEXT: splice z1.b, p0, z1.b, z0.b +; CHECK-NEXT: rev z0.b, z1.b ; CHECK-NEXT: ret %res = call @llvm.experimental.vector.splice.nxv16i8( %a, %b, i32 0) ret %res @@ -27,16 +22,11 @@ define @splice_nxv16i8_last_idx( %a, %b) #0 { ; CHECK-LABEL: splice_nxv16i8_last_idx: ; CHECK: // %bb.0: -; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill -; CHECK-NEXT: addvl sp, sp, #-2 -; CHECK-NEXT: ptrue p0.b -; CHECK-NEXT: mov x8, sp -; CHECK-NEXT: st1b { z0.b }, p0, [sp] -; CHECK-NEXT: st1b { z1.b }, p0, [x8, #1, mul vl] -; CHECK-NEXT: orr x8, x8, #0xf -; CHECK-NEXT: ld1b { z0.b }, p0/z, [x8] -; CHECK-NEXT: addvl sp, sp, #2 -; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: mov w8, #15 +; CHECK-NEXT: rev z1.b, z1.b +; CHECK-NEXT: whilele p0.b, xzr, x8 +; CHECK-NEXT: splice z1.b, p0, z1.b, z0.b +; CHECK-NEXT: rev z0.b, z1.b ; CHECK-NEXT: ret %res = call @llvm.experimental.vector.splice.nxv16i8( %a, %b, i32 15) ret %res @@ -46,20 +36,11 @@ define @splice_nxv16i8_clamped_idx( %a, %b) #0 { ; CHECK-LABEL: splice_nxv16i8_clamped_idx: ; CHECK: // %bb.0: -; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill -; CHECK-NEXT: addvl sp, sp, #-2 -; CHECK-NEXT: rdvl x9, #1 -; CHECK-NEXT: sub x9, x9, #1 // =1 -; CHECK-NEXT: ptrue p0.b -; CHECK-NEXT: mov x8, sp -; CHECK-NEXT: mov w10, #16 -; CHECK-NEXT: cmp x9, #16 // =16 -; CHECK-NEXT: st1b { z0.b }, p0, [sp] -; CHECK-NEXT: st1b { z1.b }, p0, [x8, #1, mul vl] -; CHECK-NEXT: csel x9, x9, x10, lo -; CHECK-NEXT: ld1b { z0.b }, p0/z, [x8, x9] -; CHECK-NEXT: addvl sp, sp, #2 -; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: mov w8, #16 +; CHECK-NEXT: rev z1.b, z1.b +; CHECK-NEXT: whilele p0.b, xzr, x8 +; CHECK-NEXT: splice z1.b, p0, z1.b, z0.b +; CHECK-NEXT: rev z0.b, z1.b ; CHECK-NEXT: ret %res = call @llvm.experimental.vector.splice.nxv16i8( %a, %b, i32 16) ret %res @@ -68,15 +49,10 @@ define @splice_nxv8i16_first_idx( %a, %b) #0 { ; CHECK-LABEL: splice_nxv8i16_first_idx: ; CHECK: // %bb.0: -; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill -; CHECK-NEXT: addvl sp, sp, #-2 -; CHECK-NEXT: ptrue p0.h -; CHECK-NEXT: mov x8, sp -; CHECK-NEXT: st1h { z0.h }, p0, [sp] -; CHECK-NEXT: st1h { z1.h }, p0, [x8, #1, mul vl] -; CHECK-NEXT: ld1h { z0.h }, p0/z, [sp] -; CHECK-NEXT: addvl sp, sp, #2 -; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: rev z1.h, z1.h +; CHECK-NEXT: whilele p0.h, xzr, xzr +; CHECK-NEXT: splice z1.h, p0, z1.h, z0.h +; CHECK-NEXT: rev z0.h, z1.h ; CHECK-NEXT: ret %res = call @llvm.experimental.vector.splice.nxv8i16( %a, %b, i32 0) ret %res @@ -85,16 +61,11 @@ define @splice_nxv8i16_last_idx( %a, %b) #0 { ; CHECK-LABEL: splice_nxv8i16_last_idx: ; CHECK: // %bb.0: -; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill -; CHECK-NEXT: addvl sp, sp, #-2 -; CHECK-NEXT: ptrue p0.h -; CHECK-NEXT: mov x8, sp -; CHECK-NEXT: st1h { z0.h }, p0, [sp] -; CHECK-NEXT: st1h { z1.h }, p0, [x8, #1, mul vl] -; CHECK-NEXT: orr x8, x8, #0xe -; CHECK-NEXT: ld1h { z0.h }, p0/z, [x8] -; CHECK-NEXT: addvl sp, sp, #2 -; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: mov w8, #7 +; CHECK-NEXT: rev z1.h, z1.h +; CHECK-NEXT: whilele p0.h, xzr, x8 +; CHECK-NEXT: splice z1.h, p0, z1.h, z0.h +; CHECK-NEXT: rev z0.h, z1.h ; CHECK-NEXT: ret %res = call @llvm.experimental.vector.splice.nxv8i16( %a, %b, i32 7) ret %res @@ -104,20 +75,11 @@ define @splice_nxv8i16_clamped_idx( %a, %b) #0 { ; CHECK-LABEL: splice_nxv8i16_clamped_idx: ; CHECK: // %bb.0: -; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill -; CHECK-NEXT: addvl sp, sp, #-2 -; CHECK-NEXT: cnth x10 -; CHECK-NEXT: sub x10, x10, #1 // =1 -; CHECK-NEXT: ptrue p0.h -; CHECK-NEXT: mov x8, sp -; CHECK-NEXT: mov w9, #8 -; CHECK-NEXT: cmp x10, #8 // =8 -; CHECK-NEXT: st1h { z0.h }, p0, [sp] -; CHECK-NEXT: st1h { z1.h }, p0, [x8, #1, mul vl] -; CHECK-NEXT: csel x9, x10, x9, lo -; CHECK-NEXT: ld1h { z0.h }, p0/z, [x8, x9, lsl #1] -; CHECK-NEXT: addvl sp, sp, #2 -; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: mov w8, #8 +; CHECK-NEXT: rev z1.h, z1.h +; CHECK-NEXT: whilele p0.h, xzr, x8 +; CHECK-NEXT: splice z1.h, p0, z1.h, z0.h +; CHECK-NEXT: rev z0.h, z1.h ; CHECK-NEXT: ret %res = call @llvm.experimental.vector.splice.nxv8i16( %a, %b, i32 8) ret %res @@ -126,15 +88,10 @@ define @splice_nxv4i32_first_idx( %a, %b) #0 { ; CHECK-LABEL: splice_nxv4i32_first_idx: ; CHECK: // %bb.0: -; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill -; CHECK-NEXT: addvl sp, sp, #-2 -; CHECK-NEXT: ptrue p0.s -; CHECK-NEXT: mov x8, sp -; CHECK-NEXT: st1w { z0.s }, p0, [sp] -; CHECK-NEXT: st1w { z1.s }, p0, [x8, #1, mul vl] -; CHECK-NEXT: ld1w { z0.s }, p0/z, [sp] -; CHECK-NEXT: addvl sp, sp, #2 -; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: rev z1.s, z1.s +; CHECK-NEXT: whilele p0.s, xzr, xzr +; CHECK-NEXT: splice z1.s, p0, z1.s, z0.s +; CHECK-NEXT: rev z0.s, z1.s ; CHECK-NEXT: ret %res = call @llvm.experimental.vector.splice.nxv4i32( %a, %b, i32 0) ret %res @@ -143,16 +100,11 @@ define @splice_nxv4i32_last_idx( %a, %b) #0 { ; CHECK-LABEL: splice_nxv4i32_last_idx: ; CHECK: // %bb.0: -; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill -; CHECK-NEXT: addvl sp, sp, #-2 -; CHECK-NEXT: ptrue p0.s -; CHECK-NEXT: mov x8, sp -; CHECK-NEXT: st1w { z0.s }, p0, [sp] -; CHECK-NEXT: st1w { z1.s }, p0, [x8, #1, mul vl] -; CHECK-NEXT: orr x8, x8, #0xc -; CHECK-NEXT: ld1w { z0.s }, p0/z, [x8] -; CHECK-NEXT: addvl sp, sp, #2 -; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: mov w8, #3 +; CHECK-NEXT: rev z1.s, z1.s +; CHECK-NEXT: whilele p0.s, xzr, x8 +; CHECK-NEXT: splice z1.s, p0, z1.s, z0.s +; CHECK-NEXT: rev z0.s, z1.s ; CHECK-NEXT: ret %res = call @llvm.experimental.vector.splice.nxv4i32( %a, %b, i32 3) ret %res @@ -162,20 +114,11 @@ define @splice_nxv4i32_clamped_idx( %a, %b) #0 { ; CHECK-LABEL: splice_nxv4i32_clamped_idx: ; CHECK: // %bb.0: -; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill -; CHECK-NEXT: addvl sp, sp, #-2 -; CHECK-NEXT: cntw x10 -; CHECK-NEXT: sub x10, x10, #1 // =1 -; CHECK-NEXT: ptrue p0.s -; CHECK-NEXT: mov x8, sp -; CHECK-NEXT: mov w9, #4 -; CHECK-NEXT: cmp x10, #4 // =4 -; CHECK-NEXT: st1w { z0.s }, p0, [sp] -; CHECK-NEXT: st1w { z1.s }, p0, [x8, #1, mul vl] -; CHECK-NEXT: csel x9, x10, x9, lo -; CHECK-NEXT: ld1w { z0.s }, p0/z, [x8, x9, lsl #2] -; CHECK-NEXT: addvl sp, sp, #2 -; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: mov w8, #4 +; CHECK-NEXT: rev z1.s, z1.s +; CHECK-NEXT: whilele p0.s, xzr, x8 +; CHECK-NEXT: splice z1.s, p0, z1.s, z0.s +; CHECK-NEXT: rev z0.s, z1.s ; CHECK-NEXT: ret %res = call @llvm.experimental.vector.splice.nxv4i32( %a, %b, i32 4) ret %res @@ -184,15 +127,10 @@ define @splice_nxv2i64_first_idx( %a, %b) #0 { ; CHECK-LABEL: splice_nxv2i64_first_idx: ; CHECK: // %bb.0: -; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill -; CHECK-NEXT: addvl sp, sp, #-2 -; CHECK-NEXT: ptrue p0.d -; CHECK-NEXT: mov x8, sp -; CHECK-NEXT: st1d { z0.d }, p0, [sp] -; CHECK-NEXT: st1d { z1.d }, p0, [x8, #1, mul vl] -; CHECK-NEXT: ld1d { z0.d }, p0/z, [sp] -; CHECK-NEXT: addvl sp, sp, #2 -; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: rev z1.d, z1.d +; CHECK-NEXT: whilele p0.d, xzr, xzr +; CHECK-NEXT: splice z1.d, p0, z1.d, z0.d +; CHECK-NEXT: rev z0.d, z1.d ; CHECK-NEXT: ret %res = call @llvm.experimental.vector.splice.nxv2i64( %a, %b, i32 0) ret %res @@ -201,16 +139,10 @@ define @splice_nxv2i64_last_idx( %a, %b) #0 { ; CHECK-LABEL: splice_nxv2i64_last_idx: ; CHECK: // %bb.0: -; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill -; CHECK-NEXT: addvl sp, sp, #-2 +; CHECK-NEXT: rev z1.d, z1.d ; CHECK-NEXT: ptrue p0.d -; CHECK-NEXT: mov x8, sp -; CHECK-NEXT: st1d { z0.d }, p0, [sp] -; CHECK-NEXT: st1d { z1.d }, p0, [x8, #1, mul vl] -; CHECK-NEXT: orr x8, x8, #0x8 -; CHECK-NEXT: ld1d { z0.d }, p0/z, [x8] -; CHECK-NEXT: addvl sp, sp, #2 -; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: lastb d1, p0, z1.d +; CHECK-NEXT: insr z0.d, d1 ; CHECK-NEXT: ret %res = call @llvm.experimental.vector.splice.nxv2i64( %a, %b, i32 1) ret %res @@ -220,20 +152,11 @@ define @splice_nxv2i64_clamped_idx( %a, %b) #0 { ; CHECK-LABEL: splice_nxv2i64_clamped_idx: ; CHECK: // %bb.0: -; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill -; CHECK-NEXT: addvl sp, sp, #-2 -; CHECK-NEXT: cntd x10 -; CHECK-NEXT: sub x10, x10, #1 // =1 -; CHECK-NEXT: ptrue p0.d -; CHECK-NEXT: mov x8, sp -; CHECK-NEXT: mov w9, #2 -; CHECK-NEXT: cmp x10, #2 // =2 -; CHECK-NEXT: st1d { z0.d }, p0, [sp] -; CHECK-NEXT: st1d { z1.d }, p0, [x8, #1, mul vl] -; CHECK-NEXT: csel x9, x10, x9, lo -; CHECK-NEXT: ld1d { z0.d }, p0/z, [x8, x9, lsl #3] -; CHECK-NEXT: addvl sp, sp, #2 -; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: mov w8, #2 +; CHECK-NEXT: rev z1.d, z1.d +; CHECK-NEXT: whilele p0.d, xzr, x8 +; CHECK-NEXT: splice z1.d, p0, z1.d, z0.d +; CHECK-NEXT: rev z0.d, z1.d ; CHECK-NEXT: ret %res = call @llvm.experimental.vector.splice.nxv2i64( %a, %b, i32 2) ret %res @@ -242,15 +165,10 @@ define @splice_nxv8f16_first_idx( %a, %b) #0 { ; CHECK-LABEL: splice_nxv8f16_first_idx: ; CHECK: // %bb.0: -; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill -; CHECK-NEXT: addvl sp, sp, #-2 -; CHECK-NEXT: ptrue p0.h -; CHECK-NEXT: mov x8, sp -; CHECK-NEXT: st1h { z0.h }, p0, [sp] -; CHECK-NEXT: st1h { z1.h }, p0, [x8, #1, mul vl] -; CHECK-NEXT: ld1h { z0.h }, p0/z, [sp] -; CHECK-NEXT: addvl sp, sp, #2 -; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: rev z1.h, z1.h +; CHECK-NEXT: whilele p0.h, xzr, xzr +; CHECK-NEXT: splice z1.h, p0, z1.h, z0.h +; CHECK-NEXT: rev z0.h, z1.h ; CHECK-NEXT: ret %res = call @llvm.experimental.vector.splice.nxv8f16( %a, %b, i32 0) ret %res @@ -259,16 +177,11 @@ define @splice_nxv8f16_last_idx( %a, %b) #0 { ; CHECK-LABEL: splice_nxv8f16_last_idx: ; CHECK: // %bb.0: -; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill -; CHECK-NEXT: addvl sp, sp, #-2 -; CHECK-NEXT: ptrue p0.h -; CHECK-NEXT: mov x8, sp -; CHECK-NEXT: st1h { z0.h }, p0, [sp] -; CHECK-NEXT: st1h { z1.h }, p0, [x8, #1, mul vl] -; CHECK-NEXT: orr x8, x8, #0xe -; CHECK-NEXT: ld1h { z0.h }, p0/z, [x8] -; CHECK-NEXT: addvl sp, sp, #2 -; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: mov w8, #7 +; CHECK-NEXT: rev z1.h, z1.h +; CHECK-NEXT: whilele p0.h, xzr, x8 +; CHECK-NEXT: splice z1.h, p0, z1.h, z0.h +; CHECK-NEXT: rev z0.h, z1.h ; CHECK-NEXT: ret %res = call @llvm.experimental.vector.splice.nxv8f16( %a, %b, i32 7) ret %res @@ -278,20 +191,11 @@ define @splice_nxv8f16_clamped_idx( %a, %b) #0 { ; CHECK-LABEL: splice_nxv8f16_clamped_idx: ; CHECK: // %bb.0: -; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill -; CHECK-NEXT: addvl sp, sp, #-2 -; CHECK-NEXT: cnth x10 -; CHECK-NEXT: sub x10, x10, #1 // =1 -; CHECK-NEXT: ptrue p0.h -; CHECK-NEXT: mov x8, sp -; CHECK-NEXT: mov w9, #8 -; CHECK-NEXT: cmp x10, #8 // =8 -; CHECK-NEXT: st1h { z0.h }, p0, [sp] -; CHECK-NEXT: st1h { z1.h }, p0, [x8, #1, mul vl] -; CHECK-NEXT: csel x9, x10, x9, lo -; CHECK-NEXT: ld1h { z0.h }, p0/z, [x8, x9, lsl #1] -; CHECK-NEXT: addvl sp, sp, #2 -; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: mov w8, #8 +; CHECK-NEXT: rev z1.h, z1.h +; CHECK-NEXT: whilele p0.h, xzr, x8 +; CHECK-NEXT: splice z1.h, p0, z1.h, z0.h +; CHECK-NEXT: rev z0.h, z1.h ; CHECK-NEXT: ret %res = call @llvm.experimental.vector.splice.nxv8f16( %a, %b, i32 8) ret %res @@ -300,15 +204,10 @@ define @splice_nxv4f32_first_idx( %a, %b) #0 { ; CHECK-LABEL: splice_nxv4f32_first_idx: ; CHECK: // %bb.0: -; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill -; CHECK-NEXT: addvl sp, sp, #-2 -; CHECK-NEXT: ptrue p0.s -; CHECK-NEXT: mov x8, sp -; CHECK-NEXT: st1w { z0.s }, p0, [sp] -; CHECK-NEXT: st1w { z1.s }, p0, [x8, #1, mul vl] -; CHECK-NEXT: ld1w { z0.s }, p0/z, [sp] -; CHECK-NEXT: addvl sp, sp, #2 -; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: rev z1.s, z1.s +; CHECK-NEXT: whilele p0.s, xzr, xzr +; CHECK-NEXT: splice z1.s, p0, z1.s, z0.s +; CHECK-NEXT: rev z0.s, z1.s ; CHECK-NEXT: ret %res = call @llvm.experimental.vector.splice.nxv4f32( %a, %b, i32 0) ret %res @@ -317,16 +216,11 @@ define @splice_nxv4f32_last_idx( %a, %b) #0 { ; CHECK-LABEL: splice_nxv4f32_last_idx: ; CHECK: // %bb.0: -; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill -; CHECK-NEXT: addvl sp, sp, #-2 -; CHECK-NEXT: ptrue p0.s -; CHECK-NEXT: mov x8, sp -; CHECK-NEXT: st1w { z0.s }, p0, [sp] -; CHECK-NEXT: st1w { z1.s }, p0, [x8, #1, mul vl] -; CHECK-NEXT: orr x8, x8, #0xc -; CHECK-NEXT: ld1w { z0.s }, p0/z, [x8] -; CHECK-NEXT: addvl sp, sp, #2 -; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: mov w8, #3 +; CHECK-NEXT: rev z1.s, z1.s +; CHECK-NEXT: whilele p0.s, xzr, x8 +; CHECK-NEXT: splice z1.s, p0, z1.s, z0.s +; CHECK-NEXT: rev z0.s, z1.s ; CHECK-NEXT: ret %res = call @llvm.experimental.vector.splice.nxv4f32( %a, %b, i32 3) ret %res @@ -336,20 +230,11 @@ define @splice_nxv4f32_clamped_idx( %a, %b) #0 { ; CHECK-LABEL: splice_nxv4f32_clamped_idx: ; CHECK: // %bb.0: -; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill -; CHECK-NEXT: addvl sp, sp, #-2 -; CHECK-NEXT: cntw x10 -; CHECK-NEXT: sub x10, x10, #1 // =1 -; CHECK-NEXT: ptrue p0.s -; CHECK-NEXT: mov x8, sp -; CHECK-NEXT: mov w9, #4 -; CHECK-NEXT: cmp x10, #4 // =4 -; CHECK-NEXT: st1w { z0.s }, p0, [sp] -; CHECK-NEXT: st1w { z1.s }, p0, [x8, #1, mul vl] -; CHECK-NEXT: csel x9, x10, x9, lo -; CHECK-NEXT: ld1w { z0.s }, p0/z, [x8, x9, lsl #2] -; CHECK-NEXT: addvl sp, sp, #2 -; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: mov w8, #4 +; CHECK-NEXT: rev z1.s, z1.s +; CHECK-NEXT: whilele p0.s, xzr, x8 +; CHECK-NEXT: splice z1.s, p0, z1.s, z0.s +; CHECK-NEXT: rev z0.s, z1.s ; CHECK-NEXT: ret %res = call @llvm.experimental.vector.splice.nxv4f32( %a, %b, i32 4) ret %res @@ -358,15 +243,10 @@ define @splice_nxv2f64_first_idx( %a, %b) #0 { ; CHECK-LABEL: splice_nxv2f64_first_idx: ; CHECK: // %bb.0: -; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill -; CHECK-NEXT: addvl sp, sp, #-2 -; CHECK-NEXT: ptrue p0.d -; CHECK-NEXT: mov x8, sp -; CHECK-NEXT: st1d { z0.d }, p0, [sp] -; CHECK-NEXT: st1d { z1.d }, p0, [x8, #1, mul vl] -; CHECK-NEXT: ld1d { z0.d }, p0/z, [sp] -; CHECK-NEXT: addvl sp, sp, #2 -; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: rev z1.d, z1.d +; CHECK-NEXT: whilele p0.d, xzr, xzr +; CHECK-NEXT: splice z1.d, p0, z1.d, z0.d +; CHECK-NEXT: rev z0.d, z1.d ; CHECK-NEXT: ret %res = call @llvm.experimental.vector.splice.nxv2f64( %a, %b, i32 0) ret %res @@ -375,16 +255,10 @@ define @splice_nxv2f64_last_idx( %a, %b) #0 { ; CHECK-LABEL: splice_nxv2f64_last_idx: ; CHECK: // %bb.0: -; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill -; CHECK-NEXT: addvl sp, sp, #-2 +; CHECK-NEXT: rev z1.d, z1.d ; CHECK-NEXT: ptrue p0.d -; CHECK-NEXT: mov x8, sp -; CHECK-NEXT: st1d { z0.d }, p0, [sp] -; CHECK-NEXT: st1d { z1.d }, p0, [x8, #1, mul vl] -; CHECK-NEXT: orr x8, x8, #0x8 -; CHECK-NEXT: ld1d { z0.d }, p0/z, [x8] -; CHECK-NEXT: addvl sp, sp, #2 -; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: lastb d1, p0, z1.d +; CHECK-NEXT: insr z0.d, d1 ; CHECK-NEXT: ret %res = call @llvm.experimental.vector.splice.nxv2f64( %a, %b, i32 1) ret %res @@ -394,20 +268,11 @@ define @splice_nxv2f64_clamped_idx( %a, %b) #0 { ; CHECK-LABEL: splice_nxv2f64_clamped_idx: ; CHECK: // %bb.0: -; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill -; CHECK-NEXT: addvl sp, sp, #-2 -; CHECK-NEXT: cntd x10 -; CHECK-NEXT: sub x10, x10, #1 // =1 -; CHECK-NEXT: ptrue p0.d -; CHECK-NEXT: mov x8, sp -; CHECK-NEXT: mov w9, #2 -; CHECK-NEXT: cmp x10, #2 // =2 -; CHECK-NEXT: st1d { z0.d }, p0, [sp] -; CHECK-NEXT: st1d { z1.d }, p0, [x8, #1, mul vl] -; CHECK-NEXT: csel x9, x10, x9, lo -; CHECK-NEXT: ld1d { z0.d }, p0/z, [x8, x9, lsl #3] -; CHECK-NEXT: addvl sp, sp, #2 -; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: mov w8, #2 +; CHECK-NEXT: rev z1.d, z1.d +; CHECK-NEXT: whilele p0.d, xzr, x8 +; CHECK-NEXT: splice z1.d, p0, z1.d, z0.d +; CHECK-NEXT: rev z0.d, z1.d ; CHECK-NEXT: ret %res = call @llvm.experimental.vector.splice.nxv2f64( %a, %b, i32 2) ret %res @@ -417,20 +282,14 @@ define @splice_nxv2i1_idx( %a, %b) #0 { ; CHECK-LABEL: splice_nxv2i1_idx: ; CHECK: // %bb.0: -; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill -; CHECK-NEXT: addvl sp, sp, #-2 -; CHECK-NEXT: mov z0.d, p0/z, #1 // =0x1 -; CHECK-NEXT: ptrue p0.d -; CHECK-NEXT: mov z1.d, p1/z, #1 // =0x1 -; CHECK-NEXT: mov x8, sp -; CHECK-NEXT: st1d { z0.d }, p0, [sp] -; CHECK-NEXT: st1d { z1.d }, p0, [x8, #1, mul vl] -; CHECK-NEXT: orr x8, x8, #0x8 -; CHECK-NEXT: ld1d { z0.d }, p0/z, [x8] -; CHECK-NEXT: and z0.d, z0.d, #0x1 -; CHECK-NEXT: cmpne p0.d, p0/z, z0.d, #0 -; CHECK-NEXT: addvl sp, sp, #2 -; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: rev p1.d, p1.d +; CHECK-NEXT: ptrue p2.d +; CHECK-NEXT: mov z0.d, p1/z, #1 // =0x1 +; CHECK-NEXT: lastb d0, p2, z0.d +; CHECK-NEXT: mov z1.d, p0/z, #1 // =0x1 +; CHECK-NEXT: insr z1.d, d0 +; CHECK-NEXT: and z1.d, z1.d, #0x1 +; CHECK-NEXT: cmpne p0.d, p2/z, z1.d, #0 ; CHECK-NEXT: ret %res = call @llvm.experimental.vector.splice.nxv2i1( %a, %b, i32 1) ret %res @@ -440,20 +299,16 @@ define @splice_nxv4i1_idx( %a, %b) #0 { ; CHECK-LABEL: splice_nxv4i1_idx: ; CHECK: // %bb.0: -; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill -; CHECK-NEXT: addvl sp, sp, #-2 ; CHECK-NEXT: mov z0.s, p0/z, #1 // =0x1 -; CHECK-NEXT: ptrue p0.s -; CHECK-NEXT: mov z1.s, p1/z, #1 // =0x1 -; CHECK-NEXT: mov x8, sp -; CHECK-NEXT: st1w { z0.s }, p0, [sp] -; CHECK-NEXT: st1w { z1.s }, p0, [x8, #1, mul vl] -; CHECK-NEXT: orr x8, x8, #0x8 -; CHECK-NEXT: ld1w { z0.s }, p0/z, [x8] +; CHECK-NEXT: rev p0.s, p1.s +; CHECK-NEXT: mov w8, #2 +; CHECK-NEXT: mov z1.s, p0/z, #1 // =0x1 +; CHECK-NEXT: whilele p0.s, xzr, x8 +; CHECK-NEXT: splice z1.s, p0, z1.s, z0.s +; CHECK-NEXT: rev z0.s, z1.s ; CHECK-NEXT: and z0.s, z0.s, #0x1 +; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: cmpne p0.s, p0/z, z0.s, #0 -; CHECK-NEXT: addvl sp, sp, #2 -; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload ; CHECK-NEXT: ret %res = call @llvm.experimental.vector.splice.nxv4i1( %a, %b, i32 2) ret %res @@ -463,20 +318,16 @@ define @splice_nxv8i1_idx( %a, %b) #0 { ; CHECK-LABEL: splice_nxv8i1_idx: ; CHECK: // %bb.0: -; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill -; CHECK-NEXT: addvl sp, sp, #-2 ; CHECK-NEXT: mov z0.h, p0/z, #1 // =0x1 -; CHECK-NEXT: ptrue p0.h -; CHECK-NEXT: mov z1.h, p1/z, #1 // =0x1 -; CHECK-NEXT: mov x8, sp -; CHECK-NEXT: st1h { z0.h }, p0, [sp] -; CHECK-NEXT: st1h { z1.h }, p0, [x8, #1, mul vl] -; CHECK-NEXT: orr x8, x8, #0x8 -; CHECK-NEXT: ld1h { z0.h }, p0/z, [x8] +; CHECK-NEXT: rev p0.h, p1.h +; CHECK-NEXT: mov w8, #4 +; CHECK-NEXT: mov z1.h, p0/z, #1 // =0x1 +; CHECK-NEXT: whilele p0.h, xzr, x8 +; CHECK-NEXT: splice z1.h, p0, z1.h, z0.h +; CHECK-NEXT: rev z0.h, z1.h ; CHECK-NEXT: and z0.h, z0.h, #0x1 +; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: cmpne p0.h, p0/z, z0.h, #0 -; CHECK-NEXT: addvl sp, sp, #2 -; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload ; CHECK-NEXT: ret %res = call @llvm.experimental.vector.splice.nxv8i1( %a, %b, i32 4) ret %res @@ -486,20 +337,16 @@ define @splice_nxv16i1_idx( %a, %b) #0 { ; CHECK-LABEL: splice_nxv16i1_idx: ; CHECK: // %bb.0: -; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill -; CHECK-NEXT: addvl sp, sp, #-2 ; CHECK-NEXT: mov z0.b, p0/z, #1 // =0x1 -; CHECK-NEXT: ptrue p0.b -; CHECK-NEXT: mov z1.b, p1/z, #1 // =0x1 -; CHECK-NEXT: mov x8, sp -; CHECK-NEXT: st1b { z0.b }, p0, [sp] -; CHECK-NEXT: st1b { z1.b }, p0, [x8, #1, mul vl] -; CHECK-NEXT: orr x8, x8, #0x8 -; CHECK-NEXT: ld1b { z0.b }, p0/z, [x8] +; CHECK-NEXT: rev p0.b, p1.b +; CHECK-NEXT: mov w8, #8 +; CHECK-NEXT: mov z1.b, p0/z, #1 // =0x1 +; CHECK-NEXT: whilele p0.b, xzr, x8 +; CHECK-NEXT: splice z1.b, p0, z1.b, z0.b +; CHECK-NEXT: rev z0.b, z1.b ; CHECK-NEXT: and z0.b, z0.b, #0x1 +; CHECK-NEXT: ptrue p0.b ; CHECK-NEXT: cmpne p0.b, p0/z, z0.b, #0 -; CHECK-NEXT: addvl sp, sp, #2 -; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload ; CHECK-NEXT: ret %res = call @llvm.experimental.vector.splice.nxv16i1( %a, %b, i32 8) ret %res @@ -509,16 +356,10 @@ define @splice_nxv2i8_idx( %a, %b) #0 { ; CHECK-LABEL: splice_nxv2i8_idx: ; CHECK: // %bb.0: -; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill -; CHECK-NEXT: addvl sp, sp, #-2 +; CHECK-NEXT: rev z1.d, z1.d ; CHECK-NEXT: ptrue p0.d -; CHECK-NEXT: mov x8, sp -; CHECK-NEXT: st1d { z0.d }, p0, [sp] -; CHECK-NEXT: st1d { z1.d }, p0, [x8, #1, mul vl] -; CHECK-NEXT: orr x8, x8, #0x8 -; CHECK-NEXT: ld1d { z0.d }, p0/z, [x8] -; CHECK-NEXT: addvl sp, sp, #2 -; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: lastb d1, p0, z1.d +; CHECK-NEXT: insr z0.d, d1 ; CHECK-NEXT: ret %res = call @llvm.experimental.vector.splice.nxv2i8( %a, %b, i32 1) ret %res @@ -532,10 +373,12 @@ ; CHECK-NEXT: addvl sp, sp, #-4 ; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: mov x8, sp +; CHECK-NEXT: rev z2.s, z2.s +; CHECK-NEXT: rev z3.s, z3.s ; CHECK-NEXT: st1w { z1.s }, p0, [x8, #1, mul vl] ; CHECK-NEXT: st1w { z0.s }, p0, [sp] -; CHECK-NEXT: st1w { z3.s }, p0, [x8, #3, mul vl] -; CHECK-NEXT: st1w { z2.s }, p0, [x8, #2, mul vl] +; CHECK-NEXT: st1w { z2.s }, p0, [x8, #3, mul vl] +; CHECK-NEXT: st1w { z3.s }, p0, [x8, #2, mul vl] ; CHECK-NEXT: orr x8, x8, #0x8 ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x8] ; CHECK-NEXT: ld1w { z1.s }, p0/z, [x8, #1, mul vl] @@ -556,16 +399,20 @@ ; CHECK-NEXT: sub x10, x10, #1 // =1 ; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: mov x8, sp +; CHECK-NEXT: rev z6.s, z6.s +; CHECK-NEXT: rev z5.s, z5.s +; CHECK-NEXT: rev z4.s, z4.s +; CHECK-NEXT: rev z7.s, z7.s ; CHECK-NEXT: mov w9, #16 ; CHECK-NEXT: cmp x10, #16 // =16 ; CHECK-NEXT: st1w { z3.s }, p0, [x8, #3, mul vl] ; CHECK-NEXT: st1w { z2.s }, p0, [x8, #2, mul vl] ; CHECK-NEXT: st1w { z1.s }, p0, [x8, #1, mul vl] ; CHECK-NEXT: st1w { z0.s }, p0, [sp] -; CHECK-NEXT: st1w { z7.s }, p0, [x8, #7, mul vl] -; CHECK-NEXT: st1w { z4.s }, p0, [x8, #4, mul vl] -; CHECK-NEXT: st1w { z5.s }, p0, [x8, #5, mul vl] -; CHECK-NEXT: st1w { z6.s }, p0, [x8, #6, mul vl] +; CHECK-NEXT: st1w { z6.s }, p0, [x8, #5, mul vl] +; CHECK-NEXT: st1w { z5.s }, p0, [x8, #6, mul vl] +; CHECK-NEXT: st1w { z4.s }, p0, [x8, #7, mul vl] +; CHECK-NEXT: st1w { z7.s }, p0, [x8, #4, mul vl] ; CHECK-NEXT: csel x9, x10, x9, lo ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x8, x9, lsl #2] ; CHECK-NEXT: add x8, x8, x9, lsl #2 @@ -586,17 +433,11 @@ define @splice_nxv16i8( %a, %b) #0 { ; CHECK-LABEL: splice_nxv16i8: ; CHECK: // %bb.0: -; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill -; CHECK-NEXT: addvl sp, sp, #-2 -; CHECK-NEXT: ptrue p0.b -; CHECK-NEXT: mov x8, sp -; CHECK-NEXT: st1b { z0.b }, p0, [sp] -; CHECK-NEXT: st1b { z1.b }, p0, [x8, #1, mul vl] -; CHECK-NEXT: addvl x8, x8, #1 -; CHECK-NEXT: mov x9, #-16 -; CHECK-NEXT: ld1b { z0.b }, p0/z, [x8, x9] -; CHECK-NEXT: addvl sp, sp, #2 -; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: mov x8, #-16 +; CHECK-NEXT: rev z1.b, z1.b +; CHECK-NEXT: whilele p0.b, xzr, x8 +; CHECK-NEXT: splice z1.b, p0, z1.b, z0.b +; CHECK-NEXT: rev z0.b, z1.b ; CHECK-NEXT: ret %res = call @llvm.experimental.vector.splice.nxv16i8( %a, %b, i32 -16) ret %res @@ -605,17 +446,11 @@ define @splice_nxv16i8_1( %a, %b) #0 { ; CHECK-LABEL: splice_nxv16i8_1: ; CHECK: // %bb.0: -; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill -; CHECK-NEXT: addvl sp, sp, #-2 -; CHECK-NEXT: ptrue p0.b -; CHECK-NEXT: mov x8, sp -; CHECK-NEXT: st1b { z0.b }, p0, [sp] -; CHECK-NEXT: st1b { z1.b }, p0, [x8, #1, mul vl] -; CHECK-NEXT: addvl x8, x8, #1 -; CHECK-NEXT: mov x9, #-1 -; CHECK-NEXT: ld1b { z0.b }, p0/z, [x8, x9] -; CHECK-NEXT: addvl sp, sp, #2 -; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: mov x8, #-1 +; CHECK-NEXT: rev z1.b, z1.b +; CHECK-NEXT: whilele p0.b, xzr, x8 +; CHECK-NEXT: splice z1.b, p0, z1.b, z0.b +; CHECK-NEXT: rev z0.b, z1.b ; CHECK-NEXT: ret %res = call @llvm.experimental.vector.splice.nxv16i8( %a, %b, i32 -1) ret %res @@ -625,21 +460,11 @@ define @splice_nxv16i8_clamped( %a, %b) #0 { ; CHECK-LABEL: splice_nxv16i8_clamped: ; CHECK: // %bb.0: -; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill -; CHECK-NEXT: addvl sp, sp, #-2 -; CHECK-NEXT: rdvl x9, #1 -; CHECK-NEXT: ptrue p0.b -; CHECK-NEXT: mov x8, sp -; CHECK-NEXT: mov w10, #17 -; CHECK-NEXT: cmp x9, #17 // =17 -; CHECK-NEXT: st1b { z0.b }, p0, [sp] -; CHECK-NEXT: st1b { z1.b }, p0, [x8, #1, mul vl] -; CHECK-NEXT: csel x9, x9, x10, lo -; CHECK-NEXT: addvl x8, x8, #1 -; CHECK-NEXT: sub x8, x8, x9 -; CHECK-NEXT: ld1b { z0.b }, p0/z, [x8] -; CHECK-NEXT: addvl sp, sp, #2 -; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: mov x8, #-17 +; CHECK-NEXT: rev z1.b, z1.b +; CHECK-NEXT: whilele p0.b, xzr, x8 +; CHECK-NEXT: splice z1.b, p0, z1.b, z0.b +; CHECK-NEXT: rev z0.b, z1.b ; CHECK-NEXT: ret %res = call @llvm.experimental.vector.splice.nxv16i8( %a, %b, i32 -17) ret %res @@ -648,18 +473,11 @@ define @splice_nxv8i16( %a, %b) #0 { ; CHECK-LABEL: splice_nxv8i16: ; CHECK: // %bb.0: -; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill -; CHECK-NEXT: addvl sp, sp, #-2 -; CHECK-NEXT: ptrue p0.h -; CHECK-NEXT: mov x8, sp -; CHECK-NEXT: ptrue p1.b -; CHECK-NEXT: st1h { z0.h }, p0, [sp] -; CHECK-NEXT: st1h { z1.h }, p0, [x8, #1, mul vl] -; CHECK-NEXT: addvl x8, x8, #1 -; CHECK-NEXT: mov x9, #-16 -; CHECK-NEXT: ld1b { z0.b }, p1/z, [x8, x9] -; CHECK-NEXT: addvl sp, sp, #2 -; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: mov x8, #-8 +; CHECK-NEXT: rev z1.h, z1.h +; CHECK-NEXT: whilele p0.h, xzr, x8 +; CHECK-NEXT: splice z1.h, p0, z1.h, z0.h +; CHECK-NEXT: rev z0.h, z1.h ; CHECK-NEXT: ret %res = call @llvm.experimental.vector.splice.nxv8i16( %a, %b, i32 -8) ret %res @@ -668,18 +486,11 @@ define @splice_nxv8i16_1( %a, %b) #0 { ; CHECK-LABEL: splice_nxv8i16_1: ; CHECK: // %bb.0: -; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill -; CHECK-NEXT: addvl sp, sp, #-2 -; CHECK-NEXT: ptrue p0.h -; CHECK-NEXT: mov x8, sp -; CHECK-NEXT: ptrue p1.b -; CHECK-NEXT: st1h { z0.h }, p0, [sp] -; CHECK-NEXT: st1h { z1.h }, p0, [x8, #1, mul vl] -; CHECK-NEXT: addvl x8, x8, #1 -; CHECK-NEXT: mov x9, #-2 -; CHECK-NEXT: ld1b { z0.b }, p1/z, [x8, x9] -; CHECK-NEXT: addvl sp, sp, #2 -; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: mov x8, #-1 +; CHECK-NEXT: rev z1.h, z1.h +; CHECK-NEXT: whilele p0.h, xzr, x8 +; CHECK-NEXT: splice z1.h, p0, z1.h, z0.h +; CHECK-NEXT: rev z0.h, z1.h ; CHECK-NEXT: ret %res = call @llvm.experimental.vector.splice.nxv8i16( %a, %b, i32 -1) ret %res @@ -689,21 +500,11 @@ define @splice_nxv8i16_clamped( %a, %b) #0 { ; CHECK-LABEL: splice_nxv8i16_clamped: ; CHECK: // %bb.0: -; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill -; CHECK-NEXT: addvl sp, sp, #-2 -; CHECK-NEXT: rdvl x9, #1 -; CHECK-NEXT: ptrue p0.h -; CHECK-NEXT: mov x8, sp -; CHECK-NEXT: mov w10, #18 -; CHECK-NEXT: cmp x9, #18 // =18 -; CHECK-NEXT: st1h { z0.h }, p0, [sp] -; CHECK-NEXT: st1h { z1.h }, p0, [x8, #1, mul vl] -; CHECK-NEXT: csel x9, x9, x10, lo -; CHECK-NEXT: addvl x8, x8, #1 -; CHECK-NEXT: sub x8, x8, x9 -; CHECK-NEXT: ld1h { z0.h }, p0/z, [x8] -; CHECK-NEXT: addvl sp, sp, #2 -; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: mov x8, #-9 +; CHECK-NEXT: rev z1.h, z1.h +; CHECK-NEXT: whilele p0.h, xzr, x8 +; CHECK-NEXT: splice z1.h, p0, z1.h, z0.h +; CHECK-NEXT: rev z0.h, z1.h ; CHECK-NEXT: ret %res = call @llvm.experimental.vector.splice.nxv8i16( %a, %b, i32 -9) ret %res @@ -712,18 +513,11 @@ define @splice_nxv4i32( %a, %b) #0 { ; CHECK-LABEL: splice_nxv4i32: ; CHECK: // %bb.0: -; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill -; CHECK-NEXT: addvl sp, sp, #-2 -; CHECK-NEXT: ptrue p0.s -; CHECK-NEXT: mov x8, sp -; CHECK-NEXT: ptrue p1.b -; CHECK-NEXT: st1w { z0.s }, p0, [sp] -; CHECK-NEXT: st1w { z1.s }, p0, [x8, #1, mul vl] -; CHECK-NEXT: addvl x8, x8, #1 -; CHECK-NEXT: mov x9, #-16 -; CHECK-NEXT: ld1b { z0.b }, p1/z, [x8, x9] -; CHECK-NEXT: addvl sp, sp, #2 -; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: mov x8, #-4 +; CHECK-NEXT: rev z1.s, z1.s +; CHECK-NEXT: whilele p0.s, xzr, x8 +; CHECK-NEXT: splice z1.s, p0, z1.s, z0.s +; CHECK-NEXT: rev z0.s, z1.s ; CHECK-NEXT: ret %res = call @llvm.experimental.vector.splice.nxv4i32( %a, %b, i32 -4) ret %res @@ -732,18 +526,11 @@ define @splice_nxv4i32_1( %a, %b) #0 { ; CHECK-LABEL: splice_nxv4i32_1: ; CHECK: // %bb.0: -; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill -; CHECK-NEXT: addvl sp, sp, #-2 -; CHECK-NEXT: ptrue p0.s -; CHECK-NEXT: mov x8, sp -; CHECK-NEXT: ptrue p1.b -; CHECK-NEXT: st1w { z0.s }, p0, [sp] -; CHECK-NEXT: st1w { z1.s }, p0, [x8, #1, mul vl] -; CHECK-NEXT: addvl x8, x8, #1 -; CHECK-NEXT: mov x9, #-4 -; CHECK-NEXT: ld1b { z0.b }, p1/z, [x8, x9] -; CHECK-NEXT: addvl sp, sp, #2 -; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: mov x8, #-1 +; CHECK-NEXT: rev z1.s, z1.s +; CHECK-NEXT: whilele p0.s, xzr, x8 +; CHECK-NEXT: splice z1.s, p0, z1.s, z0.s +; CHECK-NEXT: rev z0.s, z1.s ; CHECK-NEXT: ret %res = call @llvm.experimental.vector.splice.nxv4i32( %a, %b, i32 -1) ret %res @@ -753,21 +540,11 @@ define @splice_nxv4i32_clamped( %a, %b) #0 { ; CHECK-LABEL: splice_nxv4i32_clamped: ; CHECK: // %bb.0: -; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill -; CHECK-NEXT: addvl sp, sp, #-2 -; CHECK-NEXT: rdvl x9, #1 -; CHECK-NEXT: ptrue p0.s -; CHECK-NEXT: mov x8, sp -; CHECK-NEXT: mov w10, #20 -; CHECK-NEXT: cmp x9, #20 // =20 -; CHECK-NEXT: st1w { z0.s }, p0, [sp] -; CHECK-NEXT: st1w { z1.s }, p0, [x8, #1, mul vl] -; CHECK-NEXT: csel x9, x9, x10, lo -; CHECK-NEXT: addvl x8, x8, #1 -; CHECK-NEXT: sub x8, x8, x9 -; CHECK-NEXT: ld1w { z0.s }, p0/z, [x8] -; CHECK-NEXT: addvl sp, sp, #2 -; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: mov x8, #-5 +; CHECK-NEXT: rev z1.s, z1.s +; CHECK-NEXT: whilele p0.s, xzr, x8 +; CHECK-NEXT: splice z1.s, p0, z1.s, z0.s +; CHECK-NEXT: rev z0.s, z1.s ; CHECK-NEXT: ret %res = call @llvm.experimental.vector.splice.nxv4i32( %a, %b, i32 -5) ret %res @@ -776,18 +553,11 @@ define @splice_nxv2i64( %a, %b) #0 { ; CHECK-LABEL: splice_nxv2i64: ; CHECK: // %bb.0: -; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill -; CHECK-NEXT: addvl sp, sp, #-2 -; CHECK-NEXT: ptrue p0.d -; CHECK-NEXT: mov x8, sp -; CHECK-NEXT: ptrue p1.b -; CHECK-NEXT: st1d { z0.d }, p0, [sp] -; CHECK-NEXT: st1d { z1.d }, p0, [x8, #1, mul vl] -; CHECK-NEXT: addvl x8, x8, #1 -; CHECK-NEXT: mov x9, #-16 -; CHECK-NEXT: ld1b { z0.b }, p1/z, [x8, x9] -; CHECK-NEXT: addvl sp, sp, #2 -; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: mov x8, #-2 +; CHECK-NEXT: rev z1.d, z1.d +; CHECK-NEXT: whilele p0.d, xzr, x8 +; CHECK-NEXT: splice z1.d, p0, z1.d, z0.d +; CHECK-NEXT: rev z0.d, z1.d ; CHECK-NEXT: ret %res = call @llvm.experimental.vector.splice.nxv2i64( %a, %b, i32 -2) ret %res @@ -796,18 +566,11 @@ define @splice_nxv2i64_1( %a, %b) #0 { ; CHECK-LABEL: splice_nxv2i64_1: ; CHECK: // %bb.0: -; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill -; CHECK-NEXT: addvl sp, sp, #-2 -; CHECK-NEXT: ptrue p0.d -; CHECK-NEXT: mov x8, sp -; CHECK-NEXT: ptrue p1.b -; CHECK-NEXT: st1d { z0.d }, p0, [sp] -; CHECK-NEXT: st1d { z1.d }, p0, [x8, #1, mul vl] -; CHECK-NEXT: addvl x8, x8, #1 -; CHECK-NEXT: mov x9, #-8 -; CHECK-NEXT: ld1b { z0.b }, p1/z, [x8, x9] -; CHECK-NEXT: addvl sp, sp, #2 -; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: mov x8, #-1 +; CHECK-NEXT: rev z1.d, z1.d +; CHECK-NEXT: whilele p0.d, xzr, x8 +; CHECK-NEXT: splice z1.d, p0, z1.d, z0.d +; CHECK-NEXT: rev z0.d, z1.d ; CHECK-NEXT: ret %res = call @llvm.experimental.vector.splice.nxv2i64( %a, %b, i32 -1) ret %res @@ -817,21 +580,11 @@ define @splice_nxv2i64_clamped( %a, %b) #0 { ; CHECK-LABEL: splice_nxv2i64_clamped: ; CHECK: // %bb.0: -; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill -; CHECK-NEXT: addvl sp, sp, #-2 -; CHECK-NEXT: rdvl x9, #1 -; CHECK-NEXT: ptrue p0.d -; CHECK-NEXT: mov x8, sp -; CHECK-NEXT: mov w10, #24 -; CHECK-NEXT: cmp x9, #24 // =24 -; CHECK-NEXT: st1d { z0.d }, p0, [sp] -; CHECK-NEXT: st1d { z1.d }, p0, [x8, #1, mul vl] -; CHECK-NEXT: csel x9, x9, x10, lo -; CHECK-NEXT: addvl x8, x8, #1 -; CHECK-NEXT: sub x8, x8, x9 -; CHECK-NEXT: ld1d { z0.d }, p0/z, [x8] -; CHECK-NEXT: addvl sp, sp, #2 -; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: mov x8, #-3 +; CHECK-NEXT: rev z1.d, z1.d +; CHECK-NEXT: whilele p0.d, xzr, x8 +; CHECK-NEXT: splice z1.d, p0, z1.d, z0.d +; CHECK-NEXT: rev z0.d, z1.d ; CHECK-NEXT: ret %res = call @llvm.experimental.vector.splice.nxv2i64( %a, %b, i32 -3) ret %res @@ -840,18 +593,11 @@ define @splice_nxv8f16( %a, %b) #0 { ; CHECK-LABEL: splice_nxv8f16: ; CHECK: // %bb.0: -; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill -; CHECK-NEXT: addvl sp, sp, #-2 -; CHECK-NEXT: ptrue p0.h -; CHECK-NEXT: mov x8, sp -; CHECK-NEXT: ptrue p1.b -; CHECK-NEXT: st1h { z0.h }, p0, [sp] -; CHECK-NEXT: st1h { z1.h }, p0, [x8, #1, mul vl] -; CHECK-NEXT: addvl x8, x8, #1 -; CHECK-NEXT: mov x9, #-16 -; CHECK-NEXT: ld1b { z0.b }, p1/z, [x8, x9] -; CHECK-NEXT: addvl sp, sp, #2 -; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: mov x8, #-8 +; CHECK-NEXT: rev z1.h, z1.h +; CHECK-NEXT: whilele p0.h, xzr, x8 +; CHECK-NEXT: splice z1.h, p0, z1.h, z0.h +; CHECK-NEXT: rev z0.h, z1.h ; CHECK-NEXT: ret %res = call @llvm.experimental.vector.splice.nxv8f16( %a, %b, i32 -8) ret %res @@ -860,18 +606,11 @@ define @splice_nxv8f16_1( %a, %b) #0 { ; CHECK-LABEL: splice_nxv8f16_1: ; CHECK: // %bb.0: -; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill -; CHECK-NEXT: addvl sp, sp, #-2 -; CHECK-NEXT: ptrue p0.h -; CHECK-NEXT: mov x8, sp -; CHECK-NEXT: ptrue p1.b -; CHECK-NEXT: st1h { z0.h }, p0, [sp] -; CHECK-NEXT: st1h { z1.h }, p0, [x8, #1, mul vl] -; CHECK-NEXT: addvl x8, x8, #1 -; CHECK-NEXT: mov x9, #-2 -; CHECK-NEXT: ld1b { z0.b }, p1/z, [x8, x9] -; CHECK-NEXT: addvl sp, sp, #2 -; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: mov x8, #-1 +; CHECK-NEXT: rev z1.h, z1.h +; CHECK-NEXT: whilele p0.h, xzr, x8 +; CHECK-NEXT: splice z1.h, p0, z1.h, z0.h +; CHECK-NEXT: rev z0.h, z1.h ; CHECK-NEXT: ret %res = call @llvm.experimental.vector.splice.nxv8f16( %a, %b, i32 -1) ret %res @@ -881,21 +620,11 @@ define @splice_nxv8f16_clamped( %a, %b) #0 { ; CHECK-LABEL: splice_nxv8f16_clamped: ; CHECK: // %bb.0: -; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill -; CHECK-NEXT: addvl sp, sp, #-2 -; CHECK-NEXT: rdvl x9, #1 -; CHECK-NEXT: ptrue p0.h -; CHECK-NEXT: mov x8, sp -; CHECK-NEXT: mov w10, #18 -; CHECK-NEXT: cmp x9, #18 // =18 -; CHECK-NEXT: st1h { z0.h }, p0, [sp] -; CHECK-NEXT: st1h { z1.h }, p0, [x8, #1, mul vl] -; CHECK-NEXT: csel x9, x9, x10, lo -; CHECK-NEXT: addvl x8, x8, #1 -; CHECK-NEXT: sub x8, x8, x9 -; CHECK-NEXT: ld1h { z0.h }, p0/z, [x8] -; CHECK-NEXT: addvl sp, sp, #2 -; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: mov x8, #-9 +; CHECK-NEXT: rev z1.h, z1.h +; CHECK-NEXT: whilele p0.h, xzr, x8 +; CHECK-NEXT: splice z1.h, p0, z1.h, z0.h +; CHECK-NEXT: rev z0.h, z1.h ; CHECK-NEXT: ret %res = call @llvm.experimental.vector.splice.nxv8f16( %a, %b, i32 -9) ret %res @@ -904,18 +633,11 @@ define @splice_nxv4f32( %a, %b) #0 { ; CHECK-LABEL: splice_nxv4f32: ; CHECK: // %bb.0: -; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill -; CHECK-NEXT: addvl sp, sp, #-2 -; CHECK-NEXT: ptrue p0.s -; CHECK-NEXT: mov x8, sp -; CHECK-NEXT: ptrue p1.b -; CHECK-NEXT: st1w { z0.s }, p0, [sp] -; CHECK-NEXT: st1w { z1.s }, p0, [x8, #1, mul vl] -; CHECK-NEXT: addvl x8, x8, #1 -; CHECK-NEXT: mov x9, #-16 -; CHECK-NEXT: ld1b { z0.b }, p1/z, [x8, x9] -; CHECK-NEXT: addvl sp, sp, #2 -; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: mov x8, #-4 +; CHECK-NEXT: rev z1.s, z1.s +; CHECK-NEXT: whilele p0.s, xzr, x8 +; CHECK-NEXT: splice z1.s, p0, z1.s, z0.s +; CHECK-NEXT: rev z0.s, z1.s ; CHECK-NEXT: ret %res = call @llvm.experimental.vector.splice.nxv4f32( %a, %b, i32 -4) ret %res @@ -924,18 +646,11 @@ define @splice_nxv4f32_1( %a, %b) #0 { ; CHECK-LABEL: splice_nxv4f32_1: ; CHECK: // %bb.0: -; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill -; CHECK-NEXT: addvl sp, sp, #-2 -; CHECK-NEXT: ptrue p0.s -; CHECK-NEXT: mov x8, sp -; CHECK-NEXT: ptrue p1.b -; CHECK-NEXT: st1w { z0.s }, p0, [sp] -; CHECK-NEXT: st1w { z1.s }, p0, [x8, #1, mul vl] -; CHECK-NEXT: addvl x8, x8, #1 -; CHECK-NEXT: mov x9, #-4 -; CHECK-NEXT: ld1b { z0.b }, p1/z, [x8, x9] -; CHECK-NEXT: addvl sp, sp, #2 -; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: mov x8, #-1 +; CHECK-NEXT: rev z1.s, z1.s +; CHECK-NEXT: whilele p0.s, xzr, x8 +; CHECK-NEXT: splice z1.s, p0, z1.s, z0.s +; CHECK-NEXT: rev z0.s, z1.s ; CHECK-NEXT: ret %res = call @llvm.experimental.vector.splice.nxv4f32( %a, %b, i32 -1) ret %res @@ -945,21 +660,11 @@ define @splice_nxv4f32_clamped( %a, %b) #0 { ; CHECK-LABEL: splice_nxv4f32_clamped: ; CHECK: // %bb.0: -; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill -; CHECK-NEXT: addvl sp, sp, #-2 -; CHECK-NEXT: rdvl x9, #1 -; CHECK-NEXT: ptrue p0.s -; CHECK-NEXT: mov x8, sp -; CHECK-NEXT: mov w10, #20 -; CHECK-NEXT: cmp x9, #20 // =20 -; CHECK-NEXT: st1w { z0.s }, p0, [sp] -; CHECK-NEXT: st1w { z1.s }, p0, [x8, #1, mul vl] -; CHECK-NEXT: csel x9, x9, x10, lo -; CHECK-NEXT: addvl x8, x8, #1 -; CHECK-NEXT: sub x8, x8, x9 -; CHECK-NEXT: ld1w { z0.s }, p0/z, [x8] -; CHECK-NEXT: addvl sp, sp, #2 -; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: mov x8, #-5 +; CHECK-NEXT: rev z1.s, z1.s +; CHECK-NEXT: whilele p0.s, xzr, x8 +; CHECK-NEXT: splice z1.s, p0, z1.s, z0.s +; CHECK-NEXT: rev z0.s, z1.s ; CHECK-NEXT: ret %res = call @llvm.experimental.vector.splice.nxv4f32( %a, %b, i32 -5) ret %res @@ -968,18 +673,11 @@ define @splice_nxv2f64( %a, %b) #0 { ; CHECK-LABEL: splice_nxv2f64: ; CHECK: // %bb.0: -; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill -; CHECK-NEXT: addvl sp, sp, #-2 -; CHECK-NEXT: ptrue p0.d -; CHECK-NEXT: mov x8, sp -; CHECK-NEXT: ptrue p1.b -; CHECK-NEXT: st1d { z0.d }, p0, [sp] -; CHECK-NEXT: st1d { z1.d }, p0, [x8, #1, mul vl] -; CHECK-NEXT: addvl x8, x8, #1 -; CHECK-NEXT: mov x9, #-16 -; CHECK-NEXT: ld1b { z0.b }, p1/z, [x8, x9] -; CHECK-NEXT: addvl sp, sp, #2 -; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: mov x8, #-2 +; CHECK-NEXT: rev z1.d, z1.d +; CHECK-NEXT: whilele p0.d, xzr, x8 +; CHECK-NEXT: splice z1.d, p0, z1.d, z0.d +; CHECK-NEXT: rev z0.d, z1.d ; CHECK-NEXT: ret %res = call @llvm.experimental.vector.splice.nxv2f64( %a, %b, i32 -2) ret %res @@ -988,18 +686,11 @@ define @splice_nxv2f64_1( %a, %b) #0 { ; CHECK-LABEL: splice_nxv2f64_1: ; CHECK: // %bb.0: -; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill -; CHECK-NEXT: addvl sp, sp, #-2 -; CHECK-NEXT: ptrue p0.d -; CHECK-NEXT: mov x8, sp -; CHECK-NEXT: ptrue p1.b -; CHECK-NEXT: st1d { z0.d }, p0, [sp] -; CHECK-NEXT: st1d { z1.d }, p0, [x8, #1, mul vl] -; CHECK-NEXT: addvl x8, x8, #1 -; CHECK-NEXT: mov x9, #-8 -; CHECK-NEXT: ld1b { z0.b }, p1/z, [x8, x9] -; CHECK-NEXT: addvl sp, sp, #2 -; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: mov x8, #-1 +; CHECK-NEXT: rev z1.d, z1.d +; CHECK-NEXT: whilele p0.d, xzr, x8 +; CHECK-NEXT: splice z1.d, p0, z1.d, z0.d +; CHECK-NEXT: rev z0.d, z1.d ; CHECK-NEXT: ret %res = call @llvm.experimental.vector.splice.nxv2f64( %a, %b, i32 -1) ret %res @@ -1009,21 +700,11 @@ define @splice_nxv2f64_clamped( %a, %b) #0 { ; CHECK-LABEL: splice_nxv2f64_clamped: ; CHECK: // %bb.0: -; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill -; CHECK-NEXT: addvl sp, sp, #-2 -; CHECK-NEXT: rdvl x9, #1 -; CHECK-NEXT: ptrue p0.d -; CHECK-NEXT: mov x8, sp -; CHECK-NEXT: mov w10, #24 -; CHECK-NEXT: cmp x9, #24 // =24 -; CHECK-NEXT: st1d { z0.d }, p0, [sp] -; CHECK-NEXT: st1d { z1.d }, p0, [x8, #1, mul vl] -; CHECK-NEXT: csel x9, x9, x10, lo -; CHECK-NEXT: addvl x8, x8, #1 -; CHECK-NEXT: sub x8, x8, x9 -; CHECK-NEXT: ld1d { z0.d }, p0/z, [x8] -; CHECK-NEXT: addvl sp, sp, #2 -; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: mov x8, #-3 +; CHECK-NEXT: rev z1.d, z1.d +; CHECK-NEXT: whilele p0.d, xzr, x8 +; CHECK-NEXT: splice z1.d, p0, z1.d, z0.d +; CHECK-NEXT: rev z0.d, z1.d ; CHECK-NEXT: ret %res = call @llvm.experimental.vector.splice.nxv2f64( %a, %b, i32 -3) ret %res @@ -1033,22 +714,16 @@ define @splice_nxv2i1( %a, %b) #0 { ; CHECK-LABEL: splice_nxv2i1: ; CHECK: // %bb.0: -; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill -; CHECK-NEXT: addvl sp, sp, #-2 ; CHECK-NEXT: mov z0.d, p0/z, #1 // =0x1 -; CHECK-NEXT: ptrue p0.d -; CHECK-NEXT: mov z1.d, p1/z, #1 // =0x1 -; CHECK-NEXT: mov x8, sp -; CHECK-NEXT: ptrue p1.b -; CHECK-NEXT: st1d { z0.d }, p0, [sp] -; CHECK-NEXT: st1d { z1.d }, p0, [x8, #1, mul vl] -; CHECK-NEXT: addvl x8, x8, #1 -; CHECK-NEXT: mov x9, #-8 -; CHECK-NEXT: ld1b { z0.b }, p1/z, [x8, x9] +; CHECK-NEXT: mov x8, #-1 +; CHECK-NEXT: rev p0.d, p1.d +; CHECK-NEXT: whilele p1.d, xzr, x8 +; CHECK-NEXT: mov z1.d, p0/z, #1 // =0x1 +; CHECK-NEXT: splice z1.d, p1, z1.d, z0.d +; CHECK-NEXT: rev z0.d, z1.d ; CHECK-NEXT: and z0.d, z0.d, #0x1 +; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: cmpne p0.d, p0/z, z0.d, #0 -; CHECK-NEXT: addvl sp, sp, #2 -; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload ; CHECK-NEXT: ret %res = call @llvm.experimental.vector.splice.nxv2i1( %a, %b, i32 -1) ret %res @@ -1058,22 +733,16 @@ define @splice_nxv4i1( %a, %b) #0 { ; CHECK-LABEL: splice_nxv4i1: ; CHECK: // %bb.0: -; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill -; CHECK-NEXT: addvl sp, sp, #-2 ; CHECK-NEXT: mov z0.s, p0/z, #1 // =0x1 -; CHECK-NEXT: ptrue p0.s -; CHECK-NEXT: mov z1.s, p1/z, #1 // =0x1 -; CHECK-NEXT: mov x8, sp -; CHECK-NEXT: ptrue p1.b -; CHECK-NEXT: st1w { z0.s }, p0, [sp] -; CHECK-NEXT: st1w { z1.s }, p0, [x8, #1, mul vl] -; CHECK-NEXT: addvl x8, x8, #1 -; CHECK-NEXT: mov x9, #-4 -; CHECK-NEXT: ld1b { z0.b }, p1/z, [x8, x9] +; CHECK-NEXT: mov x8, #-1 +; CHECK-NEXT: rev p0.s, p1.s +; CHECK-NEXT: whilele p1.s, xzr, x8 +; CHECK-NEXT: mov z1.s, p0/z, #1 // =0x1 +; CHECK-NEXT: splice z1.s, p1, z1.s, z0.s +; CHECK-NEXT: rev z0.s, z1.s ; CHECK-NEXT: and z0.s, z0.s, #0x1 +; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: cmpne p0.s, p0/z, z0.s, #0 -; CHECK-NEXT: addvl sp, sp, #2 -; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload ; CHECK-NEXT: ret %res = call @llvm.experimental.vector.splice.nxv4i1( %a, %b, i32 -1) ret %res @@ -1083,22 +752,16 @@ define @splice_nxv8i1( %a, %b) #0 { ; CHECK-LABEL: splice_nxv8i1: ; CHECK: // %bb.0: -; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill -; CHECK-NEXT: addvl sp, sp, #-2 ; CHECK-NEXT: mov z0.h, p0/z, #1 // =0x1 -; CHECK-NEXT: ptrue p0.h -; CHECK-NEXT: mov z1.h, p1/z, #1 // =0x1 -; CHECK-NEXT: mov x8, sp -; CHECK-NEXT: ptrue p1.b -; CHECK-NEXT: st1h { z0.h }, p0, [sp] -; CHECK-NEXT: st1h { z1.h }, p0, [x8, #1, mul vl] -; CHECK-NEXT: addvl x8, x8, #1 -; CHECK-NEXT: mov x9, #-2 -; CHECK-NEXT: ld1b { z0.b }, p1/z, [x8, x9] +; CHECK-NEXT: mov x8, #-1 +; CHECK-NEXT: rev p0.h, p1.h +; CHECK-NEXT: whilele p1.h, xzr, x8 +; CHECK-NEXT: mov z1.h, p0/z, #1 // =0x1 +; CHECK-NEXT: splice z1.h, p1, z1.h, z0.h +; CHECK-NEXT: rev z0.h, z1.h ; CHECK-NEXT: and z0.h, z0.h, #0x1 +; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: cmpne p0.h, p0/z, z0.h, #0 -; CHECK-NEXT: addvl sp, sp, #2 -; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload ; CHECK-NEXT: ret %res = call @llvm.experimental.vector.splice.nxv8i1( %a, %b, i32 -1) ret %res @@ -1108,21 +771,16 @@ define @splice_nxv16i1( %a, %b) #0 { ; CHECK-LABEL: splice_nxv16i1: ; CHECK: // %bb.0: -; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill -; CHECK-NEXT: addvl sp, sp, #-2 ; CHECK-NEXT: mov z0.b, p0/z, #1 // =0x1 -; CHECK-NEXT: ptrue p0.b -; CHECK-NEXT: mov z1.b, p1/z, #1 // =0x1 -; CHECK-NEXT: mov x8, sp -; CHECK-NEXT: st1b { z0.b }, p0, [sp] -; CHECK-NEXT: st1b { z1.b }, p0, [x8, #1, mul vl] -; CHECK-NEXT: addvl x8, x8, #1 -; CHECK-NEXT: mov x9, #-1 -; CHECK-NEXT: ld1b { z0.b }, p0/z, [x8, x9] +; CHECK-NEXT: mov x8, #-1 +; CHECK-NEXT: rev p0.b, p1.b +; CHECK-NEXT: whilele p1.b, xzr, x8 +; CHECK-NEXT: mov z1.b, p0/z, #1 // =0x1 +; CHECK-NEXT: splice z1.b, p1, z1.b, z0.b +; CHECK-NEXT: rev z0.b, z1.b ; CHECK-NEXT: and z0.b, z0.b, #0x1 +; CHECK-NEXT: ptrue p0.b ; CHECK-NEXT: cmpne p0.b, p0/z, z0.b, #0 -; CHECK-NEXT: addvl sp, sp, #2 -; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload ; CHECK-NEXT: ret %res = call @llvm.experimental.vector.splice.nxv16i1( %a, %b, i32 -1) ret %res @@ -1132,18 +790,11 @@ define @splice_nxv2i8( %a, %b) #0 { ; CHECK-LABEL: splice_nxv2i8: ; CHECK: // %bb.0: -; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill -; CHECK-NEXT: addvl sp, sp, #-2 -; CHECK-NEXT: ptrue p0.d -; CHECK-NEXT: mov x8, sp -; CHECK-NEXT: ptrue p1.b -; CHECK-NEXT: st1d { z0.d }, p0, [sp] -; CHECK-NEXT: st1d { z1.d }, p0, [x8, #1, mul vl] -; CHECK-NEXT: addvl x8, x8, #1 -; CHECK-NEXT: mov x9, #-16 -; CHECK-NEXT: ld1b { z0.b }, p1/z, [x8, x9] -; CHECK-NEXT: addvl sp, sp, #2 -; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: mov x8, #-2 +; CHECK-NEXT: rev z1.d, z1.d +; CHECK-NEXT: whilele p0.d, xzr, x8 +; CHECK-NEXT: splice z1.d, p0, z1.d, z0.d +; CHECK-NEXT: rev z0.d, z1.d ; CHECK-NEXT: ret %res = call @llvm.experimental.vector.splice.nxv2i8( %a, %b, i32 -2) ret %res @@ -1157,12 +808,14 @@ ; CHECK-NEXT: addvl sp, sp, #-4 ; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: mov x8, sp +; CHECK-NEXT: rev z2.s, z2.s +; CHECK-NEXT: rev z3.s, z3.s ; CHECK-NEXT: ptrue p1.b ; CHECK-NEXT: mov x9, #-32 ; CHECK-NEXT: st1w { z1.s }, p0, [x8, #1, mul vl] ; CHECK-NEXT: st1w { z0.s }, p0, [sp] -; CHECK-NEXT: st1w { z3.s }, p0, [x8, #3, mul vl] -; CHECK-NEXT: st1w { z2.s }, p0, [x8, #2, mul vl] +; CHECK-NEXT: st1w { z2.s }, p0, [x8, #3, mul vl] +; CHECK-NEXT: st1w { z3.s }, p0, [x8, #2, mul vl] ; CHECK-NEXT: addvl x8, x8, #2 ; CHECK-NEXT: ld1b { z0.b }, p1/z, [x8, x9] ; CHECK-NEXT: sub x8, x8, #32 // =32 @@ -1183,16 +836,20 @@ ; CHECK-NEXT: rdvl x9, #4 ; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: mov x8, sp +; CHECK-NEXT: rev z6.s, z6.s +; CHECK-NEXT: rev z5.s, z5.s +; CHECK-NEXT: rev z4.s, z4.s +; CHECK-NEXT: rev z7.s, z7.s ; CHECK-NEXT: mov w10, #68 ; CHECK-NEXT: cmp x9, #68 // =68 ; CHECK-NEXT: st1w { z3.s }, p0, [x8, #3, mul vl] ; CHECK-NEXT: st1w { z2.s }, p0, [x8, #2, mul vl] ; CHECK-NEXT: st1w { z1.s }, p0, [x8, #1, mul vl] ; CHECK-NEXT: st1w { z0.s }, p0, [sp] -; CHECK-NEXT: st1w { z7.s }, p0, [x8, #7, mul vl] -; CHECK-NEXT: st1w { z4.s }, p0, [x8, #4, mul vl] -; CHECK-NEXT: st1w { z5.s }, p0, [x8, #5, mul vl] -; CHECK-NEXT: st1w { z6.s }, p0, [x8, #6, mul vl] +; CHECK-NEXT: st1w { z6.s }, p0, [x8, #5, mul vl] +; CHECK-NEXT: st1w { z5.s }, p0, [x8, #6, mul vl] +; CHECK-NEXT: st1w { z4.s }, p0, [x8, #7, mul vl] +; CHECK-NEXT: st1w { z7.s }, p0, [x8, #4, mul vl] ; CHECK-NEXT: addvl x8, x8, #4 ; CHECK-NEXT: csel x9, x9, x10, lo ; CHECK-NEXT: sub x8, x8, x9