diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h @@ -948,6 +948,7 @@ SDValue LowerToPredicatedOp(SDValue Op, SelectionDAG &DAG, unsigned NewOp, bool OverrideNEON = false) const; SDValue LowerToScalableOp(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerVECTOR_SPLICE(SDValue Op, SelectionDAG &DAG) const; SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, SelectionDAG &DAG) const; SDValue LowerINSERT_SUBVECTOR(SDValue Op, SelectionDAG &DAG) const; SDValue LowerDIV(SDValue Op, SelectionDAG &DAG) const; diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -895,6 +895,7 @@ setTargetDAGCombine(ISD::ANY_EXTEND); setTargetDAGCombine(ISD::ZERO_EXTEND); setTargetDAGCombine(ISD::SIGN_EXTEND); + setTargetDAGCombine(ISD::VECTOR_SPLICE); setTargetDAGCombine(ISD::SIGN_EXTEND_INREG); setTargetDAGCombine(ISD::TRUNCATE); setTargetDAGCombine(ISD::CONCAT_VECTORS); @@ -1159,6 +1160,7 @@ setOperationAction(ISD::MULHS, VT, Custom); setOperationAction(ISD::MULHU, VT, Custom); setOperationAction(ISD::SPLAT_VECTOR, VT, Custom); + setOperationAction(ISD::VECTOR_SPLICE, VT, Custom); setOperationAction(ISD::SELECT, VT, Custom); setOperationAction(ISD::SETCC, VT, Custom); setOperationAction(ISD::SDIV, VT, Custom); @@ -1282,6 +1284,7 @@ setOperationAction(ISD::VECREDUCE_FMAX, VT, Custom); setOperationAction(ISD::VECREDUCE_FMIN, VT, Custom); setOperationAction(ISD::VECREDUCE_SEQ_FADD, VT, Custom); + setOperationAction(ISD::VECTOR_SPLICE, VT, Custom); setOperationAction(ISD::SELECT_CC, VT, Expand); } @@ -1559,6 +1562,7 @@ setOperationAction(ISD::SMAX, VT, Custom); setOperationAction(ISD::SMIN, VT, Custom); setOperationAction(ISD::SPLAT_VECTOR, VT, Custom); + setOperationAction(ISD::VECTOR_SPLICE, VT, Custom); setOperationAction(ISD::SRA, VT, Custom); setOperationAction(ISD::SRL, VT, Custom); setOperationAction(ISD::STORE, VT, Custom); @@ -4911,6 +4915,8 @@ /*OverrideNEON=*/true); case ISD::CTTZ: return LowerCTTZ(Op, DAG); + case ISD::VECTOR_SPLICE: + return LowerVECTOR_SPLICE(Op, DAG); } } @@ -7417,6 +7423,14 @@ return CS1; } +SDValue AArch64TargetLowering::LowerVECTOR_SPLICE(SDValue Op, + SelectionDAG &DAG) const { + + if (Op.getConstantOperandAPInt(2) == -1) + return Op; + return SDValue(); +} + SDValue AArch64TargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const { ISD::CondCode CC = cast(Op.getOperand(4))->get(); @@ -16487,6 +16501,28 @@ return performPostLD1Combine(N, DCI, true); } +SDValue performSVESpliceCombine(SDNode *N, SelectionDAG &DAG) { + EVT Ty = N->getValueType(0); + if (Ty.isInteger()) + return SDValue(); + + EVT IntTy = Ty.changeVectorElementTypeToInteger(); + EVT ExtIntTy = getPackedSVEVectorVT(IntTy.getVectorElementCount()); + if (ExtIntTy.getVectorElementType().getScalarSizeInBits() < + IntTy.getVectorElementType().getScalarSizeInBits()) + return SDValue(); + + SDLoc DL(N); + SDValue LHS = DAG.getAnyExtOrTrunc(DAG.getBitcast(IntTy, N->getOperand(0)), + DL, ExtIntTy); + SDValue RHS = DAG.getAnyExtOrTrunc(DAG.getBitcast(IntTy, N->getOperand(1)), + DL, ExtIntTy); + SDValue Idx = N->getOperand(2); + SDValue Splice = DAG.getNode(ISD::VECTOR_SPLICE, DL, ExtIntTy, LHS, RHS, Idx); + SDValue Trunc = DAG.getAnyExtOrTrunc(Splice, DL, IntTy); + return DAG.getBitcast(Ty, Trunc); +} + SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const { SelectionDAG &DAG = DCI.DAG; @@ -16539,6 +16575,8 @@ break; case ISD::STORE: return performSTORECombine(N, DCI, DAG, Subtarget); + case ISD::VECTOR_SPLICE: + return performSVESpliceCombine(N, DAG); case AArch64ISD::BRCOND: return performBRCONDCombine(N, DCI, DAG); case AArch64ISD::TBNZ: diff --git a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td --- a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td @@ -1226,6 +1226,20 @@ def : Pat<(nxv8bf16 (concat_vectors nxv4bf16:$v1, nxv4bf16:$v2)), (UZP1_ZZZ_H $v1, $v2)>; + // Splice with lane equal to -1 + def : Pat<(nxv16i8 (vector_splice (nxv16i8 ZPR:$Z1), (nxv16i8 ZPR:$Z2), (i64 -1))), + (INSR_ZV_B ZPR:$Z2, (INSERT_SUBREG (IMPLICIT_DEF), + (LASTB_VPZ_B (PTRUE_B 31), ZPR:$Z1), bsub))>; + def : Pat<(nxv8i16 (vector_splice (nxv8i16 ZPR:$Z1), (nxv8i16 ZPR:$Z2), (i64 -1))), + (INSR_ZV_H ZPR:$Z2, (INSERT_SUBREG (IMPLICIT_DEF), + (LASTB_VPZ_H (PTRUE_H 31), ZPR:$Z1), hsub))>; + def : Pat<(nxv4i32 (vector_splice (nxv4i32 ZPR:$Z1), (nxv4i32 ZPR:$Z2), (i64 -1))), + (INSR_ZV_S ZPR:$Z2, (INSERT_SUBREG (IMPLICIT_DEF), + (LASTB_VPZ_S (PTRUE_S 31), ZPR:$Z1), ssub))>; + def : Pat<(nxv2i64 (vector_splice (nxv2i64 ZPR:$Z1), (nxv2i64 ZPR:$Z2), (i64 -1))), + (INSR_ZV_D ZPR:$Z2, (INSERT_SUBREG (IMPLICIT_DEF), + (LASTB_VPZ_D (PTRUE_D 31), ZPR:$Z1), dsub))>; + defm CMPHS_PPzZZ : sve_int_cmp_0<0b000, "cmphs", SETUGE, SETULE>; defm CMPHI_PPzZZ : sve_int_cmp_0<0b001, "cmphi", SETUGT, SETULT>; defm CMPGE_PPzZZ : sve_int_cmp_0<0b100, "cmpge", SETGE, SETLE>; diff --git a/llvm/test/CodeGen/AArch64/named-vector-shuffles-sve.ll b/llvm/test/CodeGen/AArch64/named-vector-shuffles-sve.ll --- a/llvm/test/CodeGen/AArch64/named-vector-shuffles-sve.ll +++ b/llvm/test/CodeGen/AArch64/named-vector-shuffles-sve.ll @@ -239,6 +239,70 @@ ret %res } +define @splice_nxv2f16_neg_idx( %a, %b) #0 { +; CHECK-LABEL: splice_nxv2f16_neg_idx: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: lastb d0, p0, z0.d +; CHECK-NEXT: insr z1.d, d0 +; CHECK-NEXT: mov z0.d, z1.d +; CHECK-NEXT: ret + %res = call @llvm.experimental.vector.splice.nxv2f16( %a, %b, i32 -1) + ret %res +} + +define @splice_nxv2f16_neg2_idx( %a, %b) #0 { +; CHECK-LABEL: splice_nxv2f16_neg2_idx: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-2 +; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: mov x8, sp +; CHECK-NEXT: ptrue p1.b +; CHECK-NEXT: st1h { z0.h }, p0, [sp] +; CHECK-NEXT: st1h { z1.h }, p0, [x8, #1, mul vl] +; CHECK-NEXT: addvl x8, x8, #1 +; CHECK-NEXT: mov x9, #-16 +; CHECK-NEXT: ld1b { z0.b }, p1/z, [x8, x9] +; CHECK-NEXT: addvl sp, sp, #2 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + %res = call @llvm.experimental.vector.splice.nxv2f16( %a, %b, i32 -2) + ret %res +} + +define @splice_nxv4f16_neg_idx( %a, %b) #0 { +; CHECK-LABEL: splice_nxv4f16_neg_idx: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: lastb s0, p0, z0.s +; CHECK-NEXT: insr z1.s, s0 +; CHECK-NEXT: mov z0.d, z1.d +; CHECK-NEXT: ret + %res = call @llvm.experimental.vector.splice.nxv4f16( %a, %b, i32 -1) + ret %res +} + +define @splice_nxv4f16_neg3_idx( %a, %b) #0 { +; CHECK-LABEL: splice_nxv4f16_neg3_idx: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-2 +; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: mov x8, sp +; CHECK-NEXT: ptrue p1.b +; CHECK-NEXT: st1h { z0.h }, p0, [sp] +; CHECK-NEXT: st1h { z1.h }, p0, [x8, #1, mul vl] +; CHECK-NEXT: addvl x8, x8, #1 +; CHECK-NEXT: mov x9, #-12 +; CHECK-NEXT: ld1b { z0.b }, p1/z, [x8, x9] +; CHECK-NEXT: addvl sp, sp, #2 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + %res = call @llvm.experimental.vector.splice.nxv4f16( %a, %b, i32 -3) + ret %res +} + define @splice_nxv8f16_first_idx( %a, %b) #0 { ; CHECK-LABEL: splice_nxv8f16_first_idx: ; CHECK: // %bb.0: @@ -297,6 +361,38 @@ ret %res } +define @splice_nxv2f32_neg_idx( %a, %b) #0 { +; CHECK-LABEL: splice_nxv2f32_neg_idx: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: lastb d0, p0, z0.d +; CHECK-NEXT: insr z1.d, d0 +; CHECK-NEXT: mov z0.d, z1.d +; CHECK-NEXT: ret + %res = call @llvm.experimental.vector.splice.nxv2f32( %a, %b, i32 -1) + ret %res +} + +define @splice_nxv2f32_neg2_idx( %a, %b) #0 { +; CHECK-LABEL: splice_nxv2f32_neg2_idx: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-2 +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: mov x8, sp +; CHECK-NEXT: ptrue p1.b +; CHECK-NEXT: st1w { z0.s }, p0, [sp] +; CHECK-NEXT: st1w { z1.s }, p0, [x8, #1, mul vl] +; CHECK-NEXT: addvl x8, x8, #1 +; CHECK-NEXT: mov x9, #-16 +; CHECK-NEXT: ld1b { z0.b }, p1/z, [x8, x9] +; CHECK-NEXT: addvl sp, sp, #2 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + %res = call @llvm.experimental.vector.splice.nxv2f32( %a, %b, i32 -2) + ret %res +} + define @splice_nxv4f32_first_idx( %a, %b) #0 { ; CHECK-LABEL: splice_nxv4f32_first_idx: ; CHECK: // %bb.0: @@ -605,17 +701,10 @@ define @splice_nxv16i8_1( %a, %b) #0 { ; CHECK-LABEL: splice_nxv16i8_1: ; CHECK: // %bb.0: -; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill -; CHECK-NEXT: addvl sp, sp, #-2 ; CHECK-NEXT: ptrue p0.b -; CHECK-NEXT: mov x8, sp -; CHECK-NEXT: st1b { z0.b }, p0, [sp] -; CHECK-NEXT: st1b { z1.b }, p0, [x8, #1, mul vl] -; CHECK-NEXT: addvl x8, x8, #1 -; CHECK-NEXT: mov x9, #-1 -; CHECK-NEXT: ld1b { z0.b }, p0/z, [x8, x9] -; CHECK-NEXT: addvl sp, sp, #2 -; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: lastb b0, p0, z0.b +; CHECK-NEXT: insr z1.b, b0 +; CHECK-NEXT: mov z0.d, z1.d ; CHECK-NEXT: ret %res = call @llvm.experimental.vector.splice.nxv16i8( %a, %b, i32 -1) ret %res @@ -668,18 +757,10 @@ define @splice_nxv8i16_1( %a, %b) #0 { ; CHECK-LABEL: splice_nxv8i16_1: ; CHECK: // %bb.0: -; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill -; CHECK-NEXT: addvl sp, sp, #-2 ; CHECK-NEXT: ptrue p0.h -; CHECK-NEXT: mov x8, sp -; CHECK-NEXT: ptrue p1.b -; CHECK-NEXT: st1h { z0.h }, p0, [sp] -; CHECK-NEXT: st1h { z1.h }, p0, [x8, #1, mul vl] -; CHECK-NEXT: addvl x8, x8, #1 -; CHECK-NEXT: mov x9, #-2 -; CHECK-NEXT: ld1b { z0.b }, p1/z, [x8, x9] -; CHECK-NEXT: addvl sp, sp, #2 -; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: lastb h0, p0, z0.h +; CHECK-NEXT: insr z1.h, h0 +; CHECK-NEXT: mov z0.d, z1.d ; CHECK-NEXT: ret %res = call @llvm.experimental.vector.splice.nxv8i16( %a, %b, i32 -1) ret %res @@ -732,18 +813,10 @@ define @splice_nxv4i32_1( %a, %b) #0 { ; CHECK-LABEL: splice_nxv4i32_1: ; CHECK: // %bb.0: -; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill -; CHECK-NEXT: addvl sp, sp, #-2 ; CHECK-NEXT: ptrue p0.s -; CHECK-NEXT: mov x8, sp -; CHECK-NEXT: ptrue p1.b -; CHECK-NEXT: st1w { z0.s }, p0, [sp] -; CHECK-NEXT: st1w { z1.s }, p0, [x8, #1, mul vl] -; CHECK-NEXT: addvl x8, x8, #1 -; CHECK-NEXT: mov x9, #-4 -; CHECK-NEXT: ld1b { z0.b }, p1/z, [x8, x9] -; CHECK-NEXT: addvl sp, sp, #2 -; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: lastb s0, p0, z0.s +; CHECK-NEXT: insr z1.s, s0 +; CHECK-NEXT: mov z0.d, z1.d ; CHECK-NEXT: ret %res = call @llvm.experimental.vector.splice.nxv4i32( %a, %b, i32 -1) ret %res @@ -796,18 +869,10 @@ define @splice_nxv2i64_1( %a, %b) #0 { ; CHECK-LABEL: splice_nxv2i64_1: ; CHECK: // %bb.0: -; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill -; CHECK-NEXT: addvl sp, sp, #-2 ; CHECK-NEXT: ptrue p0.d -; CHECK-NEXT: mov x8, sp -; CHECK-NEXT: ptrue p1.b -; CHECK-NEXT: st1d { z0.d }, p0, [sp] -; CHECK-NEXT: st1d { z1.d }, p0, [x8, #1, mul vl] -; CHECK-NEXT: addvl x8, x8, #1 -; CHECK-NEXT: mov x9, #-8 -; CHECK-NEXT: ld1b { z0.b }, p1/z, [x8, x9] -; CHECK-NEXT: addvl sp, sp, #2 -; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: lastb d0, p0, z0.d +; CHECK-NEXT: insr z1.d, d0 +; CHECK-NEXT: mov z0.d, z1.d ; CHECK-NEXT: ret %res = call @llvm.experimental.vector.splice.nxv2i64( %a, %b, i32 -1) ret %res @@ -860,18 +925,10 @@ define @splice_nxv8f16_1( %a, %b) #0 { ; CHECK-LABEL: splice_nxv8f16_1: ; CHECK: // %bb.0: -; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill -; CHECK-NEXT: addvl sp, sp, #-2 ; CHECK-NEXT: ptrue p0.h -; CHECK-NEXT: mov x8, sp -; CHECK-NEXT: ptrue p1.b -; CHECK-NEXT: st1h { z0.h }, p0, [sp] -; CHECK-NEXT: st1h { z1.h }, p0, [x8, #1, mul vl] -; CHECK-NEXT: addvl x8, x8, #1 -; CHECK-NEXT: mov x9, #-2 -; CHECK-NEXT: ld1b { z0.b }, p1/z, [x8, x9] -; CHECK-NEXT: addvl sp, sp, #2 -; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: lastb h0, p0, z0.h +; CHECK-NEXT: insr z1.h, h0 +; CHECK-NEXT: mov z0.d, z1.d ; CHECK-NEXT: ret %res = call @llvm.experimental.vector.splice.nxv8f16( %a, %b, i32 -1) ret %res @@ -924,18 +981,10 @@ define @splice_nxv4f32_1( %a, %b) #0 { ; CHECK-LABEL: splice_nxv4f32_1: ; CHECK: // %bb.0: -; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill -; CHECK-NEXT: addvl sp, sp, #-2 ; CHECK-NEXT: ptrue p0.s -; CHECK-NEXT: mov x8, sp -; CHECK-NEXT: ptrue p1.b -; CHECK-NEXT: st1w { z0.s }, p0, [sp] -; CHECK-NEXT: st1w { z1.s }, p0, [x8, #1, mul vl] -; CHECK-NEXT: addvl x8, x8, #1 -; CHECK-NEXT: mov x9, #-4 -; CHECK-NEXT: ld1b { z0.b }, p1/z, [x8, x9] -; CHECK-NEXT: addvl sp, sp, #2 -; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: lastb s0, p0, z0.s +; CHECK-NEXT: insr z1.s, s0 +; CHECK-NEXT: mov z0.d, z1.d ; CHECK-NEXT: ret %res = call @llvm.experimental.vector.splice.nxv4f32( %a, %b, i32 -1) ret %res @@ -988,18 +1037,10 @@ define @splice_nxv2f64_1( %a, %b) #0 { ; CHECK-LABEL: splice_nxv2f64_1: ; CHECK: // %bb.0: -; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill -; CHECK-NEXT: addvl sp, sp, #-2 ; CHECK-NEXT: ptrue p0.d -; CHECK-NEXT: mov x8, sp -; CHECK-NEXT: ptrue p1.b -; CHECK-NEXT: st1d { z0.d }, p0, [sp] -; CHECK-NEXT: st1d { z1.d }, p0, [x8, #1, mul vl] -; CHECK-NEXT: addvl x8, x8, #1 -; CHECK-NEXT: mov x9, #-8 -; CHECK-NEXT: ld1b { z0.b }, p1/z, [x8, x9] -; CHECK-NEXT: addvl sp, sp, #2 -; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: lastb d0, p0, z0.d +; CHECK-NEXT: insr z1.d, d0 +; CHECK-NEXT: mov z0.d, z1.d ; CHECK-NEXT: ret %res = call @llvm.experimental.vector.splice.nxv2f64( %a, %b, i32 -1) ret %res @@ -1033,22 +1074,13 @@ define @splice_nxv2i1( %a, %b) #0 { ; CHECK-LABEL: splice_nxv2i1: ; CHECK: // %bb.0: -; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill -; CHECK-NEXT: addvl sp, sp, #-2 ; CHECK-NEXT: mov z0.d, p0/z, #1 // =0x1 ; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: lastb d0, p0, z0.d ; CHECK-NEXT: mov z1.d, p1/z, #1 // =0x1 -; CHECK-NEXT: mov x8, sp -; CHECK-NEXT: ptrue p1.b -; CHECK-NEXT: st1d { z0.d }, p0, [sp] -; CHECK-NEXT: st1d { z1.d }, p0, [x8, #1, mul vl] -; CHECK-NEXT: addvl x8, x8, #1 -; CHECK-NEXT: mov x9, #-8 -; CHECK-NEXT: ld1b { z0.b }, p1/z, [x8, x9] -; CHECK-NEXT: and z0.d, z0.d, #0x1 -; CHECK-NEXT: cmpne p0.d, p0/z, z0.d, #0 -; CHECK-NEXT: addvl sp, sp, #2 -; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: insr z1.d, d0 +; CHECK-NEXT: and z1.d, z1.d, #0x1 +; CHECK-NEXT: cmpne p0.d, p0/z, z1.d, #0 ; CHECK-NEXT: ret %res = call @llvm.experimental.vector.splice.nxv2i1( %a, %b, i32 -1) ret %res @@ -1058,22 +1090,13 @@ define @splice_nxv4i1( %a, %b) #0 { ; CHECK-LABEL: splice_nxv4i1: ; CHECK: // %bb.0: -; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill -; CHECK-NEXT: addvl sp, sp, #-2 ; CHECK-NEXT: mov z0.s, p0/z, #1 // =0x1 ; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: lastb s0, p0, z0.s ; CHECK-NEXT: mov z1.s, p1/z, #1 // =0x1 -; CHECK-NEXT: mov x8, sp -; CHECK-NEXT: ptrue p1.b -; CHECK-NEXT: st1w { z0.s }, p0, [sp] -; CHECK-NEXT: st1w { z1.s }, p0, [x8, #1, mul vl] -; CHECK-NEXT: addvl x8, x8, #1 -; CHECK-NEXT: mov x9, #-4 -; CHECK-NEXT: ld1b { z0.b }, p1/z, [x8, x9] -; CHECK-NEXT: and z0.s, z0.s, #0x1 -; CHECK-NEXT: cmpne p0.s, p0/z, z0.s, #0 -; CHECK-NEXT: addvl sp, sp, #2 -; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: insr z1.s, s0 +; CHECK-NEXT: and z1.s, z1.s, #0x1 +; CHECK-NEXT: cmpne p0.s, p0/z, z1.s, #0 ; CHECK-NEXT: ret %res = call @llvm.experimental.vector.splice.nxv4i1( %a, %b, i32 -1) ret %res @@ -1083,22 +1106,13 @@ define @splice_nxv8i1( %a, %b) #0 { ; CHECK-LABEL: splice_nxv8i1: ; CHECK: // %bb.0: -; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill -; CHECK-NEXT: addvl sp, sp, #-2 ; CHECK-NEXT: mov z0.h, p0/z, #1 // =0x1 ; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: lastb h0, p0, z0.h ; CHECK-NEXT: mov z1.h, p1/z, #1 // =0x1 -; CHECK-NEXT: mov x8, sp -; CHECK-NEXT: ptrue p1.b -; CHECK-NEXT: st1h { z0.h }, p0, [sp] -; CHECK-NEXT: st1h { z1.h }, p0, [x8, #1, mul vl] -; CHECK-NEXT: addvl x8, x8, #1 -; CHECK-NEXT: mov x9, #-2 -; CHECK-NEXT: ld1b { z0.b }, p1/z, [x8, x9] -; CHECK-NEXT: and z0.h, z0.h, #0x1 -; CHECK-NEXT: cmpne p0.h, p0/z, z0.h, #0 -; CHECK-NEXT: addvl sp, sp, #2 -; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: insr z1.h, h0 +; CHECK-NEXT: and z1.h, z1.h, #0x1 +; CHECK-NEXT: cmpne p0.h, p0/z, z1.h, #0 ; CHECK-NEXT: ret %res = call @llvm.experimental.vector.splice.nxv8i1( %a, %b, i32 -1) ret %res @@ -1108,21 +1122,13 @@ define @splice_nxv16i1( %a, %b) #0 { ; CHECK-LABEL: splice_nxv16i1: ; CHECK: // %bb.0: -; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill -; CHECK-NEXT: addvl sp, sp, #-2 ; CHECK-NEXT: mov z0.b, p0/z, #1 // =0x1 ; CHECK-NEXT: ptrue p0.b +; CHECK-NEXT: lastb b0, p0, z0.b ; CHECK-NEXT: mov z1.b, p1/z, #1 // =0x1 -; CHECK-NEXT: mov x8, sp -; CHECK-NEXT: st1b { z0.b }, p0, [sp] -; CHECK-NEXT: st1b { z1.b }, p0, [x8, #1, mul vl] -; CHECK-NEXT: addvl x8, x8, #1 -; CHECK-NEXT: mov x9, #-1 -; CHECK-NEXT: ld1b { z0.b }, p0/z, [x8, x9] -; CHECK-NEXT: and z0.b, z0.b, #0x1 -; CHECK-NEXT: cmpne p0.b, p0/z, z0.b, #0 -; CHECK-NEXT: addvl sp, sp, #2 -; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: insr z1.b, b0 +; CHECK-NEXT: and z1.b, z1.b, #0x1 +; CHECK-NEXT: cmpne p0.b, p0/z, z1.b, #0 ; CHECK-NEXT: ret %res = call @llvm.experimental.vector.splice.nxv16i1( %a, %b, i32 -1) ret %res @@ -1217,7 +1223,10 @@ declare @llvm.experimental.vector.splice.nxv4i32(, , i32) declare @llvm.experimental.vector.splice.nxv8i32(, , i32) declare @llvm.experimental.vector.splice.nxv2i64(, , i32) +declare @llvm.experimental.vector.splice.nxv2f16(, , i32) +declare @llvm.experimental.vector.splice.nxv4f16(, , i32) declare @llvm.experimental.vector.splice.nxv8f16(, , i32) +declare @llvm.experimental.vector.splice.nxv2f32(, , i32) declare @llvm.experimental.vector.splice.nxv4f32(, , i32) declare @llvm.experimental.vector.splice.nxv16f32(, , i32) declare @llvm.experimental.vector.splice.nxv2f64(, , i32)