diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -7796,10 +7796,37 @@ SelectionDAG &DAG) const { EVT Ty = Op.getValueType(); auto Idx = Op.getConstantOperandAPInt(2); + int64_t IdxVal = Idx.getSExtValue(); + assert(Ty.isScalableVector() && + "Only expect scalable vectors for custom lowering of VECTOR_SPLICE"); + + // We can use the splice instruction for certain index values where we are + // able to efficiently generate the correct predicate. The index will be + // inverted and used directly as the input to the ptrue instruction, i.e. + // -1 -> vl1, -2 -> vl2, etc. The predicate will then be reversed to get the + // splice predicate. However, we can only do this if we can guarantee that + // there are enough elements in the vector, hence we check the index <= min + // number of elements. + Optional PredPattern; + if (Ty.isScalableVector() && IdxVal < 0 && + (PredPattern = getSVEPredPatternFromNumElements(std::abs(IdxVal))) != + None) { + SDLoc DL(Op); + + // Create a predicate where all but the last -IdxVal elements are false. + EVT PredVT = Ty.changeVectorElementType(MVT::i1); + SDValue Pred = getPTrue(DAG, DL, PredVT, *PredPattern); + Pred = DAG.getNode(ISD::VECTOR_REVERSE, DL, PredVT, Pred); + + // Now splice the two inputs together using the predicate. + return DAG.getNode(AArch64ISD::SPLICE, DL, Ty, Pred, Op.getOperand(0), + Op.getOperand(1)); + } // This will select to an EXT instruction, which has a maximum immediate // value of 255, hence 2048-bits is the maximum value we can lower. - if (Idx.sge(-1) && Idx.slt(2048 / Ty.getVectorElementType().getSizeInBits())) + if (IdxVal >= 0 && + IdxVal < int64_t(2048 / Ty.getVectorElementType().getSizeInBits())) return Op; return SDValue(); @@ -11011,10 +11038,10 @@ if (Vec0.isUndef()) return Op; - unsigned int PredPattern = + Optional PredPattern = getSVEPredPatternFromNumElements(InVT.getVectorNumElements()); auto PredTy = VT.changeVectorElementType(MVT::i1); - SDValue PTrue = getPTrue(DAG, DL, PredTy, PredPattern); + SDValue PTrue = getPTrue(DAG, DL, PredTy, *PredPattern); SDValue ScalableVec1 = convertToScalableVector(DAG, VT, Vec1); return DAG.getNode(ISD::VSELECT, DL, VT, PTrue, ScalableVec1, Vec0); } @@ -12319,7 +12346,7 @@ Value *PTrue = nullptr; if (UseScalable) { - unsigned PgPattern = + Optional PgPattern = getSVEPredPatternFromNumElements(FVTy->getNumElements()); if (Subtarget->getMinSVEVectorSizeInBits() == Subtarget->getMaxSVEVectorSizeInBits() && @@ -12327,7 +12354,7 @@ PgPattern = AArch64SVEPredPattern::all; auto *PTruePat = - ConstantInt::get(Type::getInt32Ty(LDVTy->getContext()), PgPattern); + ConstantInt::get(Type::getInt32Ty(LDVTy->getContext()), *PgPattern); PTrue = Builder.CreateIntrinsic(Intrinsic::aarch64_sve_ptrue, {PredTy}, {PTruePat}); } @@ -12499,7 +12526,7 @@ Value *PTrue = nullptr; if (UseScalable) { - unsigned PgPattern = + Optional PgPattern = getSVEPredPatternFromNumElements(SubVecTy->getNumElements()); if (Subtarget->getMinSVEVectorSizeInBits() == Subtarget->getMaxSVEVectorSizeInBits() && @@ -12508,7 +12535,7 @@ PgPattern = AArch64SVEPredPattern::all; auto *PTruePat = - ConstantInt::get(Type::getInt32Ty(STVTy->getContext()), PgPattern); + ConstantInt::get(Type::getInt32Ty(STVTy->getContext()), *PgPattern); PTrue = Builder.CreateIntrinsic(Intrinsic::aarch64_sve_ptrue, {PredTy}, {PTruePat}); } @@ -18752,7 +18779,7 @@ DAG.getTargetLoweringInfo().isTypeLegal(VT) && "Expected legal fixed length vector!"); - unsigned PgPattern = + Optional PgPattern = getSVEPredPatternFromNumElements(VT.getVectorNumElements()); assert(PgPattern && "Unexpected element count for SVE predicate"); @@ -18788,7 +18815,7 @@ break; } - return getPTrue(DAG, DL, MaskVT, PgPattern); + return getPTrue(DAG, DL, MaskVT, *PgPattern); } static SDValue getPredicateForScalableVector(SelectionDAG &DAG, SDLoc &DL, diff --git a/llvm/lib/Target/AArch64/Utils/AArch64BaseInfo.h b/llvm/lib/Target/AArch64/Utils/AArch64BaseInfo.h --- a/llvm/lib/Target/AArch64/Utils/AArch64BaseInfo.h +++ b/llvm/lib/Target/AArch64/Utils/AArch64BaseInfo.h @@ -483,18 +483,20 @@ } /// Return specific VL predicate pattern based on the number of elements. -inline unsigned getSVEPredPatternFromNumElements(unsigned MinNumElts) { +inline Optional +getSVEPredPatternFromNumElements(unsigned MinNumElts) { switch (MinNumElts) { default: - llvm_unreachable("unexpected element count for SVE predicate"); + return None; case 1: - return AArch64SVEPredPattern::vl1; case 2: - return AArch64SVEPredPattern::vl2; + case 3: case 4: - return AArch64SVEPredPattern::vl4; + case 5: + case 6: + case 7: case 8: - return AArch64SVEPredPattern::vl8; + return MinNumElts; case 16: return AArch64SVEPredPattern::vl16; case 32: diff --git a/llvm/test/CodeGen/AArch64/named-vector-shuffles-sve.ll b/llvm/test/CodeGen/AArch64/named-vector-shuffles-sve.ll --- a/llvm/test/CodeGen/AArch64/named-vector-shuffles-sve.ll +++ b/llvm/test/CodeGen/AArch64/named-vector-shuffles-sve.ll @@ -81,10 +81,9 @@ define @splice_nxv2f16_neg_idx( %a, %b) #0 { ; CHECK-LABEL: splice_nxv2f16_neg_idx: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d -; CHECK-NEXT: lastb d0, p0, z0.d -; CHECK-NEXT: insr z1.d, d0 -; CHECK-NEXT: mov z0.d, z1.d +; CHECK-NEXT: ptrue p0.d, vl1 +; CHECK-NEXT: rev p0.d, p0.d +; CHECK-NEXT: splice z0.d, p0, z0.d, z1.d ; CHECK-NEXT: ret %res = call @llvm.experimental.vector.splice.nxv2f16( %a, %b, i32 -1) ret %res @@ -93,17 +92,9 @@ define @splice_nxv2f16_neg2_idx( %a, %b) #0 { ; CHECK-LABEL: splice_nxv2f16_neg2_idx: ; CHECK: // %bb.0: -; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill -; CHECK-NEXT: addvl sp, sp, #-2 -; CHECK-NEXT: mov x8, sp -; CHECK-NEXT: mov x9, #-8 -; CHECK-NEXT: ptrue p0.h -; CHECK-NEXT: st1h { z0.h }, p0, [sp] -; CHECK-NEXT: st1h { z1.h }, p0, [sp, #1, mul vl] -; CHECK-NEXT: addvl x8, x8, #1 -; CHECK-NEXT: ld1h { z0.h }, p0/z, [x8, x9, lsl #1] -; CHECK-NEXT: addvl sp, sp, #2 -; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ptrue p0.d, vl2 +; CHECK-NEXT: rev p0.d, p0.d +; CHECK-NEXT: splice z0.d, p0, z0.d, z1.d ; CHECK-NEXT: ret %res = call @llvm.experimental.vector.splice.nxv2f16( %a, %b, i32 -2) ret %res @@ -130,10 +121,9 @@ define @splice_nxv4f16_neg_idx( %a, %b) #0 { ; CHECK-LABEL: splice_nxv4f16_neg_idx: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s -; CHECK-NEXT: lastb s0, p0, z0.s -; CHECK-NEXT: insr z1.s, s0 -; CHECK-NEXT: mov z0.d, z1.d +; CHECK-NEXT: ptrue p0.s, vl1 +; CHECK-NEXT: rev p0.s, p0.s +; CHECK-NEXT: splice z0.s, p0, z0.s, z1.s ; CHECK-NEXT: ret %res = call @llvm.experimental.vector.splice.nxv4f16( %a, %b, i32 -1) ret %res @@ -142,17 +132,9 @@ define @splice_nxv4f16_neg3_idx( %a, %b) #0 { ; CHECK-LABEL: splice_nxv4f16_neg3_idx: ; CHECK: // %bb.0: -; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill -; CHECK-NEXT: addvl sp, sp, #-2 -; CHECK-NEXT: mov x8, sp -; CHECK-NEXT: mov x9, #-6 -; CHECK-NEXT: ptrue p0.h -; CHECK-NEXT: st1h { z0.h }, p0, [sp] -; CHECK-NEXT: st1h { z1.h }, p0, [sp, #1, mul vl] -; CHECK-NEXT: addvl x8, x8, #1 -; CHECK-NEXT: ld1h { z0.h }, p0/z, [x8, x9, lsl #1] -; CHECK-NEXT: addvl sp, sp, #2 -; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ptrue p0.s, vl3 +; CHECK-NEXT: rev p0.s, p0.s +; CHECK-NEXT: splice z0.s, p0, z0.s, z1.s ; CHECK-NEXT: ret %res = call @llvm.experimental.vector.splice.nxv4f16( %a, %b, i32 -3) ret %res @@ -197,10 +179,9 @@ define @splice_nxv2f32_neg_idx( %a, %b) #0 { ; CHECK-LABEL: splice_nxv2f32_neg_idx: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d -; CHECK-NEXT: lastb d0, p0, z0.d -; CHECK-NEXT: insr z1.d, d0 -; CHECK-NEXT: mov z0.d, z1.d +; CHECK-NEXT: ptrue p0.d, vl1 +; CHECK-NEXT: rev p0.d, p0.d +; CHECK-NEXT: splice z0.d, p0, z0.d, z1.d ; CHECK-NEXT: ret %res = call @llvm.experimental.vector.splice.nxv2f32( %a, %b, i32 -1) ret %res @@ -209,17 +190,9 @@ define @splice_nxv2f32_neg2_idx( %a, %b) #0 { ; CHECK-LABEL: splice_nxv2f32_neg2_idx: ; CHECK: // %bb.0: -; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill -; CHECK-NEXT: addvl sp, sp, #-2 -; CHECK-NEXT: mov x8, sp -; CHECK-NEXT: mov x9, #-4 -; CHECK-NEXT: ptrue p0.s -; CHECK-NEXT: st1w { z0.s }, p0, [sp] -; CHECK-NEXT: st1w { z1.s }, p0, [sp, #1, mul vl] -; CHECK-NEXT: addvl x8, x8, #1 -; CHECK-NEXT: ld1w { z0.s }, p0/z, [x8, x9, lsl #2] -; CHECK-NEXT: addvl sp, sp, #2 -; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ptrue p0.d, vl2 +; CHECK-NEXT: rev p0.d, p0.d +; CHECK-NEXT: splice z0.d, p0, z0.d, z1.d ; CHECK-NEXT: ret %res = call @llvm.experimental.vector.splice.nxv2f32( %a, %b, i32 -2) ret %res @@ -411,29 +384,64 @@ define @splice_nxv16i8( %a, %b) #0 { ; CHECK-LABEL: splice_nxv16i8: ; CHECK: // %bb.0: -; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill -; CHECK-NEXT: addvl sp, sp, #-2 -; CHECK-NEXT: mov x8, sp -; CHECK-NEXT: mov x9, #-16 -; CHECK-NEXT: ptrue p0.b -; CHECK-NEXT: st1b { z0.b }, p0, [sp] -; CHECK-NEXT: st1b { z1.b }, p0, [sp, #1, mul vl] -; CHECK-NEXT: addvl x8, x8, #1 -; CHECK-NEXT: ld1b { z0.b }, p0/z, [x8, x9] -; CHECK-NEXT: addvl sp, sp, #2 -; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ptrue p0.b, vl16 +; CHECK-NEXT: rev p0.b, p0.b +; CHECK-NEXT: splice z0.b, p0, z0.b, z1.b ; CHECK-NEXT: ret %res = call @llvm.experimental.vector.splice.nxv16i8( %a, %b, i32 -16) ret %res } +define @splice_nxv16i8_neg32( %a, %b) #2 { +; CHECK-LABEL: splice_nxv16i8_neg32: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.b, vl32 +; CHECK-NEXT: rev p0.b, p0.b +; CHECK-NEXT: splice z0.b, p0, z0.b, z1.b +; CHECK-NEXT: ret + %res = call @llvm.experimental.vector.splice.nxv16i8( %a, %b, i32 -32) + ret %res +} + +define @splice_nxv16i8_neg64( %a, %b) #3 { +; CHECK-LABEL: splice_nxv16i8_neg64: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.b, vl64 +; CHECK-NEXT: rev p0.b, p0.b +; CHECK-NEXT: splice z0.b, p0, z0.b, z1.b +; CHECK-NEXT: ret + %res = call @llvm.experimental.vector.splice.nxv16i8( %a, %b, i32 -64) + ret %res +} + +define @splice_nxv16i8_neg128( %a, %b) #4 { +; CHECK-LABEL: splice_nxv16i8_neg128: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.b, vl128 +; CHECK-NEXT: rev p0.b, p0.b +; CHECK-NEXT: splice z0.b, p0, z0.b, z1.b +; CHECK-NEXT: ret + %res = call @llvm.experimental.vector.splice.nxv16i8( %a, %b, i32 -128) + ret %res +} + +define @splice_nxv16i8_neg256( %a, %b) #1 { +; CHECK-LABEL: splice_nxv16i8_neg256: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.b, vl256 +; CHECK-NEXT: rev p0.b, p0.b +; CHECK-NEXT: splice z0.b, p0, z0.b, z1.b +; CHECK-NEXT: ret + %res = call @llvm.experimental.vector.splice.nxv16i8( %a, %b, i32 -256) + ret %res +} + define @splice_nxv16i8_1( %a, %b) #0 { ; CHECK-LABEL: splice_nxv16i8_1: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.b -; CHECK-NEXT: lastb b0, p0, z0.b -; CHECK-NEXT: insr z1.b, b0 -; CHECK-NEXT: mov z0.d, z1.d +; CHECK-NEXT: ptrue p0.b, vl1 +; CHECK-NEXT: rev p0.b, p0.b +; CHECK-NEXT: splice z0.b, p0, z0.b, z1.b ; CHECK-NEXT: ret %res = call @llvm.experimental.vector.splice.nxv16i8( %a, %b, i32 -1) ret %res @@ -465,17 +473,9 @@ define @splice_nxv8i16( %a, %b) #0 { ; CHECK-LABEL: splice_nxv8i16: ; CHECK: // %bb.0: -; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill -; CHECK-NEXT: addvl sp, sp, #-2 -; CHECK-NEXT: mov x8, sp -; CHECK-NEXT: mov x9, #-8 -; CHECK-NEXT: ptrue p0.h -; CHECK-NEXT: st1h { z0.h }, p0, [sp] -; CHECK-NEXT: st1h { z1.h }, p0, [sp, #1, mul vl] -; CHECK-NEXT: addvl x8, x8, #1 -; CHECK-NEXT: ld1h { z0.h }, p0/z, [x8, x9, lsl #1] -; CHECK-NEXT: addvl sp, sp, #2 -; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ptrue p0.h, vl8 +; CHECK-NEXT: rev p0.h, p0.h +; CHECK-NEXT: splice z0.h, p0, z0.h, z1.h ; CHECK-NEXT: ret %res = call @llvm.experimental.vector.splice.nxv8i16( %a, %b, i32 -8) ret %res @@ -484,10 +484,9 @@ define @splice_nxv8i16_1( %a, %b) #0 { ; CHECK-LABEL: splice_nxv8i16_1: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.h -; CHECK-NEXT: lastb h0, p0, z0.h -; CHECK-NEXT: insr z1.h, h0 -; CHECK-NEXT: mov z0.d, z1.d +; CHECK-NEXT: ptrue p0.h, vl1 +; CHECK-NEXT: rev p0.h, p0.h +; CHECK-NEXT: splice z0.h, p0, z0.h, z1.h ; CHECK-NEXT: ret %res = call @llvm.experimental.vector.splice.nxv8i16( %a, %b, i32 -1) ret %res @@ -519,17 +518,9 @@ define @splice_nxv4i32( %a, %b) #0 { ; CHECK-LABEL: splice_nxv4i32: ; CHECK: // %bb.0: -; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill -; CHECK-NEXT: addvl sp, sp, #-2 -; CHECK-NEXT: mov x8, sp -; CHECK-NEXT: mov x9, #-4 -; CHECK-NEXT: ptrue p0.s -; CHECK-NEXT: st1w { z0.s }, p0, [sp] -; CHECK-NEXT: st1w { z1.s }, p0, [sp, #1, mul vl] -; CHECK-NEXT: addvl x8, x8, #1 -; CHECK-NEXT: ld1w { z0.s }, p0/z, [x8, x9, lsl #2] -; CHECK-NEXT: addvl sp, sp, #2 -; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ptrue p0.s, vl4 +; CHECK-NEXT: rev p0.s, p0.s +; CHECK-NEXT: splice z0.s, p0, z0.s, z1.s ; CHECK-NEXT: ret %res = call @llvm.experimental.vector.splice.nxv4i32( %a, %b, i32 -4) ret %res @@ -538,10 +529,9 @@ define @splice_nxv4i32_1( %a, %b) #0 { ; CHECK-LABEL: splice_nxv4i32_1: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s -; CHECK-NEXT: lastb s0, p0, z0.s -; CHECK-NEXT: insr z1.s, s0 -; CHECK-NEXT: mov z0.d, z1.d +; CHECK-NEXT: ptrue p0.s, vl1 +; CHECK-NEXT: rev p0.s, p0.s +; CHECK-NEXT: splice z0.s, p0, z0.s, z1.s ; CHECK-NEXT: ret %res = call @llvm.experimental.vector.splice.nxv4i32( %a, %b, i32 -1) ret %res @@ -550,21 +540,9 @@ define @splice_nxv4i32_neg5( %a, %b) #2 { ; CHECK-LABEL: splice_nxv4i32_neg5: ; CHECK: // %bb.0: -; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill -; CHECK-NEXT: addvl sp, sp, #-2 -; CHECK-NEXT: mov x8, sp -; CHECK-NEXT: rdvl x9, #1 -; CHECK-NEXT: cmp x9, #20 -; CHECK-NEXT: mov w10, #20 -; CHECK-NEXT: csel x9, x9, x10, lo -; CHECK-NEXT: ptrue p0.s -; CHECK-NEXT: addvl x8, x8, #1 -; CHECK-NEXT: st1w { z0.s }, p0, [sp] -; CHECK-NEXT: sub x8, x8, x9 -; CHECK-NEXT: st1w { z1.s }, p0, [sp, #1, mul vl] -; CHECK-NEXT: ld1w { z0.s }, p0/z, [x8] -; CHECK-NEXT: addvl sp, sp, #2 -; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ptrue p0.s, vl5 +; CHECK-NEXT: rev p0.s, p0.s +; CHECK-NEXT: splice z0.s, p0, z0.s, z1.s ; CHECK-NEXT: ret %res = call @llvm.experimental.vector.splice.nxv4i32( %a, %b, i32 -5) ret %res @@ -573,17 +551,9 @@ define @splice_nxv2i64( %a, %b) #0 { ; CHECK-LABEL: splice_nxv2i64: ; CHECK: // %bb.0: -; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill -; CHECK-NEXT: addvl sp, sp, #-2 -; CHECK-NEXT: mov x8, sp -; CHECK-NEXT: mov x9, #-2 -; CHECK-NEXT: ptrue p0.d -; CHECK-NEXT: st1d { z0.d }, p0, [sp] -; CHECK-NEXT: st1d { z1.d }, p0, [sp, #1, mul vl] -; CHECK-NEXT: addvl x8, x8, #1 -; CHECK-NEXT: ld1d { z0.d }, p0/z, [x8, x9, lsl #3] -; CHECK-NEXT: addvl sp, sp, #2 -; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ptrue p0.d, vl2 +; CHECK-NEXT: rev p0.d, p0.d +; CHECK-NEXT: splice z0.d, p0, z0.d, z1.d ; CHECK-NEXT: ret %res = call @llvm.experimental.vector.splice.nxv2i64( %a, %b, i32 -2) ret %res @@ -592,10 +562,9 @@ define @splice_nxv2i64_1( %a, %b) #0 { ; CHECK-LABEL: splice_nxv2i64_1: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d -; CHECK-NEXT: lastb d0, p0, z0.d -; CHECK-NEXT: insr z1.d, d0 -; CHECK-NEXT: mov z0.d, z1.d +; CHECK-NEXT: ptrue p0.d, vl1 +; CHECK-NEXT: rev p0.d, p0.d +; CHECK-NEXT: splice z0.d, p0, z0.d, z1.d ; CHECK-NEXT: ret %res = call @llvm.experimental.vector.splice.nxv2i64( %a, %b, i32 -1) ret %res @@ -604,21 +573,9 @@ define @splice_nxv2i64_neg3( %a, %b) #2 { ; CHECK-LABEL: splice_nxv2i64_neg3: ; CHECK: // %bb.0: -; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill -; CHECK-NEXT: addvl sp, sp, #-2 -; CHECK-NEXT: mov x8, sp -; CHECK-NEXT: rdvl x9, #1 -; CHECK-NEXT: cmp x9, #24 -; CHECK-NEXT: mov w10, #24 -; CHECK-NEXT: csel x9, x9, x10, lo -; CHECK-NEXT: ptrue p0.d -; CHECK-NEXT: addvl x8, x8, #1 -; CHECK-NEXT: st1d { z0.d }, p0, [sp] -; CHECK-NEXT: sub x8, x8, x9 -; CHECK-NEXT: st1d { z1.d }, p0, [sp, #1, mul vl] -; CHECK-NEXT: ld1d { z0.d }, p0/z, [x8] -; CHECK-NEXT: addvl sp, sp, #2 -; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ptrue p0.d, vl3 +; CHECK-NEXT: rev p0.d, p0.d +; CHECK-NEXT: splice z0.d, p0, z0.d, z1.d ; CHECK-NEXT: ret %res = call @llvm.experimental.vector.splice.nxv2i64( %a, %b, i32 -3) ret %res @@ -627,17 +584,9 @@ define @splice_nxv8f16( %a, %b) #0 { ; CHECK-LABEL: splice_nxv8f16: ; CHECK: // %bb.0: -; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill -; CHECK-NEXT: addvl sp, sp, #-2 -; CHECK-NEXT: mov x8, sp -; CHECK-NEXT: mov x9, #-8 -; CHECK-NEXT: ptrue p0.h -; CHECK-NEXT: st1h { z0.h }, p0, [sp] -; CHECK-NEXT: st1h { z1.h }, p0, [sp, #1, mul vl] -; CHECK-NEXT: addvl x8, x8, #1 -; CHECK-NEXT: ld1h { z0.h }, p0/z, [x8, x9, lsl #1] -; CHECK-NEXT: addvl sp, sp, #2 -; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ptrue p0.h, vl8 +; CHECK-NEXT: rev p0.h, p0.h +; CHECK-NEXT: splice z0.h, p0, z0.h, z1.h ; CHECK-NEXT: ret %res = call @llvm.experimental.vector.splice.nxv8f16( %a, %b, i32 -8) ret %res @@ -646,10 +595,9 @@ define @splice_nxv8f16_1( %a, %b) #0 { ; CHECK-LABEL: splice_nxv8f16_1: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.h -; CHECK-NEXT: lastb h0, p0, z0.h -; CHECK-NEXT: insr z1.h, h0 -; CHECK-NEXT: mov z0.d, z1.d +; CHECK-NEXT: ptrue p0.h, vl1 +; CHECK-NEXT: rev p0.h, p0.h +; CHECK-NEXT: splice z0.h, p0, z0.h, z1.h ; CHECK-NEXT: ret %res = call @llvm.experimental.vector.splice.nxv8f16( %a, %b, i32 -1) ret %res @@ -681,17 +629,9 @@ define @splice_nxv4f32( %a, %b) #0 { ; CHECK-LABEL: splice_nxv4f32: ; CHECK: // %bb.0: -; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill -; CHECK-NEXT: addvl sp, sp, #-2 -; CHECK-NEXT: mov x8, sp -; CHECK-NEXT: mov x9, #-4 -; CHECK-NEXT: ptrue p0.s -; CHECK-NEXT: st1w { z0.s }, p0, [sp] -; CHECK-NEXT: st1w { z1.s }, p0, [sp, #1, mul vl] -; CHECK-NEXT: addvl x8, x8, #1 -; CHECK-NEXT: ld1w { z0.s }, p0/z, [x8, x9, lsl #2] -; CHECK-NEXT: addvl sp, sp, #2 -; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ptrue p0.s, vl4 +; CHECK-NEXT: rev p0.s, p0.s +; CHECK-NEXT: splice z0.s, p0, z0.s, z1.s ; CHECK-NEXT: ret %res = call @llvm.experimental.vector.splice.nxv4f32( %a, %b, i32 -4) ret %res @@ -700,10 +640,9 @@ define @splice_nxv4f32_1( %a, %b) #0 { ; CHECK-LABEL: splice_nxv4f32_1: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s -; CHECK-NEXT: lastb s0, p0, z0.s -; CHECK-NEXT: insr z1.s, s0 -; CHECK-NEXT: mov z0.d, z1.d +; CHECK-NEXT: ptrue p0.s, vl1 +; CHECK-NEXT: rev p0.s, p0.s +; CHECK-NEXT: splice z0.s, p0, z0.s, z1.s ; CHECK-NEXT: ret %res = call @llvm.experimental.vector.splice.nxv4f32( %a, %b, i32 -1) ret %res @@ -712,21 +651,9 @@ define @splice_nxv4f32_neg5( %a, %b) #2 { ; CHECK-LABEL: splice_nxv4f32_neg5: ; CHECK: // %bb.0: -; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill -; CHECK-NEXT: addvl sp, sp, #-2 -; CHECK-NEXT: mov x8, sp -; CHECK-NEXT: rdvl x9, #1 -; CHECK-NEXT: cmp x9, #20 -; CHECK-NEXT: mov w10, #20 -; CHECK-NEXT: csel x9, x9, x10, lo -; CHECK-NEXT: ptrue p0.s -; CHECK-NEXT: addvl x8, x8, #1 -; CHECK-NEXT: st1w { z0.s }, p0, [sp] -; CHECK-NEXT: sub x8, x8, x9 -; CHECK-NEXT: st1w { z1.s }, p0, [sp, #1, mul vl] -; CHECK-NEXT: ld1w { z0.s }, p0/z, [x8] -; CHECK-NEXT: addvl sp, sp, #2 -; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ptrue p0.s, vl5 +; CHECK-NEXT: rev p0.s, p0.s +; CHECK-NEXT: splice z0.s, p0, z0.s, z1.s ; CHECK-NEXT: ret %res = call @llvm.experimental.vector.splice.nxv4f32( %a, %b, i32 -5) ret %res @@ -735,17 +662,9 @@ define @splice_nxv2f64( %a, %b) #0 { ; CHECK-LABEL: splice_nxv2f64: ; CHECK: // %bb.0: -; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill -; CHECK-NEXT: addvl sp, sp, #-2 -; CHECK-NEXT: mov x8, sp -; CHECK-NEXT: mov x9, #-2 -; CHECK-NEXT: ptrue p0.d -; CHECK-NEXT: st1d { z0.d }, p0, [sp] -; CHECK-NEXT: st1d { z1.d }, p0, [sp, #1, mul vl] -; CHECK-NEXT: addvl x8, x8, #1 -; CHECK-NEXT: ld1d { z0.d }, p0/z, [x8, x9, lsl #3] -; CHECK-NEXT: addvl sp, sp, #2 -; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ptrue p0.d, vl2 +; CHECK-NEXT: rev p0.d, p0.d +; CHECK-NEXT: splice z0.d, p0, z0.d, z1.d ; CHECK-NEXT: ret %res = call @llvm.experimental.vector.splice.nxv2f64( %a, %b, i32 -2) ret %res @@ -754,10 +673,9 @@ define @splice_nxv2f64_1( %a, %b) #0 { ; CHECK-LABEL: splice_nxv2f64_1: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d -; CHECK-NEXT: lastb d0, p0, z0.d -; CHECK-NEXT: insr z1.d, d0 -; CHECK-NEXT: mov z0.d, z1.d +; CHECK-NEXT: ptrue p0.d, vl1 +; CHECK-NEXT: rev p0.d, p0.d +; CHECK-NEXT: splice z0.d, p0, z0.d, z1.d ; CHECK-NEXT: ret %res = call @llvm.experimental.vector.splice.nxv2f64( %a, %b, i32 -1) ret %res @@ -766,21 +684,9 @@ define @splice_nxv2f64_neg3( %a, %b) #2 { ; CHECK-LABEL: splice_nxv2f64_neg3: ; CHECK: // %bb.0: -; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill -; CHECK-NEXT: addvl sp, sp, #-2 -; CHECK-NEXT: mov x8, sp -; CHECK-NEXT: rdvl x9, #1 -; CHECK-NEXT: cmp x9, #24 -; CHECK-NEXT: mov w10, #24 -; CHECK-NEXT: csel x9, x9, x10, lo -; CHECK-NEXT: ptrue p0.d -; CHECK-NEXT: addvl x8, x8, #1 -; CHECK-NEXT: st1d { z0.d }, p0, [sp] -; CHECK-NEXT: sub x8, x8, x9 -; CHECK-NEXT: st1d { z1.d }, p0, [sp, #1, mul vl] -; CHECK-NEXT: ld1d { z0.d }, p0/z, [x8] -; CHECK-NEXT: addvl sp, sp, #2 -; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ptrue p0.d, vl3 +; CHECK-NEXT: rev p0.d, p0.d +; CHECK-NEXT: splice z0.d, p0, z0.d, z1.d ; CHECK-NEXT: ret %res = call @llvm.experimental.vector.splice.nxv2f64( %a, %b, i32 -3) ret %res @@ -790,13 +696,14 @@ define @splice_nxv2i1( %a, %b) #0 { ; CHECK-LABEL: splice_nxv2i1: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p2.d -; CHECK-NEXT: mov z0.d, p0/z, #1 // =0x1 -; CHECK-NEXT: lastb d0, p2, z0.d -; CHECK-NEXT: mov z1.d, p1/z, #1 // =0x1 -; CHECK-NEXT: insr z1.d, d0 +; CHECK-NEXT: ptrue p2.d, vl1 +; CHECK-NEXT: mov z0.d, p1/z, #1 // =0x1 +; CHECK-NEXT: rev p2.d, p2.d +; CHECK-NEXT: mov z1.d, p0/z, #1 // =0x1 +; CHECK-NEXT: splice z1.d, p2, z1.d, z0.d +; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: and z1.d, z1.d, #0x1 -; CHECK-NEXT: cmpne p0.d, p2/z, z1.d, #0 +; CHECK-NEXT: cmpne p0.d, p0/z, z1.d, #0 ; CHECK-NEXT: ret %res = call @llvm.experimental.vector.splice.nxv2i1( %a, %b, i32 -1) ret %res @@ -806,13 +713,14 @@ define @splice_nxv4i1( %a, %b) #0 { ; CHECK-LABEL: splice_nxv4i1: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p2.s -; CHECK-NEXT: mov z0.s, p0/z, #1 // =0x1 -; CHECK-NEXT: lastb s0, p2, z0.s -; CHECK-NEXT: mov z1.s, p1/z, #1 // =0x1 -; CHECK-NEXT: insr z1.s, s0 +; CHECK-NEXT: ptrue p2.s, vl1 +; CHECK-NEXT: mov z0.s, p1/z, #1 // =0x1 +; CHECK-NEXT: rev p2.s, p2.s +; CHECK-NEXT: mov z1.s, p0/z, #1 // =0x1 +; CHECK-NEXT: splice z1.s, p2, z1.s, z0.s +; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: and z1.s, z1.s, #0x1 -; CHECK-NEXT: cmpne p0.s, p2/z, z1.s, #0 +; CHECK-NEXT: cmpne p0.s, p0/z, z1.s, #0 ; CHECK-NEXT: ret %res = call @llvm.experimental.vector.splice.nxv4i1( %a, %b, i32 -1) ret %res @@ -822,13 +730,14 @@ define @splice_nxv8i1( %a, %b) #0 { ; CHECK-LABEL: splice_nxv8i1: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p2.h -; CHECK-NEXT: mov z0.h, p0/z, #1 // =0x1 -; CHECK-NEXT: lastb h0, p2, z0.h -; CHECK-NEXT: mov z1.h, p1/z, #1 // =0x1 -; CHECK-NEXT: insr z1.h, h0 +; CHECK-NEXT: ptrue p2.h, vl1 +; CHECK-NEXT: mov z0.h, p1/z, #1 // =0x1 +; CHECK-NEXT: rev p2.h, p2.h +; CHECK-NEXT: mov z1.h, p0/z, #1 // =0x1 +; CHECK-NEXT: splice z1.h, p2, z1.h, z0.h +; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: and z1.h, z1.h, #0x1 -; CHECK-NEXT: cmpne p0.h, p2/z, z1.h, #0 +; CHECK-NEXT: cmpne p0.h, p0/z, z1.h, #0 ; CHECK-NEXT: ret %res = call @llvm.experimental.vector.splice.nxv8i1( %a, %b, i32 -1) ret %res @@ -838,13 +747,14 @@ define @splice_nxv16i1( %a, %b) #0 { ; CHECK-LABEL: splice_nxv16i1: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p2.b -; CHECK-NEXT: mov z0.b, p0/z, #1 // =0x1 -; CHECK-NEXT: lastb b0, p2, z0.b -; CHECK-NEXT: mov z1.b, p1/z, #1 // =0x1 -; CHECK-NEXT: insr z1.b, b0 +; CHECK-NEXT: ptrue p2.b, vl1 +; CHECK-NEXT: mov z0.b, p1/z, #1 // =0x1 +; CHECK-NEXT: rev p2.b, p2.b +; CHECK-NEXT: mov z1.b, p0/z, #1 // =0x1 +; CHECK-NEXT: splice z1.b, p2, z1.b, z0.b +; CHECK-NEXT: ptrue p0.b ; CHECK-NEXT: and z1.b, z1.b, #0x1 -; CHECK-NEXT: cmpne p0.b, p2/z, z1.b, #0 +; CHECK-NEXT: cmpne p0.b, p0/z, z1.b, #0 ; CHECK-NEXT: ret %res = call @llvm.experimental.vector.splice.nxv16i1( %a, %b, i32 -1) ret %res @@ -854,17 +764,9 @@ define @splice_nxv2i8( %a, %b) #0 { ; CHECK-LABEL: splice_nxv2i8: ; CHECK: // %bb.0: -; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill -; CHECK-NEXT: addvl sp, sp, #-2 -; CHECK-NEXT: mov x8, sp -; CHECK-NEXT: mov x9, #-2 -; CHECK-NEXT: ptrue p0.d -; CHECK-NEXT: st1d { z0.d }, p0, [sp] -; CHECK-NEXT: st1d { z1.d }, p0, [sp, #1, mul vl] -; CHECK-NEXT: addvl x8, x8, #1 -; CHECK-NEXT: ld1d { z0.d }, p0/z, [x8, x9, lsl #3] -; CHECK-NEXT: addvl sp, sp, #2 -; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ptrue p0.d, vl2 +; CHECK-NEXT: rev p0.d, p0.d +; CHECK-NEXT: splice z0.d, p0, z0.d, z1.d ; CHECK-NEXT: ret %res = call @llvm.experimental.vector.splice.nxv2i8( %a, %b, i32 -2) ret %res @@ -948,3 +850,5 @@ attributes #0 = { nounwind "target-features"="+sve" } attributes #1 = { nounwind "target-features"="+sve" vscale_range(16,16) } attributes #2 = { nounwind "target-features"="+sve" vscale_range(2,16) } +attributes #3 = { nounwind "target-features"="+sve" vscale_range(4,16) } +attributes #4 = { nounwind "target-features"="+sve" vscale_range(8,16) }