Index: llvm/lib/Target/AArch64/AArch64ISelLowering.cpp =================================================================== --- llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -7785,9 +7785,33 @@ EVT Ty = Op.getValueType(); auto Idx = Op.getConstantOperandAPInt(2); + int64_t IdxVal = Idx.getSExtValue(); + + // We can use the splice instruction for certain index values where we are + // able to efficiently generate the correct predicate. The index will be + // inverted and used directly as the input to the ptrue instruction, i.e. + // -1 -> vl1, -2 -> vl2, etc. The predicate will then be reversed to get the + // splice predicate. However, we can only do this if we can guarantee that + // there are enough elements in the vector, hence we check the index <= min + // number of elements. + if (Ty.isScalableVector() && IdxVal >= -8 && IdxVal <= -1 && + -IdxVal <= Ty.getVectorMinNumElements()) { + SDLoc DL(Op); + + // Create a predicate where all but the last -IdxVal elements are false. + EVT PredVT = Ty.changeVectorElementType(MVT::i1); + SDValue Pred = getPTrue(DAG, DL, PredVT, -IdxVal); + Pred = DAG.getNode(ISD::VECTOR_REVERSE, DL, PredVT, Pred); + + // Now splice the two inputs together using the predicate. + return DAG.getNode(AArch64ISD::SPLICE, DL, Ty, Pred, Op.getOperand(0), + Op.getOperand(1)); + } + // This will select to an EXT instruction, which has a maximum immediate // value of 255, hence 2048-bits is the maximum value we can lower. - if (Idx.sge(-1) && Idx.slt(2048 / Ty.getVectorElementType().getSizeInBits())) + if (IdxVal >= 0 && + IdxVal < (2048 / Ty.getVectorElementType().getSizeInBits())) return Op; return SDValue(); Index: llvm/test/CodeGen/AArch64/named-vector-shuffles-sve.ll =================================================================== --- llvm/test/CodeGen/AArch64/named-vector-shuffles-sve.ll +++ llvm/test/CodeGen/AArch64/named-vector-shuffles-sve.ll @@ -182,10 +182,9 @@ define @splice_nxv2f16_neg_idx( %a, %b) #0 { ; CHECK-LABEL: splice_nxv2f16_neg_idx: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d -; CHECK-NEXT: lastb d0, p0, z0.d -; CHECK-NEXT: insr z1.d, d0 -; CHECK-NEXT: mov z0.d, z1.d +; CHECK-NEXT: ptrue p0.d, vl1 +; CHECK-NEXT: rev p0.d, p0.d +; CHECK-NEXT: splice z0.d, p0, z0.d, z1.d ; CHECK-NEXT: ret %res = call @llvm.experimental.vector.splice.nxv2f16( %a, %b, i32 -1) ret %res @@ -194,17 +193,9 @@ define @splice_nxv2f16_neg2_idx( %a, %b) #0 { ; CHECK-LABEL: splice_nxv2f16_neg2_idx: ; CHECK: // %bb.0: -; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill -; CHECK-NEXT: addvl sp, sp, #-2 -; CHECK-NEXT: mov x8, sp -; CHECK-NEXT: mov x9, #-8 -; CHECK-NEXT: ptrue p0.h -; CHECK-NEXT: st1h { z0.h }, p0, [sp] -; CHECK-NEXT: st1h { z1.h }, p0, [sp, #1, mul vl] -; CHECK-NEXT: addvl x8, x8, #1 -; CHECK-NEXT: ld1h { z0.h }, p0/z, [x8, x9, lsl #1] -; CHECK-NEXT: addvl sp, sp, #2 -; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ptrue p0.d, vl2 +; CHECK-NEXT: rev p0.d, p0.d +; CHECK-NEXT: splice z0.d, p0, z0.d, z1.d ; CHECK-NEXT: ret %res = call @llvm.experimental.vector.splice.nxv2f16( %a, %b, i32 -2) ret %res @@ -256,10 +247,9 @@ define @splice_nxv4f16_neg_idx( %a, %b) #0 { ; CHECK-LABEL: splice_nxv4f16_neg_idx: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s -; CHECK-NEXT: lastb s0, p0, z0.s -; CHECK-NEXT: insr z1.s, s0 -; CHECK-NEXT: mov z0.d, z1.d +; CHECK-NEXT: ptrue p0.s, vl1 +; CHECK-NEXT: rev p0.s, p0.s +; CHECK-NEXT: splice z0.s, p0, z0.s, z1.s ; CHECK-NEXT: ret %res = call @llvm.experimental.vector.splice.nxv4f16( %a, %b, i32 -1) ret %res @@ -268,17 +258,9 @@ define @splice_nxv4f16_neg3_idx( %a, %b) #0 { ; CHECK-LABEL: splice_nxv4f16_neg3_idx: ; CHECK: // %bb.0: -; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill -; CHECK-NEXT: addvl sp, sp, #-2 -; CHECK-NEXT: mov x8, sp -; CHECK-NEXT: mov x9, #-6 -; CHECK-NEXT: ptrue p0.h -; CHECK-NEXT: st1h { z0.h }, p0, [sp] -; CHECK-NEXT: st1h { z1.h }, p0, [sp, #1, mul vl] -; CHECK-NEXT: addvl x8, x8, #1 -; CHECK-NEXT: ld1h { z0.h }, p0/z, [x8, x9, lsl #1] -; CHECK-NEXT: addvl sp, sp, #2 -; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ptrue p0.s, vl3 +; CHECK-NEXT: rev p0.s, p0.s +; CHECK-NEXT: splice z0.s, p0, z0.s, z1.s ; CHECK-NEXT: ret %res = call @llvm.experimental.vector.splice.nxv4f16( %a, %b, i32 -3) ret %res @@ -371,10 +353,9 @@ define @splice_nxv2f32_neg_idx( %a, %b) #0 { ; CHECK-LABEL: splice_nxv2f32_neg_idx: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d -; CHECK-NEXT: lastb d0, p0, z0.d -; CHECK-NEXT: insr z1.d, d0 -; CHECK-NEXT: mov z0.d, z1.d +; CHECK-NEXT: ptrue p0.d, vl1 +; CHECK-NEXT: rev p0.d, p0.d +; CHECK-NEXT: splice z0.d, p0, z0.d, z1.d ; CHECK-NEXT: ret %res = call @llvm.experimental.vector.splice.nxv2f32( %a, %b, i32 -1) ret %res @@ -383,17 +364,9 @@ define @splice_nxv2f32_neg2_idx( %a, %b) #0 { ; CHECK-LABEL: splice_nxv2f32_neg2_idx: ; CHECK: // %bb.0: -; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill -; CHECK-NEXT: addvl sp, sp, #-2 -; CHECK-NEXT: mov x8, sp -; CHECK-NEXT: mov x9, #-4 -; CHECK-NEXT: ptrue p0.s -; CHECK-NEXT: st1w { z0.s }, p0, [sp] -; CHECK-NEXT: st1w { z1.s }, p0, [sp, #1, mul vl] -; CHECK-NEXT: addvl x8, x8, #1 -; CHECK-NEXT: ld1w { z0.s }, p0/z, [x8, x9, lsl #2] -; CHECK-NEXT: addvl sp, sp, #2 -; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ptrue p0.d, vl2 +; CHECK-NEXT: rev p0.d, p0.d +; CHECK-NEXT: splice z0.d, p0, z0.d, z1.d ; CHECK-NEXT: ret %res = call @llvm.experimental.vector.splice.nxv2f32( %a, %b, i32 -2) ret %res @@ -675,10 +648,9 @@ define @splice_nxv16i8_1( %a, %b) #0 { ; CHECK-LABEL: splice_nxv16i8_1: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.b -; CHECK-NEXT: lastb b0, p0, z0.b -; CHECK-NEXT: insr z1.b, b0 -; CHECK-NEXT: mov z0.d, z1.d +; CHECK-NEXT: ptrue p0.b, vl1 +; CHECK-NEXT: rev p0.b, p0.b +; CHECK-NEXT: splice z0.b, p0, z0.b, z1.b ; CHECK-NEXT: ret %res = call @llvm.experimental.vector.splice.nxv16i8( %a, %b, i32 -1) ret %res @@ -711,17 +683,9 @@ define @splice_nxv8i16( %a, %b) #0 { ; CHECK-LABEL: splice_nxv8i16: ; CHECK: // %bb.0: -; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill -; CHECK-NEXT: addvl sp, sp, #-2 -; CHECK-NEXT: mov x8, sp -; CHECK-NEXT: mov x9, #-8 -; CHECK-NEXT: ptrue p0.h -; CHECK-NEXT: st1h { z0.h }, p0, [sp] -; CHECK-NEXT: st1h { z1.h }, p0, [sp, #1, mul vl] -; CHECK-NEXT: addvl x8, x8, #1 -; CHECK-NEXT: ld1h { z0.h }, p0/z, [x8, x9, lsl #1] -; CHECK-NEXT: addvl sp, sp, #2 -; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ptrue p0.h, vl8 +; CHECK-NEXT: rev p0.h, p0.h +; CHECK-NEXT: splice z0.h, p0, z0.h, z1.h ; CHECK-NEXT: ret %res = call @llvm.experimental.vector.splice.nxv8i16( %a, %b, i32 -8) ret %res @@ -730,10 +694,9 @@ define @splice_nxv8i16_1( %a, %b) #0 { ; CHECK-LABEL: splice_nxv8i16_1: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.h -; CHECK-NEXT: lastb h0, p0, z0.h -; CHECK-NEXT: insr z1.h, h0 -; CHECK-NEXT: mov z0.d, z1.d +; CHECK-NEXT: ptrue p0.h, vl1 +; CHECK-NEXT: rev p0.h, p0.h +; CHECK-NEXT: splice z0.h, p0, z0.h, z1.h ; CHECK-NEXT: ret %res = call @llvm.experimental.vector.splice.nxv8i16( %a, %b, i32 -1) ret %res @@ -766,17 +729,9 @@ define @splice_nxv4i32( %a, %b) #0 { ; CHECK-LABEL: splice_nxv4i32: ; CHECK: // %bb.0: -; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill -; CHECK-NEXT: addvl sp, sp, #-2 -; CHECK-NEXT: mov x8, sp -; CHECK-NEXT: mov x9, #-4 -; CHECK-NEXT: ptrue p0.s -; CHECK-NEXT: st1w { z0.s }, p0, [sp] -; CHECK-NEXT: st1w { z1.s }, p0, [sp, #1, mul vl] -; CHECK-NEXT: addvl x8, x8, #1 -; CHECK-NEXT: ld1w { z0.s }, p0/z, [x8, x9, lsl #2] -; CHECK-NEXT: addvl sp, sp, #2 -; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ptrue p0.s, vl4 +; CHECK-NEXT: rev p0.s, p0.s +; CHECK-NEXT: splice z0.s, p0, z0.s, z1.s ; CHECK-NEXT: ret %res = call @llvm.experimental.vector.splice.nxv4i32( %a, %b, i32 -4) ret %res @@ -785,10 +740,9 @@ define @splice_nxv4i32_1( %a, %b) #0 { ; CHECK-LABEL: splice_nxv4i32_1: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s -; CHECK-NEXT: lastb s0, p0, z0.s -; CHECK-NEXT: insr z1.s, s0 -; CHECK-NEXT: mov z0.d, z1.d +; CHECK-NEXT: ptrue p0.s, vl1 +; CHECK-NEXT: rev p0.s, p0.s +; CHECK-NEXT: splice z0.s, p0, z0.s, z1.s ; CHECK-NEXT: ret %res = call @llvm.experimental.vector.splice.nxv4i32( %a, %b, i32 -1) ret %res @@ -821,17 +775,9 @@ define @splice_nxv2i64( %a, %b) #0 { ; CHECK-LABEL: splice_nxv2i64: ; CHECK: // %bb.0: -; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill -; CHECK-NEXT: addvl sp, sp, #-2 -; CHECK-NEXT: mov x8, sp -; CHECK-NEXT: mov x9, #-2 -; CHECK-NEXT: ptrue p0.d -; CHECK-NEXT: st1d { z0.d }, p0, [sp] -; CHECK-NEXT: st1d { z1.d }, p0, [sp, #1, mul vl] -; CHECK-NEXT: addvl x8, x8, #1 -; CHECK-NEXT: ld1d { z0.d }, p0/z, [x8, x9, lsl #3] -; CHECK-NEXT: addvl sp, sp, #2 -; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ptrue p0.d, vl2 +; CHECK-NEXT: rev p0.d, p0.d +; CHECK-NEXT: splice z0.d, p0, z0.d, z1.d ; CHECK-NEXT: ret %res = call @llvm.experimental.vector.splice.nxv2i64( %a, %b, i32 -2) ret %res @@ -840,10 +786,9 @@ define @splice_nxv2i64_1( %a, %b) #0 { ; CHECK-LABEL: splice_nxv2i64_1: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d -; CHECK-NEXT: lastb d0, p0, z0.d -; CHECK-NEXT: insr z1.d, d0 -; CHECK-NEXT: mov z0.d, z1.d +; CHECK-NEXT: ptrue p0.d, vl1 +; CHECK-NEXT: rev p0.d, p0.d +; CHECK-NEXT: splice z0.d, p0, z0.d, z1.d ; CHECK-NEXT: ret %res = call @llvm.experimental.vector.splice.nxv2i64( %a, %b, i32 -1) ret %res @@ -876,17 +821,9 @@ define @splice_nxv8f16( %a, %b) #0 { ; CHECK-LABEL: splice_nxv8f16: ; CHECK: // %bb.0: -; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill -; CHECK-NEXT: addvl sp, sp, #-2 -; CHECK-NEXT: mov x8, sp -; CHECK-NEXT: mov x9, #-8 -; CHECK-NEXT: ptrue p0.h -; CHECK-NEXT: st1h { z0.h }, p0, [sp] -; CHECK-NEXT: st1h { z1.h }, p0, [sp, #1, mul vl] -; CHECK-NEXT: addvl x8, x8, #1 -; CHECK-NEXT: ld1h { z0.h }, p0/z, [x8, x9, lsl #1] -; CHECK-NEXT: addvl sp, sp, #2 -; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ptrue p0.h, vl8 +; CHECK-NEXT: rev p0.h, p0.h +; CHECK-NEXT: splice z0.h, p0, z0.h, z1.h ; CHECK-NEXT: ret %res = call @llvm.experimental.vector.splice.nxv8f16( %a, %b, i32 -8) ret %res @@ -895,10 +832,9 @@ define @splice_nxv8f16_1( %a, %b) #0 { ; CHECK-LABEL: splice_nxv8f16_1: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.h -; CHECK-NEXT: lastb h0, p0, z0.h -; CHECK-NEXT: insr z1.h, h0 -; CHECK-NEXT: mov z0.d, z1.d +; CHECK-NEXT: ptrue p0.h, vl1 +; CHECK-NEXT: rev p0.h, p0.h +; CHECK-NEXT: splice z0.h, p0, z0.h, z1.h ; CHECK-NEXT: ret %res = call @llvm.experimental.vector.splice.nxv8f16( %a, %b, i32 -1) ret %res @@ -931,17 +867,9 @@ define @splice_nxv4f32( %a, %b) #0 { ; CHECK-LABEL: splice_nxv4f32: ; CHECK: // %bb.0: -; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill -; CHECK-NEXT: addvl sp, sp, #-2 -; CHECK-NEXT: mov x8, sp -; CHECK-NEXT: mov x9, #-4 -; CHECK-NEXT: ptrue p0.s -; CHECK-NEXT: st1w { z0.s }, p0, [sp] -; CHECK-NEXT: st1w { z1.s }, p0, [sp, #1, mul vl] -; CHECK-NEXT: addvl x8, x8, #1 -; CHECK-NEXT: ld1w { z0.s }, p0/z, [x8, x9, lsl #2] -; CHECK-NEXT: addvl sp, sp, #2 -; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ptrue p0.s, vl4 +; CHECK-NEXT: rev p0.s, p0.s +; CHECK-NEXT: splice z0.s, p0, z0.s, z1.s ; CHECK-NEXT: ret %res = call @llvm.experimental.vector.splice.nxv4f32( %a, %b, i32 -4) ret %res @@ -950,10 +878,9 @@ define @splice_nxv4f32_1( %a, %b) #0 { ; CHECK-LABEL: splice_nxv4f32_1: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s -; CHECK-NEXT: lastb s0, p0, z0.s -; CHECK-NEXT: insr z1.s, s0 -; CHECK-NEXT: mov z0.d, z1.d +; CHECK-NEXT: ptrue p0.s, vl1 +; CHECK-NEXT: rev p0.s, p0.s +; CHECK-NEXT: splice z0.s, p0, z0.s, z1.s ; CHECK-NEXT: ret %res = call @llvm.experimental.vector.splice.nxv4f32( %a, %b, i32 -1) ret %res @@ -986,17 +913,9 @@ define @splice_nxv2f64( %a, %b) #0 { ; CHECK-LABEL: splice_nxv2f64: ; CHECK: // %bb.0: -; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill -; CHECK-NEXT: addvl sp, sp, #-2 -; CHECK-NEXT: mov x8, sp -; CHECK-NEXT: mov x9, #-2 -; CHECK-NEXT: ptrue p0.d -; CHECK-NEXT: st1d { z0.d }, p0, [sp] -; CHECK-NEXT: st1d { z1.d }, p0, [sp, #1, mul vl] -; CHECK-NEXT: addvl x8, x8, #1 -; CHECK-NEXT: ld1d { z0.d }, p0/z, [x8, x9, lsl #3] -; CHECK-NEXT: addvl sp, sp, #2 -; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ptrue p0.d, vl2 +; CHECK-NEXT: rev p0.d, p0.d +; CHECK-NEXT: splice z0.d, p0, z0.d, z1.d ; CHECK-NEXT: ret %res = call @llvm.experimental.vector.splice.nxv2f64( %a, %b, i32 -2) ret %res @@ -1005,10 +924,9 @@ define @splice_nxv2f64_1( %a, %b) #0 { ; CHECK-LABEL: splice_nxv2f64_1: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d -; CHECK-NEXT: lastb d0, p0, z0.d -; CHECK-NEXT: insr z1.d, d0 -; CHECK-NEXT: mov z0.d, z1.d +; CHECK-NEXT: ptrue p0.d, vl1 +; CHECK-NEXT: rev p0.d, p0.d +; CHECK-NEXT: splice z0.d, p0, z0.d, z1.d ; CHECK-NEXT: ret %res = call @llvm.experimental.vector.splice.nxv2f64( %a, %b, i32 -1) ret %res @@ -1042,13 +960,14 @@ define @splice_nxv2i1( %a, %b) #0 { ; CHECK-LABEL: splice_nxv2i1: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p2.d -; CHECK-NEXT: mov z0.d, p0/z, #1 // =0x1 -; CHECK-NEXT: lastb d0, p2, z0.d -; CHECK-NEXT: mov z1.d, p1/z, #1 // =0x1 -; CHECK-NEXT: insr z1.d, d0 +; CHECK-NEXT: ptrue p2.d, vl1 +; CHECK-NEXT: mov z0.d, p1/z, #1 // =0x1 +; CHECK-NEXT: rev p2.d, p2.d +; CHECK-NEXT: mov z1.d, p0/z, #1 // =0x1 +; CHECK-NEXT: splice z1.d, p2, z1.d, z0.d +; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: and z1.d, z1.d, #0x1 -; CHECK-NEXT: cmpne p0.d, p2/z, z1.d, #0 +; CHECK-NEXT: cmpne p0.d, p0/z, z1.d, #0 ; CHECK-NEXT: ret %res = call @llvm.experimental.vector.splice.nxv2i1( %a, %b, i32 -1) ret %res @@ -1058,13 +977,14 @@ define @splice_nxv4i1( %a, %b) #0 { ; CHECK-LABEL: splice_nxv4i1: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p2.s -; CHECK-NEXT: mov z0.s, p0/z, #1 // =0x1 -; CHECK-NEXT: lastb s0, p2, z0.s -; CHECK-NEXT: mov z1.s, p1/z, #1 // =0x1 -; CHECK-NEXT: insr z1.s, s0 +; CHECK-NEXT: ptrue p2.s, vl1 +; CHECK-NEXT: mov z0.s, p1/z, #1 // =0x1 +; CHECK-NEXT: rev p2.s, p2.s +; CHECK-NEXT: mov z1.s, p0/z, #1 // =0x1 +; CHECK-NEXT: splice z1.s, p2, z1.s, z0.s +; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: and z1.s, z1.s, #0x1 -; CHECK-NEXT: cmpne p0.s, p2/z, z1.s, #0 +; CHECK-NEXT: cmpne p0.s, p0/z, z1.s, #0 ; CHECK-NEXT: ret %res = call @llvm.experimental.vector.splice.nxv4i1( %a, %b, i32 -1) ret %res @@ -1074,13 +994,14 @@ define @splice_nxv8i1( %a, %b) #0 { ; CHECK-LABEL: splice_nxv8i1: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p2.h -; CHECK-NEXT: mov z0.h, p0/z, #1 // =0x1 -; CHECK-NEXT: lastb h0, p2, z0.h -; CHECK-NEXT: mov z1.h, p1/z, #1 // =0x1 -; CHECK-NEXT: insr z1.h, h0 +; CHECK-NEXT: ptrue p2.h, vl1 +; CHECK-NEXT: mov z0.h, p1/z, #1 // =0x1 +; CHECK-NEXT: rev p2.h, p2.h +; CHECK-NEXT: mov z1.h, p0/z, #1 // =0x1 +; CHECK-NEXT: splice z1.h, p2, z1.h, z0.h +; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: and z1.h, z1.h, #0x1 -; CHECK-NEXT: cmpne p0.h, p2/z, z1.h, #0 +; CHECK-NEXT: cmpne p0.h, p0/z, z1.h, #0 ; CHECK-NEXT: ret %res = call @llvm.experimental.vector.splice.nxv8i1( %a, %b, i32 -1) ret %res @@ -1090,13 +1011,14 @@ define @splice_nxv16i1( %a, %b) #0 { ; CHECK-LABEL: splice_nxv16i1: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p2.b -; CHECK-NEXT: mov z0.b, p0/z, #1 // =0x1 -; CHECK-NEXT: lastb b0, p2, z0.b -; CHECK-NEXT: mov z1.b, p1/z, #1 // =0x1 -; CHECK-NEXT: insr z1.b, b0 +; CHECK-NEXT: ptrue p2.b, vl1 +; CHECK-NEXT: mov z0.b, p1/z, #1 // =0x1 +; CHECK-NEXT: rev p2.b, p2.b +; CHECK-NEXT: mov z1.b, p0/z, #1 // =0x1 +; CHECK-NEXT: splice z1.b, p2, z1.b, z0.b +; CHECK-NEXT: ptrue p0.b ; CHECK-NEXT: and z1.b, z1.b, #0x1 -; CHECK-NEXT: cmpne p0.b, p2/z, z1.b, #0 +; CHECK-NEXT: cmpne p0.b, p0/z, z1.b, #0 ; CHECK-NEXT: ret %res = call @llvm.experimental.vector.splice.nxv16i1( %a, %b, i32 -1) ret %res @@ -1106,17 +1028,9 @@ define @splice_nxv2i8( %a, %b) #0 { ; CHECK-LABEL: splice_nxv2i8: ; CHECK: // %bb.0: -; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill -; CHECK-NEXT: addvl sp, sp, #-2 -; CHECK-NEXT: mov x8, sp -; CHECK-NEXT: mov x9, #-2 -; CHECK-NEXT: ptrue p0.d -; CHECK-NEXT: st1d { z0.d }, p0, [sp] -; CHECK-NEXT: st1d { z1.d }, p0, [sp, #1, mul vl] -; CHECK-NEXT: addvl x8, x8, #1 -; CHECK-NEXT: ld1d { z0.d }, p0/z, [x8, x9, lsl #3] -; CHECK-NEXT: addvl sp, sp, #2 -; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ptrue p0.d, vl2 +; CHECK-NEXT: rev p0.d, p0.d +; CHECK-NEXT: splice z0.d, p0, z0.d, z1.d ; CHECK-NEXT: ret %res = call @llvm.experimental.vector.splice.nxv2i8( %a, %b, i32 -2) ret %res