Index: llvm/lib/Target/AArch64/AArch64ISelLowering.cpp =================================================================== --- llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -25748,6 +25748,69 @@ } } +static SDValue GenerateFixedLengthSVETBL(SDValue Op, SDValue Op1, SDValue Op2, + ArrayRef ShuffleMask, EVT VT, + EVT ContainerVT, SelectionDAG &DAG) { + auto &Subtarget = DAG.getSubtarget(); + SDLoc DL(Op); + unsigned MinSVESize = Subtarget.getMinSVEVectorSizeInBits(); + unsigned MaxSVESize = Subtarget.getMaxSVEVectorSizeInBits(); + bool IsSingleOp = ShuffleVectorInst::isSingleSourceMask(ShuffleMask); + + // Ignore two operands if no SVE2 or all index numbers couldn't + // be represented. + if (!IsSingleOp && (!Subtarget.hasSVE2() || MinSVESize != MaxSVESize)) + return SDValue(); + + EVT VTOp1 = Op.getOperand(0).getValueType(); + unsigned BitsPerElt = VTOp1.getVectorElementType().getSizeInBits(); + unsigned IndexLen = MinSVESize / BitsPerElt; + unsigned ElementsPerVectorReg = VTOp1.getVectorNumElements(); + unsigned MaskSize = ShuffleMask.size(); + uint64_t MaxOffset = APInt(BitsPerElt, -1, false).getZExtValue(); + assert(ElementsPerVectorReg <= IndexLen && MaskSize <= IndexLen && + "Incorrectly legalised shuffle operation"); + + SmallVector TBLMask; + for (int Offset : ShuffleMask) { + // If we refer to the second operand then we have to add elements + // number in hardware register minus number of elements in a type. + if (Offset >= (int) ElementsPerVectorReg) + Offset += (int)IndexLen - (int)ElementsPerVectorReg; + // For 8-bit elements and 1024-bit SVE registers and MaxOffset equals + // to 255, this might point to the last element of in the second operand + // of the shufflevector, thus we are rejecting this transform. + if (Offset < 0 || (unsigned) Offset >= MaxOffset) + return SDValue(); + TBLMask.push_back(DAG.getConstant(Offset, DL, MVT::i64)); + } + + // It is still better to fill TBL mask to the actual hardware supported + // size with out of index elements. + for (unsigned i = 0; i < IndexLen - ElementsPerVectorReg; ++i) + TBLMask.push_back(DAG.getConstant((int)MaxOffset, DL, MVT::i64)); + + EVT MaskEltType = EVT::getIntegerVT(*DAG.getContext(), BitsPerElt); + EVT MaskType = EVT::getVectorVT(*DAG.getContext(), MaskEltType, IndexLen); + EVT MaskContainerVT = getContainerForFixedLengthVector(DAG, MaskType); + SDValue VecMask = + DAG.getBuildVector(MaskType, DL, ArrayRef(TBLMask.data(), IndexLen)); + SDValue SVEMask = convertToScalableVector(DAG, MaskContainerVT, VecMask); + + SDValue Shuffle; + if (IsSingleOp) { + Shuffle = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, ContainerVT, + DAG.getConstant(Intrinsic::aarch64_sve_tbl, DL, MVT::i32), + Op1, SVEMask); + } else if (Subtarget.hasSVE2()) { + Shuffle = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, ContainerVT, + DAG.getConstant(Intrinsic::aarch64_sve_tbl2, DL, MVT::i32), + Op1, Op2, SVEMask); + } + Shuffle = convertFromScalableVector(DAG, VT, Shuffle); + return DAG.getNode(ISD::BITCAST, DL, Op.getValueType(), Shuffle); +} + SDValue AArch64TargetLowering::LowerFixedLengthVECTOR_SHUFFLEToSVE( SDValue Op, SelectionDAG &DAG) const { EVT VT = Op.getValueType(); @@ -25893,6 +25956,12 @@ } } + // Avoid producing TBL instruction if we don't know + // SVE register minimal size. + if (MinSVESize) + return GenerateFixedLengthSVETBL(Op, Op1, Op2, ShuffleMask, VT, ContainerVT, + DAG); + return SDValue(); } Index: llvm/test/CodeGen/AArch64/sve-fixed-length-permute-rev.ll =================================================================== --- llvm/test/CodeGen/AArch64/sve-fixed-length-permute-rev.ll +++ llvm/test/CodeGen/AArch64/sve-fixed-length-permute-rev.ll @@ -194,28 +194,13 @@ define void @test_rev_elts_fail(ptr %a) #1 { ; CHECK-LABEL: test_rev_elts_fail: ; CHECK: // %bb.0: -; CHECK-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill -; CHECK-NEXT: sub x9, sp, #48 -; CHECK-NEXT: mov x29, sp -; CHECK-NEXT: and sp, x9, #0xffffffffffffffe0 -; CHECK-NEXT: .cfi_def_cfa w29, 16 -; CHECK-NEXT: .cfi_offset w30, -8 -; CHECK-NEXT: .cfi_offset w29, -16 ; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: adrp x8, .LCPI11_0 +; CHECK-NEXT: add x8, x8, :lo12:.LCPI11_0 ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] -; CHECK-NEXT: mov z1.d, z0.d[2] -; CHECK-NEXT: mov z2.d, z0.d[3] -; CHECK-NEXT: mov x9, v0.d[1] -; CHECK-NEXT: fmov x8, d1 -; CHECK-NEXT: fmov x10, d2 -; CHECK-NEXT: stp x10, x8, [sp, #16] -; CHECK-NEXT: fmov x8, d0 -; CHECK-NEXT: stp x9, x8, [sp] -; CHECK-NEXT: mov x8, sp -; CHECK-NEXT: ld1d { z0.d }, p0/z, [x8] +; CHECK-NEXT: ld1d { z1.d }, p0/z, [x8] +; CHECK-NEXT: tbl z0.d, { z0.d }, z1.d ; CHECK-NEXT: st1d { z0.d }, p0, [x0] -; CHECK-NEXT: mov sp, x29 -; CHECK-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload ; CHECK-NEXT: ret %tmp1 = load <4 x i64>, ptr %a %tmp2 = shufflevector <4 x i64> %tmp1, <4 x i64> undef, <4 x i32> @@ -260,39 +245,26 @@ ; sve-vector-bits-min=256, sve-vector-bits-max is not set, REV inst can't be generated. define void @test_revv8i32(ptr %a) #0 { -; CHECK-LABEL: test_revv8i32: -; CHECK: // %bb.0: -; CHECK-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill -; CHECK-NEXT: sub x9, sp, #48 -; CHECK-NEXT: mov x29, sp -; CHECK-NEXT: and sp, x9, #0xffffffffffffffe0 -; CHECK-NEXT: .cfi_def_cfa w29, 16 -; CHECK-NEXT: .cfi_offset w30, -8 -; CHECK-NEXT: .cfi_offset w29, -16 -; CHECK-NEXT: ptrue p0.s, vl8 -; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] -; CHECK-NEXT: mov w8, v0.s[1] -; CHECK-NEXT: mov w9, v0.s[2] -; CHECK-NEXT: mov w10, v0.s[3] -; CHECK-NEXT: fmov w11, s0 -; CHECK-NEXT: mov z1.s, z0.s[4] -; CHECK-NEXT: mov z2.s, z0.s[5] -; CHECK-NEXT: mov z3.s, z0.s[6] -; CHECK-NEXT: mov z0.s, z0.s[7] -; CHECK-NEXT: stp w8, w11, [sp, #24] -; CHECK-NEXT: fmov w8, s1 -; CHECK-NEXT: stp w10, w9, [sp, #16] -; CHECK-NEXT: fmov w9, s2 -; CHECK-NEXT: stp w9, w8, [sp, #8] -; CHECK-NEXT: fmov w8, s3 -; CHECK-NEXT: fmov w9, s0 -; CHECK-NEXT: stp w9, w8, [sp] -; CHECK-NEXT: mov x8, sp -; CHECK-NEXT: ld1w { z0.s }, p0/z, [x8] -; CHECK-NEXT: st1w { z0.s }, p0, [x0] -; CHECK-NEXT: mov sp, x29 -; CHECK-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload -; CHECK-NEXT: ret +; VBITS_GE_256-LABEL: test_revv8i32: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: ptrue p0.s, vl8 +; VBITS_GE_256-NEXT: index z0.s, #7, #-1 +; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0] +; VBITS_GE_256-NEXT: tbl z0.s, { z1.s }, z0.s +; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0] +; VBITS_GE_256-NEXT: ret +; +; VBITS_GE_512-LABEL: test_revv8i32: +; VBITS_GE_512: // %bb.0: +; VBITS_GE_512-NEXT: ptrue p0.s, vl8 +; VBITS_GE_512-NEXT: adrp x8, .LCPI14_0 +; VBITS_GE_512-NEXT: add x8, x8, :lo12:.LCPI14_0 +; VBITS_GE_512-NEXT: ptrue p1.s, vl16 +; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x0] +; VBITS_GE_512-NEXT: ld1w { z1.s }, p1/z, [x8] +; VBITS_GE_512-NEXT: tbl z0.s, { z0.s }, z1.s +; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x0] +; VBITS_GE_512-NEXT: ret %tmp1 = load <8 x i32>, ptr %a %tmp2 = shufflevector <8 x i32> %tmp1, <8 x i32> undef, <8 x i32> store <8 x i32> %tmp2, ptr %a @@ -379,60 +351,13 @@ define void @test_rev_fail(ptr %a) #1 { ; CHECK-LABEL: test_rev_fail: ; CHECK: // %bb.0: -; CHECK-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill -; CHECK-NEXT: sub x9, sp, #48 -; CHECK-NEXT: mov x29, sp -; CHECK-NEXT: and sp, x9, #0xffffffffffffffe0 -; CHECK-NEXT: .cfi_def_cfa w29, 16 -; CHECK-NEXT: .cfi_offset w30, -8 -; CHECK-NEXT: .cfi_offset w29, -16 ; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: adrp x8, .LCPI20_0 +; CHECK-NEXT: add x8, x8, :lo12:.LCPI20_0 ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] -; CHECK-NEXT: mov z1.h, z0.h[8] -; CHECK-NEXT: fmov w8, s0 -; CHECK-NEXT: mov z2.h, z0.h[9] -; CHECK-NEXT: mov z3.h, z0.h[10] -; CHECK-NEXT: mov z4.h, z0.h[11] -; CHECK-NEXT: strh w8, [sp, #14] -; CHECK-NEXT: fmov w8, s1 -; CHECK-NEXT: mov z1.h, z0.h[12] -; CHECK-NEXT: fmov w9, s2 -; CHECK-NEXT: mov z2.h, z0.h[13] -; CHECK-NEXT: strh w8, [sp, #30] -; CHECK-NEXT: fmov w8, s3 -; CHECK-NEXT: mov z3.h, z0.h[14] -; CHECK-NEXT: strh w9, [sp, #28] -; CHECK-NEXT: fmov w9, s4 -; CHECK-NEXT: mov z4.h, z0.h[15] -; CHECK-NEXT: fmov w10, s2 -; CHECK-NEXT: strh w8, [sp, #26] -; CHECK-NEXT: fmov w8, s1 -; CHECK-NEXT: fmov w11, s3 -; CHECK-NEXT: strh w9, [sp, #24] -; CHECK-NEXT: umov w9, v0.h[1] -; CHECK-NEXT: fmov w12, s4 -; CHECK-NEXT: strh w10, [sp, #20] -; CHECK-NEXT: umov w10, v0.h[3] -; CHECK-NEXT: strh w8, [sp, #22] -; CHECK-NEXT: umov w8, v0.h[2] -; CHECK-NEXT: strh w11, [sp, #18] -; CHECK-NEXT: umov w11, v0.h[4] -; CHECK-NEXT: strh w12, [sp, #16] -; CHECK-NEXT: umov w12, v0.h[5] -; CHECK-NEXT: strh w9, [sp, #12] -; CHECK-NEXT: umov w9, v0.h[6] -; CHECK-NEXT: strh w8, [sp, #10] -; CHECK-NEXT: umov w8, v0.h[7] -; CHECK-NEXT: strh w10, [sp, #8] -; CHECK-NEXT: strh w11, [sp, #6] -; CHECK-NEXT: strh w12, [sp, #4] -; CHECK-NEXT: strh w9, [sp, #2] -; CHECK-NEXT: strh w8, [sp] -; CHECK-NEXT: mov x8, sp -; CHECK-NEXT: ld1h { z0.h }, p0/z, [x8] +; CHECK-NEXT: ld1h { z1.h }, p0/z, [x8] +; CHECK-NEXT: tbl z0.h, { z0.h }, z1.h ; CHECK-NEXT: st1h { z0.h }, p0, [x0] -; CHECK-NEXT: mov sp, x29 -; CHECK-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload ; CHECK-NEXT: ret %tmp1 = load <16 x i16>, ptr %a %tmp2 = shufflevector <16 x i16> %tmp1, <16 x i16> undef, <16 x i32> Index: llvm/test/CodeGen/AArch64/sve-fixed-length-vector-shuffle.ll =================================================================== --- llvm/test/CodeGen/AArch64/sve-fixed-length-vector-shuffle.ll +++ llvm/test/CodeGen/AArch64/sve-fixed-length-vector-shuffle.ll @@ -937,4 +937,69 @@ ret void } +define void @shuffle_v4f64_tbl_op1(ptr %a, ptr %b) #1 { +; CHECK: .LCPI42_0: +; CHECK-NEXT: .xword 1 +; CHECK-NEXT: .xword 3 +; CHECK-NEXT: .xword 2 +; CHECK-NEXT: .xword 0 +; CHECK-LABEL: shuffle_v4f64_tbl_op1: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: adrp x8, .LCPI42_0 +; CHECK-NEXT: add x8, x8, :lo12:.LCPI42_0 +; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] +; CHECK-NEXT: ld1d { z1.d }, p0/z, [x8] +; CHECK-NEXT: tbl z0.d, { z0.d }, z1.d +; CHECK-NEXT: st1d { z0.d }, p0, [x0] +; CHECK-NEXT: ret + %op1 = load <4 x double>, ptr %a + %op2 = load <4 x double>, ptr %b + %ret = shufflevector <4 x double> %op1, <4 x double> %op2, <4 x i32> + store <4 x double> %ret, ptr %a + ret void +} + +define void @shuffle_v4f64_tbl_op2(ptr %a, ptr %b) #1 { +; CHECK: .LCPI43_0: +; CHECK-NEXT: .xword 1 +; CHECK-NEXT: .xword 3 +; CHECK-NEXT: .xword 2 +; CHECK-NEXT: .xword 0 +; CHECK-LABEL: shuffle_v4f64_tbl_op2: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: adrp x8, .LCPI43_0 +; CHECK-NEXT: add x8, x8, :lo12:.LCPI43_0 +; CHECK-NEXT: ld1d { z0.d }, p0/z, [x1] +; CHECK-NEXT: ld1d { z1.d }, p0/z, [x8] +; CHECK-NEXT: tbl z0.d, { z0.d }, z1.d +; CHECK-NEXT: st1d { z0.d }, p0, [x0] +; CHECK-NEXT: ret + %op1 = load <4 x double>, ptr %a + %op2 = load <4 x double>, ptr %b + %ret = shufflevector <4 x double> %op1, <4 x double> %op2, <4 x i32> + store <4 x double> %ret, ptr %a + ret void +} + +define void @shuffle_v4f64_tbl2(ptr %a, ptr %b) #2 { +; CHECK-LABEL: shuffle_v4f64_tbl2: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: index z2.d, #2, #1 +; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] +; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1] +; CHECK-NEXT: tbl z0.d, { z0.d, z1.d }, z2.d +; CHECK-NEXT: st1d { z0.d }, p0, [x0] +; CHECK-NEXT: ret + %op1 = load <4 x double>, ptr %a + %op2 = load <4 x double>, ptr %b + %ret = shufflevector <4 x double> %op1, <4 x double> %op2, <4 x i32> + store <4 x double> %ret, ptr %a + ret void +} + attributes #0 = { "target-features"="+sve" } +attributes #1 = { "target-features"="+sve" vscale_range(2,2) } +attributes #2 = { "target-features"="+sve2" vscale_range(2,2) } Index: llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-vector-shuffle.ll =================================================================== --- llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-vector-shuffle.ll +++ llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-vector-shuffle.ll @@ -1,29 +1,62 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mattr=+sve -force-streaming-compatible-sve < %s | FileCheck %s - +; RUN: llc -mattr=+sve -force-streaming-compatible-sve < %s | FileCheck %s -check-prefixes=CHECK,CHECK_SVE +; RUN: llc -mattr=+sve2 -force-streaming-compatible-sve -aarch64-sve-vector-bits-min=128 -aarch64-sve-vector-bits-max=128 < %s | FileCheck %s -check-prefixes=CHECK,SVE2_128 +; RUN: llc -mattr=+sve2 -force-streaming-compatible-sve -aarch64-sve-vector-bits-min=1024 -aarch64-sve-vector-bits-max=1024 < %s | FileCheck %s -check-prefixes=CHECK,SVE2_1024 +; RUN: llc -mattr=+sve2 -force-streaming-compatible-sve -aarch64-sve-vector-bits-min=2048 -aarch64-sve-vector-bits-max=2048 < %s | FileCheck %s -check-prefixes=CHECK,SVE2_2048 target triple = "aarch64-unknown-linux-gnu" define <4 x i8> @shuffle_ext_byone_v4i8(<4 x i8> %op1, <4 x i8> %op2) { -; CHECK-LABEL: shuffle_ext_byone_v4i8: -; CHECK: // %bb.0: -; CHECK-NEXT: sub sp, sp, #16 -; CHECK-NEXT: .cfi_def_cfa_offset 16 -; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 -; CHECK-NEXT: mov z1.h, z0.h[1] -; CHECK-NEXT: fmov w8, s0 -; CHECK-NEXT: mov z2.h, z0.h[2] -; CHECK-NEXT: mov z3.h, z0.h[3] -; CHECK-NEXT: strh w8, [sp, #8] -; CHECK-NEXT: fmov w8, s1 -; CHECK-NEXT: fmov w9, s2 -; CHECK-NEXT: strh w8, [sp, #14] -; CHECK-NEXT: fmov w8, s3 -; CHECK-NEXT: strh w9, [sp, #12] -; CHECK-NEXT: strh w8, [sp, #10] -; CHECK-NEXT: ldr d0, [sp, #8] -; CHECK-NEXT: add sp, sp, #16 -; CHECK-NEXT: ret +; CHECK_SVE-LABEL: shuffle_ext_byone_v4i8: +; CHECK_SVE: // %bb.0: +; CHECK_SVE-NEXT: sub sp, sp, #16 +; CHECK_SVE-NEXT: .cfi_def_cfa_offset 16 +; CHECK_SVE-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK_SVE-NEXT: mov z1.h, z0.h[1] +; CHECK_SVE-NEXT: fmov w8, s0 +; CHECK_SVE-NEXT: mov z2.h, z0.h[2] +; CHECK_SVE-NEXT: mov z3.h, z0.h[3] +; CHECK_SVE-NEXT: strh w8, [sp, #8] +; CHECK_SVE-NEXT: fmov w8, s1 +; CHECK_SVE-NEXT: fmov w9, s2 +; CHECK_SVE-NEXT: strh w8, [sp, #14] +; CHECK_SVE-NEXT: fmov w8, s3 +; CHECK_SVE-NEXT: strh w9, [sp, #12] +; CHECK_SVE-NEXT: strh w8, [sp, #10] +; CHECK_SVE-NEXT: ldr d0, [sp, #8] +; CHECK_SVE-NEXT: add sp, sp, #16 +; CHECK_SVE-NEXT: ret +; +; SVE2_128-LABEL: shuffle_ext_byone_v4i8: +; SVE2_128: // %bb.0: +; SVE2_128-NEXT: adrp x8, .LCPI0_0 +; SVE2_128-NEXT: // kill: def $d0 killed $d0 def $z0 +; SVE2_128-NEXT: ldr q1, [x8, :lo12:.LCPI0_0] +; SVE2_128-NEXT: tbl z0.h, { z0.h }, z1.h +; SVE2_128-NEXT: // kill: def $d0 killed $d0 killed $z0 +; SVE2_128-NEXT: ret +; +; SVE2_1024-LABEL: shuffle_ext_byone_v4i8: +; SVE2_1024: // %bb.0: +; SVE2_1024-NEXT: ptrue p0.h +; SVE2_1024-NEXT: adrp x8, .LCPI0_0 +; SVE2_1024-NEXT: add x8, x8, :lo12:.LCPI0_0 +; SVE2_1024-NEXT: // kill: def $d0 killed $d0 def $z0 +; SVE2_1024-NEXT: ld1h { z1.h }, p0/z, [x8] +; SVE2_1024-NEXT: tbl z0.h, { z0.h }, z1.h +; SVE2_1024-NEXT: // kill: def $d0 killed $d0 killed $z0 +; SVE2_1024-NEXT: ret +; +; SVE2_2048-LABEL: shuffle_ext_byone_v4i8: +; SVE2_2048: // %bb.0: +; SVE2_2048-NEXT: ptrue p0.h +; SVE2_2048-NEXT: adrp x8, .LCPI0_0 +; SVE2_2048-NEXT: add x8, x8, :lo12:.LCPI0_0 +; SVE2_2048-NEXT: // kill: def $d0 killed $d0 def $z0 +; SVE2_2048-NEXT: ld1h { z1.h }, p0/z, [x8] +; SVE2_2048-NEXT: tbl z0.h, { z0.h }, z1.h +; SVE2_2048-NEXT: // kill: def $d0 killed $d0 killed $z0 +; SVE2_2048-NEXT: ret %ret = shufflevector <4 x i8> %op1, <4 x i8> %op2, <4 x i32> ret <4 x i8> %ret } @@ -58,18 +91,53 @@ } define void @shuffle_ext_byone_v32i8(ptr %a, ptr %b) { -; CHECK-LABEL: shuffle_ext_byone_v32i8: -; CHECK: // %bb.0: -; CHECK-NEXT: ldr q0, [x0, #16] -; CHECK-NEXT: ldp q1, q3, [x1] -; CHECK-NEXT: mov z0.b, z0.b[15] -; CHECK-NEXT: mov z2.b, z1.b[15] -; CHECK-NEXT: fmov w8, s0 -; CHECK-NEXT: insr z1.b, w8 -; CHECK-NEXT: fmov w8, s2 -; CHECK-NEXT: insr z3.b, w8 -; CHECK-NEXT: stp q1, q3, [x0] -; CHECK-NEXT: ret +; CHECK_SVE-LABEL: shuffle_ext_byone_v32i8: +; CHECK_SVE: // %bb.0: +; CHECK_SVE-NEXT: ldr q0, [x0, #16] +; CHECK_SVE-NEXT: ldp q1, q3, [x1] +; CHECK_SVE-NEXT: mov z0.b, z0.b[15] +; CHECK_SVE-NEXT: mov z2.b, z1.b[15] +; CHECK_SVE-NEXT: fmov w8, s0 +; CHECK_SVE-NEXT: insr z1.b, w8 +; CHECK_SVE-NEXT: fmov w8, s2 +; CHECK_SVE-NEXT: insr z3.b, w8 +; CHECK_SVE-NEXT: stp q1, q3, [x0] +; CHECK_SVE-NEXT: ret +; +; SVE2_128-LABEL: shuffle_ext_byone_v32i8: +; SVE2_128: // %bb.0: +; SVE2_128-NEXT: ldr q0, [x0, #16] +; SVE2_128-NEXT: ldp q1, q3, [x1] +; SVE2_128-NEXT: mov z0.b, z0.b[15] +; SVE2_128-NEXT: mov z2.b, z1.b[15] +; SVE2_128-NEXT: fmov w8, s0 +; SVE2_128-NEXT: insr z1.b, w8 +; SVE2_128-NEXT: fmov w8, s2 +; SVE2_128-NEXT: insr z3.b, w8 +; SVE2_128-NEXT: stp q1, q3, [x0] +; SVE2_128-NEXT: ret +; +; SVE2_1024-LABEL: shuffle_ext_byone_v32i8: +; SVE2_1024: // %bb.0: +; SVE2_1024-NEXT: ptrue p0.b, vl32 +; SVE2_1024-NEXT: ld1b { z0.b }, p0/z, [x0] +; SVE2_1024-NEXT: ld1b { z1.b }, p0/z, [x1] +; SVE2_1024-NEXT: mov z0.b, z0.b[31] +; SVE2_1024-NEXT: fmov w8, s0 +; SVE2_1024-NEXT: insr z1.b, w8 +; SVE2_1024-NEXT: st1b { z1.b }, p0, [x0] +; SVE2_1024-NEXT: ret +; +; SVE2_2048-LABEL: shuffle_ext_byone_v32i8: +; SVE2_2048: // %bb.0: +; SVE2_2048-NEXT: ptrue p0.b, vl32 +; SVE2_2048-NEXT: ld1b { z0.b }, p0/z, [x0] +; SVE2_2048-NEXT: ld1b { z1.b }, p0/z, [x1] +; SVE2_2048-NEXT: mov z0.b, z0.b[31] +; SVE2_2048-NEXT: fmov w8, s0 +; SVE2_2048-NEXT: insr z1.b, w8 +; SVE2_2048-NEXT: st1b { z1.b }, p0, [x0] +; SVE2_2048-NEXT: ret %op1 = load <32 x i8>, ptr %a %op2 = load <32 x i8>, ptr %b %ret = shufflevector <32 x i8> %op1, <32 x i8> %op2, <32 x i32> , ptr %a %op2 = load <16 x i16>, ptr %b %ret = shufflevector <16 x i16> %op1, <16 x i16> %op2, <16 x i32> , ptr %a %op2 = load <8 x i32>, ptr %b %ret = shufflevector <8 x i32> %op1, <8 x i32> %op2, <8 x i32> @@ -204,18 +342,53 @@ } define void @shuffle_ext_byone_v4i64(ptr %a, ptr %b) { -; CHECK-LABEL: shuffle_ext_byone_v4i64: -; CHECK: // %bb.0: -; CHECK-NEXT: ldr q0, [x0, #16] -; CHECK-NEXT: ldp q1, q3, [x1] -; CHECK-NEXT: mov z0.d, z0.d[1] -; CHECK-NEXT: mov z2.d, z1.d[1] -; CHECK-NEXT: fmov x8, d0 -; CHECK-NEXT: insr z1.d, x8 -; CHECK-NEXT: fmov x8, d2 -; CHECK-NEXT: insr z3.d, x8 -; CHECK-NEXT: stp q1, q3, [x0] -; CHECK-NEXT: ret +; CHECK_SVE-LABEL: shuffle_ext_byone_v4i64: +; CHECK_SVE: // %bb.0: +; CHECK_SVE-NEXT: ldr q0, [x0, #16] +; CHECK_SVE-NEXT: ldp q1, q3, [x1] +; CHECK_SVE-NEXT: mov z0.d, z0.d[1] +; CHECK_SVE-NEXT: mov z2.d, z1.d[1] +; CHECK_SVE-NEXT: fmov x8, d0 +; CHECK_SVE-NEXT: insr z1.d, x8 +; CHECK_SVE-NEXT: fmov x8, d2 +; CHECK_SVE-NEXT: insr z3.d, x8 +; CHECK_SVE-NEXT: stp q1, q3, [x0] +; CHECK_SVE-NEXT: ret +; +; SVE2_128-LABEL: shuffle_ext_byone_v4i64: +; SVE2_128: // %bb.0: +; SVE2_128-NEXT: ldr q0, [x0, #16] +; SVE2_128-NEXT: ldp q1, q3, [x1] +; SVE2_128-NEXT: mov z0.d, z0.d[1] +; SVE2_128-NEXT: mov z2.d, z1.d[1] +; SVE2_128-NEXT: fmov x8, d0 +; SVE2_128-NEXT: insr z1.d, x8 +; SVE2_128-NEXT: fmov x8, d2 +; SVE2_128-NEXT: insr z3.d, x8 +; SVE2_128-NEXT: stp q1, q3, [x0] +; SVE2_128-NEXT: ret +; +; SVE2_1024-LABEL: shuffle_ext_byone_v4i64: +; SVE2_1024: // %bb.0: +; SVE2_1024-NEXT: ptrue p0.d, vl4 +; SVE2_1024-NEXT: ld1d { z0.d }, p0/z, [x0] +; SVE2_1024-NEXT: ld1d { z1.d }, p0/z, [x1] +; SVE2_1024-NEXT: mov z0.d, z0.d[3] +; SVE2_1024-NEXT: fmov x8, d0 +; SVE2_1024-NEXT: insr z1.d, x8 +; SVE2_1024-NEXT: st1d { z1.d }, p0, [x0] +; SVE2_1024-NEXT: ret +; +; SVE2_2048-LABEL: shuffle_ext_byone_v4i64: +; SVE2_2048: // %bb.0: +; SVE2_2048-NEXT: ptrue p0.d, vl4 +; SVE2_2048-NEXT: ld1d { z0.d }, p0/z, [x0] +; SVE2_2048-NEXT: ld1d { z1.d }, p0/z, [x1] +; SVE2_2048-NEXT: mov z0.d, z0.d[3] +; SVE2_2048-NEXT: fmov x8, d0 +; SVE2_2048-NEXT: insr z1.d, x8 +; SVE2_2048-NEXT: st1d { z1.d }, p0, [x0] +; SVE2_2048-NEXT: ret %op1 = load <4 x i64>, ptr %a %op2 = load <4 x i64>, ptr %b %ret = shufflevector <4 x i64> %op1, <4 x i64> %op2, <4 x i32> @@ -251,16 +424,47 @@ } define void @shuffle_ext_byone_v16f16(ptr %a, ptr %b) { -; CHECK-LABEL: shuffle_ext_byone_v16f16: -; CHECK: // %bb.0: -; CHECK-NEXT: ldp q1, q3, [x1] -; CHECK-NEXT: ldr q0, [x0, #16] -; CHECK-NEXT: mov z0.h, z0.h[7] -; CHECK-NEXT: mov z2.h, z1.h[7] -; CHECK-NEXT: insr z1.h, h0 -; CHECK-NEXT: insr z3.h, h2 -; CHECK-NEXT: stp q1, q3, [x0] -; CHECK-NEXT: ret +; CHECK_SVE-LABEL: shuffle_ext_byone_v16f16: +; CHECK_SVE: // %bb.0: +; CHECK_SVE-NEXT: ldp q1, q3, [x1] +; CHECK_SVE-NEXT: ldr q0, [x0, #16] +; CHECK_SVE-NEXT: mov z0.h, z0.h[7] +; CHECK_SVE-NEXT: mov z2.h, z1.h[7] +; CHECK_SVE-NEXT: insr z1.h, h0 +; CHECK_SVE-NEXT: insr z3.h, h2 +; CHECK_SVE-NEXT: stp q1, q3, [x0] +; CHECK_SVE-NEXT: ret +; +; SVE2_128-LABEL: shuffle_ext_byone_v16f16: +; SVE2_128: // %bb.0: +; SVE2_128-NEXT: ldp q1, q3, [x1] +; SVE2_128-NEXT: ldr q0, [x0, #16] +; SVE2_128-NEXT: mov z0.h, z0.h[7] +; SVE2_128-NEXT: mov z2.h, z1.h[7] +; SVE2_128-NEXT: insr z1.h, h0 +; SVE2_128-NEXT: insr z3.h, h2 +; SVE2_128-NEXT: stp q1, q3, [x0] +; SVE2_128-NEXT: ret +; +; SVE2_1024-LABEL: shuffle_ext_byone_v16f16: +; SVE2_1024: // %bb.0: +; SVE2_1024-NEXT: ptrue p0.h, vl16 +; SVE2_1024-NEXT: ld1h { z0.h }, p0/z, [x0] +; SVE2_1024-NEXT: ld1h { z1.h }, p0/z, [x1] +; SVE2_1024-NEXT: mov z0.h, z0.h[15] +; SVE2_1024-NEXT: insr z1.h, h0 +; SVE2_1024-NEXT: st1h { z1.h }, p0, [x0] +; SVE2_1024-NEXT: ret +; +; SVE2_2048-LABEL: shuffle_ext_byone_v16f16: +; SVE2_2048: // %bb.0: +; SVE2_2048-NEXT: ptrue p0.h, vl16 +; SVE2_2048-NEXT: ld1h { z0.h }, p0/z, [x0] +; SVE2_2048-NEXT: ld1h { z1.h }, p0/z, [x1] +; SVE2_2048-NEXT: mov z0.h, z0.h[15] +; SVE2_2048-NEXT: insr z1.h, h0 +; SVE2_2048-NEXT: st1h { z1.h }, p0, [x0] +; SVE2_2048-NEXT: ret %op1 = load <16 x half>, ptr %a %op2 = load <16 x half>, ptr %b %ret = shufflevector <16 x half> %op1, <16 x half> %op2, <16 x i32> , ptr %a %op2 = load <8 x float>, ptr %b %ret = shufflevector <8 x float> %op1, <8 x float> %op2, <8 x i32> @@ -327,16 +562,47 @@ } define void @shuffle_ext_byone_v4f64(ptr %a, ptr %b) { -; CHECK-LABEL: shuffle_ext_byone_v4f64: -; CHECK: // %bb.0: -; CHECK-NEXT: ldp q1, q3, [x1] -; CHECK-NEXT: ldr q0, [x0, #16] -; CHECK-NEXT: mov z0.d, z0.d[1] -; CHECK-NEXT: mov z2.d, z1.d[1] -; CHECK-NEXT: insr z1.d, d0 -; CHECK-NEXT: insr z3.d, d2 -; CHECK-NEXT: stp q1, q3, [x0] -; CHECK-NEXT: ret +; CHECK_SVE-LABEL: shuffle_ext_byone_v4f64: +; CHECK_SVE: // %bb.0: +; CHECK_SVE-NEXT: ldp q1, q3, [x1] +; CHECK_SVE-NEXT: ldr q0, [x0, #16] +; CHECK_SVE-NEXT: mov z0.d, z0.d[1] +; CHECK_SVE-NEXT: mov z2.d, z1.d[1] +; CHECK_SVE-NEXT: insr z1.d, d0 +; CHECK_SVE-NEXT: insr z3.d, d2 +; CHECK_SVE-NEXT: stp q1, q3, [x0] +; CHECK_SVE-NEXT: ret +; +; SVE2_128-LABEL: shuffle_ext_byone_v4f64: +; SVE2_128: // %bb.0: +; SVE2_128-NEXT: ldp q1, q3, [x1] +; SVE2_128-NEXT: ldr q0, [x0, #16] +; SVE2_128-NEXT: mov z0.d, z0.d[1] +; SVE2_128-NEXT: mov z2.d, z1.d[1] +; SVE2_128-NEXT: insr z1.d, d0 +; SVE2_128-NEXT: insr z3.d, d2 +; SVE2_128-NEXT: stp q1, q3, [x0] +; SVE2_128-NEXT: ret +; +; SVE2_1024-LABEL: shuffle_ext_byone_v4f64: +; SVE2_1024: // %bb.0: +; SVE2_1024-NEXT: ptrue p0.d, vl4 +; SVE2_1024-NEXT: ld1d { z0.d }, p0/z, [x0] +; SVE2_1024-NEXT: ld1d { z1.d }, p0/z, [x1] +; SVE2_1024-NEXT: mov z0.d, z0.d[3] +; SVE2_1024-NEXT: insr z1.d, d0 +; SVE2_1024-NEXT: st1d { z1.d }, p0, [x0] +; SVE2_1024-NEXT: ret +; +; SVE2_2048-LABEL: shuffle_ext_byone_v4f64: +; SVE2_2048: // %bb.0: +; SVE2_2048-NEXT: ptrue p0.d, vl4 +; SVE2_2048-NEXT: ld1d { z0.d }, p0/z, [x0] +; SVE2_2048-NEXT: ld1d { z1.d }, p0/z, [x1] +; SVE2_2048-NEXT: mov z0.d, z0.d[3] +; SVE2_2048-NEXT: insr z1.d, d0 +; SVE2_2048-NEXT: st1d { z1.d }, p0, [x0] +; SVE2_2048-NEXT: ret %op1 = load <4 x double>, ptr %a %op2 = load <4 x double>, ptr %b %ret = shufflevector <4 x double> %op1, <4 x double> %op2, <4 x i32> @@ -345,16 +611,47 @@ } define void @shuffle_ext_byone_reverse(ptr %a, ptr %b) { -; CHECK-LABEL: shuffle_ext_byone_reverse: -; CHECK: // %bb.0: -; CHECK-NEXT: ldp q1, q3, [x0] -; CHECK-NEXT: ldr q0, [x1, #16] -; CHECK-NEXT: mov z0.d, z0.d[1] -; CHECK-NEXT: mov z2.d, z1.d[1] -; CHECK-NEXT: insr z1.d, d0 -; CHECK-NEXT: insr z3.d, d2 -; CHECK-NEXT: stp q1, q3, [x0] -; CHECK-NEXT: ret +; CHECK_SVE-LABEL: shuffle_ext_byone_reverse: +; CHECK_SVE: // %bb.0: +; CHECK_SVE-NEXT: ldp q1, q3, [x0] +; CHECK_SVE-NEXT: ldr q0, [x1, #16] +; CHECK_SVE-NEXT: mov z0.d, z0.d[1] +; CHECK_SVE-NEXT: mov z2.d, z1.d[1] +; CHECK_SVE-NEXT: insr z1.d, d0 +; CHECK_SVE-NEXT: insr z3.d, d2 +; CHECK_SVE-NEXT: stp q1, q3, [x0] +; CHECK_SVE-NEXT: ret +; +; SVE2_128-LABEL: shuffle_ext_byone_reverse: +; SVE2_128: // %bb.0: +; SVE2_128-NEXT: ldp q1, q3, [x0] +; SVE2_128-NEXT: ldr q0, [x1, #16] +; SVE2_128-NEXT: mov z0.d, z0.d[1] +; SVE2_128-NEXT: mov z2.d, z1.d[1] +; SVE2_128-NEXT: insr z1.d, d0 +; SVE2_128-NEXT: insr z3.d, d2 +; SVE2_128-NEXT: stp q1, q3, [x0] +; SVE2_128-NEXT: ret +; +; SVE2_1024-LABEL: shuffle_ext_byone_reverse: +; SVE2_1024: // %bb.0: +; SVE2_1024-NEXT: ptrue p0.d, vl4 +; SVE2_1024-NEXT: ld1d { z0.d }, p0/z, [x0] +; SVE2_1024-NEXT: ld1d { z1.d }, p0/z, [x1] +; SVE2_1024-NEXT: mov z1.d, z1.d[3] +; SVE2_1024-NEXT: insr z0.d, d1 +; SVE2_1024-NEXT: st1d { z0.d }, p0, [x0] +; SVE2_1024-NEXT: ret +; +; SVE2_2048-LABEL: shuffle_ext_byone_reverse: +; SVE2_2048: // %bb.0: +; SVE2_2048-NEXT: ptrue p0.d, vl4 +; SVE2_2048-NEXT: ld1d { z0.d }, p0/z, [x0] +; SVE2_2048-NEXT: ld1d { z1.d }, p0/z, [x1] +; SVE2_2048-NEXT: mov z1.d, z1.d[3] +; SVE2_2048-NEXT: insr z0.d, d1 +; SVE2_2048-NEXT: st1d { z0.d }, p0, [x0] +; SVE2_2048-NEXT: ret %op1 = load <4 x double>, ptr %a %op2 = load <4 x double>, ptr %b %ret = shufflevector <4 x double> %op1, <4 x double> %op2, <4 x i32> @@ -363,15 +660,1090 @@ } define void @shuffle_ext_invalid(ptr %a, ptr %b) { -; CHECK-LABEL: shuffle_ext_invalid: -; CHECK: // %bb.0: -; CHECK-NEXT: ldr q0, [x0, #16] -; CHECK-NEXT: ldr q1, [x1] -; CHECK-NEXT: stp q0, q1, [x0] -; CHECK-NEXT: ret +; CHECK_SVE-LABEL: shuffle_ext_invalid: +; CHECK_SVE: // %bb.0: +; CHECK_SVE-NEXT: ldr q0, [x0, #16] +; CHECK_SVE-NEXT: ldr q1, [x1] +; CHECK_SVE-NEXT: stp q0, q1, [x0] +; CHECK_SVE-NEXT: ret +; +; SVE2_128-LABEL: shuffle_ext_invalid: +; SVE2_128: // %bb.0: +; SVE2_128-NEXT: ldr q0, [x0, #16] +; SVE2_128-NEXT: ldr q1, [x1] +; SVE2_128-NEXT: stp q0, q1, [x0] +; SVE2_128-NEXT: ret +; +; SVE2_1024-LABEL: shuffle_ext_invalid: +; SVE2_1024: // %bb.0: +; SVE2_1024-NEXT: ptrue p0.d, vl4 +; SVE2_1024-NEXT: adrp x8, .LCPI22_0 +; SVE2_1024-NEXT: add x8, x8, :lo12:.LCPI22_0 +; SVE2_1024-NEXT: ptrue p1.d +; SVE2_1024-NEXT: ld1d { z0.d }, p0/z, [x0] +; SVE2_1024-NEXT: ld1d { z1.d }, p0/z, [x1] +; SVE2_1024-NEXT: ld1d { z2.d }, p1/z, [x8] +; SVE2_1024-NEXT: tbl z0.d, { z0.d, z1.d }, z2.d +; SVE2_1024-NEXT: st1d { z0.d }, p0, [x0] +; SVE2_1024-NEXT: ret +; +; SVE2_2048-LABEL: shuffle_ext_invalid: +; SVE2_2048: // %bb.0: +; SVE2_2048-NEXT: ptrue p0.d, vl4 +; SVE2_2048-NEXT: adrp x8, .LCPI22_0 +; SVE2_2048-NEXT: add x8, x8, :lo12:.LCPI22_0 +; SVE2_2048-NEXT: ptrue p1.d +; SVE2_2048-NEXT: ld1d { z0.d }, p0/z, [x0] +; SVE2_2048-NEXT: ld1d { z1.d }, p0/z, [x1] +; SVE2_2048-NEXT: ld1d { z2.d }, p1/z, [x8] +; SVE2_2048-NEXT: tbl z0.d, { z0.d, z1.d }, z2.d +; SVE2_2048-NEXT: st1d { z0.d }, p0, [x0] +; SVE2_2048-NEXT: ret %op1 = load <4 x double>, ptr %a %op2 = load <4 x double>, ptr %b %ret = shufflevector <4 x double> %op1, <4 x double> %op2, <4 x i32> store <4 x double> %ret, ptr %a ret void } + +; SVE2_128: .LCPI23_0: +; SVE2_128-NEXT: .hword 10 +; SVE2_128-NEXT: .hword 1 +; SVE2_128-NEXT: .hword 3 +; SVE2_128-NEXT: .hword 8 +; SVE2_128-NEXT: .hword 65535 +; SVE2_128-NEXT: .hword 65535 +; SVE2_128-NEXT: .hword 65535 +; SVE2_128-NEXT: .hword 65535 +define <4 x i16> @sve2_shuffle_v4i16_tbl2(ptr %a, ptr %b) { +; CHECK_SVE-LABEL: sve2_shuffle_v4i16_tbl2: +; CHECK_SVE: // %bb.0: +; CHECK_SVE-NEXT: sub sp, sp, #16 +; CHECK_SVE-NEXT: .cfi_def_cfa_offset 16 +; CHECK_SVE-NEXT: ldr d0, [x0] +; CHECK_SVE-NEXT: ldr d1, [x1] +; CHECK_SVE-NEXT: mov z2.h, z0.h[3] +; CHECK_SVE-NEXT: fmov w8, s1 +; CHECK_SVE-NEXT: mov z0.h, z0.h[1] +; CHECK_SVE-NEXT: mov z3.h, z1.h[2] +; CHECK_SVE-NEXT: strh w8, [sp, #14] +; CHECK_SVE-NEXT: fmov w8, s2 +; CHECK_SVE-NEXT: fmov w9, s0 +; CHECK_SVE-NEXT: strh w8, [sp, #12] +; CHECK_SVE-NEXT: fmov w8, s3 +; CHECK_SVE-NEXT: strh w9, [sp, #10] +; CHECK_SVE-NEXT: strh w8, [sp, #8] +; CHECK_SVE-NEXT: ldr d0, [sp, #8] +; CHECK_SVE-NEXT: add sp, sp, #16 +; CHECK_SVE-NEXT: ret +; +; SVE2_128-LABEL: sve2_shuffle_v4i16_tbl2: +; SVE2_128: // %bb.0: +; SVE2_128-NEXT: adrp x8, .LCPI23_0 +; SVE2_128-NEXT: ldr d0, [x0] +; SVE2_128-NEXT: ldr d1, [x1] +; SVE2_128-NEXT: ldr q2, [x8, :lo12:.LCPI23_0] +; SVE2_128-NEXT: tbl z0.h, { z0.h, z1.h }, z2.h +; SVE2_128-NEXT: // kill: def $d0 killed $d0 killed $z0 +; SVE2_128-NEXT: ret +; +; SVE2_1024-LABEL: sve2_shuffle_v4i16_tbl2: +; SVE2_1024: // %bb.0: +; SVE2_1024-NEXT: ptrue p0.h +; SVE2_1024-NEXT: ldr d0, [x0] +; SVE2_1024-NEXT: adrp x8, .LCPI23_0 +; SVE2_1024-NEXT: add x8, x8, :lo12:.LCPI23_0 +; SVE2_1024-NEXT: ldr d1, [x1] +; SVE2_1024-NEXT: ld1h { z2.h }, p0/z, [x8] +; SVE2_1024-NEXT: tbl z0.h, { z0.h, z1.h }, z2.h +; SVE2_1024-NEXT: // kill: def $d0 killed $d0 killed $z0 +; SVE2_1024-NEXT: ret +; +; SVE2_2048-LABEL: sve2_shuffle_v4i16_tbl2: +; SVE2_2048: // %bb.0: +; SVE2_2048-NEXT: ptrue p0.h +; SVE2_2048-NEXT: ldr d0, [x0] +; SVE2_2048-NEXT: adrp x8, .LCPI23_0 +; SVE2_2048-NEXT: add x8, x8, :lo12:.LCPI23_0 +; SVE2_2048-NEXT: ldr d1, [x1] +; SVE2_2048-NEXT: ld1h { z2.h }, p0/z, [x8] +; SVE2_2048-NEXT: tbl z0.h, { z0.h, z1.h }, z2.h +; SVE2_2048-NEXT: // kill: def $d0 killed $d0 killed $z0 +; SVE2_2048-NEXT: ret + %op1 = load <4 x i16>, ptr %a + %op2 = load <4 x i16>, ptr %b + %1 = shufflevector <4 x i16> %op1, <4 x i16> %op2, <4 x i32> + ret <4 x i16> %1 +} + +; SVE2_128: .LCPI24_0: +; SVE2_128-NEXT: .hword 0 +; SVE2_128-NEXT: .hword 3 +; SVE2_128-NEXT: .hword 7 +; SVE2_128-NEXT: .hword 7 +; SVE2_128-NEXT: .hword 15 +; SVE2_128-NEXT: .hword 0 +; SVE2_128-NEXT: .hword 0 +; SVE2_128-NEXT: .hword 1 +define <8 x i16> @sve2_shuffle_v8i16_tbl2(ptr %a, ptr %b) { +; CHECK_SVE-LABEL: sve2_shuffle_v8i16_tbl2: +; CHECK_SVE: // %bb.0: +; CHECK_SVE-NEXT: sub sp, sp, #16 +; CHECK_SVE-NEXT: .cfi_def_cfa_offset 16 +; CHECK_SVE-NEXT: ldr q0, [x0] +; CHECK_SVE-NEXT: ldr q1, [x1] +; CHECK_SVE-NEXT: mov z2.h, z0.h[1] +; CHECK_SVE-NEXT: fmov w8, s0 +; CHECK_SVE-NEXT: mov z1.h, z1.h[7] +; CHECK_SVE-NEXT: mov z3.h, z0.h[7] +; CHECK_SVE-NEXT: mov z0.h, z0.h[3] +; CHECK_SVE-NEXT: strh w8, [sp, #12] +; CHECK_SVE-NEXT: fmov w9, s2 +; CHECK_SVE-NEXT: strh w8, [sp, #10] +; CHECK_SVE-NEXT: strh w8, [sp] +; CHECK_SVE-NEXT: fmov w8, s1 +; CHECK_SVE-NEXT: strh w9, [sp, #14] +; CHECK_SVE-NEXT: fmov w9, s3 +; CHECK_SVE-NEXT: strh w8, [sp, #8] +; CHECK_SVE-NEXT: fmov w8, s0 +; CHECK_SVE-NEXT: strh w9, [sp, #6] +; CHECK_SVE-NEXT: strh w9, [sp, #4] +; CHECK_SVE-NEXT: strh w8, [sp, #2] +; CHECK_SVE-NEXT: ldr q0, [sp], #16 +; CHECK_SVE-NEXT: ret +; +; SVE2_128-LABEL: sve2_shuffle_v8i16_tbl2: +; SVE2_128: // %bb.0: +; SVE2_128-NEXT: adrp x8, .LCPI24_0 +; SVE2_128-NEXT: ldr q0, [x0] +; SVE2_128-NEXT: ldr q1, [x1] +; SVE2_128-NEXT: ldr q2, [x8, :lo12:.LCPI24_0] +; SVE2_128-NEXT: tbl z0.h, { z0.h, z1.h }, z2.h +; SVE2_128-NEXT: // kill: def $q0 killed $q0 killed $z0 +; SVE2_128-NEXT: ret +; +; SVE2_1024-LABEL: sve2_shuffle_v8i16_tbl2: +; SVE2_1024: // %bb.0: +; SVE2_1024-NEXT: ptrue p0.h +; SVE2_1024-NEXT: ldr q0, [x0] +; SVE2_1024-NEXT: adrp x8, .LCPI24_0 +; SVE2_1024-NEXT: add x8, x8, :lo12:.LCPI24_0 +; SVE2_1024-NEXT: ldr q1, [x1] +; SVE2_1024-NEXT: ld1h { z2.h }, p0/z, [x8] +; SVE2_1024-NEXT: tbl z0.h, { z0.h, z1.h }, z2.h +; SVE2_1024-NEXT: // kill: def $q0 killed $q0 killed $z0 +; SVE2_1024-NEXT: ret +; +; SVE2_2048-LABEL: sve2_shuffle_v8i16_tbl2: +; SVE2_2048: // %bb.0: +; SVE2_2048-NEXT: ptrue p0.h +; SVE2_2048-NEXT: ldr q0, [x0] +; SVE2_2048-NEXT: adrp x8, .LCPI24_0 +; SVE2_2048-NEXT: add x8, x8, :lo12:.LCPI24_0 +; SVE2_2048-NEXT: ldr q1, [x1] +; SVE2_2048-NEXT: ld1h { z2.h }, p0/z, [x8] +; SVE2_2048-NEXT: tbl z0.h, { z0.h, z1.h }, z2.h +; SVE2_2048-NEXT: // kill: def $q0 killed $q0 killed $z0 +; SVE2_2048-NEXT: ret + %op1 = load <8 x i16>, ptr %a + %op2 = load <8 x i16>, ptr %b + %1 = shufflevector <8 x i16> %op1, <8 x i16> %op2, <8 x i32> + ret <8 x i16> %1 +} + +; SVE2_128: .LCPI25_0: +; SVE2_128-NEXT: .hword 0 +; SVE2_128-NEXT: .hword 3 +; SVE2_128-NEXT: .hword 7 +; SVE2_128-NEXT: .hword 7 +; SVE2_128-NEXT: .hword 1 +; SVE2_128-NEXT: .hword 0 +; SVE2_128-NEXT: .hword 0 +; SVE2_128-NEXT: .hword 1 +define <8 x i16> @sve2_shuffle_v8i16_tbl_op1(ptr %a, ptr %b) { +; CHECK_SVE-LABEL: sve2_shuffle_v8i16_tbl_op1: +; CHECK_SVE: // %bb.0: +; CHECK_SVE-NEXT: sub sp, sp, #16 +; CHECK_SVE-NEXT: .cfi_def_cfa_offset 16 +; CHECK_SVE-NEXT: ldr q0, [x0] +; CHECK_SVE-NEXT: mov z1.h, z0.h[1] +; CHECK_SVE-NEXT: fmov w8, s0 +; CHECK_SVE-NEXT: mov z2.h, z0.h[7] +; CHECK_SVE-NEXT: mov z0.h, z0.h[3] +; CHECK_SVE-NEXT: strh w8, [sp, #12] +; CHECK_SVE-NEXT: strh w8, [sp, #10] +; CHECK_SVE-NEXT: strh w8, [sp] +; CHECK_SVE-NEXT: fmov w8, s1 +; CHECK_SVE-NEXT: strh w8, [sp, #14] +; CHECK_SVE-NEXT: strh w8, [sp, #8] +; CHECK_SVE-NEXT: fmov w8, s2 +; CHECK_SVE-NEXT: strh w8, [sp, #6] +; CHECK_SVE-NEXT: strh w8, [sp, #4] +; CHECK_SVE-NEXT: fmov w8, s0 +; CHECK_SVE-NEXT: strh w8, [sp, #2] +; CHECK_SVE-NEXT: ldr q0, [sp], #16 +; CHECK_SVE-NEXT: ret +; +; SVE2_128-LABEL: sve2_shuffle_v8i16_tbl_op1: +; SVE2_128: // %bb.0: +; SVE2_128-NEXT: adrp x8, .LCPI25_0 +; SVE2_128-NEXT: ldr q0, [x0] +; SVE2_128-NEXT: ldr q1, [x8, :lo12:.LCPI25_0] +; SVE2_128-NEXT: tbl z0.h, { z0.h }, z1.h +; SVE2_128-NEXT: // kill: def $q0 killed $q0 killed $z0 +; SVE2_128-NEXT: ret +; +; SVE2_1024-LABEL: sve2_shuffle_v8i16_tbl_op1: +; SVE2_1024: // %bb.0: +; SVE2_1024-NEXT: ptrue p0.h +; SVE2_1024-NEXT: adrp x8, .LCPI25_0 +; SVE2_1024-NEXT: add x8, x8, :lo12:.LCPI25_0 +; SVE2_1024-NEXT: ldr q0, [x0] +; SVE2_1024-NEXT: ld1h { z1.h }, p0/z, [x8] +; SVE2_1024-NEXT: tbl z0.h, { z0.h }, z1.h +; SVE2_1024-NEXT: // kill: def $q0 killed $q0 killed $z0 +; SVE2_1024-NEXT: ret +; +; SVE2_2048-LABEL: sve2_shuffle_v8i16_tbl_op1: +; SVE2_2048: // %bb.0: +; SVE2_2048-NEXT: ptrue p0.h +; SVE2_2048-NEXT: adrp x8, .LCPI25_0 +; SVE2_2048-NEXT: add x8, x8, :lo12:.LCPI25_0 +; SVE2_2048-NEXT: ldr q0, [x0] +; SVE2_2048-NEXT: ld1h { z1.h }, p0/z, [x8] +; SVE2_2048-NEXT: tbl z0.h, { z0.h }, z1.h +; SVE2_2048-NEXT: // kill: def $q0 killed $q0 killed $z0 +; SVE2_2048-NEXT: ret + %op1 = load <8 x i16>, ptr %a + %op2 = load <8 x i16>, ptr %b + %1 = shufflevector <8 x i16> %op1, <8 x i16> %op2, <8 x i32> + ret <8 x i16> %1 +} + +; SVE2_128: .LCPI26_0: +; SVE2_128-NEXT: .hword 2 +; SVE2_128-NEXT: .hword 5 +; SVE2_128-NEXT: .hword 2 +; SVE2_128-NEXT: .hword 3 +; SVE2_128-NEXT: .hword 7 +; SVE2_128-NEXT: .hword 3 +; SVE2_128-NEXT: .hword 3 +; SVE2_128-NEXT: .hword 2 +define <8 x i16> @sve2_shuffle_v8i16_tbl_op2(ptr %a, ptr %b) { +; CHECK_SVE-LABEL: sve2_shuffle_v8i16_tbl_op2: +; CHECK_SVE: // %bb.0: +; CHECK_SVE-NEXT: sub sp, sp, #16 +; CHECK_SVE-NEXT: .cfi_def_cfa_offset 16 +; CHECK_SVE-NEXT: ldr q0, [x1] +; CHECK_SVE-NEXT: mov z1.h, z0.h[2] +; CHECK_SVE-NEXT: mov z2.h, z0.h[3] +; CHECK_SVE-NEXT: fmov w8, s1 +; CHECK_SVE-NEXT: mov z1.h, z0.h[7] +; CHECK_SVE-NEXT: fmov w9, s2 +; CHECK_SVE-NEXT: mov z0.h, z0.h[5] +; CHECK_SVE-NEXT: strh w9, [sp, #12] +; CHECK_SVE-NEXT: fmov w10, s1 +; CHECK_SVE-NEXT: strh w9, [sp, #10] +; CHECK_SVE-NEXT: strh w9, [sp, #6] +; CHECK_SVE-NEXT: fmov w9, s0 +; CHECK_SVE-NEXT: strh w8, [sp, #14] +; CHECK_SVE-NEXT: strh w10, [sp, #8] +; CHECK_SVE-NEXT: strh w8, [sp, #4] +; CHECK_SVE-NEXT: strh w9, [sp, #2] +; CHECK_SVE-NEXT: strh w8, [sp] +; CHECK_SVE-NEXT: ldr q0, [sp], #16 +; CHECK_SVE-NEXT: ret +; +; SVE2_128-LABEL: sve2_shuffle_v8i16_tbl_op2: +; SVE2_128: // %bb.0: +; SVE2_128-NEXT: adrp x8, .LCPI26_0 +; SVE2_128-NEXT: ldr q0, [x1] +; SVE2_128-NEXT: ldr q1, [x8, :lo12:.LCPI26_0] +; SVE2_128-NEXT: tbl z0.h, { z0.h }, z1.h +; SVE2_128-NEXT: // kill: def $q0 killed $q0 killed $z0 +; SVE2_128-NEXT: ret +; +; SVE2_1024-LABEL: sve2_shuffle_v8i16_tbl_op2: +; SVE2_1024: // %bb.0: +; SVE2_1024-NEXT: ptrue p0.h +; SVE2_1024-NEXT: adrp x8, .LCPI26_0 +; SVE2_1024-NEXT: add x8, x8, :lo12:.LCPI26_0 +; SVE2_1024-NEXT: ldr q0, [x1] +; SVE2_1024-NEXT: ld1h { z1.h }, p0/z, [x8] +; SVE2_1024-NEXT: tbl z0.h, { z0.h }, z1.h +; SVE2_1024-NEXT: // kill: def $q0 killed $q0 killed $z0 +; SVE2_1024-NEXT: ret +; +; SVE2_2048-LABEL: sve2_shuffle_v8i16_tbl_op2: +; SVE2_2048: // %bb.0: +; SVE2_2048-NEXT: ptrue p0.h +; SVE2_2048-NEXT: adrp x8, .LCPI26_0 +; SVE2_2048-NEXT: add x8, x8, :lo12:.LCPI26_0 +; SVE2_2048-NEXT: ldr q0, [x1] +; SVE2_2048-NEXT: ld1h { z1.h }, p0/z, [x8] +; SVE2_2048-NEXT: tbl z0.h, { z0.h }, z1.h +; SVE2_2048-NEXT: // kill: def $q0 killed $q0 killed $z0 +; SVE2_2048-NEXT: ret + %op1 = load <8 x i16>, ptr %a + %op2 = load <8 x i16>, ptr %b + %1 = shufflevector <8 x i16> %op1, <8 x i16> %op2, <8 x i32> + ret <8 x i16> %1 +} + +; SVE2_128: .LCPI27_0: +; SVE2_128-NEXT: .word 0 +; SVE2_128-NEXT: .word 3 +; SVE2_128-NEXT: .word 5 +; SVE2_128-NEXT: .word 1 +define <4 x float> @sve2_shuffle_v4f32_tbl2_op2(ptr %a, ptr %b) { +; CHECK_SVE-LABEL: sve2_shuffle_v4f32_tbl2_op2: +; CHECK_SVE: // %bb.0: +; CHECK_SVE-NEXT: sub sp, sp, #16 +; CHECK_SVE-NEXT: .cfi_def_cfa_offset 16 +; CHECK_SVE-NEXT: ldr q0, [x0] +; CHECK_SVE-NEXT: ldr q1, [x1] +; CHECK_SVE-NEXT: mov z2.s, z0.s[1] +; CHECK_SVE-NEXT: mov z1.s, z1.s[1] +; CHECK_SVE-NEXT: mov z3.s, z0.s[3] +; CHECK_SVE-NEXT: stp s1, s2, [sp, #8] +; CHECK_SVE-NEXT: stp s0, s3, [sp] +; CHECK_SVE-NEXT: ldr q0, [sp], #16 +; CHECK_SVE-NEXT: ret +; +; SVE2_128-LABEL: sve2_shuffle_v4f32_tbl2_op2: +; SVE2_128: // %bb.0: +; SVE2_128-NEXT: adrp x8, .LCPI27_0 +; SVE2_128-NEXT: ldr q0, [x0] +; SVE2_128-NEXT: ldr q1, [x1] +; SVE2_128-NEXT: ldr q2, [x8, :lo12:.LCPI27_0] +; SVE2_128-NEXT: tbl z0.s, { z0.s, z1.s }, z2.s +; SVE2_128-NEXT: // kill: def $q0 killed $q0 killed $z0 +; SVE2_128-NEXT: ret +; +; SVE2_1024-LABEL: sve2_shuffle_v4f32_tbl2_op2: +; SVE2_1024: // %bb.0: +; SVE2_1024-NEXT: ptrue p0.s +; SVE2_1024-NEXT: ldr q0, [x0] +; SVE2_1024-NEXT: adrp x8, .LCPI27_0 +; SVE2_1024-NEXT: add x8, x8, :lo12:.LCPI27_0 +; SVE2_1024-NEXT: ldr q1, [x1] +; SVE2_1024-NEXT: ld1w { z2.s }, p0/z, [x8] +; SVE2_1024-NEXT: tbl z0.s, { z0.s, z1.s }, z2.s +; SVE2_1024-NEXT: // kill: def $q0 killed $q0 killed $z0 +; SVE2_1024-NEXT: ret +; +; SVE2_2048-LABEL: sve2_shuffle_v4f32_tbl2_op2: +; SVE2_2048: // %bb.0: +; SVE2_2048-NEXT: ptrue p0.s +; SVE2_2048-NEXT: ldr q0, [x0] +; SVE2_2048-NEXT: adrp x8, .LCPI27_0 +; SVE2_2048-NEXT: add x8, x8, :lo12:.LCPI27_0 +; SVE2_2048-NEXT: ldr q1, [x1] +; SVE2_2048-NEXT: ld1w { z2.s }, p0/z, [x8] +; SVE2_2048-NEXT: tbl z0.s, { z0.s, z1.s }, z2.s +; SVE2_2048-NEXT: // kill: def $q0 killed $q0 killed $z0 +; SVE2_2048-NEXT: ret + %op1 = load <4 x float>, ptr %a + %op2 = load <4 x float>, ptr %b + %1 = shufflevector <4 x float> %op1, <4 x float> %op2, <4 x i32> + ret <4 x float> %1 +} + +; SVE2_128: .LCPI28_0: +; SVE2_128-NEXT: .word 0 +; SVE2_128-NEXT: .word 3 +; SVE2_128-NEXT: .word 2 +; SVE2_128-NEXT: .word 1 +define <4 x float> @sve2_shuffle_v4f32_tbl_op1(ptr %a, ptr %b) { +; CHECK_SVE-LABEL: sve2_shuffle_v4f32_tbl_op1: +; CHECK_SVE: // %bb.0: +; CHECK_SVE-NEXT: sub sp, sp, #16 +; CHECK_SVE-NEXT: .cfi_def_cfa_offset 16 +; CHECK_SVE-NEXT: ldr q0, [x0] +; CHECK_SVE-NEXT: mov z1.s, z0.s[1] +; CHECK_SVE-NEXT: mov z2.s, z0.s[2] +; CHECK_SVE-NEXT: mov z3.s, z0.s[3] +; CHECK_SVE-NEXT: stp s2, s1, [sp, #8] +; CHECK_SVE-NEXT: stp s0, s3, [sp] +; CHECK_SVE-NEXT: ldr q0, [sp], #16 +; CHECK_SVE-NEXT: ret +; +; SVE2_128-LABEL: sve2_shuffle_v4f32_tbl_op1: +; SVE2_128: // %bb.0: +; SVE2_128-NEXT: adrp x8, .LCPI28_0 +; SVE2_128-NEXT: ldr q0, [x0] +; SVE2_128-NEXT: ldr q1, [x8, :lo12:.LCPI28_0] +; SVE2_128-NEXT: tbl z0.s, { z0.s }, z1.s +; SVE2_128-NEXT: // kill: def $q0 killed $q0 killed $z0 +; SVE2_128-NEXT: ret +; +; SVE2_1024-LABEL: sve2_shuffle_v4f32_tbl_op1: +; SVE2_1024: // %bb.0: +; SVE2_1024-NEXT: ptrue p0.s +; SVE2_1024-NEXT: adrp x8, .LCPI28_0 +; SVE2_1024-NEXT: add x8, x8, :lo12:.LCPI28_0 +; SVE2_1024-NEXT: ldr q0, [x0] +; SVE2_1024-NEXT: ld1w { z1.s }, p0/z, [x8] +; SVE2_1024-NEXT: tbl z0.s, { z0.s }, z1.s +; SVE2_1024-NEXT: // kill: def $q0 killed $q0 killed $z0 +; SVE2_1024-NEXT: ret +; +; SVE2_2048-LABEL: sve2_shuffle_v4f32_tbl_op1: +; SVE2_2048: // %bb.0: +; SVE2_2048-NEXT: ptrue p0.s +; SVE2_2048-NEXT: adrp x8, .LCPI28_0 +; SVE2_2048-NEXT: add x8, x8, :lo12:.LCPI28_0 +; SVE2_2048-NEXT: ldr q0, [x0] +; SVE2_2048-NEXT: ld1w { z1.s }, p0/z, [x8] +; SVE2_2048-NEXT: tbl z0.s, { z0.s }, z1.s +; SVE2_2048-NEXT: // kill: def $q0 killed $q0 killed $z0 +; SVE2_2048-NEXT: ret + %op1 = load <4 x float>, ptr %a + %op2 = load <4 x float>, ptr %b + %1 = shufflevector <4 x float> %op1, <4 x float> %op2, <4 x i32> + ret <4 x float> %1 +} + +; SVE2_128: .LCPI29_0: +; SVE2_128-NEXT: .byte 0 +; SVE2_128-NEXT: .byte 1 +; SVE2_128-NEXT: .byte 2 +; SVE2_128-NEXT: .byte 3 +; SVE2_128-NEXT: .byte 4 +; SVE2_128-NEXT: .byte 7 +; SVE2_128-NEXT: .byte 6 +; SVE2_128-NEXT: .byte 7 +; SVE2_128-NEXT: .byte 255 +; SVE2_128-NEXT: .byte 255 +define <8 x i8> @shuffle_index_size_acceptable_op2(ptr %a, ptr %b) { +; CHECK_SVE-LABEL: shuffle_index_size_acceptable_op2: +; CHECK_SVE: // %bb.0: +; CHECK_SVE-NEXT: sub sp, sp, #16 +; CHECK_SVE-NEXT: .cfi_def_cfa_offset 16 +; CHECK_SVE-NEXT: ldr d0, [x1] +; CHECK_SVE-NEXT: mov z1.b, z0.b[7] +; CHECK_SVE-NEXT: mov z2.b, z0.b[6] +; CHECK_SVE-NEXT: fmov w8, s0 +; CHECK_SVE-NEXT: mov z3.b, z0.b[4] +; CHECK_SVE-NEXT: strb w8, [sp, #8] +; CHECK_SVE-NEXT: fmov w8, s1 +; CHECK_SVE-NEXT: mov z1.b, z0.b[3] +; CHECK_SVE-NEXT: fmov w9, s2 +; CHECK_SVE-NEXT: mov z2.b, z0.b[2] +; CHECK_SVE-NEXT: mov z0.b, z0.b[1] +; CHECK_SVE-NEXT: strb w8, [sp, #15] +; CHECK_SVE-NEXT: strb w9, [sp, #14] +; CHECK_SVE-NEXT: fmov w9, s3 +; CHECK_SVE-NEXT: strb w8, [sp, #13] +; CHECK_SVE-NEXT: fmov w8, s1 +; CHECK_SVE-NEXT: strb w9, [sp, #12] +; CHECK_SVE-NEXT: fmov w9, s2 +; CHECK_SVE-NEXT: strb w8, [sp, #11] +; CHECK_SVE-NEXT: fmov w8, s0 +; CHECK_SVE-NEXT: strb w9, [sp, #10] +; CHECK_SVE-NEXT: strb w8, [sp, #9] +; CHECK_SVE-NEXT: ldr d0, [sp, #8] +; CHECK_SVE-NEXT: add sp, sp, #16 +; CHECK_SVE-NEXT: ret +; +; SVE2_128-LABEL: shuffle_index_size_acceptable_op2: +; SVE2_128: // %bb.0: +; SVE2_128-NEXT: adrp x8, .LCPI29_0 +; SVE2_128-NEXT: ldr d0, [x1] +; SVE2_128-NEXT: ldr q1, [x8, :lo12:.LCPI29_0] +; SVE2_128-NEXT: tbl z0.b, { z0.b }, z1.b +; SVE2_128-NEXT: // kill: def $d0 killed $d0 killed $z0 +; SVE2_128-NEXT: ret +; +; SVE2_1024-LABEL: shuffle_index_size_acceptable_op2: +; SVE2_1024: // %bb.0: +; SVE2_1024-NEXT: ptrue p0.b +; SVE2_1024-NEXT: adrp x8, .LCPI29_0 +; SVE2_1024-NEXT: add x8, x8, :lo12:.LCPI29_0 +; SVE2_1024-NEXT: ldr d0, [x1] +; SVE2_1024-NEXT: ld1b { z1.b }, p0/z, [x8] +; SVE2_1024-NEXT: tbl z0.b, { z0.b }, z1.b +; SVE2_1024-NEXT: // kill: def $d0 killed $d0 killed $z0 +; SVE2_1024-NEXT: ret +; +; SVE2_2048-LABEL: shuffle_index_size_acceptable_op2: +; SVE2_2048: // %bb.0: +; SVE2_2048-NEXT: ptrue p0.b +; SVE2_2048-NEXT: adrp x8, .LCPI29_0 +; SVE2_2048-NEXT: add x8, x8, :lo12:.LCPI29_0 +; SVE2_2048-NEXT: ldr d0, [x1] +; SVE2_2048-NEXT: ld1b { z1.b }, p0/z, [x8] +; SVE2_2048-NEXT: tbl z0.b, { z0.b }, z1.b +; SVE2_2048-NEXT: // kill: def $d0 killed $d0 killed $z0 +; SVE2_2048-NEXT: ret + %op1 = load <8 x i8>, ptr %a + %op2 = load <8 x i8>, ptr %b + %1 = shufflevector <8 x i8> %op1, <8 x i8> %op2, <8 x i32> + ret <8 x i8> %1 +} + +; SVE2_1024: .LCPI30_0: +; SVE2_1024-NEXT: .byte 1 +; SVE2_1024-NEXT: .byte 2 +; SVE2_1024-NEXT: .byte 3 +; SVE2_1024-NEXT: .byte 4 +; SVE2_1024-NEXT: .byte 5 +; SVE2_1024-NEXT: .byte 7 +; SVE2_1024-NEXT: .byte 6 +; SVE2_1024-NEXT: .byte 7 +; SVE2_1024-NEXT: .byte 255 +; SVE2_1024-NEXT: .byte 255 +define <8 x i8> @shuffle_index_size_acceptable_op1(ptr %a, ptr %b) { +; CHECK_SVE-LABEL: shuffle_index_size_acceptable_op1: +; CHECK_SVE: // %bb.0: +; CHECK_SVE-NEXT: sub sp, sp, #16 +; CHECK_SVE-NEXT: .cfi_def_cfa_offset 16 +; CHECK_SVE-NEXT: ldr d0, [x0] +; CHECK_SVE-NEXT: mov z2.b, z0.b[6] +; CHECK_SVE-NEXT: mov z1.b, z0.b[7] +; CHECK_SVE-NEXT: mov z3.b, z0.b[5] +; CHECK_SVE-NEXT: fmov w9, s2 +; CHECK_SVE-NEXT: fmov w8, s1 +; CHECK_SVE-NEXT: mov z1.b, z0.b[4] +; CHECK_SVE-NEXT: mov z2.b, z0.b[3] +; CHECK_SVE-NEXT: strb w9, [sp, #14] +; CHECK_SVE-NEXT: fmov w9, s3 +; CHECK_SVE-NEXT: mov z3.b, z0.b[2] +; CHECK_SVE-NEXT: strb w8, [sp, #15] +; CHECK_SVE-NEXT: mov z0.b, z0.b[1] +; CHECK_SVE-NEXT: strb w8, [sp, #13] +; CHECK_SVE-NEXT: fmov w8, s1 +; CHECK_SVE-NEXT: strb w9, [sp, #12] +; CHECK_SVE-NEXT: fmov w9, s2 +; CHECK_SVE-NEXT: strb w8, [sp, #11] +; CHECK_SVE-NEXT: fmov w8, s3 +; CHECK_SVE-NEXT: strb w9, [sp, #10] +; CHECK_SVE-NEXT: fmov w9, s0 +; CHECK_SVE-NEXT: strb w8, [sp, #9] +; CHECK_SVE-NEXT: strb w9, [sp, #8] +; CHECK_SVE-NEXT: ldr d0, [sp, #8] +; CHECK_SVE-NEXT: add sp, sp, #16 +; CHECK_SVE-NEXT: ret +; +; SVE2_128-LABEL: shuffle_index_size_acceptable_op1: +; SVE2_128: // %bb.0: +; SVE2_128-NEXT: adrp x8, .LCPI30_0 +; SVE2_128-NEXT: ldr d0, [x0] +; SVE2_128-NEXT: ldr q1, [x8, :lo12:.LCPI30_0] +; SVE2_128-NEXT: tbl z0.b, { z0.b }, z1.b +; SVE2_128-NEXT: // kill: def $d0 killed $d0 killed $z0 +; SVE2_128-NEXT: ret +; +; SVE2_1024-LABEL: shuffle_index_size_acceptable_op1: +; SVE2_1024: // %bb.0: +; SVE2_1024-NEXT: ptrue p0.b +; SVE2_1024-NEXT: adrp x8, .LCPI30_0 +; SVE2_1024-NEXT: add x8, x8, :lo12:.LCPI30_0 +; SVE2_1024-NEXT: ldr d0, [x0] +; SVE2_1024-NEXT: ld1b { z1.b }, p0/z, [x8] +; SVE2_1024-NEXT: tbl z0.b, { z0.b }, z1.b +; SVE2_1024-NEXT: // kill: def $d0 killed $d0 killed $z0 +; SVE2_1024-NEXT: ret +; +; SVE2_2048-LABEL: shuffle_index_size_acceptable_op1: +; SVE2_2048: // %bb.0: +; SVE2_2048-NEXT: ptrue p0.b +; SVE2_2048-NEXT: adrp x8, .LCPI30_0 +; SVE2_2048-NEXT: add x8, x8, :lo12:.LCPI30_0 +; SVE2_2048-NEXT: ldr d0, [x0] +; SVE2_2048-NEXT: ld1b { z1.b }, p0/z, [x8] +; SVE2_2048-NEXT: tbl z0.b, { z0.b }, z1.b +; SVE2_2048-NEXT: // kill: def $d0 killed $d0 killed $z0 +; SVE2_2048-NEXT: ret + %op1 = load <8 x i8>, ptr %a + %op2 = load <8 x i8>, ptr %b + %1 = shufflevector <8 x i8> %op1, <8 x i8> %op2, <8 x i32> + ret <8 x i8> %1 +} + +; SVE2_128: .LCPI31_0: +; SVE2_128-NEXT: .byte 1 +; SVE2_128-NEXT: .byte 17 +; SVE2_128-NEXT: .byte 18 +; SVE2_128-NEXT: .byte 19 +; SVE2_128-NEXT: .byte 20 +; SVE2_128-NEXT: .byte 20 +; SVE2_128-NEXT: .byte 22 +; SVE2_128-NEXT: .byte 23 +; SVE2_128-NEXT: .byte 255 +; SVE2_128-NEXT: .byte 255 +define <8 x i8> @shuffle_index_size_acceptable_op_both(ptr %a, ptr %b) { +; CHECK_SVE-LABEL: shuffle_index_size_acceptable_op_both: +; CHECK_SVE: // %bb.0: +; CHECK_SVE-NEXT: sub sp, sp, #16 +; CHECK_SVE-NEXT: .cfi_def_cfa_offset 16 +; CHECK_SVE-NEXT: ldr d0, [x1] +; CHECK_SVE-NEXT: mov z1.b, z0.b[7] +; CHECK_SVE-NEXT: mov z2.b, z0.b[6] +; CHECK_SVE-NEXT: mov z3.b, z0.b[4] +; CHECK_SVE-NEXT: fmov w8, s1 +; CHECK_SVE-NEXT: ldr d1, [x0] +; CHECK_SVE-NEXT: fmov w9, s2 +; CHECK_SVE-NEXT: mov z2.b, z0.b[3] +; CHECK_SVE-NEXT: mov z1.b, z1.b[1] +; CHECK_SVE-NEXT: strb w8, [sp, #15] +; CHECK_SVE-NEXT: fmov w8, s3 +; CHECK_SVE-NEXT: mov z3.b, z0.b[2] +; CHECK_SVE-NEXT: strb w9, [sp, #14] +; CHECK_SVE-NEXT: mov z0.b, z0.b[1] +; CHECK_SVE-NEXT: fmov w9, s2 +; CHECK_SVE-NEXT: strb w8, [sp, #13] +; CHECK_SVE-NEXT: strb w8, [sp, #12] +; CHECK_SVE-NEXT: fmov w8, s3 +; CHECK_SVE-NEXT: strb w9, [sp, #11] +; CHECK_SVE-NEXT: fmov w9, s0 +; CHECK_SVE-NEXT: strb w8, [sp, #10] +; CHECK_SVE-NEXT: fmov w8, s1 +; CHECK_SVE-NEXT: strb w9, [sp, #9] +; CHECK_SVE-NEXT: strb w8, [sp, #8] +; CHECK_SVE-NEXT: ldr d0, [sp, #8] +; CHECK_SVE-NEXT: add sp, sp, #16 +; CHECK_SVE-NEXT: ret +; +; SVE2_128-LABEL: shuffle_index_size_acceptable_op_both: +; SVE2_128: // %bb.0: +; SVE2_128-NEXT: adrp x8, .LCPI31_0 +; SVE2_128-NEXT: ldr d0, [x0] +; SVE2_128-NEXT: ldr d1, [x1] +; SVE2_128-NEXT: ldr q2, [x8, :lo12:.LCPI31_0] +; SVE2_128-NEXT: tbl z0.b, { z0.b, z1.b }, z2.b +; SVE2_128-NEXT: // kill: def $d0 killed $d0 killed $z0 +; SVE2_128-NEXT: ret +; +; SVE2_1024-LABEL: shuffle_index_size_acceptable_op_both: +; SVE2_1024: // %bb.0: +; SVE2_1024-NEXT: ptrue p0.b +; SVE2_1024-NEXT: ldr d0, [x0] +; SVE2_1024-NEXT: adrp x8, .LCPI31_0 +; SVE2_1024-NEXT: add x8, x8, :lo12:.LCPI31_0 +; SVE2_1024-NEXT: ldr d1, [x1] +; SVE2_1024-NEXT: ld1b { z2.b }, p0/z, [x8] +; SVE2_1024-NEXT: tbl z0.b, { z0.b, z1.b }, z2.b +; SVE2_1024-NEXT: // kill: def $d0 killed $d0 killed $z0 +; SVE2_1024-NEXT: ret +; +; SVE2_2048-LABEL: shuffle_index_size_acceptable_op_both: +; SVE2_2048: // %bb.0: +; SVE2_2048-NEXT: sub sp, sp, #16 +; SVE2_2048-NEXT: .cfi_def_cfa_offset 16 +; SVE2_2048-NEXT: ldr d0, [x1] +; SVE2_2048-NEXT: mov z1.b, z0.b[7] +; SVE2_2048-NEXT: mov z2.b, z0.b[6] +; SVE2_2048-NEXT: mov z3.b, z0.b[4] +; SVE2_2048-NEXT: fmov w8, s1 +; SVE2_2048-NEXT: ldr d1, [x0] +; SVE2_2048-NEXT: fmov w9, s2 +; SVE2_2048-NEXT: mov z2.b, z0.b[3] +; SVE2_2048-NEXT: mov z1.b, z1.b[1] +; SVE2_2048-NEXT: strb w8, [sp, #15] +; SVE2_2048-NEXT: fmov w8, s3 +; SVE2_2048-NEXT: mov z3.b, z0.b[2] +; SVE2_2048-NEXT: strb w9, [sp, #14] +; SVE2_2048-NEXT: mov z0.b, z0.b[1] +; SVE2_2048-NEXT: fmov w9, s2 +; SVE2_2048-NEXT: strb w8, [sp, #13] +; SVE2_2048-NEXT: strb w8, [sp, #12] +; SVE2_2048-NEXT: fmov w8, s3 +; SVE2_2048-NEXT: strb w9, [sp, #11] +; SVE2_2048-NEXT: fmov w9, s0 +; SVE2_2048-NEXT: strb w8, [sp, #10] +; SVE2_2048-NEXT: fmov w8, s1 +; SVE2_2048-NEXT: strb w9, [sp, #9] +; SVE2_2048-NEXT: strb w8, [sp, #8] +; SVE2_2048-NEXT: ldr d0, [sp, #8] +; SVE2_2048-NEXT: add sp, sp, #16 +; SVE2_2048-NEXT: ret + %op1 = load <8 x i8>, ptr %a + %op2 = load <8 x i8>, ptr %b + %1 = shufflevector <8 x i8> %op1, <8 x i8> %op2, <8 x i32> + ret <8 x i8> %1 +} + +define <8 x i8> @shuffle_index_size_unacceptable_op_both_maxhw(ptr %a, ptr %b) { +; CHECK_SVE-LABEL: shuffle_index_size_unacceptable_op_both_maxhw: +; CHECK_SVE: // %bb.0: +; CHECK_SVE-NEXT: sub sp, sp, #16 +; CHECK_SVE-NEXT: .cfi_def_cfa_offset 16 +; CHECK_SVE-NEXT: ldr d0, [x1] +; CHECK_SVE-NEXT: mov z1.b, z0.b[7] +; CHECK_SVE-NEXT: mov z2.b, z0.b[6] +; CHECK_SVE-NEXT: mov z3.b, z0.b[4] +; CHECK_SVE-NEXT: fmov w8, s1 +; CHECK_SVE-NEXT: ldr d1, [x0] +; CHECK_SVE-NEXT: fmov w9, s2 +; CHECK_SVE-NEXT: mov z2.b, z0.b[3] +; CHECK_SVE-NEXT: mov z1.b, z1.b[1] +; CHECK_SVE-NEXT: strb w8, [sp, #15] +; CHECK_SVE-NEXT: fmov w8, s3 +; CHECK_SVE-NEXT: mov z3.b, z0.b[2] +; CHECK_SVE-NEXT: strb w9, [sp, #14] +; CHECK_SVE-NEXT: mov z0.b, z0.b[1] +; CHECK_SVE-NEXT: fmov w9, s2 +; CHECK_SVE-NEXT: strb w8, [sp, #13] +; CHECK_SVE-NEXT: strb w8, [sp, #12] +; CHECK_SVE-NEXT: fmov w8, s3 +; CHECK_SVE-NEXT: strb w9, [sp, #11] +; CHECK_SVE-NEXT: fmov w9, s0 +; CHECK_SVE-NEXT: strb w8, [sp, #10] +; CHECK_SVE-NEXT: fmov w8, s1 +; CHECK_SVE-NEXT: strb w9, [sp, #9] +; CHECK_SVE-NEXT: strb w8, [sp, #8] +; CHECK_SVE-NEXT: ldr d0, [sp, #8] +; CHECK_SVE-NEXT: add sp, sp, #16 +; CHECK_SVE-NEXT: ret +; +; SVE2_128-LABEL: shuffle_index_size_unacceptable_op_both_maxhw: +; SVE2_128: // %bb.0: +; SVE2_128-NEXT: adrp x8, .LCPI32_0 +; SVE2_128-NEXT: ldr d0, [x0] +; SVE2_128-NEXT: ldr d1, [x1] +; SVE2_128-NEXT: ldr q2, [x8, :lo12:.LCPI32_0] +; SVE2_128-NEXT: tbl z0.b, { z0.b, z1.b }, z2.b +; SVE2_128-NEXT: // kill: def $d0 killed $d0 killed $z0 +; SVE2_128-NEXT: ret +; +; SVE2_1024-LABEL: shuffle_index_size_unacceptable_op_both_maxhw: +; SVE2_1024: // %bb.0: +; SVE2_1024-NEXT: ptrue p0.b +; SVE2_1024-NEXT: ldr d0, [x0] +; SVE2_1024-NEXT: adrp x8, .LCPI32_0 +; SVE2_1024-NEXT: add x8, x8, :lo12:.LCPI32_0 +; SVE2_1024-NEXT: ldr d1, [x1] +; SVE2_1024-NEXT: ld1b { z2.b }, p0/z, [x8] +; SVE2_1024-NEXT: tbl z0.b, { z0.b, z1.b }, z2.b +; SVE2_1024-NEXT: // kill: def $d0 killed $d0 killed $z0 +; SVE2_1024-NEXT: ret +; +; SVE2_2048-LABEL: shuffle_index_size_unacceptable_op_both_maxhw: +; SVE2_2048: // %bb.0: +; SVE2_2048-NEXT: sub sp, sp, #16 +; SVE2_2048-NEXT: .cfi_def_cfa_offset 16 +; SVE2_2048-NEXT: ldr d0, [x1] +; SVE2_2048-NEXT: mov z1.b, z0.b[7] +; SVE2_2048-NEXT: mov z2.b, z0.b[6] +; SVE2_2048-NEXT: mov z3.b, z0.b[4] +; SVE2_2048-NEXT: fmov w8, s1 +; SVE2_2048-NEXT: ldr d1, [x0] +; SVE2_2048-NEXT: fmov w9, s2 +; SVE2_2048-NEXT: mov z2.b, z0.b[3] +; SVE2_2048-NEXT: mov z1.b, z1.b[1] +; SVE2_2048-NEXT: strb w8, [sp, #15] +; SVE2_2048-NEXT: fmov w8, s3 +; SVE2_2048-NEXT: mov z3.b, z0.b[2] +; SVE2_2048-NEXT: strb w9, [sp, #14] +; SVE2_2048-NEXT: mov z0.b, z0.b[1] +; SVE2_2048-NEXT: fmov w9, s2 +; SVE2_2048-NEXT: strb w8, [sp, #13] +; SVE2_2048-NEXT: strb w8, [sp, #12] +; SVE2_2048-NEXT: fmov w8, s3 +; SVE2_2048-NEXT: strb w9, [sp, #11] +; SVE2_2048-NEXT: fmov w9, s0 +; SVE2_2048-NEXT: strb w8, [sp, #10] +; SVE2_2048-NEXT: fmov w8, s1 +; SVE2_2048-NEXT: strb w9, [sp, #9] +; SVE2_2048-NEXT: strb w8, [sp, #8] +; SVE2_2048-NEXT: ldr d0, [sp, #8] +; SVE2_2048-NEXT: add sp, sp, #16 +; SVE2_2048-NEXT: ret + %op1 = load <8 x i8>, ptr %a + %op2 = load <8 x i8>, ptr %b + %1 = shufflevector <8 x i8> %op1, <8 x i8> %op2, <8 x i32> + ret <8 x i8> %1 +} + +; SVE2_2048: .LCPI33_0: +; SVE2_2048-NEXT: .hword 1 +; SVE2_2048-NEXT: .hword 129 +; SVE2_2048-NEXT: .hword 130 +; SVE2_2048-NEXT: .hword 131 +; SVE2_2048-NEXT: .hword 132 +; SVE2_2048-NEXT: .hword 132 +; SVE2_2048-NEXT: .hword 134 +; SVE2_2048-NEXT: .hword 135 +; SVE2_2048-NEXT: .hword 65535 +; SVE2_2048-NEXT: .hword 65535 +define <8 x i16> @shuffle_index_size_acceptable_i16_op_both(ptr %a, ptr %b) { +; CHECK_SVE-LABEL: shuffle_index_size_acceptable_i16_op_both: +; CHECK_SVE: // %bb.0: +; CHECK_SVE-NEXT: sub sp, sp, #16 +; CHECK_SVE-NEXT: .cfi_def_cfa_offset 16 +; CHECK_SVE-NEXT: ldr q0, [x1] +; CHECK_SVE-NEXT: mov z1.h, z0.h[7] +; CHECK_SVE-NEXT: mov z2.h, z0.h[6] +; CHECK_SVE-NEXT: mov z3.h, z0.h[4] +; CHECK_SVE-NEXT: fmov w8, s1 +; CHECK_SVE-NEXT: ldr q1, [x0] +; CHECK_SVE-NEXT: fmov w9, s2 +; CHECK_SVE-NEXT: mov z2.h, z0.h[3] +; CHECK_SVE-NEXT: mov z1.h, z1.h[1] +; CHECK_SVE-NEXT: strh w8, [sp, #14] +; CHECK_SVE-NEXT: fmov w8, s3 +; CHECK_SVE-NEXT: mov z3.h, z0.h[2] +; CHECK_SVE-NEXT: strh w9, [sp, #12] +; CHECK_SVE-NEXT: mov z0.h, z0.h[1] +; CHECK_SVE-NEXT: fmov w9, s2 +; CHECK_SVE-NEXT: strh w8, [sp, #10] +; CHECK_SVE-NEXT: strh w8, [sp, #8] +; CHECK_SVE-NEXT: fmov w8, s3 +; CHECK_SVE-NEXT: strh w9, [sp, #6] +; CHECK_SVE-NEXT: fmov w9, s0 +; CHECK_SVE-NEXT: strh w8, [sp, #4] +; CHECK_SVE-NEXT: fmov w8, s1 +; CHECK_SVE-NEXT: strh w9, [sp, #2] +; CHECK_SVE-NEXT: strh w8, [sp] +; CHECK_SVE-NEXT: ldr q0, [sp], #16 +; CHECK_SVE-NEXT: ret +; +; SVE2_128-LABEL: shuffle_index_size_acceptable_i16_op_both: +; SVE2_128: // %bb.0: +; SVE2_128-NEXT: adrp x8, .LCPI33_0 +; SVE2_128-NEXT: ldr q0, [x0] +; SVE2_128-NEXT: ldr q1, [x1] +; SVE2_128-NEXT: ldr q2, [x8, :lo12:.LCPI33_0] +; SVE2_128-NEXT: tbl z0.h, { z0.h, z1.h }, z2.h +; SVE2_128-NEXT: // kill: def $q0 killed $q0 killed $z0 +; SVE2_128-NEXT: ret +; +; SVE2_1024-LABEL: shuffle_index_size_acceptable_i16_op_both: +; SVE2_1024: // %bb.0: +; SVE2_1024-NEXT: ptrue p0.h +; SVE2_1024-NEXT: ldr q0, [x0] +; SVE2_1024-NEXT: adrp x8, .LCPI33_0 +; SVE2_1024-NEXT: add x8, x8, :lo12:.LCPI33_0 +; SVE2_1024-NEXT: ldr q1, [x1] +; SVE2_1024-NEXT: ld1h { z2.h }, p0/z, [x8] +; SVE2_1024-NEXT: tbl z0.h, { z0.h, z1.h }, z2.h +; SVE2_1024-NEXT: // kill: def $q0 killed $q0 killed $z0 +; SVE2_1024-NEXT: ret +; +; SVE2_2048-LABEL: shuffle_index_size_acceptable_i16_op_both: +; SVE2_2048: // %bb.0: +; SVE2_2048-NEXT: ptrue p0.h +; SVE2_2048-NEXT: ldr q0, [x0] +; SVE2_2048-NEXT: adrp x8, .LCPI33_0 +; SVE2_2048-NEXT: add x8, x8, :lo12:.LCPI33_0 +; SVE2_2048-NEXT: ldr q1, [x1] +; SVE2_2048-NEXT: ld1h { z2.h }, p0/z, [x8] +; SVE2_2048-NEXT: tbl z0.h, { z0.h, z1.h }, z2.h +; SVE2_2048-NEXT: // kill: def $q0 killed $q0 killed $z0 +; SVE2_2048-NEXT: ret + %op1 = load <8 x i16>, ptr %a + %op2 = load <8 x i16>, ptr %b + %1 = shufflevector <8 x i16> %op1, <8 x i16> %op2, <8 x i32> + ret <8 x i16> %1 +} + +define <16 x double> @shuffle_doublemask_size_unacceptable_form(ptr %a, ptr %b) { +; CHECK_SVE-LABEL: shuffle_doublemask_size_unacceptable_form: +; CHECK_SVE: // %bb.0: +; CHECK_SVE-NEXT: sub sp, sp, #48 +; CHECK_SVE-NEXT: .cfi_def_cfa_offset 48 +; CHECK_SVE-NEXT: ldp q0, q3, [x0] +; CHECK_SVE-NEXT: ldr q1, [x0, #32] +; CHECK_SVE-NEXT: ldr q4, [x1, #48] +; CHECK_SVE-NEXT: mov z1.d, z1.d[1] +; CHECK_SVE-NEXT: mov z2.d, z0.d[1] +; CHECK_SVE-NEXT: mov z3.d, z3.d[1] +; CHECK_SVE-NEXT: stp d1, d2, [sp, #16] +; CHECK_SVE-NEXT: mov z2.d, z4.d[1] +; CHECK_SVE-NEXT: stp d0, d3, [sp, #32] +; CHECK_SVE-NEXT: stp d1, d2, [sp] +; CHECK_SVE-NEXT: ldp q1, q0, [sp, #16] +; CHECK_SVE-NEXT: ldr q7, [sp], #48 +; CHECK_SVE-NEXT: mov z2.d, z0.d +; CHECK_SVE-NEXT: mov z3.d, z1.d +; CHECK_SVE-NEXT: mov z5.d, z1.d +; CHECK_SVE-NEXT: mov z4.d, z0.d +; CHECK_SVE-NEXT: mov z6.d, z0.d +; CHECK_SVE-NEXT: ret +; +; SVE2_128-LABEL: shuffle_doublemask_size_unacceptable_form: +; SVE2_128: // %bb.0: +; SVE2_128-NEXT: ldp q5, q2, [x0, #16] +; SVE2_128-NEXT: index z0.d, #3, #-2 +; SVE2_128-NEXT: ldr q1, [x0] +; SVE2_128-NEXT: index z3.d, #0, #3 +; SVE2_128-NEXT: ldr q7, [x1, #48] +; SVE2_128-NEXT: mov z6.d, z2.d[1] +; SVE2_128-NEXT: mov z4.d, z1.d +; SVE2_128-NEXT: tbl z1.d, { z1.d, z2.d }, z0.d +; SVE2_128-NEXT: tbl z7.d, { z6.d, z7.d }, z3.d +; SVE2_128-NEXT: tbl z0.d, { z4.d, z5.d }, z3.d +; SVE2_128-NEXT: // kill: def $q7 killed $q7 killed $z7 +; SVE2_128-NEXT: mov z3.d, z1.d +; SVE2_128-NEXT: mov z5.d, z1.d +; SVE2_128-NEXT: mov z2.d, z0.d +; SVE2_128-NEXT: mov z4.d, z0.d +; SVE2_128-NEXT: mov z6.d, z0.d +; SVE2_128-NEXT: ret +; +; SVE2_1024-LABEL: shuffle_doublemask_size_unacceptable_form: +; SVE2_1024: // %bb.0: +; SVE2_1024-NEXT: ptrue p0.d, vl8 +; SVE2_1024-NEXT: adrp x9, .LCPI34_0 +; SVE2_1024-NEXT: add x9, x9, :lo12:.LCPI34_0 +; SVE2_1024-NEXT: ptrue p1.d +; SVE2_1024-NEXT: ld1d { z0.d }, p0/z, [x0] +; SVE2_1024-NEXT: ld1d { z1.d }, p0/z, [x1] +; SVE2_1024-NEXT: ld1d { z2.d }, p1/z, [x9] +; SVE2_1024-NEXT: tbl z0.d, { z0.d, z1.d }, z2.d +; SVE2_1024-NEXT: st1d { z0.d }, p1, [x8] +; SVE2_1024-NEXT: ret +; +; SVE2_2048-LABEL: shuffle_doublemask_size_unacceptable_form: +; SVE2_2048: // %bb.0: +; SVE2_2048-NEXT: ptrue p0.d, vl8 +; SVE2_2048-NEXT: adrp x9, .LCPI34_0 +; SVE2_2048-NEXT: add x9, x9, :lo12:.LCPI34_0 +; SVE2_2048-NEXT: ptrue p1.d +; SVE2_2048-NEXT: ld1d { z0.d }, p0/z, [x0] +; SVE2_2048-NEXT: ld1d { z1.d }, p0/z, [x1] +; SVE2_2048-NEXT: ld1d { z2.d }, p1/z, [x9] +; SVE2_2048-NEXT: ptrue p0.d, vl16 +; SVE2_2048-NEXT: tbl z0.d, { z0.d, z1.d }, z2.d +; SVE2_2048-NEXT: st1d { z0.d }, p0, [x8] +; SVE2_2048-NEXT: ret + %op1 = load <8 x double>, ptr %a + %op2 = load <8 x double>, ptr %b + %1 = shufflevector <8 x double> %op1, <8 x double> %op2, <16 x i32> + ret <16 x double> %1 +} + +; SVE2_128: .LCPI35_0: +; SVE2_128-NEXT: .hword 1 +; SVE2_128-NEXT: .hword 11 +; SVE2_128-NEXT: .hword 10 +; SVE2_128-NEXT: .hword 3 +; SVE2_128-NEXT: .hword 0 +; SVE2_128-NEXT: .hword 0 +; SVE2_128-NEXT: .hword 0 +; SVE2_128-NEXT: .hword 8 +define <8 x i16> @shuffle_doublemask_size_acceptable_i16_op_both(ptr %a, ptr %b) { +; CHECK_SVE-LABEL: shuffle_doublemask_size_acceptable_i16_op_both: +; CHECK_SVE: // %bb.0: +; CHECK_SVE-NEXT: sub sp, sp, #16 +; CHECK_SVE-NEXT: .cfi_def_cfa_offset 16 +; CHECK_SVE-NEXT: ldr d0, [x1] +; CHECK_SVE-NEXT: ldr d1, [x0] +; CHECK_SVE-NEXT: fmov w8, s0 +; CHECK_SVE-NEXT: mov z2.h, z1.h[3] +; CHECK_SVE-NEXT: fmov w9, s1 +; CHECK_SVE-NEXT: mov z3.h, z0.h[2] +; CHECK_SVE-NEXT: mov z0.h, z0.h[3] +; CHECK_SVE-NEXT: mov z1.h, z1.h[1] +; CHECK_SVE-NEXT: strh w8, [sp, #14] +; CHECK_SVE-NEXT: fmov w8, s2 +; CHECK_SVE-NEXT: strh w9, [sp, #12] +; CHECK_SVE-NEXT: strh w9, [sp, #10] +; CHECK_SVE-NEXT: strh w9, [sp, #8] +; CHECK_SVE-NEXT: fmov w9, s3 +; CHECK_SVE-NEXT: strh w8, [sp, #6] +; CHECK_SVE-NEXT: fmov w8, s0 +; CHECK_SVE-NEXT: strh w9, [sp, #4] +; CHECK_SVE-NEXT: fmov w9, s1 +; CHECK_SVE-NEXT: strh w8, [sp, #2] +; CHECK_SVE-NEXT: strh w9, [sp] +; CHECK_SVE-NEXT: ldr q0, [sp], #16 +; CHECK_SVE-NEXT: ret +; +; SVE2_128-LABEL: shuffle_doublemask_size_acceptable_i16_op_both: +; SVE2_128: // %bb.0: +; SVE2_128-NEXT: adrp x8, .LCPI35_0 +; SVE2_128-NEXT: ldr d0, [x0] +; SVE2_128-NEXT: ldr d1, [x1] +; SVE2_128-NEXT: ldr q2, [x8, :lo12:.LCPI35_0] +; SVE2_128-NEXT: tbl z0.h, { z0.h, z1.h }, z2.h +; SVE2_128-NEXT: // kill: def $q0 killed $q0 killed $z0 +; SVE2_128-NEXT: ret +; +; SVE2_1024-LABEL: shuffle_doublemask_size_acceptable_i16_op_both: +; SVE2_1024: // %bb.0: +; SVE2_1024-NEXT: ptrue p0.h +; SVE2_1024-NEXT: ldr d0, [x0] +; SVE2_1024-NEXT: adrp x8, .LCPI35_0 +; SVE2_1024-NEXT: add x8, x8, :lo12:.LCPI35_0 +; SVE2_1024-NEXT: ldr d1, [x1] +; SVE2_1024-NEXT: ld1h { z2.h }, p0/z, [x8] +; SVE2_1024-NEXT: tbl z0.h, { z0.h, z1.h }, z2.h +; SVE2_1024-NEXT: // kill: def $q0 killed $q0 killed $z0 +; SVE2_1024-NEXT: ret +; +; SVE2_2048-LABEL: shuffle_doublemask_size_acceptable_i16_op_both: +; SVE2_2048: // %bb.0: +; SVE2_2048-NEXT: ptrue p0.h +; SVE2_2048-NEXT: ldr d0, [x0] +; SVE2_2048-NEXT: adrp x8, .LCPI35_0 +; SVE2_2048-NEXT: add x8, x8, :lo12:.LCPI35_0 +; SVE2_2048-NEXT: ldr d1, [x1] +; SVE2_2048-NEXT: ld1h { z2.h }, p0/z, [x8] +; SVE2_2048-NEXT: tbl z0.h, { z0.h, z1.h }, z2.h +; SVE2_2048-NEXT: // kill: def $q0 killed $q0 killed $z0 +; SVE2_2048-NEXT: ret + %op1 = load <4 x i16>, ptr %a + %op2 = load <4 x i16>, ptr %b + %1 = shufflevector <4 x i16> %op1, <4 x i16> %op2, <8 x i32> + ret <8 x i16> %1 +} + +; SVE2_1024: .LCPI36_0: +; SVE2_1024-NEXT: .word 0 +; SVE2_1024-NEXT: .word 1 +; SVE2_1024-NEXT: .word 1 +; SVE2_1024-NEXT: .word 2 +; SVE2_1024-NEXT: .word 4294967295 +; SVE2_1024-NEXT: .word 4294967295 +define <4 x float> @shuffle_halfmask_size_acceptable_float_op_one(ptr %ptr1, ptr %ptr2) { +; CHECK_SVE-LABEL: shuffle_halfmask_size_acceptable_float_op_one: +; CHECK_SVE: // %bb.0: +; CHECK_SVE-NEXT: sub sp, sp, #16 +; CHECK_SVE-NEXT: .cfi_def_cfa_offset 16 +; CHECK_SVE-NEXT: ldr q0, [x0] +; CHECK_SVE-NEXT: mov z1.s, z0.s[2] +; CHECK_SVE-NEXT: mov z2.s, z0.s[1] +; CHECK_SVE-NEXT: stp s2, s1, [sp, #8] +; CHECK_SVE-NEXT: stp s0, s2, [sp] +; CHECK_SVE-NEXT: ldr q0, [sp], #16 +; CHECK_SVE-NEXT: ret +; +; SVE2_128-LABEL: shuffle_halfmask_size_acceptable_float_op_one: +; SVE2_128: // %bb.0: +; SVE2_128-NEXT: adrp x8, .LCPI36_0 +; SVE2_128-NEXT: ldr q1, [x0] +; SVE2_128-NEXT: ldr q0, [x8, :lo12:.LCPI36_0] +; SVE2_128-NEXT: tbl z0.s, { z1.s }, z0.s +; SVE2_128-NEXT: // kill: def $q0 killed $q0 killed $z0 +; SVE2_128-NEXT: ret +; +; SVE2_1024-LABEL: shuffle_halfmask_size_acceptable_float_op_one: +; SVE2_1024: // %bb.0: +; SVE2_1024-NEXT: ptrue p0.s +; SVE2_1024-NEXT: adrp x8, .LCPI36_0 +; SVE2_1024-NEXT: add x8, x8, :lo12:.LCPI36_0 +; SVE2_1024-NEXT: ld1w { z0.s }, p0/z, [x8] +; SVE2_1024-NEXT: ldr q1, [x0] +; SVE2_1024-NEXT: tbl z0.s, { z1.s }, z0.s +; SVE2_1024-NEXT: // kill: def $q0 killed $q0 killed $z0 +; SVE2_1024-NEXT: ret +; +; SVE2_2048-LABEL: shuffle_halfmask_size_acceptable_float_op_one: +; SVE2_2048: // %bb.0: +; SVE2_2048-NEXT: ptrue p0.s +; SVE2_2048-NEXT: adrp x8, .LCPI36_0 +; SVE2_2048-NEXT: add x8, x8, :lo12:.LCPI36_0 +; SVE2_2048-NEXT: ld1w { z0.s }, p0/z, [x8] +; SVE2_2048-NEXT: ldr q1, [x0] +; SVE2_2048-NEXT: tbl z0.s, { z1.s }, z0.s +; SVE2_2048-NEXT: // kill: def $q0 killed $q0 killed $z0 +; SVE2_2048-NEXT: ret + %a = load <8 x float>, ptr %ptr1 + %b = load <8 x float>, ptr %ptr2 + %1 = shufflevector <8 x float> %a, <8 x float> %b, <4 x i32> + ret <4 x float> %1 +} + +attributes #0 = { "target-features"="+sve" }