Index: llvm/lib/Target/AArch64/AArch64ISelLowering.cpp =================================================================== --- llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -8921,8 +8921,8 @@ for (auto &Src : Sources) { EVT SrcVT = Src.ShuffleVec.getValueType(); - uint64_t SrcVTSize = SrcVT.getFixedSizeInBits(); - if (SrcVTSize == VTSize) + TypeSize SrcVTSize = SrcVT.getSizeInBits(); + if (SrcVTSize == TypeSize::Fixed(VTSize)) continue; // This stage of the search produces a source with the same element type as @@ -8931,7 +8931,7 @@ unsigned NumSrcElts = VTSize / EltVT.getFixedSizeInBits(); EVT DestVT = EVT::getVectorVT(*DAG.getContext(), EltVT, NumSrcElts); - if (SrcVTSize < VTSize) { + if (!SrcVTSize.isScalable() && SrcVTSize.getFixedValue() < VTSize) { assert(2 * SrcVTSize == VTSize); // We can pad out the smaller vector for free, so if it's part of a // shuffle... @@ -8941,7 +8941,8 @@ continue; } - if (SrcVTSize != 2 * VTSize) { + if ((!SrcVTSize.isScalable() && SrcVTSize.getFixedValue() != 2 * VTSize) || + (SrcVTSize.isScalable() && SrcVTSize.getKnownMinValue() != VTSize)) { LLVM_DEBUG( dbgs() << "Reshuffle failed: result vector too small to extract\n"); return SDValue(); @@ -9004,8 +9005,10 @@ // Final check before we try to actually produce a shuffle. LLVM_DEBUG(for (auto Src - : Sources) - assert(Src.ShuffleVec.getValueType() == ShuffleVT);); + : Sources) assert(Src.ShuffleVec.getValueType() + .getSizeInBits() + .getKnownMinValue() == + ShuffleVT.getFixedSizeInBits());); // The stars all align, our next step is to produce the mask for the shuffle. SmallVector Mask(ShuffleVT.getVectorNumElements(), -1); Index: llvm/test/CodeGen/AArch64/sve-extract-fixed-vector.ll =================================================================== --- llvm/test/CodeGen/AArch64/sve-extract-fixed-vector.ll +++ llvm/test/CodeGen/AArch64/sve-extract-fixed-vector.ll @@ -361,6 +361,106 @@ ret <16 x i8> %retval } + +; Predicates + +define <2 x i1> @extract_v2i1_nxv2i1( %inmask) { +; CHECK-LABEL: extract_v2i1_nxv2i1: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z0.d, p0/z, #1 // =0x1 +; CHECK-NEXT: fmov x0, d0 +; CHECK-NEXT: mov x8, v0.d[1] +; CHECK-NEXT: fmov s0, w0 +; CHECK-NEXT: mov v0.s[1], w8 +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-NEXT: ret + %mask = call <2 x i1> @llvm.experimental.vector.extract.v2i1.nxv2i1( %inmask, i64 0) + ret <2 x i1> %mask +} + +define <4 x i1> @extract_v4i1_nxv4i1( %inmask) { +; CHECK-LABEL: extract_v4i1_nxv4i1: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z1.s, p0/z, #1 // =0x1 +; CHECK-NEXT: mov w8, v1.s[1] +; CHECK-NEXT: mov w9, v1.s[2] +; CHECK-NEXT: mov v0.16b, v1.16b +; CHECK-NEXT: mov v0.h[1], w8 +; CHECK-NEXT: mov w8, v1.s[3] +; CHECK-NEXT: mov v0.h[2], w9 +; CHECK-NEXT: mov v0.h[3], w8 +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-NEXT: ret + %mask = call <4 x i1> @llvm.experimental.vector.extract.v4i1.nxv4i1( %inmask, i64 0) + ret <4 x i1> %mask +} + +define <8 x i1> @extract_v8i1_nxv8i1( %inmask) { +; CHECK-LABEL: extract_v8i1_nxv8i1: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z1.h, p0/z, #1 // =0x1 +; CHECK-NEXT: umov w8, v1.h[1] +; CHECK-NEXT: umov w9, v1.h[2] +; CHECK-NEXT: mov v0.16b, v1.16b +; CHECK-NEXT: mov v0.b[1], w8 +; CHECK-NEXT: umov w8, v1.h[3] +; CHECK-NEXT: mov v0.b[2], w9 +; CHECK-NEXT: umov w9, v1.h[4] +; CHECK-NEXT: mov v0.b[3], w8 +; CHECK-NEXT: umov w8, v1.h[5] +; CHECK-NEXT: mov v0.b[4], w9 +; CHECK-NEXT: umov w9, v1.h[6] +; CHECK-NEXT: mov v0.b[5], w8 +; CHECK-NEXT: umov w8, v1.h[7] +; CHECK-NEXT: mov v0.b[6], w9 +; CHECK-NEXT: mov v0.b[7], w8 +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-NEXT: ret + %mask = call <8 x i1> @llvm.experimental.vector.extract.v8i1.nxv8i1( %inmask, i64 0) + ret <8 x i1> %mask +} + +define <16 x i1> @extract_v16i1_nxv16i1( %inmask) { +; CHECK-LABEL: extract_v16i1_nxv16i1: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z1.b, p0/z, #1 // =0x1 +; CHECK-NEXT: umov w8, v1.b[1] +; CHECK-NEXT: umov w9, v1.b[2] +; CHECK-NEXT: mov v0.16b, v1.16b +; CHECK-NEXT: mov v0.b[1], w8 +; CHECK-NEXT: umov w8, v1.b[3] +; CHECK-NEXT: mov v0.b[2], w9 +; CHECK-NEXT: umov w9, v1.b[4] +; CHECK-NEXT: mov v0.b[3], w8 +; CHECK-NEXT: umov w8, v1.b[5] +; CHECK-NEXT: mov v0.b[4], w9 +; CHECK-NEXT: umov w9, v1.b[6] +; CHECK-NEXT: mov v0.b[5], w8 +; CHECK-NEXT: umov w8, v1.b[7] +; CHECK-NEXT: mov v0.b[6], w9 +; CHECK-NEXT: umov w9, v1.b[8] +; CHECK-NEXT: mov v0.b[7], w8 +; CHECK-NEXT: umov w8, v1.b[9] +; CHECK-NEXT: mov v0.b[8], w9 +; CHECK-NEXT: umov w9, v1.b[10] +; CHECK-NEXT: mov v0.b[9], w8 +; CHECK-NEXT: umov w8, v1.b[11] +; CHECK-NEXT: mov v0.b[10], w9 +; CHECK-NEXT: umov w9, v1.b[12] +; CHECK-NEXT: mov v0.b[11], w8 +; CHECK-NEXT: umov w8, v1.b[13] +; CHECK-NEXT: mov v0.b[12], w9 +; CHECK-NEXT: umov w9, v1.b[14] +; CHECK-NEXT: mov v0.b[13], w8 +; CHECK-NEXT: umov w8, v1.b[15] +; CHECK-NEXT: mov v0.b[14], w9 +; CHECK-NEXT: mov v0.b[15], w8 +; CHECK-NEXT: ret + %mask = call <16 x i1> @llvm.experimental.vector.extract.v16i1.nxv16i1( %inmask, i64 0) + ret <16 x i1> %mask +} + + ; Fixed length clamping define <2 x i64> @extract_fixed_v2i64_nxv2i64( %vec) nounwind #0 { @@ -441,4 +541,9 @@ declare <16 x i8> @llvm.experimental.vector.extract.v16i8.nxv4i8(, i64) declare <16 x i8> @llvm.experimental.vector.extract.v16i8.nxv2i8(, i64) +declare <2 x i1> @llvm.experimental.vector.extract.v2i1.nxv2i1(, i64) +declare <4 x i1> @llvm.experimental.vector.extract.v4i1.nxv4i1(, i64) +declare <8 x i1> @llvm.experimental.vector.extract.v8i1.nxv8i1(, i64) +declare <16 x i1> @llvm.experimental.vector.extract.v16i1.nxv16i1(, i64) + declare <4 x i64> @llvm.experimental.vector.extract.v4i64.nxv2i64(, i64) Index: llvm/test/CodeGen/AArch64/sve-fixed-length-reshuffle.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/AArch64/sve-fixed-length-reshuffle.ll @@ -0,0 +1,897 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s | FileCheck %s + +target triple = "aarch64-unknown-linux-gnu" + +; == Matching first N elements == + +define <2 x i1> @reshuffle_v2i1_nxv2i1( %a) #0 { +; CHECK-LABEL: reshuffle_v2i1_nxv2i1: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z0.d, p0/z, #1 // =0x1 +; CHECK-NEXT: fmov x0, d0 +; CHECK-NEXT: mov x8, v0.d[1] +; CHECK-NEXT: fmov s0, w0 +; CHECK-NEXT: mov v0.s[1], w8 +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-NEXT: ret + %el0 = extractelement %a, i32 0 + %el1 = extractelement %a, i32 1 + %v0 = insertelement <2 x i1> undef, i1 %el0, i32 0 + %v1 = insertelement <2 x i1> %v0, i1 %el1, i32 1 + ret <2 x i1> %v1 +} + + +define <2 x i1> @reshuffle_v2i1_nxv4i1( %a) #0 { +; CHECK-LABEL: reshuffle_v2i1_nxv4i1: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z0.s, p0/z, #1 // =0x1 +; CHECK-NEXT: mov w8, v0.s[1] +; CHECK-NEXT: mov v0.s[1], w8 +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 +; CHECK-NEXT: ret + %el0 = extractelement %a, i32 0 + %el1 = extractelement %a, i32 1 + %v0 = insertelement <2 x i1> undef, i1 %el0, i32 0 + %v1 = insertelement <2 x i1> %v0, i1 %el1, i32 1 + ret <2 x i1> %v1 +} + + +; ReconstructShuffle fails +define <4 x i1> @reshuffle_v4i1_nxv4i1( %a) #0 { +; CHECK-LABEL: reshuffle_v4i1_nxv4i1: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z1.s, p0/z, #1 // =0x1 +; CHECK-NEXT: mov w8, v1.s[1] +; CHECK-NEXT: mov w9, v1.s[2] +; CHECK-NEXT: mov v0.16b, v1.16b +; CHECK-NEXT: mov v0.h[1], w8 +; CHECK-NEXT: mov w8, v1.s[3] +; CHECK-NEXT: mov v0.h[2], w9 +; CHECK-NEXT: mov v0.h[3], w8 +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-NEXT: ret + %el0 = extractelement %a, i32 0 + %el1 = extractelement %a, i32 1 + %el2 = extractelement %a, i32 2 + %el3 = extractelement %a, i32 3 + %v0 = insertelement <4 x i1> undef, i1 %el0, i32 0 + %v1 = insertelement <4 x i1> %v0, i1 %el1, i32 1 + %v2 = insertelement <4 x i1> %v1, i1 %el2, i32 2 + %v3 = insertelement <4 x i1> %v2, i1 %el3, i32 3 + ret <4 x i1> %v3 +} + + +; ReconstructShuffle fails +define <4 x i1> @reshuffle_v4i1_nxv8i1( %a) #0 { +; CHECK-LABEL: reshuffle_v4i1_nxv8i1: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z1.h, p0/z, #1 // =0x1 +; CHECK-NEXT: umov w8, v1.h[1] +; CHECK-NEXT: umov w9, v1.h[2] +; CHECK-NEXT: mov v0.16b, v1.16b +; CHECK-NEXT: mov v0.h[1], w8 +; CHECK-NEXT: umov w8, v1.h[3] +; CHECK-NEXT: mov v0.h[2], w9 +; CHECK-NEXT: mov v0.h[3], w8 +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-NEXT: ret + %el0 = extractelement %a, i32 0 + %el1 = extractelement %a, i32 1 + %el2 = extractelement %a, i32 2 + %el3 = extractelement %a, i32 3 + %v0 = insertelement <4 x i1> undef, i1 %el0, i32 0 + %v1 = insertelement <4 x i1> %v0, i1 %el1, i32 1 + %v2 = insertelement <4 x i1> %v1, i1 %el2, i32 2 + %v3 = insertelement <4 x i1> %v2, i1 %el3, i32 3 + ret <4 x i1> %v3 +} + + +; ReconstructShuffle fails +define <8 x i1> @reshuffle_v8i1_nxv8i1( %a) #0 { +; CHECK-LABEL: reshuffle_v8i1_nxv8i1: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z1.h, p0/z, #1 // =0x1 +; CHECK-NEXT: umov w8, v1.h[1] +; CHECK-NEXT: umov w9, v1.h[2] +; CHECK-NEXT: mov v0.16b, v1.16b +; CHECK-NEXT: mov v0.b[1], w8 +; CHECK-NEXT: umov w8, v1.h[3] +; CHECK-NEXT: mov v0.b[2], w9 +; CHECK-NEXT: umov w9, v1.h[4] +; CHECK-NEXT: mov v0.b[3], w8 +; CHECK-NEXT: umov w8, v1.h[5] +; CHECK-NEXT: mov v0.b[4], w9 +; CHECK-NEXT: umov w9, v1.h[6] +; CHECK-NEXT: mov v0.b[5], w8 +; CHECK-NEXT: umov w8, v1.h[7] +; CHECK-NEXT: mov v0.b[6], w9 +; CHECK-NEXT: mov v0.b[7], w8 +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-NEXT: ret + %el0 = extractelement %a, i32 0 + %el1 = extractelement %a, i32 1 + %el2 = extractelement %a, i32 2 + %el3 = extractelement %a, i32 3 + %el4 = extractelement %a, i32 4 + %el5 = extractelement %a, i32 5 + %el6 = extractelement %a, i32 6 + %el7 = extractelement %a, i32 7 + %v0 = insertelement <8 x i1> undef, i1 %el0, i32 0 + %v1 = insertelement <8 x i1> %v0, i1 %el1, i32 1 + %v2 = insertelement <8 x i1> %v1, i1 %el2, i32 2 + %v3 = insertelement <8 x i1> %v2, i1 %el3, i32 3 + %v4 = insertelement <8 x i1> %v3, i1 %el4, i32 4 + %v5 = insertelement <8 x i1> %v4, i1 %el5, i32 5 + %v6 = insertelement <8 x i1> %v5, i1 %el6, i32 6 + %v7 = insertelement <8 x i1> %v6, i1 %el7, i32 7 + ret <8 x i1> %v7 +} + + +; ReconstructShuffle fails +define <16 x i1> @reshuffle_v16i1_nxv16i1( %a) #0 { +; CHECK-LABEL: reshuffle_v16i1_nxv16i1: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z1.b, p0/z, #1 // =0x1 +; CHECK-NEXT: umov w8, v1.b[1] +; CHECK-NEXT: umov w9, v1.b[2] +; CHECK-NEXT: mov v0.16b, v1.16b +; CHECK-NEXT: mov v0.b[1], w8 +; CHECK-NEXT: umov w8, v1.b[3] +; CHECK-NEXT: mov v0.b[2], w9 +; CHECK-NEXT: umov w9, v1.b[4] +; CHECK-NEXT: mov v0.b[3], w8 +; CHECK-NEXT: umov w8, v1.b[5] +; CHECK-NEXT: mov v0.b[4], w9 +; CHECK-NEXT: umov w9, v1.b[6] +; CHECK-NEXT: mov v0.b[5], w8 +; CHECK-NEXT: umov w8, v1.b[7] +; CHECK-NEXT: mov v0.b[6], w9 +; CHECK-NEXT: umov w9, v1.b[8] +; CHECK-NEXT: mov v0.b[7], w8 +; CHECK-NEXT: umov w8, v1.b[9] +; CHECK-NEXT: mov v0.b[8], w9 +; CHECK-NEXT: umov w9, v1.b[10] +; CHECK-NEXT: mov v0.b[9], w8 +; CHECK-NEXT: umov w8, v1.b[11] +; CHECK-NEXT: mov v0.b[10], w9 +; CHECK-NEXT: umov w9, v1.b[12] +; CHECK-NEXT: mov v0.b[11], w8 +; CHECK-NEXT: umov w8, v1.b[13] +; CHECK-NEXT: mov v0.b[12], w9 +; CHECK-NEXT: umov w9, v1.b[14] +; CHECK-NEXT: mov v0.b[13], w8 +; CHECK-NEXT: umov w8, v1.b[15] +; CHECK-NEXT: mov v0.b[14], w9 +; CHECK-NEXT: mov v0.b[15], w8 +; CHECK-NEXT: ret + %el0 = extractelement %a, i32 0 + %el1 = extractelement %a, i32 1 + %el2 = extractelement %a, i32 2 + %el3 = extractelement %a, i32 3 + %el4 = extractelement %a, i32 4 + %el5 = extractelement %a, i32 5 + %el6 = extractelement %a, i32 6 + %el7 = extractelement %a, i32 7 + %el8 = extractelement %a, i32 8 + %el9 = extractelement %a, i32 9 + %el10 = extractelement %a, i32 10 + %el11 = extractelement %a, i32 11 + %el12 = extractelement %a, i32 12 + %el13 = extractelement %a, i32 13 + %el14 = extractelement %a, i32 14 + %el15 = extractelement %a, i32 15 + %v0 = insertelement <16 x i1> undef, i1 %el0, i32 0 + %v1 = insertelement <16 x i1> %v0, i1 %el1, i32 1 + %v2 = insertelement <16 x i1> %v1, i1 %el2, i32 2 + %v3 = insertelement <16 x i1> %v2, i1 %el3, i32 3 + %v4 = insertelement <16 x i1> %v3, i1 %el4, i32 4 + %v5 = insertelement <16 x i1> %v4, i1 %el5, i32 5 + %v6 = insertelement <16 x i1> %v5, i1 %el6, i32 6 + %v7 = insertelement <16 x i1> %v6, i1 %el7, i32 7 + %v8 = insertelement <16 x i1> %v7, i1 %el8, i32 8 + %v9 = insertelement <16 x i1> %v8, i1 %el9, i32 9 + %v10 = insertelement <16 x i1> %v9, i1 %el10, i32 10 + %v11 = insertelement <16 x i1> %v10, i1 %el11, i32 11 + %v12 = insertelement <16 x i1> %v11, i1 %el12, i32 12 + %v13 = insertelement <16 x i1> %v12, i1 %el13, i32 13 + %v14 = insertelement <16 x i1> %v13, i1 %el14, i32 14 + %v15 = insertelement <16 x i1> %v14, i1 %el15, i32 15 + ret <16 x i1> %v15 +} + + +define <2 x i64> @reshuffle_v2i64_nxv2i64( %a) #0 { +; CHECK-LABEL: reshuffle_v2i64_nxv2i64: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 +; CHECK-NEXT: ret + %el0 = extractelement %a, i32 0 + %el1 = extractelement %a, i32 1 + %v0 = insertelement <2 x i64> undef, i64 %el0, i32 0 + %v1 = insertelement <2 x i64> %v0, i64 %el1, i32 1 + ret <2 x i64> %v1 +} + + +define <2 x i32> @reshuffle_v2i32_nxv2i32( %a) #0 { +; CHECK-LABEL: reshuffle_v2i32_nxv2i32: +; CHECK: // %bb.0: +; CHECK-NEXT: xtn v0.2s, v0.2d +; CHECK-NEXT: ret + %el0 = extractelement %a, i32 0 + %el1 = extractelement %a, i32 1 + %v0 = insertelement <2 x i32> undef, i32 %el0, i32 0 + %v1 = insertelement <2 x i32> %v0, i32 %el1, i32 1 + ret <2 x i32> %v1 +} + + +define <2 x i32> @reshuffle_v2i32_nxv4i32( %a) #0 { +; CHECK-LABEL: reshuffle_v2i32_nxv4i32: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 +; CHECK-NEXT: ret + %el0 = extractelement %a, i32 0 + %el1 = extractelement %a, i32 1 + %v0 = insertelement <2 x i32> undef, i32 %el0, i32 0 + %v1 = insertelement <2 x i32> %v0, i32 %el1, i32 1 + ret <2 x i32> %v1 +} + + +define <4 x i32> @reshuffle_v4i32_nxv4i32( %a) #0 { +; CHECK-LABEL: reshuffle_v4i32_nxv4i32: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 +; CHECK-NEXT: ret + %el0 = extractelement %a, i32 0 + %el1 = extractelement %a, i32 1 + %el2 = extractelement %a, i32 2 + %el3 = extractelement %a, i32 3 + %v0 = insertelement <4 x i32> undef, i32 %el0, i32 0 + %v1 = insertelement <4 x i32> %v0, i32 %el1, i32 1 + %v2 = insertelement <4 x i32> %v1, i32 %el2, i32 2 + %v3 = insertelement <4 x i32> %v2, i32 %el3, i32 3 + ret <4 x i32> %v3 +} + + +define <4 x i16> @reshuffle_v4i16_nxv4i16( %a) #0 { +; CHECK-LABEL: reshuffle_v4i16_nxv4i16: +; CHECK: // %bb.0: +; CHECK-NEXT: xtn v0.4h, v0.4s +; CHECK-NEXT: ret + %el0 = extractelement %a, i32 0 + %el1 = extractelement %a, i32 1 + %el2 = extractelement %a, i32 2 + %el3 = extractelement %a, i32 3 + %v0 = insertelement <4 x i16> undef, i16 %el0, i32 0 + %v1 = insertelement <4 x i16> %v0, i16 %el1, i32 1 + %v2 = insertelement <4 x i16> %v1, i16 %el2, i32 2 + %v3 = insertelement <4 x i16> %v2, i16 %el3, i32 3 + ret <4 x i16> %v3 +} + + +define <4 x i16> @reshuffle_v4i16_nxv8i16( %a) #0 { +; CHECK-LABEL: reshuffle_v4i16_nxv8i16: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 +; CHECK-NEXT: ret + %el0 = extractelement %a, i32 0 + %el1 = extractelement %a, i32 1 + %el2 = extractelement %a, i32 2 + %el3 = extractelement %a, i32 3 + %v0 = insertelement <4 x i16> undef, i16 %el0, i32 0 + %v1 = insertelement <4 x i16> %v0, i16 %el1, i32 1 + %v2 = insertelement <4 x i16> %v1, i16 %el2, i32 2 + %v3 = insertelement <4 x i16> %v2, i16 %el3, i32 3 + ret <4 x i16> %v3 +} + + +define <8 x i16> @reshuffle_v8i16_nxv8i16( %a) #0 { +; CHECK-LABEL: reshuffle_v8i16_nxv8i16: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 +; CHECK-NEXT: ret + %el0 = extractelement %a, i32 0 + %el1 = extractelement %a, i32 1 + %el2 = extractelement %a, i32 2 + %el3 = extractelement %a, i32 3 + %el4 = extractelement %a, i32 4 + %el5 = extractelement %a, i32 5 + %el6 = extractelement %a, i32 6 + %el7 = extractelement %a, i32 7 + %v0 = insertelement <8 x i16> undef, i16 %el0, i32 0 + %v1 = insertelement <8 x i16> %v0, i16 %el1, i32 1 + %v2 = insertelement <8 x i16> %v1, i16 %el2, i32 2 + %v3 = insertelement <8 x i16> %v2, i16 %el3, i32 3 + %v4 = insertelement <8 x i16> %v3, i16 %el4, i32 4 + %v5 = insertelement <8 x i16> %v4, i16 %el5, i32 5 + %v6 = insertelement <8 x i16> %v5, i16 %el6, i32 6 + %v7 = insertelement <8 x i16> %v6, i16 %el7, i32 7 + ret <8 x i16> %v7 +} + + +define <16 x i8> @reshuffle_v16i8_nxv16i8( %a) #0 { +; CHECK-LABEL: reshuffle_v16i8_nxv16i8: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 +; CHECK-NEXT: ret + %el0 = extractelement %a, i32 0 + %el1 = extractelement %a, i32 1 + %el2 = extractelement %a, i32 2 + %el3 = extractelement %a, i32 3 + %el4 = extractelement %a, i32 4 + %el5 = extractelement %a, i32 5 + %el6 = extractelement %a, i32 6 + %el7 = extractelement %a, i32 7 + %el8 = extractelement %a, i32 8 + %el9 = extractelement %a, i32 9 + %el10 = extractelement %a, i32 10 + %el11 = extractelement %a, i32 11 + %el12 = extractelement %a, i32 12 + %el13 = extractelement %a, i32 13 + %el14 = extractelement %a, i32 14 + %el15 = extractelement %a, i32 15 + %v0 = insertelement <16 x i8> undef, i8 %el0, i32 0 + %v1 = insertelement <16 x i8> %v0, i8 %el1, i32 1 + %v2 = insertelement <16 x i8> %v1, i8 %el2, i32 2 + %v3 = insertelement <16 x i8> %v2, i8 %el3, i32 3 + %v4 = insertelement <16 x i8> %v3, i8 %el4, i32 4 + %v5 = insertelement <16 x i8> %v4, i8 %el5, i32 5 + %v6 = insertelement <16 x i8> %v5, i8 %el6, i32 6 + %v7 = insertelement <16 x i8> %v6, i8 %el7, i32 7 + %v8 = insertelement <16 x i8> %v7, i8 %el8, i32 8 + %v9 = insertelement <16 x i8> %v8, i8 %el9, i32 9 + %v10 = insertelement <16 x i8> %v9, i8 %el10, i32 10 + %v11 = insertelement <16 x i8> %v10, i8 %el11, i32 11 + %v12 = insertelement <16 x i8> %v11, i8 %el12, i32 12 + %v13 = insertelement <16 x i8> %v12, i8 %el13, i32 13 + %v14 = insertelement <16 x i8> %v13, i8 %el14, i32 14 + %v15 = insertelement <16 x i8> %v14, i8 %el15, i32 15 + ret <16 x i8> %v15 +} + + +define <8 x i8> @reshuffle_v8i8_nxv16i8( %a) #0 { +; CHECK-LABEL: reshuffle_v8i8_nxv16i8: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 +; CHECK-NEXT: ret + %el0 = extractelement %a, i32 0 + %el1 = extractelement %a, i32 1 + %el2 = extractelement %a, i32 2 + %el3 = extractelement %a, i32 3 + %el4 = extractelement %a, i32 4 + %el5 = extractelement %a, i32 5 + %el6 = extractelement %a, i32 6 + %el7 = extractelement %a, i32 7 + %v0 = insertelement <8 x i8> undef, i8 %el0, i32 0 + %v1 = insertelement <8 x i8> %v0, i8 %el1, i32 1 + %v2 = insertelement <8 x i8> %v1, i8 %el2, i32 2 + %v3 = insertelement <8 x i8> %v2, i8 %el3, i32 3 + %v4 = insertelement <8 x i8> %v3, i8 %el4, i32 4 + %v5 = insertelement <8 x i8> %v4, i8 %el5, i32 5 + %v6 = insertelement <8 x i8> %v5, i8 %el6, i32 6 + %v7 = insertelement <8 x i8> %v6, i8 %el7, i32 7 + ret <8 x i8> %v7 +} + + +define <2 x float> @reshuffle_v2f32_nxv2f32( %a) #0 { +; CHECK-LABEL: reshuffle_v2f32_nxv2f32: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 8 * VG +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: st1w { z0.d }, p0, [sp, #1, mul vl] +; CHECK-NEXT: addpl x8, sp, #4 +; CHECK-NEXT: ldr d0, [x8] +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + %el0 = extractelement %a, i32 0 + %el1 = extractelement %a, i32 1 + %v0 = insertelement <2 x float> undef, float %el0, i32 0 + %v1 = insertelement <2 x float> %v0, float %el1, i32 1 + ret <2 x float> %v1 +} + + +define <4 x float> @reshuffle_v4f32_nxv4f32( %a) #0 { +; CHECK-LABEL: reshuffle_v4f32_nxv4f32: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 +; CHECK-NEXT: ret + %el0 = extractelement %a, i32 0 + %el1 = extractelement %a, i32 1 + %el2 = extractelement %a, i32 2 + %el3 = extractelement %a, i32 3 + %v0 = insertelement <4 x float> undef, float %el0, i32 0 + %v1 = insertelement <4 x float> %v0, float %el1, i32 1 + %v2 = insertelement <4 x float> %v1, float %el2, i32 2 + %v3 = insertelement <4 x float> %v2, float %el3, i32 3 + ret <4 x float> %v3 +} + + +; == Reversed first N elements == + +; ReconstructShuffle - fails +define <4 x i16> @reshuffle_v4i16_nxv4i16_reverse( %a) #0 { +; CHECK-LABEL: reshuffle_v4i16_nxv4i16_reverse: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w8, v0.s[3] +; CHECK-NEXT: mov w9, v0.s[2] +; CHECK-NEXT: fmov s1, w8 +; CHECK-NEXT: mov w8, v0.s[1] +; CHECK-NEXT: mov v1.h[1], w9 +; CHECK-NEXT: mov v1.h[2], w8 +; CHECK-NEXT: fmov w8, s0 +; CHECK-NEXT: mov v1.h[3], w8 +; CHECK-NEXT: fmov d0, d1 +; CHECK-NEXT: ret + %el0 = extractelement %a, i32 0 + %el1 = extractelement %a, i32 1 + %el2 = extractelement %a, i32 2 + %el3 = extractelement %a, i32 3 + %v0 = insertelement <4 x i16> undef, i16 %el3, i32 0 + %v1 = insertelement <4 x i16> %v0, i16 %el2, i32 1 + %v2 = insertelement <4 x i16> %v1, i16 %el1, i32 2 + %v3 = insertelement <4 x i16> %v2, i16 %el0, i32 3 + ret <4 x i16> %v3 +} + + +; ReconstructShuffle - fails +define <8 x i16> @reshuffle_v8i16_nxv8i16_reverse( %a) #0 { +; CHECK-LABEL: reshuffle_v8i16_nxv8i16_reverse: +; CHECK: // %bb.0: +; CHECK-NEXT: umov w8, v0.h[7] +; CHECK-NEXT: umov w9, v0.h[6] +; CHECK-NEXT: fmov s1, w8 +; CHECK-NEXT: umov w8, v0.h[5] +; CHECK-NEXT: mov v1.h[1], w9 +; CHECK-NEXT: umov w9, v0.h[4] +; CHECK-NEXT: mov v1.h[2], w8 +; CHECK-NEXT: umov w8, v0.h[3] +; CHECK-NEXT: mov v1.h[3], w9 +; CHECK-NEXT: umov w9, v0.h[2] +; CHECK-NEXT: mov v1.h[4], w8 +; CHECK-NEXT: umov w8, v0.h[1] +; CHECK-NEXT: mov v1.h[5], w9 +; CHECK-NEXT: mov v1.h[6], w8 +; CHECK-NEXT: fmov w8, s0 +; CHECK-NEXT: mov v1.h[7], w8 +; CHECK-NEXT: mov v0.16b, v1.16b +; CHECK-NEXT: ret + %el0 = extractelement %a, i32 0 + %el1 = extractelement %a, i32 1 + %el2 = extractelement %a, i32 2 + %el3 = extractelement %a, i32 3 + %el4 = extractelement %a, i32 4 + %el5 = extractelement %a, i32 5 + %el6 = extractelement %a, i32 6 + %el7 = extractelement %a, i32 7 + %v0 = insertelement <8 x i16> undef, i16 %el7, i32 0 + %v1 = insertelement <8 x i16> %v0, i16 %el6, i32 1 + %v2 = insertelement <8 x i16> %v1, i16 %el5, i32 2 + %v3 = insertelement <8 x i16> %v2, i16 %el4, i32 3 + %v4 = insertelement <8 x i16> %v3, i16 %el3, i32 4 + %v5 = insertelement <8 x i16> %v4, i16 %el2, i32 5 + %v6 = insertelement <8 x i16> %v5, i16 %el1, i32 6 + %v7 = insertelement <8 x i16> %v6, i16 %el0, i32 7 + ret <8 x i16> %v7 +} + + +; == Result requires padding one source with undef elements == + +define <2 x i32> @reshuffle_v2i32_nxv2i32_undef( %a) #0 { +; CHECK-LABEL: reshuffle_v2i32_nxv2i32_undef: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 +; CHECK-NEXT: ret + %el0 = extractelement %a, i32 0 + %v0 = insertelement <2 x i32> undef, i32 %el0, i32 0 + ret <2 x i32> %v0 +} + + +define <2 x i32> @reshuffle_v2i32_nxv2i32_undef2( %a) #0 { +; CHECK-LABEL: reshuffle_v2i32_nxv2i32_undef2: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w8, v0.s[2] +; CHECK-NEXT: dup v0.2s, w8 +; CHECK-NEXT: ret + %el1 = extractelement %a, i32 1 + %v0 = insertelement <2 x i32> undef, i32 %el1, i32 1 + ret <2 x i32> %v0 +} + + +define <2 x i32> @reshuffle_v2i32_nxv4i32_undef( %a) #0 { +; CHECK-LABEL: reshuffle_v2i32_nxv4i32_undef: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 +; CHECK-NEXT: ret + %el0 = extractelement %a, i32 0 + %v0 = insertelement <2 x i32> undef, i32 %el0, i32 0 + ret <2 x i32> %v0 +} + + +define <2 x i32> @reshuffle_v2i32_nxv4i32_undef2( %a) #0 { +; CHECK-LABEL: reshuffle_v2i32_nxv4i32_undef2: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w8, v0.s[1] +; CHECK-NEXT: dup v0.2s, w8 +; CHECK-NEXT: ret + %el1 = extractelement %a, i32 1 + %v0 = insertelement <2 x i32> undef, i32 %el1, i32 1 + ret <2 x i32> %v0 +} + + +; ReconstructShuffle - succeeds +define <4 x i32> @reshuffle_v4i32_nxv4i32_undef( %a) #0 { +; CHECK-LABEL: reshuffle_v4i32_nxv4i32_undef: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 +; CHECK-NEXT: ret + %el0 = extractelement %a, i32 0 + %el1 = extractelement %a, i32 1 + %v0 = insertelement <4 x i32> undef, i32 %el0, i32 0 + %v1 = insertelement <4 x i32> %v0, i32 %el1, i32 1 + ret <4 x i32> %v1 +} + + +; ReconstructShuffle - succeeds +define <4 x i32> @reshuffle_v4i32_nxv4i32_undef2( %a) #0 { +; CHECK-LABEL: reshuffle_v4i32_nxv4i32_undef2: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 +; CHECK-NEXT: ret + %el2 = extractelement %a, i32 2 + %el3 = extractelement %a, i32 3 + %v0 = insertelement <4 x i32> undef, i32 %el2, i32 2 + %v1 = insertelement <4 x i32> %v0, i32 %el3, i32 3 + ret <4 x i32> %v1 +} + + +define <2 x i16> @reshuffle_v2i16_nxv8i16_undef( %a) #0 { +; CHECK-LABEL: reshuffle_v2i16_nxv8i16_undef: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 +; CHECK-NEXT: ret + %el0 = extractelement %a, i32 0 + %v0 = insertelement <2 x i16> undef, i16 %el0, i32 0 + ret <2 x i16> %v0 +} + + +define <2 x i16> @reshuffle_v2i16_nxv8i16_undef2( %a) #0 { +; CHECK-LABEL: reshuffle_v2i16_nxv8i16_undef2: +; CHECK: // %bb.0: +; CHECK-NEXT: umov w8, v0.h[1] +; CHECK-NEXT: dup v0.2s, w8 +; CHECK-NEXT: ret + %el1 = extractelement %a, i32 1 + %v0 = insertelement <2 x i16> undef, i16 %el1, i32 1 + ret <2 x i16> %v0 +} + + +; ReconstructShuffle - fails +define <4 x i16> @reshuffle_v4i16_nxv8i16_undef( %a) #0 { +; CHECK-LABEL: reshuffle_v4i16_nxv8i16_undef: +; CHECK: // %bb.0: +; CHECK-NEXT: umov w8, v0.h[1] +; CHECK-NEXT: mov v0.h[1], w8 +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 +; CHECK-NEXT: ret + %el0 = extractelement %a, i32 0 + %el1 = extractelement %a, i32 1 + %v0 = insertelement <4 x i16> undef, i16 %el0, i32 0 + %v1 = insertelement <4 x i16> %v0, i16 %el1, i32 1 + ret <4 x i16> %v1 +} + +; ReconstructShuffle - fails +define <4 x i16> @reshuffle_v4i16_nxv8i16_undef2( %a) #0 { +; CHECK-LABEL: reshuffle_v4i16_nxv8i16_undef2: +; CHECK: // %bb.0: +; CHECK-NEXT: umov w8, v0.h[2] +; CHECK-NEXT: umov w9, v0.h[3] +; CHECK-NEXT: mov v0.h[2], w8 +; CHECK-NEXT: mov v0.h[3], w9 +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-NEXT: ret + %el2 = extractelement %a, i32 2 + %el3 = extractelement %a, i32 3 + %v0 = insertelement <4 x i16> undef, i16 %el2, i32 2 + %v1 = insertelement <4 x i16> %v0, i16 %el3, i32 3 + ret <4 x i16> %v1 +} + + +; ReconstructShuffle - succeeds +define <8 x i16> @reshuffle_v8i16_nxv8i16_undef( %a) #0 { +; CHECK-LABEL: reshuffle_v8i16_nxv8i16_undef: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 +; CHECK-NEXT: ret + %el0 = extractelement %a, i32 0 + %el1 = extractelement %a, i32 1 + %el2 = extractelement %a, i32 2 + %el3 = extractelement %a, i32 3 + %v0 = insertelement <8 x i16> undef, i16 %el0, i32 0 + %v1 = insertelement <8 x i16> %v0, i16 %el1, i32 1 + %v2 = insertelement <8 x i16> %v1, i16 %el2, i32 2 + %v3 = insertelement <8 x i16> %v2, i16 %el3, i32 3 + ret <8 x i16> %v3 +} + + +; ReconstructShuffle - succeeds +define <8 x i16> @reshuffle_v8i16_nxv8i16_undef2( %a) #0 { +; CHECK-LABEL: reshuffle_v8i16_nxv8i16_undef2: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 +; CHECK-NEXT: ret + %el4 = extractelement %a, i32 4 + %el5 = extractelement %a, i32 5 + %el6 = extractelement %a, i32 6 + %el7 = extractelement %a, i32 7 + %v0 = insertelement <8 x i16> undef, i16 %el4, i32 4 + %v1 = insertelement <8 x i16> %v0, i16 %el5, i32 5 + %v2 = insertelement <8 x i16> %v1, i16 %el6, i32 6 + %v3 = insertelement <8 x i16> %v2, i16 %el7, i32 7 + ret <8 x i16> %v3 +} + + +; ReconstructShuffle - succeeds +define <4 x i32> @reshuffle_v4i32_nxv4i32_reverse_undef( %a) #0 { +; CHECK-LABEL: reshuffle_v4i32_nxv4i32_reverse_undef: +; CHECK: // %bb.0: +; CHECK-NEXT: rev64 v0.4s, v0.4s +; CHECK-NEXT: ret + %el0 = extractelement %a, i32 0 + %el1 = extractelement %a, i32 1 + %v0 = insertelement <4 x i32> undef, i32 %el1, i32 0 + %v1 = insertelement <4 x i32> %v0, i32 %el0, i32 1 + ret <4 x i32> %v1 +} + + +; ReconstructShuffle - succeeds +define <4 x i32> @reshuffle_v4i32_nxv4i32_reverse_undef2( %a) #0 { +; CHECK-LABEL: reshuffle_v4i32_nxv4i32_reverse_undef2: +; CHECK: // %bb.0: +; CHECK-NEXT: ext v1.16b, v0.16b, v0.16b, #8 +; CHECK-NEXT: ext v0.16b, v0.16b, v1.16b, #12 +; CHECK-NEXT: ret + %el2 = extractelement %a, i32 2 + %el3 = extractelement %a, i32 3 + %v0 = insertelement <4 x i32> undef, i32 %el3, i32 0 + %v1 = insertelement <4 x i32> %v0, i32 %el2, i32 1 + ret <4 x i32> %v1 +} + + +; ReconstructShuffle - succeeds +define <8 x i16> @reshuffle_v8i16_nxv8i16_reverse_undef( %a) #0 { +; CHECK-LABEL: reshuffle_v8i16_nxv8i16_reverse_undef: +; CHECK: // %bb.0: +; CHECK-NEXT: rev64 v0.8h, v0.8h +; CHECK-NEXT: ret + %el0 = extractelement %a, i32 0 + %el1 = extractelement %a, i32 1 + %el2 = extractelement %a, i32 2 + %el3 = extractelement %a, i32 3 + %v0 = insertelement <8 x i16> undef, i16 %el3, i32 0 + %v1 = insertelement <8 x i16> %v0, i16 %el2, i32 1 + %v2 = insertelement <8 x i16> %v1, i16 %el1, i32 2 + %v3 = insertelement <8 x i16> %v2, i16 %el0, i32 3 + ret <8 x i16> %v3 +} + + +; ReconstructShuffle - fails +define <8 x i16> @reshuffle_v8i16_nxv8i16_reverse_undef2( %a) #0 { +; CHECK-LABEL: reshuffle_v8i16_nxv8i16_reverse_undef2: +; CHECK: // %bb.0: +; CHECK-NEXT: umov w8, v0.h[7] +; CHECK-NEXT: umov w9, v0.h[6] +; CHECK-NEXT: fmov s1, w8 +; CHECK-NEXT: umov w8, v0.h[5] +; CHECK-NEXT: mov v1.h[1], w9 +; CHECK-NEXT: umov w9, v0.h[4] +; CHECK-NEXT: mov v1.h[2], w8 +; CHECK-NEXT: mov v1.h[3], w9 +; CHECK-NEXT: mov v0.16b, v1.16b +; CHECK-NEXT: ret + %el4 = extractelement %a, i32 4 + %el5 = extractelement %a, i32 5 + %el6 = extractelement %a, i32 6 + %el7 = extractelement %a, i32 7 + %v0 = insertelement <8 x i16> undef, i16 %el7, i32 0 + %v1 = insertelement <8 x i16> %v0, i16 %el6, i32 1 + %v2 = insertelement <8 x i16> %v1, i16 %el5, i32 2 + %v3 = insertelement <8 x i16> %v2, i16 %el4, i32 3 + ret <8 x i16> %v3 +} + + +; == Shuffle comes from two input sources == + +define <2 x i64> @reshuffle_v2i64_2x_nxv2i64( %a, %b) #0 { +; CHECK-LABEL: reshuffle_v2i64_2x_nxv2i64: +; CHECK: // %bb.0: +; CHECK-NEXT: fmov x8, d1 +; CHECK-NEXT: mov v0.d[1], x8 +; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 +; CHECK-NEXT: ret + %el0 = extractelement %a, i32 0 + %el1 = extractelement %b, i32 0 + %v0 = insertelement <2 x i64> undef, i64 %el0, i32 0 + %v1 = insertelement <2 x i64> %v0, i64 %el1, i32 1 + ret <2 x i64> %v1 +} + + +define <2 x i32> @reshuffle_v2i32_2x_nxv2i32( %a, %b) #0 { +; CHECK-LABEL: reshuffle_v2i32_2x_nxv2i32: +; CHECK: // %bb.0: +; CHECK-NEXT: fmov w8, s1 +; CHECK-NEXT: mov v0.s[1], w8 +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 +; CHECK-NEXT: ret + %el0 = extractelement %a, i32 0 + %el1 = extractelement %b, i32 0 + %v0 = insertelement <2 x i32> undef, i32 %el0, i32 0 + %v1 = insertelement <2 x i32> %v0, i32 %el1, i32 1 + ret <2 x i32> %v1 +} + + +; ReconstructShuffle - succeeds +define <4 x i32> @reshuffle_v4i32_2x_nxv4i32( %a, %b) #0 { +; CHECK-LABEL: reshuffle_v4i32_2x_nxv4i32: +; CHECK: // %bb.0: +; CHECK-NEXT: mov v0.d[1], v1.d[0] +; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 +; CHECK-NEXT: ret + %el0 = extractelement %a, i32 0 + %el1 = extractelement %a, i32 1 + %el2 = extractelement %b, i32 0 + %el3 = extractelement %b, i32 1 + %v0 = insertelement <4 x i32> undef, i32 %el0, i32 0 + %v1 = insertelement <4 x i32> %v0, i32 %el1, i32 1 + %v2 = insertelement <4 x i32> %v1, i32 %el2, i32 2 + %v3 = insertelement <4 x i32> %v2, i32 %el3, i32 3 + ret <4 x i32> %v3 +} + + +; ReconstructShuffle - fails +define <4 x i16> @reshuffle_v4i16_2x_nxv8i16( %a, %b) #0 { +; CHECK-LABEL: reshuffle_v4i16_2x_nxv8i16: +; CHECK: // %bb.0: +; CHECK-NEXT: umov w8, v0.h[1] +; CHECK-NEXT: umov w9, v1.h[1] +; CHECK-NEXT: mov v0.h[1], w8 +; CHECK-NEXT: fmov w8, s1 +; CHECK-NEXT: mov v0.h[2], w8 +; CHECK-NEXT: mov v0.h[3], w9 +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 +; CHECK-NEXT: ret + %el0 = extractelement %a, i32 0 + %el1 = extractelement %a, i32 1 + %el2 = extractelement %b, i32 0 + %el3 = extractelement %b, i32 1 + %v0 = insertelement <4 x i16> undef, i16 %el0, i32 0 + %v1 = insertelement <4 x i16> %v0, i16 %el1, i32 1 + %v2 = insertelement <4 x i16> %v1, i16 %el2, i32 2 + %v3 = insertelement <4 x i16> %v2, i16 %el3, i32 3 + ret <4 x i16> %v3 +} + + +; ReconstructShuffle - succeeds +define <8 x i16> @reshuffle_v8i16_2x_nxv8i16( %a, %b) #0 { +; CHECK-LABEL: reshuffle_v8i16_2x_nxv8i16: +; CHECK: // %bb.0: +; CHECK-NEXT: mov v0.d[1], v1.d[0] +; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 +; CHECK-NEXT: ret + %el0 = extractelement %a, i32 0 + %el1 = extractelement %a, i32 1 + %el2 = extractelement %a, i32 2 + %el3 = extractelement %a, i32 3 + %el4 = extractelement %b, i32 0 + %el5 = extractelement %b, i32 1 + %el6 = extractelement %b, i32 2 + %el7 = extractelement %b, i32 3 + %v0 = insertelement <8 x i16> undef, i16 %el0, i32 0 + %v1 = insertelement <8 x i16> %v0, i16 %el1, i32 1 + %v2 = insertelement <8 x i16> %v1, i16 %el2, i32 2 + %v3 = insertelement <8 x i16> %v2, i16 %el3, i32 3 + %v4 = insertelement <8 x i16> %v3, i16 %el4, i32 4 + %v5 = insertelement <8 x i16> %v4, i16 %el5, i32 5 + %v6 = insertelement <8 x i16> %v5, i16 %el6, i32 6 + %v7 = insertelement <8 x i16> %v6, i16 %el7, i32 7 + ret <8 x i16> %v7 +} + + +; == Shuffle comes from two input sources and result requires padding with undef == + + +; ReconstructShuffle - succeeds +define <4 x i32> @reshuffle_v4i32_2x_nxv4i32_undef1( %a, %b) #0 { +; CHECK-LABEL: reshuffle_v4i32_2x_nxv4i32_undef1: +; CHECK: // %bb.0: +; CHECK-NEXT: zip1 v0.4s, v0.4s, v1.4s +; CHECK-NEXT: ret + %el0 = extractelement %a, i32 0 + %el1 = extractelement %b, i32 0 + %v0 = insertelement <4 x i32> undef, i32 %el0, i32 0 + %v1 = insertelement <4 x i32> %v0, i32 %el1, i32 1 + ret <4 x i32> %v1 +} + + +; ReconstructShuffle - succeeds +define <8 x i16> @reshuffle_v8i16_2x_nxv8i16_undef1( %a, %b) #0 { +; CHECK-LABEL: reshuffle_v8i16_2x_nxv8i16_undef1: +; CHECK: // %bb.0: +; CHECK-NEXT: zip1 v0.8h, v0.8h, v1.8h +; CHECK-NEXT: ret + %el0 = extractelement %a, i32 0 + %el1 = extractelement %b, i32 0 + %v0 = insertelement <8 x i16> undef, i16 %el0, i32 0 + %v1 = insertelement <8 x i16> %v0, i16 %el1, i32 1 + ret <8 x i16> %v1 +} + + +; ReconstructShuffle - fails +define <8 x i16> @reshuffle_v8i16_2x_nxv8i16_undef2( %a, %b) #0 { +; CHECK-LABEL: reshuffle_v8i16_2x_nxv8i16_undef2: +; CHECK: // %bb.0: +; CHECK-NEXT: umov w8, v0.h[1] +; CHECK-NEXT: umov w9, v1.h[1] +; CHECK-NEXT: mov v0.h[1], w8 +; CHECK-NEXT: fmov w8, s1 +; CHECK-NEXT: mov v0.h[2], w8 +; CHECK-NEXT: mov v0.h[3], w9 +; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 +; CHECK-NEXT: ret + %el0 = extractelement %a, i32 0 + %el1 = extractelement %a, i32 1 + %el2 = extractelement %b, i32 0 + %el3 = extractelement %b, i32 1 + %v0 = insertelement <8 x i16> undef, i16 %el0, i32 0 + %v1 = insertelement <8 x i16> %v0, i16 %el1, i32 1 + %v2 = insertelement <8 x i16> %v1, i16 %el2, i32 2 + %v3 = insertelement <8 x i16> %v2, i16 %el3, i32 3 + ret <8 x i16> %v3 +} + + +attributes #0 = { "target-features"="+sve" }