Index: llvm/test/CodeGen/AArch64/sve-fixed-length-reshuffle.ll =================================================================== --- llvm/test/CodeGen/AArch64/sve-fixed-length-reshuffle.ll +++ llvm/test/CodeGen/AArch64/sve-fixed-length-reshuffle.ll @@ -5,6 +5,40 @@ ; == Matching first N elements == +define <2 x i1> @reshuffle_v2i1_nxv2i1( %a) #0 { +; CHECK-LABEL: reshuffle_v2i1_nxv2i1: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z0.d, p0/z, #1 // =0x1 +; CHECK-NEXT: fmov x0, d0 +; CHECK-NEXT: mov x8, v0.d[1] +; CHECK-NEXT: fmov s0, w0 +; CHECK-NEXT: mov v0.s[1], w8 +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-NEXT: ret + %el0 = extractelement %a, i32 0 + %el1 = extractelement %a, i32 1 + %v0 = insertelement <2 x i1> undef, i1 %el0, i32 0 + %v1 = insertelement <2 x i1> %v0, i1 %el1, i32 1 + ret <2 x i1> %v1 +} + + +define <2 x i1> @reshuffle_v2i1_nxv4i1( %a) #0 { +; CHECK-LABEL: reshuffle_v2i1_nxv4i1: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z0.s, p0/z, #1 // =0x1 +; CHECK-NEXT: mov w8, v0.s[1] +; CHECK-NEXT: mov v0.s[1], w8 +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 +; CHECK-NEXT: ret + %el0 = extractelement %a, i32 0 + %el1 = extractelement %a, i32 1 + %v0 = insertelement <2 x i1> undef, i1 %el0, i32 0 + %v1 = insertelement <2 x i1> %v0, i1 %el1, i32 1 + ret <2 x i1> %v1 +} + + define <4 x i1> @reshuffle_v4i1_nxv4i1( %a) #0 { ; CHECK-LABEL: reshuffle_v4i1_nxv4i1: ; CHECK: // %bb.0: @@ -29,4 +63,867 @@ ret <4 x i1> %v3 } + +define <4 x i1> @reshuffle_v4i1_nxv8i1( %a) #0 { +; CHECK-LABEL: reshuffle_v4i1_nxv8i1: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z1.h, p0/z, #1 // =0x1 +; CHECK-NEXT: umov w8, v1.h[1] +; CHECK-NEXT: umov w9, v1.h[2] +; CHECK-NEXT: mov v0.16b, v1.16b +; CHECK-NEXT: mov v0.h[1], w8 +; CHECK-NEXT: umov w8, v1.h[3] +; CHECK-NEXT: mov v0.h[2], w9 +; CHECK-NEXT: mov v0.h[3], w8 +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-NEXT: ret + %el0 = extractelement %a, i32 0 + %el1 = extractelement %a, i32 1 + %el2 = extractelement %a, i32 2 + %el3 = extractelement %a, i32 3 + %v0 = insertelement <4 x i1> undef, i1 %el0, i32 0 + %v1 = insertelement <4 x i1> %v0, i1 %el1, i32 1 + %v2 = insertelement <4 x i1> %v1, i1 %el2, i32 2 + %v3 = insertelement <4 x i1> %v2, i1 %el3, i32 3 + ret <4 x i1> %v3 +} + + +define <8 x i1> @reshuffle_v8i1_nxv8i1( %a) #0 { +; CHECK-LABEL: reshuffle_v8i1_nxv8i1: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z1.h, p0/z, #1 // =0x1 +; CHECK-NEXT: umov w8, v1.h[1] +; CHECK-NEXT: umov w9, v1.h[2] +; CHECK-NEXT: mov v0.16b, v1.16b +; CHECK-NEXT: mov v0.b[1], w8 +; CHECK-NEXT: umov w8, v1.h[3] +; CHECK-NEXT: mov v0.b[2], w9 +; CHECK-NEXT: umov w9, v1.h[4] +; CHECK-NEXT: mov v0.b[3], w8 +; CHECK-NEXT: umov w8, v1.h[5] +; CHECK-NEXT: mov v0.b[4], w9 +; CHECK-NEXT: umov w9, v1.h[6] +; CHECK-NEXT: mov v0.b[5], w8 +; CHECK-NEXT: umov w8, v1.h[7] +; CHECK-NEXT: mov v0.b[6], w9 +; CHECK-NEXT: mov v0.b[7], w8 +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-NEXT: ret + %el0 = extractelement %a, i32 0 + %el1 = extractelement %a, i32 1 + %el2 = extractelement %a, i32 2 + %el3 = extractelement %a, i32 3 + %el4 = extractelement %a, i32 4 + %el5 = extractelement %a, i32 5 + %el6 = extractelement %a, i32 6 + %el7 = extractelement %a, i32 7 + %v0 = insertelement <8 x i1> undef, i1 %el0, i32 0 + %v1 = insertelement <8 x i1> %v0, i1 %el1, i32 1 + %v2 = insertelement <8 x i1> %v1, i1 %el2, i32 2 + %v3 = insertelement <8 x i1> %v2, i1 %el3, i32 3 + %v4 = insertelement <8 x i1> %v3, i1 %el4, i32 4 + %v5 = insertelement <8 x i1> %v4, i1 %el5, i32 5 + %v6 = insertelement <8 x i1> %v5, i1 %el6, i32 6 + %v7 = insertelement <8 x i1> %v6, i1 %el7, i32 7 + ret <8 x i1> %v7 +} + + +define <16 x i1> @reshuffle_v16i1_nxv16i1( %a) #0 { +; CHECK-LABEL: reshuffle_v16i1_nxv16i1: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z1.b, p0/z, #1 // =0x1 +; CHECK-NEXT: umov w8, v1.b[1] +; CHECK-NEXT: umov w9, v1.b[2] +; CHECK-NEXT: mov v0.16b, v1.16b +; CHECK-NEXT: mov v0.b[1], w8 +; CHECK-NEXT: umov w8, v1.b[3] +; CHECK-NEXT: mov v0.b[2], w9 +; CHECK-NEXT: umov w9, v1.b[4] +; CHECK-NEXT: mov v0.b[3], w8 +; CHECK-NEXT: umov w8, v1.b[5] +; CHECK-NEXT: mov v0.b[4], w9 +; CHECK-NEXT: umov w9, v1.b[6] +; CHECK-NEXT: mov v0.b[5], w8 +; CHECK-NEXT: umov w8, v1.b[7] +; CHECK-NEXT: mov v0.b[6], w9 +; CHECK-NEXT: umov w9, v1.b[8] +; CHECK-NEXT: mov v0.b[7], w8 +; CHECK-NEXT: umov w8, v1.b[9] +; CHECK-NEXT: mov v0.b[8], w9 +; CHECK-NEXT: umov w9, v1.b[10] +; CHECK-NEXT: mov v0.b[9], w8 +; CHECK-NEXT: umov w8, v1.b[11] +; CHECK-NEXT: mov v0.b[10], w9 +; CHECK-NEXT: umov w9, v1.b[12] +; CHECK-NEXT: mov v0.b[11], w8 +; CHECK-NEXT: umov w8, v1.b[13] +; CHECK-NEXT: mov v0.b[12], w9 +; CHECK-NEXT: umov w9, v1.b[14] +; CHECK-NEXT: mov v0.b[13], w8 +; CHECK-NEXT: umov w8, v1.b[15] +; CHECK-NEXT: mov v0.b[14], w9 +; CHECK-NEXT: mov v0.b[15], w8 +; CHECK-NEXT: ret + %el0 = extractelement %a, i32 0 + %el1 = extractelement %a, i32 1 + %el2 = extractelement %a, i32 2 + %el3 = extractelement %a, i32 3 + %el4 = extractelement %a, i32 4 + %el5 = extractelement %a, i32 5 + %el6 = extractelement %a, i32 6 + %el7 = extractelement %a, i32 7 + %el8 = extractelement %a, i32 8 + %el9 = extractelement %a, i32 9 + %el10 = extractelement %a, i32 10 + %el11 = extractelement %a, i32 11 + %el12 = extractelement %a, i32 12 + %el13 = extractelement %a, i32 13 + %el14 = extractelement %a, i32 14 + %el15 = extractelement %a, i32 15 + %v0 = insertelement <16 x i1> undef, i1 %el0, i32 0 + %v1 = insertelement <16 x i1> %v0, i1 %el1, i32 1 + %v2 = insertelement <16 x i1> %v1, i1 %el2, i32 2 + %v3 = insertelement <16 x i1> %v2, i1 %el3, i32 3 + %v4 = insertelement <16 x i1> %v3, i1 %el4, i32 4 + %v5 = insertelement <16 x i1> %v4, i1 %el5, i32 5 + %v6 = insertelement <16 x i1> %v5, i1 %el6, i32 6 + %v7 = insertelement <16 x i1> %v6, i1 %el7, i32 7 + %v8 = insertelement <16 x i1> %v7, i1 %el8, i32 8 + %v9 = insertelement <16 x i1> %v8, i1 %el9, i32 9 + %v10 = insertelement <16 x i1> %v9, i1 %el10, i32 10 + %v11 = insertelement <16 x i1> %v10, i1 %el11, i32 11 + %v12 = insertelement <16 x i1> %v11, i1 %el12, i32 12 + %v13 = insertelement <16 x i1> %v12, i1 %el13, i32 13 + %v14 = insertelement <16 x i1> %v13, i1 %el14, i32 14 + %v15 = insertelement <16 x i1> %v14, i1 %el15, i32 15 + ret <16 x i1> %v15 +} + + +define <2 x i64> @reshuffle_v2i64_nxv2i64( %a) #0 { +; CHECK-LABEL: reshuffle_v2i64_nxv2i64: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 +; CHECK-NEXT: ret + %el0 = extractelement %a, i32 0 + %el1 = extractelement %a, i32 1 + %v0 = insertelement <2 x i64> undef, i64 %el0, i32 0 + %v1 = insertelement <2 x i64> %v0, i64 %el1, i32 1 + ret <2 x i64> %v1 +} + + +define <2 x i32> @reshuffle_v2i32_nxv2i32( %a) #0 { +; CHECK-LABEL: reshuffle_v2i32_nxv2i32: +; CHECK: // %bb.0: +; CHECK-NEXT: xtn v0.2s, v0.2d +; CHECK-NEXT: ret + %el0 = extractelement %a, i32 0 + %el1 = extractelement %a, i32 1 + %v0 = insertelement <2 x i32> undef, i32 %el0, i32 0 + %v1 = insertelement <2 x i32> %v0, i32 %el1, i32 1 + ret <2 x i32> %v1 +} + + +define <2 x i32> @reshuffle_v2i32_nxv4i32( %a) #0 { +; CHECK-LABEL: reshuffle_v2i32_nxv4i32: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 +; CHECK-NEXT: ret + %el0 = extractelement %a, i32 0 + %el1 = extractelement %a, i32 1 + %v0 = insertelement <2 x i32> undef, i32 %el0, i32 0 + %v1 = insertelement <2 x i32> %v0, i32 %el1, i32 1 + ret <2 x i32> %v1 +} + + +define <4 x i32> @reshuffle_v4i32_nxv4i32( %a) #0 { +; CHECK-LABEL: reshuffle_v4i32_nxv4i32: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 +; CHECK-NEXT: ret + %el0 = extractelement %a, i32 0 + %el1 = extractelement %a, i32 1 + %el2 = extractelement %a, i32 2 + %el3 = extractelement %a, i32 3 + %v0 = insertelement <4 x i32> undef, i32 %el0, i32 0 + %v1 = insertelement <4 x i32> %v0, i32 %el1, i32 1 + %v2 = insertelement <4 x i32> %v1, i32 %el2, i32 2 + %v3 = insertelement <4 x i32> %v2, i32 %el3, i32 3 + ret <4 x i32> %v3 +} + + +define <4 x i16> @reshuffle_v4i16_nxv4i16( %a) #0 { +; CHECK-LABEL: reshuffle_v4i16_nxv4i16: +; CHECK: // %bb.0: +; CHECK-NEXT: xtn v0.4h, v0.4s +; CHECK-NEXT: ret + %el0 = extractelement %a, i32 0 + %el1 = extractelement %a, i32 1 + %el2 = extractelement %a, i32 2 + %el3 = extractelement %a, i32 3 + %v0 = insertelement <4 x i16> undef, i16 %el0, i32 0 + %v1 = insertelement <4 x i16> %v0, i16 %el1, i32 1 + %v2 = insertelement <4 x i16> %v1, i16 %el2, i32 2 + %v3 = insertelement <4 x i16> %v2, i16 %el3, i32 3 + ret <4 x i16> %v3 +} + + +define <4 x i16> @reshuffle_v4i16_nxv8i16( %a) #0 { +; CHECK-LABEL: reshuffle_v4i16_nxv8i16: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 +; CHECK-NEXT: ret + %el0 = extractelement %a, i32 0 + %el1 = extractelement %a, i32 1 + %el2 = extractelement %a, i32 2 + %el3 = extractelement %a, i32 3 + %v0 = insertelement <4 x i16> undef, i16 %el0, i32 0 + %v1 = insertelement <4 x i16> %v0, i16 %el1, i32 1 + %v2 = insertelement <4 x i16> %v1, i16 %el2, i32 2 + %v3 = insertelement <4 x i16> %v2, i16 %el3, i32 3 + ret <4 x i16> %v3 +} + + +define <8 x i16> @reshuffle_v8i16_nxv8i16( %a) #0 { +; CHECK-LABEL: reshuffle_v8i16_nxv8i16: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 +; CHECK-NEXT: ret + %el0 = extractelement %a, i32 0 + %el1 = extractelement %a, i32 1 + %el2 = extractelement %a, i32 2 + %el3 = extractelement %a, i32 3 + %el4 = extractelement %a, i32 4 + %el5 = extractelement %a, i32 5 + %el6 = extractelement %a, i32 6 + %el7 = extractelement %a, i32 7 + %v0 = insertelement <8 x i16> undef, i16 %el0, i32 0 + %v1 = insertelement <8 x i16> %v0, i16 %el1, i32 1 + %v2 = insertelement <8 x i16> %v1, i16 %el2, i32 2 + %v3 = insertelement <8 x i16> %v2, i16 %el3, i32 3 + %v4 = insertelement <8 x i16> %v3, i16 %el4, i32 4 + %v5 = insertelement <8 x i16> %v4, i16 %el5, i32 5 + %v6 = insertelement <8 x i16> %v5, i16 %el6, i32 6 + %v7 = insertelement <8 x i16> %v6, i16 %el7, i32 7 + ret <8 x i16> %v7 +} + + +define <16 x i8> @reshuffle_v16i8_nxv16i8( %a) #0 { +; CHECK-LABEL: reshuffle_v16i8_nxv16i8: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 +; CHECK-NEXT: ret + %el0 = extractelement %a, i32 0 + %el1 = extractelement %a, i32 1 + %el2 = extractelement %a, i32 2 + %el3 = extractelement %a, i32 3 + %el4 = extractelement %a, i32 4 + %el5 = extractelement %a, i32 5 + %el6 = extractelement %a, i32 6 + %el7 = extractelement %a, i32 7 + %el8 = extractelement %a, i32 8 + %el9 = extractelement %a, i32 9 + %el10 = extractelement %a, i32 10 + %el11 = extractelement %a, i32 11 + %el12 = extractelement %a, i32 12 + %el13 = extractelement %a, i32 13 + %el14 = extractelement %a, i32 14 + %el15 = extractelement %a, i32 15 + %v0 = insertelement <16 x i8> undef, i8 %el0, i32 0 + %v1 = insertelement <16 x i8> %v0, i8 %el1, i32 1 + %v2 = insertelement <16 x i8> %v1, i8 %el2, i32 2 + %v3 = insertelement <16 x i8> %v2, i8 %el3, i32 3 + %v4 = insertelement <16 x i8> %v3, i8 %el4, i32 4 + %v5 = insertelement <16 x i8> %v4, i8 %el5, i32 5 + %v6 = insertelement <16 x i8> %v5, i8 %el6, i32 6 + %v7 = insertelement <16 x i8> %v6, i8 %el7, i32 7 + %v8 = insertelement <16 x i8> %v7, i8 %el8, i32 8 + %v9 = insertelement <16 x i8> %v8, i8 %el9, i32 9 + %v10 = insertelement <16 x i8> %v9, i8 %el10, i32 10 + %v11 = insertelement <16 x i8> %v10, i8 %el11, i32 11 + %v12 = insertelement <16 x i8> %v11, i8 %el12, i32 12 + %v13 = insertelement <16 x i8> %v12, i8 %el13, i32 13 + %v14 = insertelement <16 x i8> %v13, i8 %el14, i32 14 + %v15 = insertelement <16 x i8> %v14, i8 %el15, i32 15 + ret <16 x i8> %v15 +} + + +define <8 x i8> @reshuffle_v8i8_nxv16i8( %a) #0 { +; CHECK-LABEL: reshuffle_v8i8_nxv16i8: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 +; CHECK-NEXT: ret + %el0 = extractelement %a, i32 0 + %el1 = extractelement %a, i32 1 + %el2 = extractelement %a, i32 2 + %el3 = extractelement %a, i32 3 + %el4 = extractelement %a, i32 4 + %el5 = extractelement %a, i32 5 + %el6 = extractelement %a, i32 6 + %el7 = extractelement %a, i32 7 + %v0 = insertelement <8 x i8> undef, i8 %el0, i32 0 + %v1 = insertelement <8 x i8> %v0, i8 %el1, i32 1 + %v2 = insertelement <8 x i8> %v1, i8 %el2, i32 2 + %v3 = insertelement <8 x i8> %v2, i8 %el3, i32 3 + %v4 = insertelement <8 x i8> %v3, i8 %el4, i32 4 + %v5 = insertelement <8 x i8> %v4, i8 %el5, i32 5 + %v6 = insertelement <8 x i8> %v5, i8 %el6, i32 6 + %v7 = insertelement <8 x i8> %v6, i8 %el7, i32 7 + ret <8 x i8> %v7 +} + + +define <2 x float> @reshuffle_v2f32_nxv2f32( %a) #0 { +; CHECK-LABEL: reshuffle_v2f32_nxv2f32: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 8 * VG +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: st1w { z0.d }, p0, [sp, #1, mul vl] +; CHECK-NEXT: addpl x8, sp, #4 +; CHECK-NEXT: ldr d0, [x8] +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + %el0 = extractelement %a, i32 0 + %el1 = extractelement %a, i32 1 + %v0 = insertelement <2 x float> undef, float %el0, i32 0 + %v1 = insertelement <2 x float> %v0, float %el1, i32 1 + ret <2 x float> %v1 +} + + +define <4 x float> @reshuffle_v4f32_nxv4f32( %a) #0 { +; CHECK-LABEL: reshuffle_v4f32_nxv4f32: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 +; CHECK-NEXT: ret + %el0 = extractelement %a, i32 0 + %el1 = extractelement %a, i32 1 + %el2 = extractelement %a, i32 2 + %el3 = extractelement %a, i32 3 + %v0 = insertelement <4 x float> undef, float %el0, i32 0 + %v1 = insertelement <4 x float> %v0, float %el1, i32 1 + %v2 = insertelement <4 x float> %v1, float %el2, i32 2 + %v3 = insertelement <4 x float> %v2, float %el3, i32 3 + ret <4 x float> %v3 +} + + +; == Reversed first N elements == + +define <4 x i16> @reshuffle_v4i16_nxv4i16_reverse( %a) #0 { +; CHECK-LABEL: reshuffle_v4i16_nxv4i16_reverse: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w8, v0.s[3] +; CHECK-NEXT: mov w9, v0.s[2] +; CHECK-NEXT: fmov s1, w8 +; CHECK-NEXT: mov w8, v0.s[1] +; CHECK-NEXT: mov v1.h[1], w9 +; CHECK-NEXT: mov v1.h[2], w8 +; CHECK-NEXT: fmov w8, s0 +; CHECK-NEXT: mov v1.h[3], w8 +; CHECK-NEXT: fmov d0, d1 +; CHECK-NEXT: ret + %el0 = extractelement %a, i32 0 + %el1 = extractelement %a, i32 1 + %el2 = extractelement %a, i32 2 + %el3 = extractelement %a, i32 3 + %v0 = insertelement <4 x i16> undef, i16 %el3, i32 0 + %v1 = insertelement <4 x i16> %v0, i16 %el2, i32 1 + %v2 = insertelement <4 x i16> %v1, i16 %el1, i32 2 + %v3 = insertelement <4 x i16> %v2, i16 %el0, i32 3 + ret <4 x i16> %v3 +} + + +define <8 x i16> @reshuffle_v8i16_nxv8i16_reverse( %a) #0 { +; CHECK-LABEL: reshuffle_v8i16_nxv8i16_reverse: +; CHECK: // %bb.0: +; CHECK-NEXT: umov w8, v0.h[7] +; CHECK-NEXT: umov w9, v0.h[6] +; CHECK-NEXT: fmov s1, w8 +; CHECK-NEXT: umov w8, v0.h[5] +; CHECK-NEXT: mov v1.h[1], w9 +; CHECK-NEXT: umov w9, v0.h[4] +; CHECK-NEXT: mov v1.h[2], w8 +; CHECK-NEXT: umov w8, v0.h[3] +; CHECK-NEXT: mov v1.h[3], w9 +; CHECK-NEXT: umov w9, v0.h[2] +; CHECK-NEXT: mov v1.h[4], w8 +; CHECK-NEXT: umov w8, v0.h[1] +; CHECK-NEXT: mov v1.h[5], w9 +; CHECK-NEXT: mov v1.h[6], w8 +; CHECK-NEXT: fmov w8, s0 +; CHECK-NEXT: mov v1.h[7], w8 +; CHECK-NEXT: mov v0.16b, v1.16b +; CHECK-NEXT: ret + %el0 = extractelement %a, i32 0 + %el1 = extractelement %a, i32 1 + %el2 = extractelement %a, i32 2 + %el3 = extractelement %a, i32 3 + %el4 = extractelement %a, i32 4 + %el5 = extractelement %a, i32 5 + %el6 = extractelement %a, i32 6 + %el7 = extractelement %a, i32 7 + %v0 = insertelement <8 x i16> undef, i16 %el7, i32 0 + %v1 = insertelement <8 x i16> %v0, i16 %el6, i32 1 + %v2 = insertelement <8 x i16> %v1, i16 %el5, i32 2 + %v3 = insertelement <8 x i16> %v2, i16 %el4, i32 3 + %v4 = insertelement <8 x i16> %v3, i16 %el3, i32 4 + %v5 = insertelement <8 x i16> %v4, i16 %el2, i32 5 + %v6 = insertelement <8 x i16> %v5, i16 %el1, i32 6 + %v7 = insertelement <8 x i16> %v6, i16 %el0, i32 7 + ret <8 x i16> %v7 +} + + +; == Result requires padding one source with undef elements == + +define <2 x i32> @reshuffle_v2i32_nxv2i32_undef( %a) #0 { +; CHECK-LABEL: reshuffle_v2i32_nxv2i32_undef: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 +; CHECK-NEXT: ret + %el0 = extractelement %a, i32 0 + %v0 = insertelement <2 x i32> undef, i32 %el0, i32 0 + ret <2 x i32> %v0 +} + + +define <2 x i32> @reshuffle_v2i32_nxv2i32_undef2( %a) #0 { +; CHECK-LABEL: reshuffle_v2i32_nxv2i32_undef2: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w8, v0.s[2] +; CHECK-NEXT: dup v0.2s, w8 +; CHECK-NEXT: ret + %el1 = extractelement %a, i32 1 + %v0 = insertelement <2 x i32> undef, i32 %el1, i32 1 + ret <2 x i32> %v0 +} + + +define <2 x i32> @reshuffle_v2i32_nxv4i32_undef( %a) #0 { +; CHECK-LABEL: reshuffle_v2i32_nxv4i32_undef: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 +; CHECK-NEXT: ret + %el0 = extractelement %a, i32 0 + %v0 = insertelement <2 x i32> undef, i32 %el0, i32 0 + ret <2 x i32> %v0 +} + + +define <2 x i32> @reshuffle_v2i32_nxv4i32_undef2( %a) #0 { +; CHECK-LABEL: reshuffle_v2i32_nxv4i32_undef2: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w8, v0.s[1] +; CHECK-NEXT: dup v0.2s, w8 +; CHECK-NEXT: ret + %el1 = extractelement %a, i32 1 + %v0 = insertelement <2 x i32> undef, i32 %el1, i32 1 + ret <2 x i32> %v0 +} + + +define <4 x i32> @reshuffle_v4i32_nxv4i32_undef( %a) #0 { +; CHECK-LABEL: reshuffle_v4i32_nxv4i32_undef: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w8, v0.s[1] +; CHECK-NEXT: mov v0.s[1], w8 +; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 +; CHECK-NEXT: ret + %el0 = extractelement %a, i32 0 + %el1 = extractelement %a, i32 1 + %v0 = insertelement <4 x i32> undef, i32 %el0, i32 0 + %v1 = insertelement <4 x i32> %v0, i32 %el1, i32 1 + ret <4 x i32> %v1 +} + + +define <4 x i32> @reshuffle_v4i32_nxv4i32_undef2( %a) #0 { +; CHECK-LABEL: reshuffle_v4i32_nxv4i32_undef2: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w8, v0.s[2] +; CHECK-NEXT: mov w9, v0.s[3] +; CHECK-NEXT: mov v0.s[2], w8 +; CHECK-NEXT: mov v0.s[3], w9 +; CHECK-NEXT: ret + %el2 = extractelement %a, i32 2 + %el3 = extractelement %a, i32 3 + %v0 = insertelement <4 x i32> undef, i32 %el2, i32 2 + %v1 = insertelement <4 x i32> %v0, i32 %el3, i32 3 + ret <4 x i32> %v1 +} + + +define <2 x i16> @reshuffle_v2i16_nxv8i16_undef( %a) #0 { +; CHECK-LABEL: reshuffle_v2i16_nxv8i16_undef: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 +; CHECK-NEXT: ret + %el0 = extractelement %a, i32 0 + %v0 = insertelement <2 x i16> undef, i16 %el0, i32 0 + ret <2 x i16> %v0 +} + + +define <2 x i16> @reshuffle_v2i16_nxv8i16_undef2( %a) #0 { +; CHECK-LABEL: reshuffle_v2i16_nxv8i16_undef2: +; CHECK: // %bb.0: +; CHECK-NEXT: umov w8, v0.h[1] +; CHECK-NEXT: dup v0.2s, w8 +; CHECK-NEXT: ret + %el1 = extractelement %a, i32 1 + %v0 = insertelement <2 x i16> undef, i16 %el1, i32 1 + ret <2 x i16> %v0 +} + + +define <4 x i16> @reshuffle_v4i16_nxv8i16_undef( %a) #0 { +; CHECK-LABEL: reshuffle_v4i16_nxv8i16_undef: +; CHECK: // %bb.0: +; CHECK-NEXT: umov w8, v0.h[1] +; CHECK-NEXT: mov v0.h[1], w8 +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 +; CHECK-NEXT: ret + %el0 = extractelement %a, i32 0 + %el1 = extractelement %a, i32 1 + %v0 = insertelement <4 x i16> undef, i16 %el0, i32 0 + %v1 = insertelement <4 x i16> %v0, i16 %el1, i32 1 + ret <4 x i16> %v1 +} + +define <4 x i16> @reshuffle_v4i16_nxv8i16_undef2( %a) #0 { +; CHECK-LABEL: reshuffle_v4i16_nxv8i16_undef2: +; CHECK: // %bb.0: +; CHECK-NEXT: umov w8, v0.h[2] +; CHECK-NEXT: umov w9, v0.h[3] +; CHECK-NEXT: mov v0.h[2], w8 +; CHECK-NEXT: mov v0.h[3], w9 +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-NEXT: ret + %el2 = extractelement %a, i32 2 + %el3 = extractelement %a, i32 3 + %v0 = insertelement <4 x i16> undef, i16 %el2, i32 2 + %v1 = insertelement <4 x i16> %v0, i16 %el3, i32 3 + ret <4 x i16> %v1 +} + + +define <8 x i16> @reshuffle_v8i16_nxv8i16_undef( %a) #0 { +; CHECK-LABEL: reshuffle_v8i16_nxv8i16_undef: +; CHECK: // %bb.0: +; CHECK-NEXT: umov w8, v0.h[1] +; CHECK-NEXT: umov w9, v0.h[2] +; CHECK-NEXT: umov w10, v0.h[3] +; CHECK-NEXT: mov v0.h[1], w8 +; CHECK-NEXT: mov v0.h[2], w9 +; CHECK-NEXT: mov v0.h[3], w10 +; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 +; CHECK-NEXT: ret + %el0 = extractelement %a, i32 0 + %el1 = extractelement %a, i32 1 + %el2 = extractelement %a, i32 2 + %el3 = extractelement %a, i32 3 + %v0 = insertelement <8 x i16> undef, i16 %el0, i32 0 + %v1 = insertelement <8 x i16> %v0, i16 %el1, i32 1 + %v2 = insertelement <8 x i16> %v1, i16 %el2, i32 2 + %v3 = insertelement <8 x i16> %v2, i16 %el3, i32 3 + ret <8 x i16> %v3 +} + + +define <8 x i16> @reshuffle_v8i16_nxv8i16_undef2( %a) #0 { +; CHECK-LABEL: reshuffle_v8i16_nxv8i16_undef2: +; CHECK: // %bb.0: +; CHECK-NEXT: umov w8, v0.h[4] +; CHECK-NEXT: umov w9, v0.h[5] +; CHECK-NEXT: mov v1.h[4], w8 +; CHECK-NEXT: umov w8, v0.h[6] +; CHECK-NEXT: mov v1.h[5], w9 +; CHECK-NEXT: umov w9, v0.h[7] +; CHECK-NEXT: mov v1.h[6], w8 +; CHECK-NEXT: mov v1.h[7], w9 +; CHECK-NEXT: mov v0.16b, v1.16b +; CHECK-NEXT: ret + %el4 = extractelement %a, i32 4 + %el5 = extractelement %a, i32 5 + %el6 = extractelement %a, i32 6 + %el7 = extractelement %a, i32 7 + %v0 = insertelement <8 x i16> undef, i16 %el4, i32 4 + %v1 = insertelement <8 x i16> %v0, i16 %el5, i32 5 + %v2 = insertelement <8 x i16> %v1, i16 %el6, i32 6 + %v3 = insertelement <8 x i16> %v2, i16 %el7, i32 7 + ret <8 x i16> %v3 +} + + +define <4 x i32> @reshuffle_v4i32_nxv4i32_reverse_undef( %a) #0 { +; CHECK-LABEL: reshuffle_v4i32_nxv4i32_reverse_undef: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w8, v0.s[1] +; CHECK-NEXT: fmov w9, s0 +; CHECK-NEXT: fmov s0, w8 +; CHECK-NEXT: mov v0.s[1], w9 +; CHECK-NEXT: ret + %el0 = extractelement %a, i32 0 + %el1 = extractelement %a, i32 1 + %v0 = insertelement <4 x i32> undef, i32 %el1, i32 0 + %v1 = insertelement <4 x i32> %v0, i32 %el0, i32 1 + ret <4 x i32> %v1 +} + + +define <4 x i32> @reshuffle_v4i32_nxv4i32_reverse_undef2( %a) #0 { +; CHECK-LABEL: reshuffle_v4i32_nxv4i32_reverse_undef2: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w8, v0.s[3] +; CHECK-NEXT: mov w9, v0.s[2] +; CHECK-NEXT: fmov s0, w8 +; CHECK-NEXT: mov v0.s[1], w9 +; CHECK-NEXT: ret + %el2 = extractelement %a, i32 2 + %el3 = extractelement %a, i32 3 + %v0 = insertelement <4 x i32> undef, i32 %el3, i32 0 + %v1 = insertelement <4 x i32> %v0, i32 %el2, i32 1 + ret <4 x i32> %v1 +} + + +define <8 x i16> @reshuffle_v8i16_nxv8i16_reverse_undef( %a) #0 { +; CHECK-LABEL: reshuffle_v8i16_nxv8i16_reverse_undef: +; CHECK: // %bb.0: +; CHECK-NEXT: umov w8, v0.h[3] +; CHECK-NEXT: umov w9, v0.h[2] +; CHECK-NEXT: fmov s1, w8 +; CHECK-NEXT: umov w8, v0.h[1] +; CHECK-NEXT: mov v1.h[1], w9 +; CHECK-NEXT: mov v1.h[2], w8 +; CHECK-NEXT: fmov w8, s0 +; CHECK-NEXT: mov v1.h[3], w8 +; CHECK-NEXT: mov v0.16b, v1.16b +; CHECK-NEXT: ret + %el0 = extractelement %a, i32 0 + %el1 = extractelement %a, i32 1 + %el2 = extractelement %a, i32 2 + %el3 = extractelement %a, i32 3 + %v0 = insertelement <8 x i16> undef, i16 %el3, i32 0 + %v1 = insertelement <8 x i16> %v0, i16 %el2, i32 1 + %v2 = insertelement <8 x i16> %v1, i16 %el1, i32 2 + %v3 = insertelement <8 x i16> %v2, i16 %el0, i32 3 + ret <8 x i16> %v3 +} + + +define <8 x i16> @reshuffle_v8i16_nxv8i16_reverse_undef2( %a) #0 { +; CHECK-LABEL: reshuffle_v8i16_nxv8i16_reverse_undef2: +; CHECK: // %bb.0: +; CHECK-NEXT: umov w8, v0.h[7] +; CHECK-NEXT: umov w9, v0.h[6] +; CHECK-NEXT: fmov s1, w8 +; CHECK-NEXT: umov w8, v0.h[5] +; CHECK-NEXT: mov v1.h[1], w9 +; CHECK-NEXT: umov w9, v0.h[4] +; CHECK-NEXT: mov v1.h[2], w8 +; CHECK-NEXT: mov v1.h[3], w9 +; CHECK-NEXT: mov v0.16b, v1.16b +; CHECK-NEXT: ret + %el4 = extractelement %a, i32 4 + %el5 = extractelement %a, i32 5 + %el6 = extractelement %a, i32 6 + %el7 = extractelement %a, i32 7 + %v0 = insertelement <8 x i16> undef, i16 %el7, i32 0 + %v1 = insertelement <8 x i16> %v0, i16 %el6, i32 1 + %v2 = insertelement <8 x i16> %v1, i16 %el5, i32 2 + %v3 = insertelement <8 x i16> %v2, i16 %el4, i32 3 + ret <8 x i16> %v3 +} + + +; == Shuffle comes from two input sources == + +define <2 x i64> @reshuffle_v2i64_2x_nxv2i64( %a, %b) #0 { +; CHECK-LABEL: reshuffle_v2i64_2x_nxv2i64: +; CHECK: // %bb.0: +; CHECK-NEXT: fmov x8, d1 +; CHECK-NEXT: mov v0.d[1], x8 +; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 +; CHECK-NEXT: ret + %el0 = extractelement %a, i32 0 + %el1 = extractelement %b, i32 0 + %v0 = insertelement <2 x i64> undef, i64 %el0, i32 0 + %v1 = insertelement <2 x i64> %v0, i64 %el1, i32 1 + ret <2 x i64> %v1 +} + + +define <2 x i32> @reshuffle_v2i32_2x_nxv2i32( %a, %b) #0 { +; CHECK-LABEL: reshuffle_v2i32_2x_nxv2i32: +; CHECK: // %bb.0: +; CHECK-NEXT: fmov w8, s1 +; CHECK-NEXT: mov v0.s[1], w8 +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 +; CHECK-NEXT: ret + %el0 = extractelement %a, i32 0 + %el1 = extractelement %b, i32 0 + %v0 = insertelement <2 x i32> undef, i32 %el0, i32 0 + %v1 = insertelement <2 x i32> %v0, i32 %el1, i32 1 + ret <2 x i32> %v1 +} + + +define <4 x i32> @reshuffle_v4i32_2x_nxv4i32( %a, %b) #0 { +; CHECK-LABEL: reshuffle_v4i32_2x_nxv4i32: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w8, v0.s[1] +; CHECK-NEXT: mov w9, v1.s[1] +; CHECK-NEXT: mov v0.s[1], w8 +; CHECK-NEXT: fmov w8, s1 +; CHECK-NEXT: mov v0.s[2], w8 +; CHECK-NEXT: mov v0.s[3], w9 +; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 +; CHECK-NEXT: ret + %el0 = extractelement %a, i32 0 + %el1 = extractelement %a, i32 1 + %el2 = extractelement %b, i32 0 + %el3 = extractelement %b, i32 1 + %v0 = insertelement <4 x i32> undef, i32 %el0, i32 0 + %v1 = insertelement <4 x i32> %v0, i32 %el1, i32 1 + %v2 = insertelement <4 x i32> %v1, i32 %el2, i32 2 + %v3 = insertelement <4 x i32> %v2, i32 %el3, i32 3 + ret <4 x i32> %v3 +} + + +define <4 x i16> @reshuffle_v4i16_2x_nxv8i16( %a, %b) #0 { +; CHECK-LABEL: reshuffle_v4i16_2x_nxv8i16: +; CHECK: // %bb.0: +; CHECK-NEXT: umov w8, v0.h[1] +; CHECK-NEXT: umov w9, v1.h[1] +; CHECK-NEXT: mov v0.h[1], w8 +; CHECK-NEXT: fmov w8, s1 +; CHECK-NEXT: mov v0.h[2], w8 +; CHECK-NEXT: mov v0.h[3], w9 +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 +; CHECK-NEXT: ret + %el0 = extractelement %a, i32 0 + %el1 = extractelement %a, i32 1 + %el2 = extractelement %b, i32 0 + %el3 = extractelement %b, i32 1 + %v0 = insertelement <4 x i16> undef, i16 %el0, i32 0 + %v1 = insertelement <4 x i16> %v0, i16 %el1, i32 1 + %v2 = insertelement <4 x i16> %v1, i16 %el2, i32 2 + %v3 = insertelement <4 x i16> %v2, i16 %el3, i32 3 + ret <4 x i16> %v3 +} + + +define <8 x i16> @reshuffle_v8i16_2x_nxv8i16( %a, %b) #0 { +; CHECK-LABEL: reshuffle_v8i16_2x_nxv8i16: +; CHECK: // %bb.0: +; CHECK-NEXT: umov w8, v0.h[1] +; CHECK-NEXT: umov w9, v0.h[2] +; CHECK-NEXT: umov w10, v0.h[3] +; CHECK-NEXT: mov v0.h[1], w8 +; CHECK-NEXT: fmov w8, s1 +; CHECK-NEXT: mov v0.h[2], w9 +; CHECK-NEXT: umov w9, v1.h[1] +; CHECK-NEXT: mov v0.h[3], w10 +; CHECK-NEXT: mov v0.h[4], w8 +; CHECK-NEXT: umov w8, v1.h[2] +; CHECK-NEXT: mov v0.h[5], w9 +; CHECK-NEXT: umov w9, v1.h[3] +; CHECK-NEXT: mov v0.h[6], w8 +; CHECK-NEXT: mov v0.h[7], w9 +; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 +; CHECK-NEXT: ret + %el0 = extractelement %a, i32 0 + %el1 = extractelement %a, i32 1 + %el2 = extractelement %a, i32 2 + %el3 = extractelement %a, i32 3 + %el4 = extractelement %b, i32 0 + %el5 = extractelement %b, i32 1 + %el6 = extractelement %b, i32 2 + %el7 = extractelement %b, i32 3 + %v0 = insertelement <8 x i16> undef, i16 %el0, i32 0 + %v1 = insertelement <8 x i16> %v0, i16 %el1, i32 1 + %v2 = insertelement <8 x i16> %v1, i16 %el2, i32 2 + %v3 = insertelement <8 x i16> %v2, i16 %el3, i32 3 + %v4 = insertelement <8 x i16> %v3, i16 %el4, i32 4 + %v5 = insertelement <8 x i16> %v4, i16 %el5, i32 5 + %v6 = insertelement <8 x i16> %v5, i16 %el6, i32 6 + %v7 = insertelement <8 x i16> %v6, i16 %el7, i32 7 + ret <8 x i16> %v7 +} + + +; == Shuffle comes from two input sources and result requires padding with undef == + + +define <4 x i32> @reshuffle_v4i32_2x_nxv4i32_undef1( %a, %b) #0 { +; CHECK-LABEL: reshuffle_v4i32_2x_nxv4i32_undef1: +; CHECK: // %bb.0: +; CHECK-NEXT: fmov w8, s1 +; CHECK-NEXT: mov v0.s[1], w8 +; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 +; CHECK-NEXT: ret + %el0 = extractelement %a, i32 0 + %el1 = extractelement %b, i32 0 + %v0 = insertelement <4 x i32> undef, i32 %el0, i32 0 + %v1 = insertelement <4 x i32> %v0, i32 %el1, i32 1 + ret <4 x i32> %v1 +} + + +define <8 x i16> @reshuffle_v8i16_2x_nxv8i16_undef1( %a, %b) #0 { +; CHECK-LABEL: reshuffle_v8i16_2x_nxv8i16_undef1: +; CHECK: // %bb.0: +; CHECK-NEXT: fmov w8, s1 +; CHECK-NEXT: mov v0.h[1], w8 +; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 +; CHECK-NEXT: ret + %el0 = extractelement %a, i32 0 + %el1 = extractelement %b, i32 0 + %v0 = insertelement <8 x i16> undef, i16 %el0, i32 0 + %v1 = insertelement <8 x i16> %v0, i16 %el1, i32 1 + ret <8 x i16> %v1 +} + + +define <8 x i16> @reshuffle_v8i16_2x_nxv8i16_undef2( %a, %b) #0 { +; CHECK-LABEL: reshuffle_v8i16_2x_nxv8i16_undef2: +; CHECK: // %bb.0: +; CHECK-NEXT: umov w8, v0.h[1] +; CHECK-NEXT: umov w9, v1.h[1] +; CHECK-NEXT: mov v0.h[1], w8 +; CHECK-NEXT: fmov w8, s1 +; CHECK-NEXT: mov v0.h[2], w8 +; CHECK-NEXT: mov v0.h[3], w9 +; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 +; CHECK-NEXT: ret + %el0 = extractelement %a, i32 0 + %el1 = extractelement %a, i32 1 + %el2 = extractelement %b, i32 0 + %el3 = extractelement %b, i32 1 + %v0 = insertelement <8 x i16> undef, i16 %el0, i32 0 + %v1 = insertelement <8 x i16> %v0, i16 %el1, i32 1 + %v2 = insertelement <8 x i16> %v1, i16 %el2, i32 2 + %v3 = insertelement <8 x i16> %v2, i16 %el3, i32 3 + ret <8 x i16> %v3 +} + + attributes #0 = { "target-features"="+sve" }