diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-permute-rev.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-permute-rev.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-permute-rev.ll @@ -0,0 +1,240 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -force-streaming-compatible-sve < %s | FileCheck %s + +target triple = "aarch64-unknown-linux-gnu" + +; REVB pattern for shuffle v32i8 -> v16i16 +define void @test_revbv16i16(ptr %a) #0 { +; CHECK-LABEL: test_revbv16i16: +; CHECK: // %bb.0: +; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: revb z0.h, p0/m, z0.h +; CHECK-NEXT: revb z1.h, p0/m, z1.h +; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: ret + %tmp1 = load <32 x i8>, ptr %a + %tmp2 = shufflevector <32 x i8> %tmp1, <32 x i8> undef, <32 x i32> + store <32 x i8> %tmp2, ptr %a + ret void +} + +; REVB pattern for shuffle v32i8 -> v8i32 +define void @test_revbv8i32(ptr %a) #0 { +; CHECK-LABEL: test_revbv8i32: +; CHECK: // %bb.0: +; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: revb z0.s, p0/m, z0.s +; CHECK-NEXT: revb z1.s, p0/m, z1.s +; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: ret + %tmp1 = load <32 x i8>, ptr %a + %tmp2 = shufflevector <32 x i8> %tmp1, <32 x i8> undef, <32 x i32> + store <32 x i8> %tmp2, ptr %a + ret void +} + +; REVB pattern for shuffle v32i8 -> v4i64 +define void @test_revbv4i64(ptr %a) #0 { +; CHECK-LABEL: test_revbv4i64: +; CHECK: // %bb.0: +; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: revb z0.d, p0/m, z0.d +; CHECK-NEXT: revb z1.d, p0/m, z1.d +; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: ret + %tmp1 = load <32 x i8>, ptr %a + %tmp2 = shufflevector <32 x i8> %tmp1, <32 x i8> undef, <32 x i32> + store <32 x i8> %tmp2, ptr %a + ret void +} + +; REVH pattern for shuffle v16i16 -> v8i32 +define void @test_revhv8i32(ptr %a) #0 { +; CHECK-LABEL: test_revhv8i32: +; CHECK: // %bb.0: +; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: revh z0.s, p0/m, z0.s +; CHECK-NEXT: revh z1.s, p0/m, z1.s +; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: ret + %tmp1 = load <16 x i16>, ptr %a + %tmp2 = shufflevector <16 x i16> %tmp1, <16 x i16> undef, <16 x i32> + store <16 x i16> %tmp2, ptr %a + ret void +} + +; REVH pattern for shuffle v16f16 -> v8f32 +define void @test_revhv8f32(ptr %a) #0 { +; CHECK-LABEL: test_revhv8f32: +; CHECK: // %bb.0: +; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: revh z0.s, p0/m, z0.s +; CHECK-NEXT: revh z1.s, p0/m, z1.s +; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: ret + %tmp1 = load <16 x half>, ptr %a + %tmp2 = shufflevector <16 x half> %tmp1, <16 x half> undef, <16 x i32> + store <16 x half> %tmp2, ptr %a + ret void +} + +; REVH pattern for shuffle v16i16 -> v4i64 +define void @test_revhv4i64(ptr %a) #0 { +; CHECK-LABEL: test_revhv4i64: +; CHECK: // %bb.0: +; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: revh z0.d, p0/m, z0.d +; CHECK-NEXT: revh z1.d, p0/m, z1.d +; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: ret + %tmp1 = load <16 x i16>, ptr %a + %tmp2 = shufflevector <16 x i16> %tmp1, <16 x i16> undef, <16 x i32> + store <16 x i16> %tmp2, ptr %a + ret void +} + +; REVW pattern for shuffle v8i32 -> v4i64 +define void @test_revwv4i64(ptr %a) #0 { +; CHECK-LABEL: test_revwv4i64: +; CHECK: // %bb.0: +; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: revw z0.d, p0/m, z0.d +; CHECK-NEXT: revw z1.d, p0/m, z1.d +; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: ret + %tmp1 = load <8 x i32>, ptr %a + %tmp2 = shufflevector <8 x i32> %tmp1, <8 x i32> undef, <8 x i32> + store <8 x i32> %tmp2, ptr %a + ret void +} + +; REVW pattern for shuffle v8f32 -> v4f64 +define void @test_revwv4f64(ptr %a) #0 { +; CHECK-LABEL: test_revwv4f64: +; CHECK: // %bb.0: +; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: revw z0.d, p0/m, z0.d +; CHECK-NEXT: revw z1.d, p0/m, z1.d +; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: ret + %tmp1 = load <8 x float>, ptr %a + %tmp2 = shufflevector <8 x float> %tmp1, <8 x float> undef, <8 x i32> + store <8 x float> %tmp2, ptr %a + ret void +} + +define <16 x i8> @test_revv16i8(ptr %a) #0 { +; CHECK-LABEL: test_revv16i8: +; CHECK: // %bb.0: +; CHECK-NEXT: ldr q0, [x0] +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: revb z0.d, p0/m, z0.d +; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 +; CHECK-NEXT: ret + %tmp1 = load <16 x i8>, ptr %a + %tmp2 = shufflevector <16 x i8> %tmp1, <16 x i8> undef, <16 x i32> + ret <16 x i8> %tmp2 +} + +; REVW pattern for shuffle two v8i32 inputs with the second input available. +define void @test_revwv8i32v8i32(ptr %a, ptr %b) #0 { +; CHECK-LABEL: test_revwv8i32v8i32: +; CHECK: // %bb.0: +; CHECK-NEXT: ldp q0, q1, [x1] +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: revw z0.d, p0/m, z0.d +; CHECK-NEXT: revw z1.d, p0/m, z1.d +; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: ret + %tmp1 = load <8 x i32>, ptr %a + %tmp2 = load <8 x i32>, ptr %b + %tmp3 = shufflevector <8 x i32> %tmp1, <8 x i32> %tmp2, <8 x i32> + store <8 x i32> %tmp3, ptr %a + ret void +} + +define void @test_revhv32i16(ptr %a) #0 { +; CHECK-LABEL: test_revhv32i16: +; CHECK: // %bb.0: +; CHECK-NEXT: ldp q0, q1, [x0, #32] +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: revh z0.d, p0/m, z0.d +; CHECK-NEXT: ldp q2, q3, [x0] +; CHECK-NEXT: revh z1.d, p0/m, z1.d +; CHECK-NEXT: stp q0, q1, [x0, #32] +; CHECK-NEXT: revh z0.d, p0/m, z2.d +; CHECK-NEXT: revh z1.d, p0/m, z3.d +; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: ret + %tmp1 = load <32 x i16>, ptr %a + %tmp2 = shufflevector <32 x i16> %tmp1, <32 x i16> undef, <32 x i32> + store <32 x i16> %tmp2, ptr %a + ret void +} + +define void @test_rev_elts_fail(ptr %a) #0 { +; CHECK-LABEL: test_rev_elts_fail: +; CHECK: // %bb.0: +; CHECK-NEXT: ldp q1, q0, [x0] +; CHECK-NEXT: fmov x10, d1 +; CHECK-NEXT: mov z2.d, z0.d[1] +; CHECK-NEXT: fmov x8, d0 +; CHECK-NEXT: fmov x9, d2 +; CHECK-NEXT: mov z0.d, z1.d[1] +; CHECK-NEXT: fmov x11, d0 +; CHECK-NEXT: stp x9, x8, [sp, #-32]! +; CHECK-NEXT: .cfi_def_cfa_offset 32 +; CHECK-NEXT: stp x11, x10, [sp, #16] +; CHECK-NEXT: ldp q1, q0, [sp] +; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: add sp, sp, #32 +; CHECK-NEXT: ret + %tmp1 = load <4 x i64>, ptr %a + %tmp2 = shufflevector <4 x i64> %tmp1, <4 x i64> undef, <4 x i32> + store <4 x i64> %tmp2, ptr %a + ret void +} + +define void @test_revv8i32(ptr %a) #0 { +; CHECK-LABEL: test_revv8i32: +; CHECK: // %bb.0: +; CHECK-NEXT: sub sp, sp, #32 +; CHECK-NEXT: .cfi_def_cfa_offset 32 +; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: mov z2.s, z0.s[1] +; CHECK-NEXT: mov z3.s, z0.s[2] +; CHECK-NEXT: mov z4.s, z0.s[3] +; CHECK-NEXT: fmov w8, s0 +; CHECK-NEXT: fmov w9, s2 +; CHECK-NEXT: fmov w10, s3 +; CHECK-NEXT: fmov w11, s4 +; CHECK-NEXT: mov z0.s, z1.s[1] +; CHECK-NEXT: mov z2.s, z1.s[2] +; CHECK-NEXT: mov z3.s, z1.s[3] +; CHECK-NEXT: stp w9, w8, [sp, #24] +; CHECK-NEXT: fmov w8, s1 +; CHECK-NEXT: fmov w9, s0 +; CHECK-NEXT: stp w11, w10, [sp, #16] +; CHECK-NEXT: fmov w10, s2 +; CHECK-NEXT: fmov w11, s3 +; CHECK-NEXT: stp w9, w8, [sp, #8] +; CHECK-NEXT: stp w11, w10, [sp] +; CHECK-NEXT: ldp q0, q1, [sp] +; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: add sp, sp, #32 +; CHECK-NEXT: ret + %tmp1 = load <8 x i32>, ptr %a + %tmp2 = shufflevector <8 x i32> %tmp1, <8 x i32> undef, <8 x i32> + store <8 x i32> %tmp2, ptr %a + ret void +} + +attributes #0 = { "target-features"="+sve" } diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-permute-zip-uzp-trn.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-permute-zip-uzp-trn.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-permute-zip-uzp-trn.ll @@ -0,0 +1,1271 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -force-streaming-compatible-sve < %s | FileCheck %s + +target triple = "aarch64-unknown-linux-gnu" + +define void @zip1_v32i8(ptr %a, ptr %b) #0 { +; CHECK-LABEL: zip1_v32i8: +; CHECK: // %bb.0: +; CHECK-NEXT: sub sp, sp, #16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: ldr q0, [x0, #16] +; CHECK-NEXT: ldr q0, [x0] +; CHECK-NEXT: ldr q1, [x1, #16] +; CHECK-NEXT: ldr q1, [x1] +; CHECK-NEXT: mov z2.b, z0.b[15] +; CHECK-NEXT: fmov w8, s2 +; CHECK-NEXT: mov z2.b, z0.b[14] +; CHECK-NEXT: fmov w9, s2 +; CHECK-NEXT: mov z2.b, z0.b[13] +; CHECK-NEXT: fmov w10, s2 +; CHECK-NEXT: mov z2.b, z0.b[12] +; CHECK-NEXT: strb w8, [sp, #14] +; CHECK-NEXT: fmov w8, s2 +; CHECK-NEXT: mov z2.b, z0.b[11] +; CHECK-NEXT: strb w9, [sp, #12] +; CHECK-NEXT: fmov w9, s2 +; CHECK-NEXT: mov z2.b, z0.b[10] +; CHECK-NEXT: strb w10, [sp, #10] +; CHECK-NEXT: fmov w10, s2 +; CHECK-NEXT: mov z2.b, z0.b[9] +; CHECK-NEXT: strb w8, [sp, #8] +; CHECK-NEXT: fmov w8, s2 +; CHECK-NEXT: mov z2.b, z0.b[8] +; CHECK-NEXT: strb w9, [sp, #6] +; CHECK-NEXT: fmov w9, s2 +; CHECK-NEXT: mov z2.b, z1.b[15] +; CHECK-NEXT: strb w10, [sp, #4] +; CHECK-NEXT: strb w8, [sp, #2] +; CHECK-NEXT: fmov w8, s2 +; CHECK-NEXT: mov z2.b, z1.b[14] +; CHECK-NEXT: strb w9, [sp] +; CHECK-NEXT: fmov w9, s2 +; CHECK-NEXT: mov z2.b, z1.b[13] +; CHECK-NEXT: fmov w10, s2 +; CHECK-NEXT: mov z2.b, z1.b[12] +; CHECK-NEXT: strb w8, [sp, #15] +; CHECK-NEXT: fmov w8, s2 +; CHECK-NEXT: mov z2.b, z1.b[11] +; CHECK-NEXT: strb w9, [sp, #13] +; CHECK-NEXT: strb w10, [sp, #11] +; CHECK-NEXT: zip1 z0.b, z0.b, z1.b +; CHECK-NEXT: strb w8, [sp, #9] +; CHECK-NEXT: fmov w8, s2 +; CHECK-NEXT: mov z2.b, z1.b[10] +; CHECK-NEXT: fmov w9, s2 +; CHECK-NEXT: mov z2.b, z1.b[9] +; CHECK-NEXT: fmov w10, s2 +; CHECK-NEXT: mov z2.b, z1.b[8] +; CHECK-NEXT: strb w8, [sp, #7] +; CHECK-NEXT: fmov w8, s2 +; CHECK-NEXT: strb w9, [sp, #5] +; CHECK-NEXT: strb w10, [sp, #3] +; CHECK-NEXT: strb w8, [sp, #1] +; CHECK-NEXT: ldr q2, [sp] +; CHECK-NEXT: str q0, [x0] +; CHECK-NEXT: str q2, [x0, #16] +; CHECK-NEXT: add sp, sp, #16 +; CHECK-NEXT: ret + %tmp1 = load volatile <32 x i8>, ptr %a + %tmp2 = load volatile <32 x i8>, ptr %b + %tmp3 = shufflevector <32 x i8> %tmp1, <32 x i8> %tmp2, <32 x i32> + store volatile <32 x i8> %tmp3, ptr %a + ret void +} + +define void @zip_v32i16(ptr %a, ptr %b) #0 { +; CHECK-LABEL: zip_v32i16: +; CHECK: // %bb.0: +; CHECK-NEXT: sub sp, sp, #64 +; CHECK-NEXT: .cfi_def_cfa_offset 64 +; CHECK-NEXT: ldp q2, q5, [x1] +; CHECK-NEXT: ldp q4, q7, [x0] +; CHECK-NEXT: mov z16.h, z5.h[7] +; CHECK-NEXT: fmov w8, s16 +; CHECK-NEXT: mov z16.h, z5.h[6] +; CHECK-NEXT: fmov w10, s16 +; CHECK-NEXT: mov z16.h, z5.h[5] +; CHECK-NEXT: mov z17.h, z7.h[7] +; CHECK-NEXT: fmov w9, s17 +; CHECK-NEXT: mov z17.h, z7.h[6] +; CHECK-NEXT: ldp q0, q1, [x0, #32] +; CHECK-NEXT: ldp q3, q6, [x1, #32] +; CHECK-NEXT: strh w8, [sp, #30] +; CHECK-NEXT: fmov w8, s17 +; CHECK-NEXT: strh w9, [sp, #28] +; CHECK-NEXT: strh w10, [sp, #26] +; CHECK-NEXT: strh w8, [sp, #24] +; CHECK-NEXT: fmov w8, s16 +; CHECK-NEXT: mov z16.h, z7.h[5] +; CHECK-NEXT: fmov w9, s16 +; CHECK-NEXT: mov z16.h, z5.h[4] +; CHECK-NEXT: fmov w10, s16 +; CHECK-NEXT: mov z16.h, z7.h[4] +; CHECK-NEXT: strh w8, [sp, #22] +; CHECK-NEXT: fmov w8, s16 +; CHECK-NEXT: mov z16.h, z2.h[7] +; CHECK-NEXT: strh w9, [sp, #20] +; CHECK-NEXT: strh w10, [sp, #18] +; CHECK-NEXT: mov z18.h, z6.h[7] +; CHECK-NEXT: strh w8, [sp, #16] +; CHECK-NEXT: fmov w8, s16 +; CHECK-NEXT: mov z16.h, z4.h[7] +; CHECK-NEXT: ldr q17, [sp, #16] +; CHECK-NEXT: fmov w9, s16 +; CHECK-NEXT: mov z16.h, z2.h[6] +; CHECK-NEXT: fmov w10, s16 +; CHECK-NEXT: mov z16.h, z4.h[6] +; CHECK-NEXT: strh w8, [sp, #62] +; CHECK-NEXT: fmov w8, s16 +; CHECK-NEXT: mov z16.h, z2.h[5] +; CHECK-NEXT: strh w9, [sp, #60] +; CHECK-NEXT: strh w10, [sp, #58] +; CHECK-NEXT: zip1 z5.h, z7.h, z5.h +; CHECK-NEXT: strh w8, [sp, #56] +; CHECK-NEXT: fmov w8, s16 +; CHECK-NEXT: mov z16.h, z4.h[5] +; CHECK-NEXT: fmov w9, s16 +; CHECK-NEXT: mov z16.h, z2.h[4] +; CHECK-NEXT: fmov w10, s16 +; CHECK-NEXT: mov z16.h, z4.h[4] +; CHECK-NEXT: strh w8, [sp, #54] +; CHECK-NEXT: fmov w8, s16 +; CHECK-NEXT: strh w9, [sp, #52] +; CHECK-NEXT: zip1 z2.h, z4.h, z2.h +; CHECK-NEXT: strh w10, [sp, #50] +; CHECK-NEXT: strh w8, [sp, #48] +; CHECK-NEXT: fmov w8, s18 +; CHECK-NEXT: mov z18.h, z1.h[7] +; CHECK-NEXT: ldr q16, [sp, #48] +; CHECK-NEXT: fmov w9, s18 +; CHECK-NEXT: mov z18.h, z6.h[6] +; CHECK-NEXT: fmov w10, s18 +; CHECK-NEXT: mov z18.h, z1.h[6] +; CHECK-NEXT: strh w8, [sp, #46] +; CHECK-NEXT: fmov w8, s18 +; CHECK-NEXT: mov z18.h, z6.h[5] +; CHECK-NEXT: strh w9, [sp, #44] +; CHECK-NEXT: strh w10, [sp, #42] +; CHECK-NEXT: strh w8, [sp, #40] +; CHECK-NEXT: fmov w8, s18 +; CHECK-NEXT: mov z18.h, z1.h[5] +; CHECK-NEXT: fmov w9, s18 +; CHECK-NEXT: mov z18.h, z6.h[4] +; CHECK-NEXT: fmov w10, s18 +; CHECK-NEXT: mov z18.h, z1.h[4] +; CHECK-NEXT: strh w8, [sp, #38] +; CHECK-NEXT: fmov w8, s18 +; CHECK-NEXT: mov z18.h, z3.h[7] +; CHECK-NEXT: strh w9, [sp, #36] +; CHECK-NEXT: strh w10, [sp, #34] +; CHECK-NEXT: zip1 z1.h, z1.h, z6.h +; CHECK-NEXT: strh w8, [sp, #32] +; CHECK-NEXT: fmov w8, s18 +; CHECK-NEXT: mov z18.h, z0.h[7] +; CHECK-NEXT: ldr q4, [sp, #32] +; CHECK-NEXT: fmov w9, s18 +; CHECK-NEXT: mov z18.h, z3.h[6] +; CHECK-NEXT: fmov w10, s18 +; CHECK-NEXT: mov z18.h, z0.h[6] +; CHECK-NEXT: strh w8, [sp, #14] +; CHECK-NEXT: fmov w8, s18 +; CHECK-NEXT: mov z18.h, z3.h[5] +; CHECK-NEXT: strh w9, [sp, #12] +; CHECK-NEXT: strh w10, [sp, #10] +; CHECK-NEXT: add z1.h, z5.h, z1.h +; CHECK-NEXT: strh w8, [sp, #8] +; CHECK-NEXT: fmov w8, s18 +; CHECK-NEXT: mov z18.h, z0.h[5] +; CHECK-NEXT: add z4.h, z17.h, z4.h +; CHECK-NEXT: fmov w9, s18 +; CHECK-NEXT: mov z18.h, z3.h[4] +; CHECK-NEXT: fmov w10, s18 +; CHECK-NEXT: mov z18.h, z0.h[4] +; CHECK-NEXT: strh w8, [sp, #6] +; CHECK-NEXT: fmov w8, s18 +; CHECK-NEXT: strh w9, [sp, #4] +; CHECK-NEXT: zip1 z0.h, z0.h, z3.h +; CHECK-NEXT: strh w10, [sp, #2] +; CHECK-NEXT: add z0.h, z2.h, z0.h +; CHECK-NEXT: strh w8, [sp] +; CHECK-NEXT: ldr q3, [sp] +; CHECK-NEXT: stp q1, q4, [x0, #32] +; CHECK-NEXT: add z1.h, z16.h, z3.h +; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: add sp, sp, #64 +; CHECK-NEXT: ret + %tmp1 = load <32 x i16>, ptr %a + %tmp2 = load <32 x i16>, ptr %b + %tmp3 = shufflevector <32 x i16> %tmp1, <32 x i16> %tmp2, <32 x i32> + %tmp4 = shufflevector <32 x i16> %tmp1, <32 x i16> %tmp2, <32 x i32> + %tmp5 = add <32 x i16> %tmp3, %tmp4 + store <32 x i16> %tmp5, ptr %a + ret void +} + +define void @zip1_v16i16(ptr %a, ptr %b) #0 { +; CHECK-LABEL: zip1_v16i16: +; CHECK: // %bb.0: +; CHECK-NEXT: sub sp, sp, #16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: ldr q0, [x0, #16] +; CHECK-NEXT: ldr q0, [x0] +; CHECK-NEXT: ldr q1, [x1, #16] +; CHECK-NEXT: ldr q1, [x1] +; CHECK-NEXT: mov z2.h, z0.h[7] +; CHECK-NEXT: fmov w8, s2 +; CHECK-NEXT: mov z2.h, z0.h[6] +; CHECK-NEXT: fmov w9, s2 +; CHECK-NEXT: mov z2.h, z0.h[5] +; CHECK-NEXT: fmov w10, s2 +; CHECK-NEXT: mov z2.h, z0.h[4] +; CHECK-NEXT: strh w8, [sp, #12] +; CHECK-NEXT: fmov w8, s2 +; CHECK-NEXT: mov z2.h, z1.h[7] +; CHECK-NEXT: strh w9, [sp, #8] +; CHECK-NEXT: fmov w9, s2 +; CHECK-NEXT: mov z2.h, z1.h[6] +; CHECK-NEXT: strh w10, [sp, #4] +; CHECK-NEXT: fmov w10, s2 +; CHECK-NEXT: mov z2.h, z1.h[5] +; CHECK-NEXT: strh w8, [sp] +; CHECK-NEXT: fmov w8, s2 +; CHECK-NEXT: mov z2.h, z1.h[4] +; CHECK-NEXT: strh w9, [sp, #14] +; CHECK-NEXT: fmov w9, s2 +; CHECK-NEXT: strh w10, [sp, #10] +; CHECK-NEXT: zip1 z0.h, z0.h, z1.h +; CHECK-NEXT: strh w8, [sp, #6] +; CHECK-NEXT: strh w9, [sp, #2] +; CHECK-NEXT: ldr q2, [sp] +; CHECK-NEXT: str q0, [x0] +; CHECK-NEXT: str q2, [x0, #16] +; CHECK-NEXT: add sp, sp, #16 +; CHECK-NEXT: ret + %tmp1 = load volatile <16 x i16>, ptr %a + %tmp2 = load volatile <16 x i16>, ptr %b + %tmp3 = shufflevector <16 x i16> %tmp1, <16 x i16> %tmp2, <16 x i32> + store volatile <16 x i16> %tmp3, ptr %a + ret void +} + +define void @zip1_v8i32(ptr %a, ptr %b) #0 { +; CHECK-LABEL: zip1_v8i32: +; CHECK: // %bb.0: +; CHECK-NEXT: sub sp, sp, #16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: ldr q0, [x0, #16] +; CHECK-NEXT: ldr q0, [x0] +; CHECK-NEXT: ldr q1, [x1, #16] +; CHECK-NEXT: ldr q1, [x1] +; CHECK-NEXT: mov z2.s, z0.s[3] +; CHECK-NEXT: fmov w8, s2 +; CHECK-NEXT: mov z2.s, z1.s[3] +; CHECK-NEXT: fmov w9, s2 +; CHECK-NEXT: mov z2.s, z0.s[2] +; CHECK-NEXT: fmov w10, s2 +; CHECK-NEXT: mov z2.s, z1.s[2] +; CHECK-NEXT: fmov w11, s2 +; CHECK-NEXT: zip1 z0.s, z0.s, z1.s +; CHECK-NEXT: stp w8, w9, [sp, #8] +; CHECK-NEXT: stp w10, w11, [sp] +; CHECK-NEXT: ldr q2, [sp] +; CHECK-NEXT: str q0, [x0] +; CHECK-NEXT: str q2, [x0, #16] +; CHECK-NEXT: add sp, sp, #16 +; CHECK-NEXT: ret + %tmp1 = load volatile <8 x i32>, ptr %a + %tmp2 = load volatile <8 x i32>, ptr %b + %tmp3 = shufflevector <8 x i32> %tmp1, <8 x i32> %tmp2, <8 x i32> + store volatile <8 x i32> %tmp3, ptr %a + ret void +} + +define void @zip_v4f64(ptr %a, ptr %b) #0 { +; CHECK-LABEL: zip_v4f64: +; CHECK: // %bb.0: +; CHECK-NEXT: ldp q1, q0, [x0] +; CHECK-NEXT: ptrue p0.d, vl2 +; CHECK-NEXT: ldp q2, q3, [x1] +; CHECK-NEXT: zip1 z4.d, z1.d, z2.d +; CHECK-NEXT: trn2 z1.d, z1.d, z2.d +; CHECK-NEXT: zip1 z2.d, z0.d, z3.d +; CHECK-NEXT: trn2 z0.d, z0.d, z3.d +; CHECK-NEXT: fadd z2.d, p0/m, z2.d, z4.d +; CHECK-NEXT: fadd z0.d, p0/m, z0.d, z1.d +; CHECK-NEXT: stp q2, q0, [x0] +; CHECK-NEXT: ret + %tmp1 = load <4 x double>, ptr %a + %tmp2 = load <4 x double>, ptr %b + %tmp3 = shufflevector <4 x double> %tmp1, <4 x double> %tmp2, <4 x i32> + %tmp4 = shufflevector <4 x double> %tmp1, <4 x double> %tmp2, <4 x i32> + %tmp5 = fadd <4 x double> %tmp3, %tmp4 + store <4 x double> %tmp5, ptr %a + ret void +} + +define void @zip_v4i32(ptr %a, ptr %b) #0 { +; CHECK-LABEL: zip_v4i32: +; CHECK: // %bb.0: +; CHECK-NEXT: sub sp, sp, #16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: ldr q0, [x1] +; CHECK-NEXT: ldr q1, [x0] +; CHECK-NEXT: mov z2.s, z0.s[3] +; CHECK-NEXT: mov z3.s, z1.s[3] +; CHECK-NEXT: fmov w8, s2 +; CHECK-NEXT: mov z2.s, z0.s[2] +; CHECK-NEXT: fmov w9, s3 +; CHECK-NEXT: mov z3.s, z1.s[2] +; CHECK-NEXT: fmov w10, s2 +; CHECK-NEXT: fmov w11, s3 +; CHECK-NEXT: zip1 z0.s, z1.s, z0.s +; CHECK-NEXT: stp w9, w8, [sp, #8] +; CHECK-NEXT: stp w11, w10, [sp] +; CHECK-NEXT: ldr q2, [sp] +; CHECK-NEXT: add z0.s, z0.s, z2.s +; CHECK-NEXT: str q0, [x0] +; CHECK-NEXT: add sp, sp, #16 +; CHECK-NEXT: ret + %tmp1 = load <4 x i32>, ptr %a + %tmp2 = load <4 x i32>, ptr %b + %tmp3 = shufflevector <4 x i32> %tmp1, <4 x i32> %tmp2, <4 x i32> + %tmp4 = shufflevector <4 x i32> %tmp1, <4 x i32> %tmp2, <4 x i32> + %tmp5 = add <4 x i32> %tmp3, %tmp4 + store <4 x i32> %tmp5, ptr %a + ret void +} + +define void @zip1_v8i32_undef(ptr %a) #0 { +; CHECK-LABEL: zip1_v8i32_undef: +; CHECK: // %bb.0: +; CHECK-NEXT: sub sp, sp, #16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: ldr q0, [x0, #16] +; CHECK-NEXT: ldr q0, [x0] +; CHECK-NEXT: mov z1.s, z0.s[3] +; CHECK-NEXT: fmov w8, s1 +; CHECK-NEXT: mov z1.s, z0.s[2] +; CHECK-NEXT: fmov w9, s1 +; CHECK-NEXT: zip1 z0.s, z0.s, z0.s +; CHECK-NEXT: stp w8, w8, [sp, #8] +; CHECK-NEXT: stp w9, w9, [sp] +; CHECK-NEXT: ldr q1, [sp] +; CHECK-NEXT: str q0, [x0] +; CHECK-NEXT: str q1, [x0, #16] +; CHECK-NEXT: add sp, sp, #16 +; CHECK-NEXT: ret + %tmp1 = load volatile <8 x i32>, ptr %a + %tmp2 = shufflevector <8 x i32> %tmp1, <8 x i32> undef, <8 x i32> + store volatile <8 x i32> %tmp2, ptr %a + ret void +} + +define void @trn_v32i8(ptr %a, ptr %b) #0 { +; CHECK-LABEL: trn_v32i8: +; CHECK: // %bb.0: +; CHECK-NEXT: ldp q1, q0, [x0] +; CHECK-NEXT: ldp q2, q3, [x1] +; CHECK-NEXT: trn1 z4.b, z1.b, z2.b +; CHECK-NEXT: trn2 z1.b, z1.b, z2.b +; CHECK-NEXT: add z1.b, z4.b, z1.b +; CHECK-NEXT: trn1 z5.b, z0.b, z3.b +; CHECK-NEXT: trn2 z0.b, z0.b, z3.b +; CHECK-NEXT: add z0.b, z5.b, z0.b +; CHECK-NEXT: stp q1, q0, [x0] +; CHECK-NEXT: ret + %tmp1 = load <32 x i8>, ptr %a + %tmp2 = load <32 x i8>, ptr %b + %tmp3 = shufflevector <32 x i8> %tmp1, <32 x i8> %tmp2, <32 x i32> + %tmp4 = shufflevector <32 x i8> %tmp1, <32 x i8> %tmp2, <32 x i32> + %tmp5 = add <32 x i8> %tmp3, %tmp4 + store <32 x i8> %tmp5, ptr %a + ret void +} + +define void @trn_v8i16(ptr %a, ptr %b) #0 { +; CHECK-LABEL: trn_v8i16: +; CHECK: // %bb.0: +; CHECK-NEXT: ldr q0, [x0] +; CHECK-NEXT: fmov w8, s0 +; CHECK-NEXT: mov z1.h, z0.h[3] +; CHECK-NEXT: mov z2.h, z0.h[1] +; CHECK-NEXT: mov z6.h, z0.h[2] +; CHECK-NEXT: mov z3.h, z0.h[5] +; CHECK-NEXT: mov z4.h, z0.h[4] +; CHECK-NEXT: mov z5.h, z0.h[6] +; CHECK-NEXT: fmov w9, s1 +; CHECK-NEXT: mov z0.h, z0.h[7] +; CHECK-NEXT: fmov w10, s2 +; CHECK-NEXT: fmov w11, s6 +; CHECK-NEXT: strh w8, [sp, #-32]! +; CHECK-NEXT: .cfi_def_cfa_offset 32 +; CHECK-NEXT: fmov w8, s3 +; CHECK-NEXT: fmov w12, s4 +; CHECK-NEXT: fmov w13, s5 +; CHECK-NEXT: strh w11, [sp, #4] +; CHECK-NEXT: fmov w11, s0 +; CHECK-NEXT: strh w9, [sp, #14] +; CHECK-NEXT: strh w10, [sp, #12] +; CHECK-NEXT: strh w8, [sp, #10] +; CHECK-NEXT: strh w12, [sp, #8] +; CHECK-NEXT: strh w13, [sp, #6] +; CHECK-NEXT: strh w11, [sp, #2] +; CHECK-NEXT: strh w11, [sp, #28] +; CHECK-NEXT: strh w12, [sp, #26] +; CHECK-NEXT: strh w8, [sp, #22] +; CHECK-NEXT: strh w9, [sp, #20] +; CHECK-NEXT: strh w13, [sp, #18] +; CHECK-NEXT: strh w10, [sp, #16] +; CHECK-NEXT: ldp q0, q1, [sp] +; CHECK-NEXT: add z0.h, z0.h, z1.h +; CHECK-NEXT: str q0, [x0] +; CHECK-NEXT: add sp, sp, #32 +; CHECK-NEXT: ret + %tmp1 = load <8 x i16>, ptr %a + %tmp2 = load <8 x i16>, ptr %b + %tmp3 = shufflevector <8 x i16> %tmp1, <8 x i16> %tmp2, <8 x i32> + %tmp4 = shufflevector <8 x i16> %tmp1, <8 x i16> %tmp2, <8 x i32> + %tmp5 = add <8 x i16> %tmp3, %tmp4 + store <8 x i16> %tmp5, ptr %a + ret void +} + +define void @trn_v16i16(ptr %a, ptr %b) #0 { +; CHECK-LABEL: trn_v16i16: +; CHECK: // %bb.0: +; CHECK-NEXT: ldp q1, q0, [x0] +; CHECK-NEXT: ldp q2, q3, [x1] +; CHECK-NEXT: trn1 z4.h, z1.h, z2.h +; CHECK-NEXT: trn2 z1.h, z1.h, z2.h +; CHECK-NEXT: add z1.h, z4.h, z1.h +; CHECK-NEXT: trn1 z5.h, z0.h, z3.h +; CHECK-NEXT: trn2 z0.h, z0.h, z3.h +; CHECK-NEXT: add z0.h, z5.h, z0.h +; CHECK-NEXT: stp q1, q0, [x0] +; CHECK-NEXT: ret + %tmp1 = load <16 x i16>, ptr %a + %tmp2 = load <16 x i16>, ptr %b + %tmp3 = shufflevector <16 x i16> %tmp1, <16 x i16> %tmp2, <16 x i32> + %tmp4 = shufflevector <16 x i16> %tmp1, <16 x i16> %tmp2, <16 x i32> + %tmp5 = add <16 x i16> %tmp3, %tmp4 + store <16 x i16> %tmp5, ptr %a + ret void +} + +define void @trn_v8i32(ptr %a, ptr %b) #0 { +; CHECK-LABEL: trn_v8i32: +; CHECK: // %bb.0: +; CHECK-NEXT: ldp q1, q0, [x0] +; CHECK-NEXT: ldp q2, q3, [x1] +; CHECK-NEXT: zip1 z4.s, z1.s, z2.s +; CHECK-NEXT: trn2 z1.s, z1.s, z2.s +; CHECK-NEXT: add z1.s, z4.s, z1.s +; CHECK-NEXT: trn1 z5.s, z0.s, z3.s +; CHECK-NEXT: trn2 z0.s, z0.s, z3.s +; CHECK-NEXT: add z0.s, z5.s, z0.s +; CHECK-NEXT: stp q1, q0, [x0] +; CHECK-NEXT: ret + %tmp1 = load <8 x i32>, ptr %a + %tmp2 = load <8 x i32>, ptr %b + %tmp3 = shufflevector <8 x i32> %tmp1, <8 x i32> %tmp2, <8 x i32> + %tmp4 = shufflevector <8 x i32> %tmp1, <8 x i32> %tmp2, <8 x i32> + %tmp5 = add <8 x i32> %tmp3, %tmp4 + store <8 x i32> %tmp5, ptr %a + ret void +} + +define void @trn_v4f64(ptr %a, ptr %b) #0 { +; CHECK-LABEL: trn_v4f64: +; CHECK: // %bb.0: +; CHECK-NEXT: ldp q1, q0, [x0] +; CHECK-NEXT: ptrue p0.d, vl2 +; CHECK-NEXT: ldp q2, q3, [x1] +; CHECK-NEXT: zip1 z4.d, z1.d, z2.d +; CHECK-NEXT: trn2 z1.d, z1.d, z2.d +; CHECK-NEXT: fadd z1.d, p0/m, z1.d, z4.d +; CHECK-NEXT: zip1 z5.d, z0.d, z3.d +; CHECK-NEXT: trn2 z0.d, z0.d, z3.d +; CHECK-NEXT: fadd z0.d, p0/m, z0.d, z5.d +; CHECK-NEXT: stp q1, q0, [x0] +; CHECK-NEXT: ret + %tmp1 = load <4 x double>, ptr %a + %tmp2 = load <4 x double>, ptr %b + %tmp3 = shufflevector <4 x double> %tmp1, <4 x double> %tmp2, <4 x i32> + %tmp4 = shufflevector <4 x double> %tmp1, <4 x double> %tmp2, <4 x i32> + %tmp5 = fadd <4 x double> %tmp3, %tmp4 + store <4 x double> %tmp5, ptr %a + ret void +} + +define void @trn_v4f32(ptr %a, ptr %b) #0 { +; CHECK-LABEL: trn_v4f32: +; CHECK: // %bb.0: +; CHECK-NEXT: ldr q0, [x0] +; CHECK-NEXT: ptrue p0.s, vl4 +; CHECK-NEXT: ldr q1, [x1] +; CHECK-NEXT: trn1 z2.s, z0.s, z1.s +; CHECK-NEXT: trn2 z0.s, z0.s, z1.s +; CHECK-NEXT: fadd z0.s, p0/m, z0.s, z2.s +; CHECK-NEXT: str q0, [x0] +; CHECK-NEXT: ret + %tmp1 = load <4 x float>, ptr %a + %tmp2 = load <4 x float>, ptr %b + %tmp3 = shufflevector <4 x float> %tmp1, <4 x float> %tmp2, <4 x i32> + %tmp4 = shufflevector <4 x float> %tmp1, <4 x float> %tmp2, <4 x i32> + %tmp5 = fadd <4 x float> %tmp3, %tmp4 + store <4 x float> %tmp5, ptr %a + ret void +} + +define void @trn_v8i32_undef(ptr %a) #0 { +; CHECK-LABEL: trn_v8i32_undef: +; CHECK: // %bb.0: +; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: trn1 z2.s, z0.s, z0.s +; CHECK-NEXT: trn2 z0.s, z0.s, z0.s +; CHECK-NEXT: add z0.s, z2.s, z0.s +; CHECK-NEXT: trn1 z3.s, z1.s, z1.s +; CHECK-NEXT: trn2 z1.s, z1.s, z1.s +; CHECK-NEXT: add z1.s, z3.s, z1.s +; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: ret + %tmp1 = load <8 x i32>, ptr %a + %tmp3 = shufflevector <8 x i32> %tmp1, <8 x i32> undef, <8 x i32> + %tmp4 = shufflevector <8 x i32> %tmp1, <8 x i32> undef, <8 x i32> + %tmp5 = add <8 x i32> %tmp3, %tmp4 + store <8 x i32> %tmp5, ptr %a + ret void +} + +define void @zip2_v32i8(ptr %a, ptr %b) #0{ +; CHECK-LABEL: zip2_v32i8: +; CHECK: // %bb.0: +; CHECK-NEXT: sub sp, sp, #16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: ldr q0, [x0] +; CHECK-NEXT: ldr q0, [x0, #16] +; CHECK-NEXT: ldr q1, [x1] +; CHECK-NEXT: ldr q1, [x1, #16] +; CHECK-NEXT: mov z2.b, z0.b[15] +; CHECK-NEXT: fmov w8, s2 +; CHECK-NEXT: mov z2.b, z0.b[14] +; CHECK-NEXT: fmov w9, s2 +; CHECK-NEXT: mov z2.b, z0.b[13] +; CHECK-NEXT: fmov w10, s2 +; CHECK-NEXT: mov z2.b, z0.b[12] +; CHECK-NEXT: strb w8, [sp, #14] +; CHECK-NEXT: fmov w8, s2 +; CHECK-NEXT: mov z2.b, z0.b[11] +; CHECK-NEXT: strb w9, [sp, #12] +; CHECK-NEXT: fmov w9, s2 +; CHECK-NEXT: mov z2.b, z0.b[10] +; CHECK-NEXT: strb w10, [sp, #10] +; CHECK-NEXT: fmov w10, s2 +; CHECK-NEXT: mov z2.b, z0.b[9] +; CHECK-NEXT: strb w8, [sp, #8] +; CHECK-NEXT: fmov w8, s2 +; CHECK-NEXT: mov z2.b, z0.b[8] +; CHECK-NEXT: strb w9, [sp, #6] +; CHECK-NEXT: fmov w9, s2 +; CHECK-NEXT: mov z2.b, z1.b[15] +; CHECK-NEXT: strb w10, [sp, #4] +; CHECK-NEXT: strb w8, [sp, #2] +; CHECK-NEXT: fmov w8, s2 +; CHECK-NEXT: mov z2.b, z1.b[14] +; CHECK-NEXT: strb w9, [sp] +; CHECK-NEXT: fmov w9, s2 +; CHECK-NEXT: mov z2.b, z1.b[13] +; CHECK-NEXT: fmov w10, s2 +; CHECK-NEXT: mov z2.b, z1.b[12] +; CHECK-NEXT: strb w8, [sp, #15] +; CHECK-NEXT: fmov w8, s2 +; CHECK-NEXT: mov z2.b, z1.b[11] +; CHECK-NEXT: strb w9, [sp, #13] +; CHECK-NEXT: strb w10, [sp, #11] +; CHECK-NEXT: zip1 z0.b, z0.b, z1.b +; CHECK-NEXT: strb w8, [sp, #9] +; CHECK-NEXT: fmov w8, s2 +; CHECK-NEXT: mov z2.b, z1.b[10] +; CHECK-NEXT: fmov w9, s2 +; CHECK-NEXT: mov z2.b, z1.b[9] +; CHECK-NEXT: fmov w10, s2 +; CHECK-NEXT: mov z2.b, z1.b[8] +; CHECK-NEXT: strb w8, [sp, #7] +; CHECK-NEXT: fmov w8, s2 +; CHECK-NEXT: strb w9, [sp, #5] +; CHECK-NEXT: strb w10, [sp, #3] +; CHECK-NEXT: strb w8, [sp, #1] +; CHECK-NEXT: ldr q2, [sp] +; CHECK-NEXT: str q0, [x0] +; CHECK-NEXT: str q2, [x0, #16] +; CHECK-NEXT: add sp, sp, #16 +; CHECK-NEXT: ret + %tmp1 = load volatile <32 x i8>, ptr %a + %tmp2 = load volatile <32 x i8>, ptr %b + %tmp3 = shufflevector <32 x i8> %tmp1, <32 x i8> %tmp2, <32 x i32> + store volatile <32 x i8> %tmp3, ptr %a + ret void +} + +define void @zip2_v16i16(ptr %a, ptr %b) #0{ +; CHECK-LABEL: zip2_v16i16: +; CHECK: // %bb.0: +; CHECK-NEXT: sub sp, sp, #16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: ldr q0, [x0] +; CHECK-NEXT: ldr q0, [x0, #16] +; CHECK-NEXT: ldr q1, [x1] +; CHECK-NEXT: ldr q1, [x1, #16] +; CHECK-NEXT: mov z2.h, z0.h[7] +; CHECK-NEXT: fmov w8, s2 +; CHECK-NEXT: mov z2.h, z0.h[6] +; CHECK-NEXT: fmov w9, s2 +; CHECK-NEXT: mov z2.h, z0.h[5] +; CHECK-NEXT: fmov w10, s2 +; CHECK-NEXT: mov z2.h, z0.h[4] +; CHECK-NEXT: strh w8, [sp, #12] +; CHECK-NEXT: fmov w8, s2 +; CHECK-NEXT: mov z2.h, z1.h[7] +; CHECK-NEXT: strh w9, [sp, #8] +; CHECK-NEXT: fmov w9, s2 +; CHECK-NEXT: mov z2.h, z1.h[6] +; CHECK-NEXT: strh w10, [sp, #4] +; CHECK-NEXT: fmov w10, s2 +; CHECK-NEXT: mov z2.h, z1.h[5] +; CHECK-NEXT: strh w8, [sp] +; CHECK-NEXT: fmov w8, s2 +; CHECK-NEXT: mov z2.h, z1.h[4] +; CHECK-NEXT: strh w9, [sp, #14] +; CHECK-NEXT: fmov w9, s2 +; CHECK-NEXT: strh w10, [sp, #10] +; CHECK-NEXT: zip1 z0.h, z0.h, z1.h +; CHECK-NEXT: strh w8, [sp, #6] +; CHECK-NEXT: strh w9, [sp, #2] +; CHECK-NEXT: ldr q2, [sp] +; CHECK-NEXT: str q0, [x0] +; CHECK-NEXT: str q2, [x0, #16] +; CHECK-NEXT: add sp, sp, #16 +; CHECK-NEXT: ret + %tmp1 = load volatile <16 x i16>, ptr %a + %tmp2 = load volatile <16 x i16>, ptr %b + %tmp3 = shufflevector <16 x i16> %tmp1, <16 x i16> %tmp2, <16 x i32> + store volatile <16 x i16> %tmp3, ptr %a + ret void +} + +define void @zip2_v8i32(ptr %a, ptr %b) #0{ +; CHECK-LABEL: zip2_v8i32: +; CHECK: // %bb.0: +; CHECK-NEXT: sub sp, sp, #16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: ldr q0, [x0] +; CHECK-NEXT: ldr q0, [x0, #16] +; CHECK-NEXT: ldr q1, [x1] +; CHECK-NEXT: ldr q1, [x1, #16] +; CHECK-NEXT: mov z2.s, z0.s[3] +; CHECK-NEXT: fmov w8, s2 +; CHECK-NEXT: mov z2.s, z1.s[3] +; CHECK-NEXT: fmov w9, s2 +; CHECK-NEXT: mov z2.s, z0.s[2] +; CHECK-NEXT: fmov w10, s2 +; CHECK-NEXT: mov z2.s, z1.s[2] +; CHECK-NEXT: fmov w11, s2 +; CHECK-NEXT: zip1 z0.s, z0.s, z1.s +; CHECK-NEXT: stp w8, w9, [sp, #8] +; CHECK-NEXT: stp w10, w11, [sp] +; CHECK-NEXT: ldr q2, [sp] +; CHECK-NEXT: str q0, [x0] +; CHECK-NEXT: str q2, [x0, #16] +; CHECK-NEXT: add sp, sp, #16 +; CHECK-NEXT: ret + %tmp1 = load volatile <8 x i32>, ptr %a + %tmp2 = load volatile <8 x i32>, ptr %b + %tmp3 = shufflevector <8 x i32> %tmp1, <8 x i32> %tmp2, <8 x i32> + store volatile <8 x i32> %tmp3, ptr %a + ret void +} + +define void @zip2_v8i32_undef(ptr %a) #0{ +; CHECK-LABEL: zip2_v8i32_undef: +; CHECK: // %bb.0: +; CHECK-NEXT: sub sp, sp, #16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: ldr q0, [x0] +; CHECK-NEXT: ldr q0, [x0, #16] +; CHECK-NEXT: mov z1.s, z0.s[3] +; CHECK-NEXT: fmov w8, s1 +; CHECK-NEXT: mov z1.s, z0.s[2] +; CHECK-NEXT: fmov w9, s1 +; CHECK-NEXT: zip1 z0.s, z0.s, z0.s +; CHECK-NEXT: stp w8, w8, [sp, #8] +; CHECK-NEXT: stp w9, w9, [sp] +; CHECK-NEXT: ldr q1, [sp] +; CHECK-NEXT: str q0, [x0] +; CHECK-NEXT: str q1, [x0, #16] +; CHECK-NEXT: add sp, sp, #16 +; CHECK-NEXT: ret + %tmp1 = load volatile <8 x i32>, ptr %a + %tmp2 = shufflevector <8 x i32> %tmp1, <8 x i32> undef, <8 x i32> + store volatile <8 x i32> %tmp2, ptr %a + ret void +} + +define void @uzp_v32i8(ptr %a, ptr %b) #0{ +; CHECK-LABEL: uzp_v32i8: +; CHECK: // %bb.0: +; CHECK-NEXT: sub sp, sp, #128 +; CHECK-NEXT: .cfi_def_cfa_offset 128 +; CHECK-NEXT: stp d15, d14, [sp, #64] // 16-byte Folded Spill +; CHECK-NEXT: stp d13, d12, [sp, #80] // 16-byte Folded Spill +; CHECK-NEXT: stp d11, d10, [sp, #96] // 16-byte Folded Spill +; CHECK-NEXT: stp d9, d8, [sp, #112] // 16-byte Folded Spill +; CHECK-NEXT: .cfi_offset b8, -8 +; CHECK-NEXT: .cfi_offset b9, -16 +; CHECK-NEXT: .cfi_offset b10, -24 +; CHECK-NEXT: .cfi_offset b11, -32 +; CHECK-NEXT: .cfi_offset b12, -40 +; CHECK-NEXT: .cfi_offset b13, -48 +; CHECK-NEXT: .cfi_offset b14, -56 +; CHECK-NEXT: .cfi_offset b15, -64 +; CHECK-NEXT: ldp q0, q3, [x0] +; CHECK-NEXT: fmov w9, s0 +; CHECK-NEXT: mov z27.b, z0.b[14] +; CHECK-NEXT: mov z28.b, z0.b[12] +; CHECK-NEXT: mov z30.b, z0.b[8] +; CHECK-NEXT: fmov w8, s3 +; CHECK-NEXT: mov z2.b, z3.b[12] +; CHECK-NEXT: mov z4.b, z3.b[10] +; CHECK-NEXT: mov z1.b, z3.b[14] +; CHECK-NEXT: ldp q10, q11, [x1] +; CHECK-NEXT: strb w8, [sp, #40] +; CHECK-NEXT: fmov w8, s2 +; CHECK-NEXT: strb w9, [sp, #32] +; CHECK-NEXT: fmov w9, s4 +; CHECK-NEXT: mov z6.b, z3.b[6] +; CHECK-NEXT: mov z7.b, z3.b[4] +; CHECK-NEXT: fmov w10, s1 +; CHECK-NEXT: strb w8, [sp, #46] +; CHECK-NEXT: fmov w8, s6 +; CHECK-NEXT: strb w9, [sp, #45] +; CHECK-NEXT: fmov w9, s7 +; CHECK-NEXT: mov z5.b, z3.b[8] +; CHECK-NEXT: strb w10, [sp, #47] +; CHECK-NEXT: fmov w10, s5 +; CHECK-NEXT: strb w8, [sp, #43] +; CHECK-NEXT: fmov w8, s27 +; CHECK-NEXT: strb w9, [sp, #42] +; CHECK-NEXT: fmov w9, s28 +; CHECK-NEXT: mov z16.b, z3.b[2] +; CHECK-NEXT: mov z31.b, z0.b[6] +; CHECK-NEXT: strb w10, [sp, #44] +; CHECK-NEXT: fmov w10, s16 +; CHECK-NEXT: strb w8, [sp, #39] +; CHECK-NEXT: fmov w8, s30 +; CHECK-NEXT: strb w9, [sp, #38] +; CHECK-NEXT: fmov w9, s31 +; CHECK-NEXT: mov z29.b, z0.b[10] +; CHECK-NEXT: mov z9.b, z0.b[2] +; CHECK-NEXT: strb w10, [sp, #41] +; CHECK-NEXT: fmov w10, s29 +; CHECK-NEXT: strb w8, [sp, #36] +; CHECK-NEXT: fmov w8, s9 +; CHECK-NEXT: strb w9, [sp, #35] +; CHECK-NEXT: fmov w9, s11 +; CHECK-NEXT: mov z8.b, z0.b[4] +; CHECK-NEXT: mov z16.b, z11.b[4] +; CHECK-NEXT: mov z27.b, z11.b[2] +; CHECK-NEXT: strb w10, [sp, #37] +; CHECK-NEXT: fmov w10, s8 +; CHECK-NEXT: strb w8, [sp, #33] +; CHECK-NEXT: fmov w8, s16 +; CHECK-NEXT: strb w9, [sp, #8] +; CHECK-NEXT: fmov w9, s27 +; CHECK-NEXT: mov z5.b, z11.b[10] +; CHECK-NEXT: mov z6.b, z11.b[8] +; CHECK-NEXT: mov z2.b, z11.b[14] +; CHECK-NEXT: fmov w12, s5 +; CHECK-NEXT: fmov w13, s6 +; CHECK-NEXT: mov z5.b, z10.b[10] +; CHECK-NEXT: mov z6.b, z10.b[8] +; CHECK-NEXT: strb w10, [sp, #34] +; CHECK-NEXT: fmov w10, s2 +; CHECK-NEXT: strb w8, [sp, #10] +; CHECK-NEXT: fmov w8, s5 +; CHECK-NEXT: strb w9, [sp, #9] +; CHECK-NEXT: fmov w9, s6 +; CHECK-NEXT: mov z4.b, z11.b[12] +; CHECK-NEXT: mov z7.b, z11.b[6] +; CHECK-NEXT: mov z28.b, z11.b[15] +; CHECK-NEXT: mov z29.b, z11.b[13] +; CHECK-NEXT: mov z30.b, z11.b[11] +; CHECK-NEXT: mov z31.b, z11.b[9] +; CHECK-NEXT: mov z8.b, z11.b[7] +; CHECK-NEXT: mov z9.b, z11.b[5] +; CHECK-NEXT: mov z12.b, z11.b[3] +; CHECK-NEXT: mov z13.b, z11.b[1] +; CHECK-NEXT: mov z2.b, z10.b[14] +; CHECK-NEXT: mov z11.b, z10.b[4] +; CHECK-NEXT: mov z14.b, z10.b[2] +; CHECK-NEXT: strb w10, [sp, #15] +; CHECK-NEXT: fmov w10, s2 +; CHECK-NEXT: strb w8, [sp, #5] +; CHECK-NEXT: fmov w8, s11 +; CHECK-NEXT: strb w9, [sp, #4] +; CHECK-NEXT: fmov w9, s14 +; CHECK-NEXT: mov z17.b, z3.b[15] +; CHECK-NEXT: mov z18.b, z3.b[13] +; CHECK-NEXT: fmov w14, s7 +; CHECK-NEXT: mov z7.b, z10.b[6] +; CHECK-NEXT: strb w10, [sp, #7] +; CHECK-NEXT: fmov w10, s7 +; CHECK-NEXT: strb w8, [sp, #2] +; CHECK-NEXT: fmov w8, s17 +; CHECK-NEXT: strb w9, [sp, #1] +; CHECK-NEXT: fmov w9, s18 +; CHECK-NEXT: mov z19.b, z3.b[11] +; CHECK-NEXT: mov z20.b, z3.b[9] +; CHECK-NEXT: mov z21.b, z3.b[7] +; CHECK-NEXT: strb w10, [sp, #3] +; CHECK-NEXT: fmov w10, s19 +; CHECK-NEXT: strb w8, [sp, #63] +; CHECK-NEXT: fmov w8, s20 +; CHECK-NEXT: strb w9, [sp, #62] +; CHECK-NEXT: fmov w9, s21 +; CHECK-NEXT: mov z22.b, z3.b[5] +; CHECK-NEXT: mov z23.b, z3.b[3] +; CHECK-NEXT: mov z3.b, z0.b[13] +; CHECK-NEXT: strb w10, [sp, #61] +; CHECK-NEXT: fmov w10, s22 +; CHECK-NEXT: strb w8, [sp, #60] +; CHECK-NEXT: fmov w8, s23 +; CHECK-NEXT: strb w9, [sp, #59] +; CHECK-NEXT: fmov w9, s3 +; CHECK-NEXT: mov z24.b, z0.b[11] +; CHECK-NEXT: mov z25.b, z0.b[9] +; CHECK-NEXT: mov z26.b, z0.b[5] +; CHECK-NEXT: strb w10, [sp, #58] +; CHECK-NEXT: fmov w10, s24 +; CHECK-NEXT: strb w8, [sp, #57] +; CHECK-NEXT: fmov w8, s25 +; CHECK-NEXT: strb w9, [sp, #54] +; CHECK-NEXT: fmov w9, s26 +; CHECK-NEXT: mov z1.b, z0.b[3] +; CHECK-NEXT: mov z0.b, z0.b[1] +; CHECK-NEXT: strb w10, [sp, #53] +; CHECK-NEXT: fmov w10, s1 +; CHECK-NEXT: strb w8, [sp, #52] +; CHECK-NEXT: fmov w8, s0 +; CHECK-NEXT: strb w9, [sp, #50] +; CHECK-NEXT: fmov w9, s28 +; CHECK-NEXT: strb w10, [sp, #49] +; CHECK-NEXT: fmov w10, s29 +; CHECK-NEXT: strb w8, [sp, #48] +; CHECK-NEXT: fmov w8, s30 +; CHECK-NEXT: strb w9, [sp, #31] +; CHECK-NEXT: fmov w9, s31 +; CHECK-NEXT: strb w10, [sp, #30] +; CHECK-NEXT: fmov w10, s8 +; CHECK-NEXT: strb w8, [sp, #29] +; CHECK-NEXT: fmov w8, s9 +; CHECK-NEXT: strb w9, [sp, #28] +; CHECK-NEXT: fmov w9, s12 +; CHECK-NEXT: fmov w11, s4 +; CHECK-NEXT: mov z15.b, z10.b[15] +; CHECK-NEXT: mov z16.b, z10.b[13] +; CHECK-NEXT: strb w10, [sp, #27] +; CHECK-NEXT: fmov w10, s13 +; CHECK-NEXT: strb w8, [sp, #26] +; CHECK-NEXT: fmov w8, s15 +; CHECK-NEXT: strb w9, [sp, #25] +; CHECK-NEXT: fmov w9, s16 +; CHECK-NEXT: mov z4.b, z10.b[12] +; CHECK-NEXT: mov z27.b, z10.b[11] +; CHECK-NEXT: strb w11, [sp, #14] +; CHECK-NEXT: mov z2.b, z10.b[9] +; CHECK-NEXT: fmov w11, s4 +; CHECK-NEXT: mov z4.b, z10.b[7] +; CHECK-NEXT: strb w10, [sp, #24] +; CHECK-NEXT: fmov w10, s27 +; CHECK-NEXT: strb w8, [sp, #23] +; CHECK-NEXT: fmov w8, s2 +; CHECK-NEXT: strb w9, [sp, #22] +; CHECK-NEXT: fmov w9, s4 +; CHECK-NEXT: mov z5.b, z10.b[5] +; CHECK-NEXT: mov z6.b, z10.b[3] +; CHECK-NEXT: mov z7.b, z10.b[1] +; CHECK-NEXT: fmov w15, s10 +; CHECK-NEXT: strb w10, [sp, #21] +; CHECK-NEXT: fmov w10, s5 +; CHECK-NEXT: strb w8, [sp, #20] +; CHECK-NEXT: fmov w8, s6 +; CHECK-NEXT: strb w9, [sp, #19] +; CHECK-NEXT: fmov w9, s7 +; CHECK-NEXT: strb w15, [sp] +; CHECK-NEXT: strb w12, [sp, #13] +; CHECK-NEXT: ldr q17, [sp, #32] +; CHECK-NEXT: strb w13, [sp, #12] +; CHECK-NEXT: ldr q0, [sp, #48] +; CHECK-NEXT: strb w14, [sp, #11] +; CHECK-NEXT: strb w11, [sp, #6] +; CHECK-NEXT: strb w10, [sp, #18] +; CHECK-NEXT: ldr q18, [sp] +; CHECK-NEXT: strb w8, [sp, #17] +; CHECK-NEXT: add z0.b, z17.b, z0.b +; CHECK-NEXT: strb w9, [sp, #16] +; CHECK-NEXT: ldr q1, [sp, #16] +; CHECK-NEXT: ldp d9, d8, [sp, #112] // 16-byte Folded Reload +; CHECK-NEXT: ldp d11, d10, [sp, #96] // 16-byte Folded Reload +; CHECK-NEXT: add z1.b, z18.b, z1.b +; CHECK-NEXT: ldp d13, d12, [sp, #80] // 16-byte Folded Reload +; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: ldp d15, d14, [sp, #64] // 16-byte Folded Reload +; CHECK-NEXT: add sp, sp, #128 +; CHECK-NEXT: ret + %tmp1 = load <32 x i8>, ptr %a + %tmp2 = load <32 x i8>, ptr %b + %tmp3 = shufflevector <32 x i8> %tmp1, <32 x i8> %tmp2, <32 x i32> + %tmp4 = shufflevector <32 x i8> %tmp1, <32 x i8> %tmp2, <32 x i32> + %tmp5 = add <32 x i8> %tmp3, %tmp4 + store <32 x i8> %tmp5, ptr %a + ret void +} + +define void @uzp_v4i16(ptr %a, ptr %b) #0{ +; CHECK-LABEL: uzp_v4i16: +; CHECK: // %bb.0: +; CHECK-NEXT: ldr d0, [x0] +; CHECK-NEXT: fmov w8, s0 +; CHECK-NEXT: mov z1.h, z0.h[1] +; CHECK-NEXT: mov z2.h, z0.h[2] +; CHECK-NEXT: mov z0.h, z0.h[3] +; CHECK-NEXT: fmov w9, s1 +; CHECK-NEXT: fmov w10, s2 +; CHECK-NEXT: fmov w11, s0 +; CHECK-NEXT: strh w8, [sp, #-16]! +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: strh w9, [sp, #6] +; CHECK-NEXT: strh w10, [sp, #4] +; CHECK-NEXT: strh w11, [sp, #2] +; CHECK-NEXT: strh w8, [sp, #10] +; CHECK-NEXT: strh w10, [sp, #12] +; CHECK-NEXT: strh w9, [sp, #8] +; CHECK-NEXT: ldp d0, d1, [sp] +; CHECK-NEXT: add z0.h, z0.h, z1.h +; CHECK-NEXT: str d0, [x0] +; CHECK-NEXT: add sp, sp, #16 +; CHECK-NEXT: ret + %tmp1 = load <4 x i16>, ptr %a + %tmp2 = load <4 x i16>, ptr %b + %tmp3 = shufflevector <4 x i16> %tmp1, <4 x i16> %tmp2, <4 x i32> + %tmp4 = shufflevector <4 x i16> %tmp1, <4 x i16> %tmp2, <4 x i32> + %tmp5 = add <4 x i16> %tmp3, %tmp4 + store <4 x i16> %tmp5, ptr %a + ret void +} + +define void @uzp_v16i16(ptr %a, ptr %b) #0{ +; CHECK-LABEL: uzp_v16i16: +; CHECK: // %bb.0: +; CHECK-NEXT: sub sp, sp, #64 +; CHECK-NEXT: .cfi_def_cfa_offset 64 +; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: mov z17.h, z0.h[4] +; CHECK-NEXT: fmov w9, s0 +; CHECK-NEXT: mov z18.h, z0.h[2] +; CHECK-NEXT: mov z19.h, z0.h[7] +; CHECK-NEXT: fmov w8, s1 +; CHECK-NEXT: mov z3.h, z1.h[4] +; CHECK-NEXT: ldp q21, q22, [x1] +; CHECK-NEXT: mov z2.h, z1.h[6] +; CHECK-NEXT: mov z4.h, z1.h[2] +; CHECK-NEXT: strh w8, [sp, #40] +; CHECK-NEXT: fmov w8, s3 +; CHECK-NEXT: fmov w10, s2 +; CHECK-NEXT: mov z5.h, z1.h[7] +; CHECK-NEXT: mov z6.h, z1.h[5] +; CHECK-NEXT: mov z7.h, z1.h[3] +; CHECK-NEXT: strh w8, [sp, #44] +; CHECK-NEXT: fmov w8, s17 +; CHECK-NEXT: mov z16.h, z1.h[1] +; CHECK-NEXT: mov z1.h, z0.h[6] +; CHECK-NEXT: strh w9, [sp, #32] +; CHECK-NEXT: fmov w9, s4 +; CHECK-NEXT: strh w10, [sp, #46] +; CHECK-NEXT: fmov w10, s1 +; CHECK-NEXT: strh w8, [sp, #36] +; CHECK-NEXT: fmov w8, s18 +; CHECK-NEXT: mov z2.h, z22.h[6] +; CHECK-NEXT: strh w9, [sp, #42] +; CHECK-NEXT: strh w10, [sp, #38] +; CHECK-NEXT: fmov w9, s22 +; CHECK-NEXT: fmov w10, s21 +; CHECK-NEXT: strh w8, [sp, #34] +; CHECK-NEXT: fmov w8, s2 +; CHECK-NEXT: mov z3.h, z22.h[4] +; CHECK-NEXT: mov z4.h, z22.h[2] +; CHECK-NEXT: mov z17.h, z22.h[7] +; CHECK-NEXT: mov z18.h, z22.h[5] +; CHECK-NEXT: mov z23.h, z22.h[3] +; CHECK-NEXT: mov z24.h, z22.h[1] +; CHECK-NEXT: mov z22.h, z21.h[6] +; CHECK-NEXT: strh w9, [sp, #8] +; CHECK-NEXT: fmov w9, s3 +; CHECK-NEXT: strh w10, [sp] +; CHECK-NEXT: fmov w10, s4 +; CHECK-NEXT: strh w8, [sp, #14] +; CHECK-NEXT: fmov w8, s22 +; CHECK-NEXT: mov z25.h, z21.h[4] +; CHECK-NEXT: mov z26.h, z21.h[2] +; CHECK-NEXT: strh w9, [sp, #12] +; CHECK-NEXT: fmov w9, s25 +; CHECK-NEXT: strh w10, [sp, #10] +; CHECK-NEXT: fmov w10, s26 +; CHECK-NEXT: strh w8, [sp, #6] +; CHECK-NEXT: fmov w8, s5 +; CHECK-NEXT: strh w9, [sp, #4] +; CHECK-NEXT: fmov w9, s6 +; CHECK-NEXT: strh w10, [sp, #2] +; CHECK-NEXT: fmov w10, s7 +; CHECK-NEXT: strh w8, [sp, #62] +; CHECK-NEXT: fmov w8, s16 +; CHECK-NEXT: mov z20.h, z0.h[5] +; CHECK-NEXT: mov z1.h, z0.h[3] +; CHECK-NEXT: strh w9, [sp, #60] +; CHECK-NEXT: fmov w9, s19 +; CHECK-NEXT: strh w10, [sp, #58] +; CHECK-NEXT: fmov w10, s20 +; CHECK-NEXT: strh w8, [sp, #56] +; CHECK-NEXT: fmov w8, s1 +; CHECK-NEXT: mov z0.h, z0.h[1] +; CHECK-NEXT: strh w9, [sp, #54] +; CHECK-NEXT: fmov w9, s0 +; CHECK-NEXT: strh w10, [sp, #52] +; CHECK-NEXT: fmov w10, s17 +; CHECK-NEXT: strh w8, [sp, #50] +; CHECK-NEXT: fmov w8, s18 +; CHECK-NEXT: mov z27.h, z21.h[7] +; CHECK-NEXT: strh w9, [sp, #48] +; CHECK-NEXT: fmov w9, s23 +; CHECK-NEXT: strh w10, [sp, #30] +; CHECK-NEXT: fmov w10, s24 +; CHECK-NEXT: strh w8, [sp, #28] +; CHECK-NEXT: fmov w8, s27 +; CHECK-NEXT: mov z28.h, z21.h[5] +; CHECK-NEXT: mov z2.h, z21.h[3] +; CHECK-NEXT: mov z3.h, z21.h[1] +; CHECK-NEXT: strh w9, [sp, #26] +; CHECK-NEXT: fmov w9, s28 +; CHECK-NEXT: strh w10, [sp, #24] +; CHECK-NEXT: fmov w10, s2 +; CHECK-NEXT: strh w8, [sp, #22] +; CHECK-NEXT: fmov w8, s3 +; CHECK-NEXT: ldr q4, [sp, #32] +; CHECK-NEXT: strh w9, [sp, #20] +; CHECK-NEXT: ldr q5, [sp] +; CHECK-NEXT: strh w10, [sp, #18] +; CHECK-NEXT: ldr q0, [sp, #48] +; CHECK-NEXT: strh w8, [sp, #16] +; CHECK-NEXT: ldr q1, [sp, #16] +; CHECK-NEXT: add z0.h, z4.h, z0.h +; CHECK-NEXT: add z1.h, z5.h, z1.h +; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: add sp, sp, #64 +; CHECK-NEXT: ret + %tmp1 = load <16 x i16>, ptr %a + %tmp2 = load <16 x i16>, ptr %b + %tmp3 = shufflevector <16 x i16> %tmp1, <16 x i16> %tmp2, <16 x i32> + %tmp4 = shufflevector <16 x i16> %tmp1, <16 x i16> %tmp2, <16 x i32> + %tmp5 = add <16 x i16> %tmp3, %tmp4 + store <16 x i16> %tmp5, ptr %a + ret void +} + +define void @uzp_v8f32(ptr %a, ptr %b) #0{ +; CHECK-LABEL: uzp_v8f32: +; CHECK: // %bb.0: +; CHECK-NEXT: sub sp, sp, #64 +; CHECK-NEXT: .cfi_def_cfa_offset 64 +; CHECK-NEXT: ldp q1, q0, [x0] +; CHECK-NEXT: ptrue p0.s, vl4 +; CHECK-NEXT: ldp q3, q2, [x1] +; CHECK-NEXT: mov z4.s, z0.s[2] +; CHECK-NEXT: stp s0, s4, [sp, #24] +; CHECK-NEXT: mov z4.s, z3.s[2] +; CHECK-NEXT: mov z5.s, z2.s[2] +; CHECK-NEXT: stp s4, s2, [sp, #4] +; CHECK-NEXT: stp s5, s1, [sp, #12] +; CHECK-NEXT: mov z5.s, z0.s[3] +; CHECK-NEXT: mov z0.s, z0.s[1] +; CHECK-NEXT: mov z1.s, z1.s[1] +; CHECK-NEXT: stp s0, s5, [sp, #40] +; CHECK-NEXT: mov z0.s, z3.s[3] +; CHECK-NEXT: str s1, [sp, #32] +; CHECK-NEXT: mov z1.s, z3.s[1] +; CHECK-NEXT: stp s1, s0, [sp, #48] +; CHECK-NEXT: ldp q4, q2, [sp] +; CHECK-NEXT: ldp q0, q1, [sp, #32] +; CHECK-NEXT: fadd z0.s, p0/m, z0.s, z2.s +; CHECK-NEXT: fadd z1.s, p0/m, z1.s, z4.s +; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: add sp, sp, #64 +; CHECK-NEXT: ret + %tmp1 = load <8 x float>, ptr %a + %tmp2 = load <8 x float>, ptr %b + %tmp3 = shufflevector <8 x float> %tmp1, <8 x float> %tmp2, <8 x i32> + %tmp4 = shufflevector <8 x float> %tmp1, <8 x float> %tmp2, <8 x i32> + %tmp5 = fadd <8 x float> %tmp3, %tmp4 + store <8 x float> %tmp5, ptr %a + ret void +} + +define void @uzp_v4i64(ptr %a, ptr %b) #0{ +; CHECK-LABEL: uzp_v4i64: +; CHECK: // %bb.0: +; CHECK-NEXT: ldp q1, q0, [x0] +; CHECK-NEXT: ldp q3, q2, [x1] +; CHECK-NEXT: zip1 z4.d, z1.d, z0.d +; CHECK-NEXT: trn2 z0.d, z1.d, z0.d +; CHECK-NEXT: add z0.d, z4.d, z0.d +; CHECK-NEXT: zip1 z5.d, z3.d, z2.d +; CHECK-NEXT: trn2 z1.d, z3.d, z2.d +; CHECK-NEXT: add z1.d, z5.d, z1.d +; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: ret + %tmp1 = load <4 x i64>, ptr %a + %tmp2 = load <4 x i64>, ptr %b + %tmp3 = shufflevector <4 x i64> %tmp1, <4 x i64> %tmp2, <4 x i32> + %tmp4 = shufflevector <4 x i64> %tmp1, <4 x i64> %tmp2, <4 x i32> + %tmp5 = add <4 x i64> %tmp3, %tmp4 + store <4 x i64> %tmp5, ptr %a + ret void +} + +define void @uzp_v8i16(ptr %a, ptr %b) #0{ +; CHECK-LABEL: uzp_v8i16: +; CHECK: // %bb.0: +; CHECK-NEXT: sub sp, sp, #32 +; CHECK-NEXT: .cfi_def_cfa_offset 32 +; CHECK-NEXT: ldr q0, [x1] +; CHECK-NEXT: ldr q1, [x0] +; CHECK-NEXT: fmov w8, s0 +; CHECK-NEXT: mov z2.h, z0.h[6] +; CHECK-NEXT: mov z3.h, z0.h[4] +; CHECK-NEXT: fmov w9, s1 +; CHECK-NEXT: fmov w10, s2 +; CHECK-NEXT: mov z4.h, z0.h[2] +; CHECK-NEXT: strh w8, [sp, #8] +; CHECK-NEXT: fmov w8, s3 +; CHECK-NEXT: mov z5.h, z0.h[7] +; CHECK-NEXT: mov z6.h, z0.h[5] +; CHECK-NEXT: mov z7.h, z0.h[3] +; CHECK-NEXT: mov z16.h, z0.h[1] +; CHECK-NEXT: mov z0.h, z1.h[6] +; CHECK-NEXT: mov z17.h, z1.h[4] +; CHECK-NEXT: strh w9, [sp] +; CHECK-NEXT: fmov w9, s4 +; CHECK-NEXT: strh w10, [sp, #14] +; CHECK-NEXT: fmov w10, s0 +; CHECK-NEXT: strh w8, [sp, #12] +; CHECK-NEXT: fmov w8, s17 +; CHECK-NEXT: mov z18.h, z1.h[2] +; CHECK-NEXT: strh w9, [sp, #10] +; CHECK-NEXT: fmov w9, s18 +; CHECK-NEXT: strh w10, [sp, #6] +; CHECK-NEXT: fmov w10, s5 +; CHECK-NEXT: strh w8, [sp, #4] +; CHECK-NEXT: fmov w8, s6 +; CHECK-NEXT: mov z19.h, z1.h[7] +; CHECK-NEXT: strh w9, [sp, #2] +; CHECK-NEXT: fmov w9, s7 +; CHECK-NEXT: strh w10, [sp, #30] +; CHECK-NEXT: fmov w10, s16 +; CHECK-NEXT: strh w8, [sp, #28] +; CHECK-NEXT: fmov w8, s19 +; CHECK-NEXT: mov z20.h, z1.h[5] +; CHECK-NEXT: mov z21.h, z1.h[3] +; CHECK-NEXT: mov z0.h, z1.h[1] +; CHECK-NEXT: strh w9, [sp, #26] +; CHECK-NEXT: fmov w9, s20 +; CHECK-NEXT: strh w10, [sp, #24] +; CHECK-NEXT: fmov w10, s21 +; CHECK-NEXT: strh w8, [sp, #22] +; CHECK-NEXT: fmov w8, s0 +; CHECK-NEXT: strh w9, [sp, #20] +; CHECK-NEXT: strh w10, [sp, #18] +; CHECK-NEXT: strh w8, [sp, #16] +; CHECK-NEXT: ldp q1, q0, [sp] +; CHECK-NEXT: add z0.h, z1.h, z0.h +; CHECK-NEXT: str q0, [x0] +; CHECK-NEXT: add sp, sp, #32 +; CHECK-NEXT: ret + %tmp1 = load <8 x i16>, ptr %a + %tmp2 = load <8 x i16>, ptr %b + %tmp3 = shufflevector <8 x i16> %tmp1, <8 x i16> %tmp2, <8 x i32> + %tmp4 = shufflevector <8 x i16> %tmp1, <8 x i16> %tmp2, <8 x i32> + %tmp5 = add <8 x i16> %tmp3, %tmp4 + store <8 x i16> %tmp5, ptr %a + ret void +} + +define void @uzp_v8i32_undef(ptr %a) #0{ +; CHECK-LABEL: uzp_v8i32_undef: +; CHECK: // %bb.0: +; CHECK-NEXT: sub sp, sp, #32 +; CHECK-NEXT: .cfi_def_cfa_offset 32 +; CHECK-NEXT: ldp q1, q0, [x0] +; CHECK-NEXT: fmov w10, s1 +; CHECK-NEXT: mov z5.s, z1.s[3] +; CHECK-NEXT: mov z2.s, z0.s[2] +; CHECK-NEXT: mov z3.s, z0.s[3] +; CHECK-NEXT: mov z4.s, z0.s[1] +; CHECK-NEXT: fmov w8, s0 +; CHECK-NEXT: mov z0.s, z1.s[2] +; CHECK-NEXT: fmov w9, s2 +; CHECK-NEXT: fmov w11, s0 +; CHECK-NEXT: mov z2.s, z1.s[1] +; CHECK-NEXT: fmov w12, s3 +; CHECK-NEXT: stp w8, w9, [sp, #8] +; CHECK-NEXT: fmov w8, s4 +; CHECK-NEXT: fmov w9, s5 +; CHECK-NEXT: stp w10, w11, [sp] +; CHECK-NEXT: fmov w10, s2 +; CHECK-NEXT: stp w8, w12, [sp, #24] +; CHECK-NEXT: stp w10, w9, [sp, #16] +; CHECK-NEXT: ldp q0, q1, [sp] +; CHECK-NEXT: add z0.s, z0.s, z1.s +; CHECK-NEXT: stp q0, q0, [x0] +; CHECK-NEXT: add sp, sp, #32 +; CHECK-NEXT: ret + %tmp1 = load <8 x i32>, ptr %a + %tmp3 = shufflevector <8 x i32> %tmp1, <8 x i32> undef, <8 x i32> + %tmp4 = shufflevector <8 x i32> %tmp1, <8 x i32> undef, <8 x i32> + %tmp5 = add <8 x i32> %tmp3, %tmp4 + store <8 x i32> %tmp5, ptr %a + ret void +} + +define void @zip_vscale2_4(ptr %a, ptr %b) #0 { +; CHECK-LABEL: zip_vscale2_4: +; CHECK: // %bb.0: +; CHECK-NEXT: ldp q1, q0, [x0] +; CHECK-NEXT: ptrue p0.d, vl2 +; CHECK-NEXT: ldp q2, q3, [x1] +; CHECK-NEXT: zip1 z4.d, z1.d, z2.d +; CHECK-NEXT: trn2 z1.d, z1.d, z2.d +; CHECK-NEXT: zip1 z2.d, z0.d, z3.d +; CHECK-NEXT: trn2 z0.d, z0.d, z3.d +; CHECK-NEXT: fadd z2.d, p0/m, z2.d, z4.d +; CHECK-NEXT: fadd z0.d, p0/m, z0.d, z1.d +; CHECK-NEXT: stp q2, q0, [x0] +; CHECK-NEXT: ret + %tmp1 = load <4 x double>, ptr %a + %tmp2 = load <4 x double>, ptr %b + %tmp3 = shufflevector <4 x double> %tmp1, <4 x double> %tmp2, <4 x i32> + %tmp4 = shufflevector <4 x double> %tmp1, <4 x double> %tmp2, <4 x i32> + %tmp5 = fadd <4 x double> %tmp3, %tmp4 + store <4 x double> %tmp5, ptr %a + ret void +} + +attributes #0 = { "target-features"="+sve" } diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-subvector.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-subvector.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-subvector.ll @@ -0,0 +1,341 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -force-streaming-compatible-sve < %s | FileCheck %s + +; Test we can code generater patterns of the form: +; fixed_length_vector = ISD::EXTRACT_SUBVECTOR scalable_vector, 0 +; scalable_vector = ISD::INSERT_SUBVECTOR scalable_vector, fixed_length_vector, 0 +; +; NOTE: Currently shufflevector does not support scalable vectors so it cannot +; be used to model the above operations. Instead these tests rely on knowing +; how fixed length operation are lowered to scalable ones, with multiple blocks +; ensuring insert/extract sequences are not folded away. + +target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128" +target triple = "aarch64-unknown-linux-gnu" + +; i8 +define void @subvector_v4i8(ptr %in, ptr %out) #0 { +; CHECK-LABEL: subvector_v4i8: +; CHECK: // %bb.0: // %bb1 +; CHECK-NEXT: ldr s0, [x0] +; CHECK-NEXT: ptrue p0.h, vl4 +; CHECK-NEXT: uunpklo z0.h, z0.b +; CHECK-NEXT: st1b { z0.h }, p0, [x1] +; CHECK-NEXT: ret + %a = load <4 x i8>, ptr %in + br label %bb1 + +bb1: + store <4 x i8> %a, ptr %out + ret void +} + +define void @subvector_v8i8(ptr %in, ptr %out) #0 { +; CHECK-LABEL: subvector_v8i8: +; CHECK: // %bb.0: // %bb1 +; CHECK-NEXT: ldr d0, [x0] +; CHECK-NEXT: str d0, [x1] +; CHECK-NEXT: ret + %a = load <8 x i8>, ptr %in + br label %bb1 + +bb1: + store <8 x i8> %a, ptr %out + ret void +} + +define void @subvector_v16i8(ptr %in, ptr %out) #0 { +; CHECK-LABEL: subvector_v16i8: +; CHECK: // %bb.0: // %bb1 +; CHECK-NEXT: ldr q0, [x0] +; CHECK-NEXT: str q0, [x1] +; CHECK-NEXT: ret + %a = load <16 x i8>, ptr %in + br label %bb1 + +bb1: + store <16 x i8> %a, ptr %out + ret void +} + +define void @subvector_v32i8(ptr %in, ptr %out) #0 { +; CHECK-LABEL: subvector_v32i8: +; CHECK: // %bb.0: // %bb1 +; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: stp q0, q1, [x1] +; CHECK-NEXT: ret + %a = load <32 x i8>, ptr %in + br label %bb1 + +bb1: + store <32 x i8> %a, ptr %out + ret void +} + +; i16 +define void @subvector_v2i16(ptr %in, ptr %out) #0 { +; CHECK-LABEL: subvector_v2i16: +; CHECK: // %bb.0: // %bb1 +; CHECK-NEXT: sub sp, sp, #16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: ldrh w8, [x0, #2] +; CHECK-NEXT: ptrue p0.s, vl2 +; CHECK-NEXT: str w8, [sp, #12] +; CHECK-NEXT: ldrh w8, [x0] +; CHECK-NEXT: str w8, [sp, #8] +; CHECK-NEXT: ldr d0, [sp, #8] +; CHECK-NEXT: st1h { z0.s }, p0, [x1] +; CHECK-NEXT: add sp, sp, #16 +; CHECK-NEXT: ret + %a = load <2 x i16>, ptr %in + br label %bb1 + +bb1: + store <2 x i16> %a, ptr %out + ret void +} + +define void @subvector_v4i16(ptr %in, ptr %out) #0 { +; CHECK-LABEL: subvector_v4i16: +; CHECK: // %bb.0: // %bb1 +; CHECK-NEXT: ldr d0, [x0] +; CHECK-NEXT: str d0, [x1] +; CHECK-NEXT: ret + %a = load <4 x i16>, ptr %in + br label %bb1 + +bb1: + store <4 x i16> %a, ptr %out + ret void +} + +define void @subvector_v8i16(ptr %in, ptr %out) #0 { +; CHECK-LABEL: subvector_v8i16: +; CHECK: // %bb.0: // %bb1 +; CHECK-NEXT: ldr q0, [x0] +; CHECK-NEXT: str q0, [x1] +; CHECK-NEXT: ret + %a = load <8 x i16>, ptr %in + br label %bb1 + +bb1: + store <8 x i16> %a, ptr %out + ret void +} + +define void @subvector_v16i16(ptr %in, ptr %out) #0 { +; CHECK-LABEL: subvector_v16i16: +; CHECK: // %bb.0: // %bb1 +; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: stp q0, q1, [x1] +; CHECK-NEXT: ret + %a = load <16 x i16>, ptr %in + br label %bb1 + +bb1: + store <16 x i16> %a, ptr %out + ret void +} + +; i32 +define void @subvector_v2i32(ptr %in, ptr %out) #0 { +; CHECK-LABEL: subvector_v2i32: +; CHECK: // %bb.0: // %bb1 +; CHECK-NEXT: ldr d0, [x0] +; CHECK-NEXT: str d0, [x1] +; CHECK-NEXT: ret + %a = load <2 x i32>, ptr %in + br label %bb1 + +bb1: + store <2 x i32> %a, ptr %out + ret void +} + +define void @subvector_v4i32(ptr %in, ptr %out) #0 { +; CHECK-LABEL: subvector_v4i32: +; CHECK: // %bb.0: // %bb1 +; CHECK-NEXT: ldr q0, [x0] +; CHECK-NEXT: str q0, [x1] +; CHECK-NEXT: ret + %a = load <4 x i32>, ptr %in + br label %bb1 + +bb1: + store <4 x i32> %a, ptr %out + ret void +} + +define void @subvector_v8i32(ptr %in, ptr %out) #0 { +; CHECK-LABEL: subvector_v8i32: +; CHECK: // %bb.0: // %bb1 +; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: stp q0, q1, [x1] +; CHECK-NEXT: ret + %a = load <8 x i32>, ptr %in + br label %bb1 + +bb1: + store <8 x i32> %a, ptr %out + ret void +} + +; i64 +define void @subvector_v2i64(ptr %in, ptr %out) #0 { +; CHECK-LABEL: subvector_v2i64: +; CHECK: // %bb.0: // %bb1 +; CHECK-NEXT: ldr q0, [x0] +; CHECK-NEXT: str q0, [x1] +; CHECK-NEXT: ret + %a = load <2 x i64>, ptr %in + br label %bb1 + +bb1: + store <2 x i64> %a, ptr %out + ret void +} + +define void @subvector_v4i64(ptr %in, ptr %out) #0 { +; CHECK-LABEL: subvector_v4i64: +; CHECK: // %bb.0: // %bb1 +; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: stp q0, q1, [x1] +; CHECK-NEXT: ret + %a = load <4 x i64>, ptr %in + br label %bb1 + +bb1: + store <4 x i64> %a, ptr %out + ret void +} + +; f16 +define void @subvector_v2f16(ptr %in, ptr %out) #0 { +; CHECK-LABEL: subvector_v2f16: +; CHECK: // %bb.0: // %bb1 +; CHECK-NEXT: ldr w8, [x0] +; CHECK-NEXT: str w8, [x1] +; CHECK-NEXT: ret + %a = load <2 x half>, ptr %in + br label %bb1 + +bb1: + store <2 x half> %a, ptr %out + ret void +} + +define void @subvector_v4f16(ptr %in, ptr %out) #0 { +; CHECK-LABEL: subvector_v4f16: +; CHECK: // %bb.0: // %bb1 +; CHECK-NEXT: ldr d0, [x0] +; CHECK-NEXT: str d0, [x1] +; CHECK-NEXT: ret + %a = load <4 x half>, ptr %in + br label %bb1 + +bb1: + store <4 x half> %a, ptr %out + ret void +} + +define void @subvector_v8f16(ptr %in, ptr %out) #0 { +; CHECK-LABEL: subvector_v8f16: +; CHECK: // %bb.0: // %bb1 +; CHECK-NEXT: ldr q0, [x0] +; CHECK-NEXT: str q0, [x1] +; CHECK-NEXT: ret + %a = load <8 x half>, ptr %in + br label %bb1 + +bb1: + store <8 x half> %a, ptr %out + ret void +} + +define void @subvector_v16f16(ptr %in, ptr %out) #0 { +; CHECK-LABEL: subvector_v16f16: +; CHECK: // %bb.0: // %bb1 +; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: stp q0, q1, [x1] +; CHECK-NEXT: ret + %a = load <16 x half>, ptr %in + br label %bb1 + +bb1: + store <16 x half> %a, ptr %out + ret void +} + +; f32 +define void @subvector_v2f32(ptr %in, ptr %out) #0 { +; CHECK-LABEL: subvector_v2f32: +; CHECK: // %bb.0: // %bb1 +; CHECK-NEXT: ldr d0, [x0] +; CHECK-NEXT: str d0, [x1] +; CHECK-NEXT: ret + %a = load <2 x float>, ptr %in + br label %bb1 + +bb1: + store <2 x float> %a, ptr %out + ret void +} + +define void @subvector_v4f32(ptr %in, ptr %out) #0 { +; CHECK-LABEL: subvector_v4f32: +; CHECK: // %bb.0: // %bb1 +; CHECK-NEXT: ldr q0, [x0] +; CHECK-NEXT: str q0, [x1] +; CHECK-NEXT: ret + %a = load <4 x float>, ptr %in + br label %bb1 + +bb1: + store <4 x float> %a, ptr %out + ret void +} + +define void @subvector_v8f32(ptr %in, ptr %out) #0 { +; CHECK-LABEL: subvector_v8f32: +; CHECK: // %bb.0: // %bb1 +; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: stp q0, q1, [x1] +; CHECK-NEXT: ret + %a = load <8 x float>,ptr %in + br label %bb1 + +bb1: + store <8 x float> %a, ptr %out + ret void +} + +; f64 +define void @subvector_v2f64(ptr %in, ptr %out) #0 { +; CHECK-LABEL: subvector_v2f64: +; CHECK: // %bb.0: // %bb1 +; CHECK-NEXT: ldr q0, [x0] +; CHECK-NEXT: str q0, [x1] +; CHECK-NEXT: ret + %a = load <2 x double>, ptr %in + br label %bb1 + +bb1: + store <2 x double> %a, ptr %out + ret void +} + +define void @subvector_v4f64(ptr %in, ptr %out) #0 { +; CHECK-LABEL: subvector_v4f64: +; CHECK: // %bb.0: // %bb1 +; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: stp q0, q1, [x1] +; CHECK-NEXT: ret + %a = load <4 x double>, ptr %in + br label %bb1 + +bb1: + store <4 x double> %a, ptr %out + ret void +} + +attributes #0 = { "target-features"="+sve" } diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-vector-shuffle.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-vector-shuffle.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-vector-shuffle.ll @@ -0,0 +1,382 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -force-streaming-compatible-sve < %s | FileCheck %s + +target triple = "aarch64-unknown-linux-gnu" + +define <4 x i8> @shuffle_ext_byone_v4i8(<4 x i8> %op1, <4 x i8> %op2) #0 { +; CHECK-LABEL: shuffle_ext_byone_v4i8: +; CHECK: // %bb.0: +; CHECK-NEXT: sub sp, sp, #16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: mov z1.h, z0.h[1] +; CHECK-NEXT: fmov w8, s0 +; CHECK-NEXT: mov z2.h, z0.h[2] +; CHECK-NEXT: mov z0.h, z0.h[3] +; CHECK-NEXT: fmov w9, s1 +; CHECK-NEXT: fmov w10, s2 +; CHECK-NEXT: fmov w11, s0 +; CHECK-NEXT: strh w8, [sp, #8] +; CHECK-NEXT: strh w9, [sp, #14] +; CHECK-NEXT: strh w10, [sp, #12] +; CHECK-NEXT: strh w11, [sp, #10] +; CHECK-NEXT: ldr d0, [sp, #8] +; CHECK-NEXT: add sp, sp, #16 +; CHECK-NEXT: ret + %ret = shufflevector <4 x i8> %op1, <4 x i8> %op2, <4 x i32> + ret <4 x i8> %ret +} + +define <8 x i8> @shuffle_ext_byone_v8i8(<8 x i8> %op1, <8 x i8> %op2) #0 { +; CHECK-LABEL: shuffle_ext_byone_v8i8: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1 +; CHECK-NEXT: mov z0.b, z0.b[7] +; CHECK-NEXT: fmov w8, s0 +; CHECK-NEXT: insr z1.b, w8 +; CHECK-NEXT: fmov d0, d1 +; CHECK-NEXT: ret + %ret = shufflevector <8 x i8> %op1, <8 x i8> %op2, <8 x i32> + ret <8 x i8> %ret +} + +define <16 x i8> @shuffle_ext_byone_v16i8(<16 x i8> %op1, <16 x i8> %op2) #0 { +; CHECK-LABEL: shuffle_ext_byone_v16i8: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1 +; CHECK-NEXT: mov z0.b, z0.b[15] +; CHECK-NEXT: fmov w8, s0 +; CHECK-NEXT: insr z1.b, w8 +; CHECK-NEXT: mov z0.d, z1.d +; CHECK-NEXT: ret + %ret = shufflevector <16 x i8> %op1, <16 x i8> %op2, <16 x i32> + ret <16 x i8> %ret +} + +define void @shuffle_ext_byone_v32i8(ptr %a, ptr %b) #0 { +; CHECK-LABEL: shuffle_ext_byone_v32i8: +; CHECK: // %bb.0: +; CHECK-NEXT: ldr q0, [x0, #16] +; CHECK-NEXT: ldr q1, [x1] +; CHECK-NEXT: mov z0.b, z0.b[15] +; CHECK-NEXT: mov z2.b, z1.b[15] +; CHECK-NEXT: fmov w8, s0 +; CHECK-NEXT: ldr q0, [x1, #16] +; CHECK-NEXT: fmov w9, s2 +; CHECK-NEXT: insr z1.b, w8 +; CHECK-NEXT: insr z0.b, w9 +; CHECK-NEXT: stp q1, q0, [x0] +; CHECK-NEXT: ret + %op1 = load <32 x i8>, ptr %a + %op2 = load <32 x i8>, ptr %b + %ret = shufflevector <32 x i8> %op1, <32 x i8> %op2, <32 x i32> + store <32 x i8> %ret, ptr %a + ret void +} + +define <2 x i16> @shuffle_ext_byone_v2i16(<2 x i16> %op1, <2 x i16> %op2) #0 { +; CHECK-LABEL: shuffle_ext_byone_v2i16: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: revw z0.d, p0/m, z0.d +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 +; CHECK-NEXT: ret + %ret = shufflevector <2 x i16> %op1, <2 x i16> %op2, <2 x i32> + ret <2 x i16> %ret +} + +define <4 x i16> @shuffle_ext_byone_v4i16(<4 x i16> %op1, <4 x i16> %op2) #0 { +; CHECK-LABEL: shuffle_ext_byone_v4i16: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1 +; CHECK-NEXT: mov z0.h, z0.h[3] +; CHECK-NEXT: fmov w8, s0 +; CHECK-NEXT: insr z1.h, w8 +; CHECK-NEXT: fmov d0, d1 +; CHECK-NEXT: ret + %ret = shufflevector <4 x i16> %op1, <4 x i16> %op2, <4 x i32> + ret <4 x i16> %ret +} + +define <8 x i16> @shuffle_ext_byone_v8i16(<8 x i16> %op1, <8 x i16> %op2) #0 { +; CHECK-LABEL: shuffle_ext_byone_v8i16: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1 +; CHECK-NEXT: mov z0.h, z0.h[7] +; CHECK-NEXT: fmov w8, s0 +; CHECK-NEXT: insr z1.h, w8 +; CHECK-NEXT: mov z0.d, z1.d +; CHECK-NEXT: ret + %ret = shufflevector <8 x i16> %op1, <8 x i16> %op2, <8 x i32> + ret <8 x i16> %ret +} + +define void @shuffle_ext_byone_v16i16(ptr %a, ptr %b) #0 { +; CHECK-LABEL: shuffle_ext_byone_v16i16: +; CHECK: // %bb.0: +; CHECK-NEXT: ldr q0, [x0, #16] +; CHECK-NEXT: ldr q1, [x1] +; CHECK-NEXT: mov z0.h, z0.h[7] +; CHECK-NEXT: mov z2.h, z1.h[7] +; CHECK-NEXT: fmov w8, s0 +; CHECK-NEXT: ldr q0, [x1, #16] +; CHECK-NEXT: fmov w9, s2 +; CHECK-NEXT: insr z1.h, w8 +; CHECK-NEXT: insr z0.h, w9 +; CHECK-NEXT: stp q1, q0, [x0] +; CHECK-NEXT: ret + %op1 = load <16 x i16>, ptr %a + %op2 = load <16 x i16>, ptr %b + %ret = shufflevector <16 x i16> %op1, <16 x i16> %op2, <16 x i32> + store <16 x i16> %ret, ptr %a + ret void +} + +define <2 x i32> @shuffle_ext_byone_v2i32(<2 x i32> %op1, <2 x i32> %op2) #0 { +; CHECK-LABEL: shuffle_ext_byone_v2i32: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1 +; CHECK-NEXT: mov z0.s, z0.s[1] +; CHECK-NEXT: fmov w8, s0 +; CHECK-NEXT: insr z1.s, w8 +; CHECK-NEXT: fmov d0, d1 +; CHECK-NEXT: ret + %ret = shufflevector <2 x i32> %op1, <2 x i32> %op2, <2 x i32> + ret <2 x i32> %ret +} + +define <4 x i32> @shuffle_ext_byone_v4i32(<4 x i32> %op1, <4 x i32> %op2) #0 { +; CHECK-LABEL: shuffle_ext_byone_v4i32: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1 +; CHECK-NEXT: mov z0.s, z0.s[3] +; CHECK-NEXT: fmov w8, s0 +; CHECK-NEXT: insr z1.s, w8 +; CHECK-NEXT: mov z0.d, z1.d +; CHECK-NEXT: ret + %ret = shufflevector <4 x i32> %op1, <4 x i32> %op2, <4 x i32> + ret <4 x i32> %ret +} + +define void @shuffle_ext_byone_v8i32(ptr %a, ptr %b) #0 { +; CHECK-LABEL: shuffle_ext_byone_v8i32: +; CHECK: // %bb.0: +; CHECK-NEXT: ldr q0, [x0, #16] +; CHECK-NEXT: ldr q1, [x1] +; CHECK-NEXT: mov z0.s, z0.s[3] +; CHECK-NEXT: mov z2.s, z1.s[3] +; CHECK-NEXT: fmov w8, s0 +; CHECK-NEXT: ldr q0, [x1, #16] +; CHECK-NEXT: fmov w9, s2 +; CHECK-NEXT: insr z1.s, w8 +; CHECK-NEXT: insr z0.s, w9 +; CHECK-NEXT: stp q1, q0, [x0] +; CHECK-NEXT: ret + %op1 = load <8 x i32>, ptr %a + %op2 = load <8 x i32>, ptr %b + %ret = shufflevector <8 x i32> %op1, <8 x i32> %op2, <8 x i32> + store <8 x i32> %ret, ptr %a + ret void +} + +define <2 x i64> @shuffle_ext_byone_v2i64(<2 x i64> %op1, <2 x i64> %op2) #0 { +; CHECK-LABEL: shuffle_ext_byone_v2i64: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1 +; CHECK-NEXT: mov z0.d, z0.d[1] +; CHECK-NEXT: fmov x8, d0 +; CHECK-NEXT: insr z1.d, x8 +; CHECK-NEXT: mov z0.d, z1.d +; CHECK-NEXT: ret + %ret = shufflevector <2 x i64> %op1, <2 x i64> %op2, <2 x i32> + ret <2 x i64> %ret +} + +define void @shuffle_ext_byone_v4i64(ptr %a, ptr %b) #0 { +; CHECK-LABEL: shuffle_ext_byone_v4i64: +; CHECK: // %bb.0: +; CHECK-NEXT: ldr q0, [x0, #16] +; CHECK-NEXT: ldr q1, [x1] +; CHECK-NEXT: mov z0.d, z0.d[1] +; CHECK-NEXT: mov z2.d, z1.d[1] +; CHECK-NEXT: fmov x8, d0 +; CHECK-NEXT: ldr q0, [x1, #16] +; CHECK-NEXT: fmov x9, d2 +; CHECK-NEXT: insr z1.d, x8 +; CHECK-NEXT: insr z0.d, x9 +; CHECK-NEXT: stp q1, q0, [x0] +; CHECK-NEXT: ret + %op1 = load <4 x i64>, ptr %a + %op2 = load <4 x i64>, ptr %b + %ret = shufflevector <4 x i64> %op1, <4 x i64> %op2, <4 x i32> + store <4 x i64> %ret, ptr %a + ret void +} + + +define <4 x half> @shuffle_ext_byone_v4f16(<4 x half> %op1, <4 x half> %op2) #0 { +; CHECK-LABEL: shuffle_ext_byone_v4f16: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1 +; CHECK-NEXT: mov z0.h, z0.h[3] +; CHECK-NEXT: insr z1.h, h0 +; CHECK-NEXT: fmov d0, d1 +; CHECK-NEXT: ret + %ret = shufflevector <4 x half> %op1, <4 x half> %op2, <4 x i32> + ret <4 x half> %ret +} + +define <8 x half> @shuffle_ext_byone_v8f16(<8 x half> %op1, <8 x half> %op2) #0 { +; CHECK-LABEL: shuffle_ext_byone_v8f16: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1 +; CHECK-NEXT: mov z0.h, z0.h[7] +; CHECK-NEXT: insr z1.h, h0 +; CHECK-NEXT: mov z0.d, z1.d +; CHECK-NEXT: ret + %ret = shufflevector <8 x half> %op1, <8 x half> %op2, <8 x i32> + ret <8 x half> %ret +} + +define void @shuffle_ext_byone_v16f16(ptr %a, ptr %b) #0 { +; CHECK-LABEL: shuffle_ext_byone_v16f16: +; CHECK: // %bb.0: +; CHECK-NEXT: ldp q1, q2, [x1] +; CHECK-NEXT: mov z3.h, z1.h[7] +; CHECK-NEXT: ldr q0, [x0, #16] +; CHECK-NEXT: insr z2.h, h3 +; CHECK-NEXT: mov z0.h, z0.h[7] +; CHECK-NEXT: insr z1.h, h0 +; CHECK-NEXT: stp q1, q2, [x0] +; CHECK-NEXT: ret + %op1 = load <16 x half>, ptr %a + %op2 = load <16 x half>, ptr %b + %ret = shufflevector <16 x half> %op1, <16 x half> %op2, <16 x i32> + store <16 x half> %ret, ptr %a + ret void +} + +define <2 x float> @shuffle_ext_byone_v2f32(<2 x float> %op1, <2 x float> %op2) #0 { +; CHECK-LABEL: shuffle_ext_byone_v2f32: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1 +; CHECK-NEXT: mov z0.s, z0.s[1] +; CHECK-NEXT: insr z1.s, s0 +; CHECK-NEXT: fmov d0, d1 +; CHECK-NEXT: ret + %ret = shufflevector <2 x float> %op1, <2 x float> %op2, <2 x i32> + ret <2 x float> %ret +} + +define <4 x float> @shuffle_ext_byone_v4f32(<4 x float> %op1, <4 x float> %op2) #0 { +; CHECK-LABEL: shuffle_ext_byone_v4f32: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1 +; CHECK-NEXT: mov z0.s, z0.s[3] +; CHECK-NEXT: insr z1.s, s0 +; CHECK-NEXT: mov z0.d, z1.d +; CHECK-NEXT: ret + %ret = shufflevector <4 x float> %op1, <4 x float> %op2, <4 x i32> + ret <4 x float> %ret +} + +define void @shuffle_ext_byone_v8f32(ptr %a, ptr %b) #0 { +; CHECK-LABEL: shuffle_ext_byone_v8f32: +; CHECK: // %bb.0: +; CHECK-NEXT: ldp q1, q2, [x1] +; CHECK-NEXT: mov z3.s, z1.s[3] +; CHECK-NEXT: ldr q0, [x0, #16] +; CHECK-NEXT: insr z2.s, s3 +; CHECK-NEXT: mov z0.s, z0.s[3] +; CHECK-NEXT: insr z1.s, s0 +; CHECK-NEXT: stp q1, q2, [x0] +; CHECK-NEXT: ret + %op1 = load <8 x float>, ptr %a + %op2 = load <8 x float>, ptr %b + %ret = shufflevector <8 x float> %op1, <8 x float> %op2, <8 x i32> + store <8 x float> %ret, ptr %a + ret void +} + +define <2 x double> @shuffle_ext_byone_v2f64(<2 x double> %op1, <2 x double> %op2) #0 { +; CHECK-LABEL: shuffle_ext_byone_v2f64: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1 +; CHECK-NEXT: mov z0.d, z0.d[1] +; CHECK-NEXT: insr z1.d, d0 +; CHECK-NEXT: mov z0.d, z1.d +; CHECK-NEXT: ret + %ret = shufflevector <2 x double> %op1, <2 x double> %op2, <2 x i32> + ret <2 x double> %ret +} + +define void @shuffle_ext_byone_v4f64(ptr %a, ptr %b) #0 { +; CHECK-LABEL: shuffle_ext_byone_v4f64: +; CHECK: // %bb.0: +; CHECK-NEXT: ldp q1, q2, [x1] +; CHECK-NEXT: mov z3.d, z1.d[1] +; CHECK-NEXT: ldr q0, [x0, #16] +; CHECK-NEXT: insr z2.d, d3 +; CHECK-NEXT: mov z0.d, z0.d[1] +; CHECK-NEXT: insr z1.d, d0 +; CHECK-NEXT: stp q1, q2, [x0] +; CHECK-NEXT: ret + %op1 = load <4 x double>, ptr %a + %op2 = load <4 x double>, ptr %b + %ret = shufflevector <4 x double> %op1, <4 x double> %op2, <4 x i32> + store <4 x double> %ret, ptr %a + ret void +} + +define void @shuffle_ext_byone_reverse(ptr %a, ptr %b) #0 { +; CHECK-LABEL: shuffle_ext_byone_reverse: +; CHECK: // %bb.0: +; CHECK-NEXT: ldp q1, q2, [x0] +; CHECK-NEXT: mov z3.d, z1.d[1] +; CHECK-NEXT: ldr q0, [x1, #16] +; CHECK-NEXT: insr z2.d, d3 +; CHECK-NEXT: mov z0.d, z0.d[1] +; CHECK-NEXT: insr z1.d, d0 +; CHECK-NEXT: stp q1, q2, [x0] +; CHECK-NEXT: ret + %op1 = load <4 x double>, ptr %a + %op2 = load <4 x double>, ptr %b + %ret = shufflevector <4 x double> %op1, <4 x double> %op2, <4 x i32> + store <4 x double> %ret, ptr %a + ret void +} + +define void @shuffle_ext_invalid(ptr %a, ptr %b) #0 { +; CHECK-LABEL: shuffle_ext_invalid: +; CHECK: // %bb.0: +; CHECK-NEXT: ldr q0, [x0, #16] +; CHECK-NEXT: ldr q1, [x1] +; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: ret + %op1 = load <4 x double>, ptr %a + %op2 = load <4 x double>, ptr %b + %ret = shufflevector <4 x double> %op1, <4 x double> %op2, <4 x i32> + store <4 x double> %ret, ptr %a + ret void +} + +attributes #0 = { "target-features"="+sve" }