diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-bitselect.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-bitselect.ll --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-bitselect.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-bitselect.ll @@ -1,3 +1,4 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -aarch64-sve-vector-bits-min=256 < %s | FileCheck %s target triple = "aarch64" @@ -10,10 +11,19 @@ define <8 x i32> @fixed_bitselect_v8i32(<8 x i32>* %pre_cond_ptr, <8 x i32>* %left_ptr, <8 x i32>* %right_ptr) #0 { ; CHECK-LABEL: fixed_bitselect_v8i32: -; CHECK-NOT: bsl {{.*}}, {{.*}}, {{.*}} -; CHECK-NOT: bit {{.*}}, {{.*}}, {{.*}} -; CHECK-NOT: bif {{.*}}, {{.*}}, {{.*}} -; CHECK: ret +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.s, vl8 +; CHECK-NEXT: mov z3.s, #-1 // =0xffffffffffffffff +; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] +; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1] +; CHECK-NEXT: ld1w { z2.s }, p0/z, [x2] +; CHECK-NEXT: add z3.s, z0.s, z3.s +; CHECK-NEXT: subr z0.s, z0.s, #0 // =0x0 +; CHECK-NEXT: and z0.d, z0.d, z1.d +; CHECK-NEXT: and z1.d, z3.d, z2.d +; CHECK-NEXT: orr z0.d, z1.d, z0.d +; CHECK-NEXT: st1w { z0.s }, p0, [x8] +; CHECK-NEXT: ret %pre_cond = load <8 x i32>, <8 x i32>* %pre_cond_ptr %left = load <8 x i32>, <8 x i32>* %left_ptr %right = load <8 x i32>, <8 x i32>* %right_ptr diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-extract-subvector.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-extract-subvector.ll --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-extract-subvector.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-extract-subvector.ll @@ -1,6 +1,7 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -aarch64-sve-vector-bits-min=128 < %s | FileCheck %s -check-prefix=NO_SVE ; RUN: llc -aarch64-sve-vector-bits-min=256 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_EQ_256 -; RUN: llc -aarch64-sve-vector-bits-min=384 < %s | FileCheck %s -check-prefixes=CHECK +; RUN: llc -aarch64-sve-vector-bits-min=384 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_384 ; RUN: llc -aarch64-sve-vector-bits-min=512 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512 ; RUN: llc -aarch64-sve-vector-bits-min=640 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512 ; RUN: llc -aarch64-sve-vector-bits-min=768 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512 @@ -24,6 +25,11 @@ ; Don't use SVE for 64-bit vectors. define <4 x i8> @extract_subvector_v8i8(<8 x i8> %op) #0 { +; NO_SVE-LABEL: extract_subvector_v8i8: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: zip2 v0.8b, v0.8b, v0.8b +; NO_SVE-NEXT: ret +; ; CHECK-LABEL: extract_subvector_v8i8: ; CHECK: // %bb.0: ; CHECK-NEXT: zip2 v0.8b, v0.8b, v0.8b @@ -34,6 +40,12 @@ ; Don't use SVE for 128-bit vectors. define <8 x i8> @extract_subvector_v16i8(<16 x i8> %op) #0 { +; NO_SVE-LABEL: extract_subvector_v16i8: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: ext v0.16b, v0.16b, v0.16b, #8 +; NO_SVE-NEXT: // kill: def $d0 killed $d0 killed $q0 +; NO_SVE-NEXT: ret +; ; CHECK-LABEL: extract_subvector_v16i8: ; CHECK: // %bb.0: ; CHECK-NEXT: ext v0.16b, v0.16b, v0.16b, #8 @@ -44,6 +56,12 @@ } define void @extract_subvector_v32i8(<32 x i8>* %a, <16 x i8>* %b) #0 { +; NO_SVE-LABEL: extract_subvector_v32i8: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: ldr q0, [x0, #16] +; NO_SVE-NEXT: str q0, [x1] +; NO_SVE-NEXT: ret +; ; CHECK-LABEL: extract_subvector_v32i8: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.b, vl32 @@ -58,6 +76,12 @@ } define void @extract_subvector_v64i8(<64 x i8>* %a, <32 x i8>* %b) #0 { +; NO_SVE-LABEL: extract_subvector_v64i8: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: ldp q1, q0, [x0, #32] +; NO_SVE-NEXT: stp q1, q0, [x1] +; NO_SVE-NEXT: ret +; ; VBITS_EQ_256-LABEL: extract_subvector_v64i8: ; VBITS_EQ_256: // %bb.0: ; VBITS_EQ_256-NEXT: mov w8, #32 @@ -66,6 +90,14 @@ ; VBITS_EQ_256-NEXT: st1b { z0.b }, p0, [x1] ; VBITS_EQ_256-NEXT: ret ; +; VBITS_GE_384-LABEL: extract_subvector_v64i8: +; VBITS_GE_384: // %bb.0: +; VBITS_GE_384-NEXT: mov w8, #32 +; VBITS_GE_384-NEXT: ptrue p0.b, vl32 +; VBITS_GE_384-NEXT: ld1b { z0.b }, p0/z, [x0, x8] +; VBITS_GE_384-NEXT: st1b { z0.b }, p0, [x1] +; VBITS_GE_384-NEXT: ret +; ; VBITS_GE_512-LABEL: extract_subvector_v64i8: ; VBITS_GE_512: // %bb.0: ; VBITS_GE_512-NEXT: ptrue p0.b, vl64 @@ -81,6 +113,38 @@ } define void @extract_subvector_v128i8(<128 x i8>* %a, <64 x i8>* %b) #0 { +; NO_SVE-LABEL: extract_subvector_v128i8: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: ldp q0, q1, [x0, #96] +; NO_SVE-NEXT: ldp q2, q3, [x0, #64] +; NO_SVE-NEXT: stp q0, q1, [x1, #32] +; NO_SVE-NEXT: stp q2, q3, [x1] +; NO_SVE-NEXT: ret +; +; VBITS_EQ_256-LABEL: extract_subvector_v128i8: +; VBITS_EQ_256: // %bb.0: +; VBITS_EQ_256-NEXT: mov w8, #64 +; VBITS_EQ_256-NEXT: mov w9, #96 +; VBITS_EQ_256-NEXT: ptrue p0.b, vl32 +; VBITS_EQ_256-NEXT: ld1b { z0.b }, p0/z, [x0, x8] +; VBITS_EQ_256-NEXT: ld1b { z1.b }, p0/z, [x0, x9] +; VBITS_EQ_256-NEXT: mov w8, #32 +; VBITS_EQ_256-NEXT: st1b { z1.b }, p0, [x1, x8] +; VBITS_EQ_256-NEXT: st1b { z0.b }, p0, [x1] +; VBITS_EQ_256-NEXT: ret +; +; VBITS_GE_384-LABEL: extract_subvector_v128i8: +; VBITS_GE_384: // %bb.0: +; VBITS_GE_384-NEXT: mov w8, #64 +; VBITS_GE_384-NEXT: mov w9, #96 +; VBITS_GE_384-NEXT: ptrue p0.b, vl32 +; VBITS_GE_384-NEXT: ld1b { z0.b }, p0/z, [x0, x8] +; VBITS_GE_384-NEXT: ld1b { z1.b }, p0/z, [x0, x9] +; VBITS_GE_384-NEXT: mov w8, #32 +; VBITS_GE_384-NEXT: st1b { z1.b }, p0, [x1, x8] +; VBITS_GE_384-NEXT: st1b { z0.b }, p0, [x1] +; VBITS_GE_384-NEXT: ret +; ; VBITS_GE_1024-LABEL: extract_subvector_v128i8: ; VBITS_GE_1024: // %bb.0: ; VBITS_GE_1024-NEXT: ptrue p0.b, vl128 @@ -96,6 +160,58 @@ } define void @extract_subvector_v256i8(<256 x i8>* %a, <128 x i8>* %b) #0 { +; NO_SVE-LABEL: extract_subvector_v256i8: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: ldp q1, q0, [x0, #128] +; NO_SVE-NEXT: ldp q3, q2, [x0, #224] +; NO_SVE-NEXT: ldp q5, q4, [x0, #192] +; NO_SVE-NEXT: ldp q7, q6, [x0, #160] +; NO_SVE-NEXT: stp q1, q0, [x1] +; NO_SVE-NEXT: stp q5, q4, [x1, #64] +; NO_SVE-NEXT: stp q3, q2, [x1, #96] +; NO_SVE-NEXT: stp q7, q6, [x1, #32] +; NO_SVE-NEXT: ret +; +; VBITS_EQ_256-LABEL: extract_subvector_v256i8: +; VBITS_EQ_256: // %bb.0: +; VBITS_EQ_256-NEXT: mov w8, #128 +; VBITS_EQ_256-NEXT: mov w9, #160 +; VBITS_EQ_256-NEXT: mov w10, #224 +; VBITS_EQ_256-NEXT: mov w11, #192 +; VBITS_EQ_256-NEXT: ptrue p0.b, vl32 +; VBITS_EQ_256-NEXT: ld1b { z0.b }, p0/z, [x0, x8] +; VBITS_EQ_256-NEXT: ld1b { z1.b }, p0/z, [x0, x9] +; VBITS_EQ_256-NEXT: ld1b { z2.b }, p0/z, [x0, x10] +; VBITS_EQ_256-NEXT: ld1b { z3.b }, p0/z, [x0, x11] +; VBITS_EQ_256-NEXT: mov w8, #64 +; VBITS_EQ_256-NEXT: mov w9, #96 +; VBITS_EQ_256-NEXT: mov w10, #32 +; VBITS_EQ_256-NEXT: st1b { z0.b }, p0, [x1] +; VBITS_EQ_256-NEXT: st1b { z3.b }, p0, [x1, x8] +; VBITS_EQ_256-NEXT: st1b { z2.b }, p0, [x1, x9] +; VBITS_EQ_256-NEXT: st1b { z1.b }, p0, [x1, x10] +; VBITS_EQ_256-NEXT: ret +; +; VBITS_GE_384-LABEL: extract_subvector_v256i8: +; VBITS_GE_384: // %bb.0: +; VBITS_GE_384-NEXT: mov w8, #128 +; VBITS_GE_384-NEXT: mov w9, #160 +; VBITS_GE_384-NEXT: mov w10, #224 +; VBITS_GE_384-NEXT: mov w11, #192 +; VBITS_GE_384-NEXT: ptrue p0.b, vl32 +; VBITS_GE_384-NEXT: ld1b { z0.b }, p0/z, [x0, x8] +; VBITS_GE_384-NEXT: ld1b { z1.b }, p0/z, [x0, x9] +; VBITS_GE_384-NEXT: ld1b { z2.b }, p0/z, [x0, x10] +; VBITS_GE_384-NEXT: ld1b { z3.b }, p0/z, [x0, x11] +; VBITS_GE_384-NEXT: mov w8, #64 +; VBITS_GE_384-NEXT: mov w9, #96 +; VBITS_GE_384-NEXT: mov w10, #32 +; VBITS_GE_384-NEXT: st1b { z0.b }, p0, [x1] +; VBITS_GE_384-NEXT: st1b { z3.b }, p0, [x1, x8] +; VBITS_GE_384-NEXT: st1b { z2.b }, p0, [x1, x9] +; VBITS_GE_384-NEXT: st1b { z1.b }, p0, [x1, x10] +; VBITS_GE_384-NEXT: ret +; ; VBITS_GE_2048-LABEL: extract_subvector_v256i8: ; VBITS_GE_2048: // %bb.0: ; VBITS_GE_2048-NEXT: ptrue p0.b, vl256 @@ -114,6 +230,16 @@ ; Don't use SVE for 64-bit vectors. define <2 x i16> @extract_subvector_v4i16(<4 x i16> %op) #0 { +; NO_SVE-LABEL: extract_subvector_v4i16: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: // kill: def $d0 killed $d0 def $q0 +; NO_SVE-NEXT: umov w8, v0.h[2] +; NO_SVE-NEXT: umov w9, v0.h[3] +; NO_SVE-NEXT: fmov s0, w8 +; NO_SVE-NEXT: mov v0.s[1], w9 +; NO_SVE-NEXT: // kill: def $d0 killed $d0 killed $q0 +; NO_SVE-NEXT: ret +; ; CHECK-LABEL: extract_subvector_v4i16: ; CHECK: // %bb.0: ; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 @@ -129,6 +255,12 @@ ; Don't use SVE for 128-bit vectors. define <4 x i16> @extract_subvector_v8i16(<8 x i16> %op) #0 { +; NO_SVE-LABEL: extract_subvector_v8i16: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: ext v0.16b, v0.16b, v0.16b, #8 +; NO_SVE-NEXT: // kill: def $d0 killed $d0 killed $q0 +; NO_SVE-NEXT: ret +; ; CHECK-LABEL: extract_subvector_v8i16: ; CHECK: // %bb.0: ; CHECK-NEXT: ext v0.16b, v0.16b, v0.16b, #8 @@ -139,6 +271,12 @@ } define void @extract_subvector_v16i16(<16 x i16>* %a, <8 x i16>* %b) #0 { +; NO_SVE-LABEL: extract_subvector_v16i16: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: ldr q0, [x0, #16] +; NO_SVE-NEXT: str q0, [x1] +; NO_SVE-NEXT: ret +; ; CHECK-LABEL: extract_subvector_v16i16: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.h, vl16 @@ -153,6 +291,12 @@ } define void @extract_subvector_v32i16(<32 x i16>* %a, <16 x i16>* %b) #0 { +; NO_SVE-LABEL: extract_subvector_v32i16: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: ldp q1, q0, [x0, #32] +; NO_SVE-NEXT: stp q1, q0, [x1] +; NO_SVE-NEXT: ret +; ; VBITS_EQ_256-LABEL: extract_subvector_v32i16: ; VBITS_EQ_256: // %bb.0: ; VBITS_EQ_256-NEXT: mov x8, #16 @@ -161,6 +305,14 @@ ; VBITS_EQ_256-NEXT: st1h { z0.h }, p0, [x1] ; VBITS_EQ_256-NEXT: ret ; +; VBITS_GE_384-LABEL: extract_subvector_v32i16: +; VBITS_GE_384: // %bb.0: +; VBITS_GE_384-NEXT: mov x8, #16 +; VBITS_GE_384-NEXT: ptrue p0.h, vl16 +; VBITS_GE_384-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] +; VBITS_GE_384-NEXT: st1h { z0.h }, p0, [x1] +; VBITS_GE_384-NEXT: ret +; ; VBITS_GE_512-LABEL: extract_subvector_v32i16: ; VBITS_GE_512: // %bb.0: ; VBITS_GE_512-NEXT: ptrue p0.h, vl32 @@ -176,6 +328,38 @@ } define void @extract_subvector_v64i16(<64 x i16>* %a, <32 x i16>* %b) #0 { +; NO_SVE-LABEL: extract_subvector_v64i16: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: ldp q0, q1, [x0, #96] +; NO_SVE-NEXT: ldp q2, q3, [x0, #64] +; NO_SVE-NEXT: stp q0, q1, [x1, #32] +; NO_SVE-NEXT: stp q2, q3, [x1] +; NO_SVE-NEXT: ret +; +; VBITS_EQ_256-LABEL: extract_subvector_v64i16: +; VBITS_EQ_256: // %bb.0: +; VBITS_EQ_256-NEXT: mov x8, #32 +; VBITS_EQ_256-NEXT: mov x9, #48 +; VBITS_EQ_256-NEXT: ptrue p0.h, vl16 +; VBITS_EQ_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] +; VBITS_EQ_256-NEXT: ld1h { z1.h }, p0/z, [x0, x9, lsl #1] +; VBITS_EQ_256-NEXT: mov x8, #16 +; VBITS_EQ_256-NEXT: st1h { z1.h }, p0, [x1, x8, lsl #1] +; VBITS_EQ_256-NEXT: st1h { z0.h }, p0, [x1] +; VBITS_EQ_256-NEXT: ret +; +; VBITS_GE_384-LABEL: extract_subvector_v64i16: +; VBITS_GE_384: // %bb.0: +; VBITS_GE_384-NEXT: mov x8, #32 +; VBITS_GE_384-NEXT: mov x9, #48 +; VBITS_GE_384-NEXT: ptrue p0.h, vl16 +; VBITS_GE_384-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] +; VBITS_GE_384-NEXT: ld1h { z1.h }, p0/z, [x0, x9, lsl #1] +; VBITS_GE_384-NEXT: mov x8, #16 +; VBITS_GE_384-NEXT: st1h { z1.h }, p0, [x1, x8, lsl #1] +; VBITS_GE_384-NEXT: st1h { z0.h }, p0, [x1] +; VBITS_GE_384-NEXT: ret +; ; VBITS_GE_1024-LABEL: extract_subvector_v64i16: ; VBITS_GE_1024: // %bb.0: ; VBITS_GE_1024-NEXT: ptrue p0.h, vl64 @@ -191,6 +375,58 @@ } define void @extract_subvector_v128i16(<128 x i16>* %a, <64 x i16>* %b) #0 { +; NO_SVE-LABEL: extract_subvector_v128i16: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: ldp q1, q0, [x0, #128] +; NO_SVE-NEXT: ldp q3, q2, [x0, #224] +; NO_SVE-NEXT: ldp q5, q4, [x0, #192] +; NO_SVE-NEXT: ldp q7, q6, [x0, #160] +; NO_SVE-NEXT: stp q1, q0, [x1] +; NO_SVE-NEXT: stp q5, q4, [x1, #64] +; NO_SVE-NEXT: stp q3, q2, [x1, #96] +; NO_SVE-NEXT: stp q7, q6, [x1, #32] +; NO_SVE-NEXT: ret +; +; VBITS_EQ_256-LABEL: extract_subvector_v128i16: +; VBITS_EQ_256: // %bb.0: +; VBITS_EQ_256-NEXT: mov x8, #64 +; VBITS_EQ_256-NEXT: mov x9, #80 +; VBITS_EQ_256-NEXT: mov x10, #112 +; VBITS_EQ_256-NEXT: mov x11, #96 +; VBITS_EQ_256-NEXT: ptrue p0.h, vl16 +; VBITS_EQ_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] +; VBITS_EQ_256-NEXT: ld1h { z1.h }, p0/z, [x0, x9, lsl #1] +; VBITS_EQ_256-NEXT: ld1h { z2.h }, p0/z, [x0, x10, lsl #1] +; VBITS_EQ_256-NEXT: ld1h { z3.h }, p0/z, [x0, x11, lsl #1] +; VBITS_EQ_256-NEXT: mov x8, #32 +; VBITS_EQ_256-NEXT: mov x9, #48 +; VBITS_EQ_256-NEXT: mov x10, #16 +; VBITS_EQ_256-NEXT: st1h { z0.h }, p0, [x1] +; VBITS_EQ_256-NEXT: st1h { z3.h }, p0, [x1, x8, lsl #1] +; VBITS_EQ_256-NEXT: st1h { z2.h }, p0, [x1, x9, lsl #1] +; VBITS_EQ_256-NEXT: st1h { z1.h }, p0, [x1, x10, lsl #1] +; VBITS_EQ_256-NEXT: ret +; +; VBITS_GE_384-LABEL: extract_subvector_v128i16: +; VBITS_GE_384: // %bb.0: +; VBITS_GE_384-NEXT: mov x8, #64 +; VBITS_GE_384-NEXT: mov x9, #80 +; VBITS_GE_384-NEXT: mov x10, #112 +; VBITS_GE_384-NEXT: mov x11, #96 +; VBITS_GE_384-NEXT: ptrue p0.h, vl16 +; VBITS_GE_384-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] +; VBITS_GE_384-NEXT: ld1h { z1.h }, p0/z, [x0, x9, lsl #1] +; VBITS_GE_384-NEXT: ld1h { z2.h }, p0/z, [x0, x10, lsl #1] +; VBITS_GE_384-NEXT: ld1h { z3.h }, p0/z, [x0, x11, lsl #1] +; VBITS_GE_384-NEXT: mov x8, #32 +; VBITS_GE_384-NEXT: mov x9, #48 +; VBITS_GE_384-NEXT: mov x10, #16 +; VBITS_GE_384-NEXT: st1h { z0.h }, p0, [x1] +; VBITS_GE_384-NEXT: st1h { z3.h }, p0, [x1, x8, lsl #1] +; VBITS_GE_384-NEXT: st1h { z2.h }, p0, [x1, x9, lsl #1] +; VBITS_GE_384-NEXT: st1h { z1.h }, p0, [x1, x10, lsl #1] +; VBITS_GE_384-NEXT: ret +; ; VBITS_GE_2048-LABEL: extract_subvector_v128i16: ; VBITS_GE_2048: // %bb.0: ; VBITS_GE_2048-NEXT: ptrue p0.h, vl128 @@ -209,6 +445,12 @@ ; Don't use SVE for 64-bit vectors. define <1 x i32> @extract_subvector_v2i32(<2 x i32> %op) #0 { +; NO_SVE-LABEL: extract_subvector_v2i32: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: // kill: def $d0 killed $d0 def $q0 +; NO_SVE-NEXT: dup v0.2s, v0.s[1] +; NO_SVE-NEXT: ret +; ; CHECK-LABEL: extract_subvector_v2i32: ; CHECK: // %bb.0: ; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 @@ -220,6 +462,12 @@ ; Don't use SVE for 128-bit vectors. define <2 x i32> @extract_subvector_v4i32(<4 x i32> %op) #0 { +; NO_SVE-LABEL: extract_subvector_v4i32: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: ext v0.16b, v0.16b, v0.16b, #8 +; NO_SVE-NEXT: // kill: def $d0 killed $d0 killed $q0 +; NO_SVE-NEXT: ret +; ; CHECK-LABEL: extract_subvector_v4i32: ; CHECK: // %bb.0: ; CHECK-NEXT: ext v0.16b, v0.16b, v0.16b, #8 @@ -230,6 +478,12 @@ } define void @extract_subvector_v8i32(<8 x i32>* %a, <4 x i32>* %b) #0 { +; NO_SVE-LABEL: extract_subvector_v8i32: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: ldr q0, [x0, #16] +; NO_SVE-NEXT: str q0, [x1] +; NO_SVE-NEXT: ret +; ; CHECK-LABEL: extract_subvector_v8i32: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.s, vl8 @@ -244,6 +498,12 @@ } define void @extract_subvector_v16i32(<16 x i32>* %a, <8 x i32>* %b) #0 { +; NO_SVE-LABEL: extract_subvector_v16i32: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: ldp q1, q0, [x0, #32] +; NO_SVE-NEXT: stp q1, q0, [x1] +; NO_SVE-NEXT: ret +; ; VBITS_EQ_256-LABEL: extract_subvector_v16i32: ; VBITS_EQ_256: // %bb.0: ; VBITS_EQ_256-NEXT: mov x8, #8 @@ -252,6 +512,14 @@ ; VBITS_EQ_256-NEXT: st1w { z0.s }, p0, [x1] ; VBITS_EQ_256-NEXT: ret ; +; VBITS_GE_384-LABEL: extract_subvector_v16i32: +; VBITS_GE_384: // %bb.0: +; VBITS_GE_384-NEXT: mov x8, #8 +; VBITS_GE_384-NEXT: ptrue p0.s, vl8 +; VBITS_GE_384-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] +; VBITS_GE_384-NEXT: st1w { z0.s }, p0, [x1] +; VBITS_GE_384-NEXT: ret +; ; VBITS_GE_512-LABEL: extract_subvector_v16i32: ; VBITS_GE_512: // %bb.0: ; VBITS_GE_512-NEXT: ptrue p0.s, vl16 @@ -267,6 +535,38 @@ } define void @extract_subvector_v32i32(<32 x i32>* %a, <16 x i32>* %b) #0 { +; NO_SVE-LABEL: extract_subvector_v32i32: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: ldp q0, q1, [x0, #96] +; NO_SVE-NEXT: ldp q2, q3, [x0, #64] +; NO_SVE-NEXT: stp q0, q1, [x1, #32] +; NO_SVE-NEXT: stp q2, q3, [x1] +; NO_SVE-NEXT: ret +; +; VBITS_EQ_256-LABEL: extract_subvector_v32i32: +; VBITS_EQ_256: // %bb.0: +; VBITS_EQ_256-NEXT: mov x8, #16 +; VBITS_EQ_256-NEXT: mov x9, #24 +; VBITS_EQ_256-NEXT: ptrue p0.s, vl8 +; VBITS_EQ_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] +; VBITS_EQ_256-NEXT: ld1w { z1.s }, p0/z, [x0, x9, lsl #2] +; VBITS_EQ_256-NEXT: mov x8, #8 +; VBITS_EQ_256-NEXT: st1w { z1.s }, p0, [x1, x8, lsl #2] +; VBITS_EQ_256-NEXT: st1w { z0.s }, p0, [x1] +; VBITS_EQ_256-NEXT: ret +; +; VBITS_GE_384-LABEL: extract_subvector_v32i32: +; VBITS_GE_384: // %bb.0: +; VBITS_GE_384-NEXT: mov x8, #16 +; VBITS_GE_384-NEXT: mov x9, #24 +; VBITS_GE_384-NEXT: ptrue p0.s, vl8 +; VBITS_GE_384-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] +; VBITS_GE_384-NEXT: ld1w { z1.s }, p0/z, [x0, x9, lsl #2] +; VBITS_GE_384-NEXT: mov x8, #8 +; VBITS_GE_384-NEXT: st1w { z1.s }, p0, [x1, x8, lsl #2] +; VBITS_GE_384-NEXT: st1w { z0.s }, p0, [x1] +; VBITS_GE_384-NEXT: ret +; ; VBITS_GE_1024-LABEL: extract_subvector_v32i32: ; VBITS_GE_1024: // %bb.0: ; VBITS_GE_1024-NEXT: ptrue p0.s, vl32 @@ -282,6 +582,58 @@ } define void @extract_subvector_v64i32(<64 x i32>* %a, <32 x i32>* %b) #0 { +; NO_SVE-LABEL: extract_subvector_v64i32: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: ldp q1, q0, [x0, #128] +; NO_SVE-NEXT: ldp q3, q2, [x0, #224] +; NO_SVE-NEXT: ldp q5, q4, [x0, #192] +; NO_SVE-NEXT: ldp q7, q6, [x0, #160] +; NO_SVE-NEXT: stp q1, q0, [x1] +; NO_SVE-NEXT: stp q5, q4, [x1, #64] +; NO_SVE-NEXT: stp q3, q2, [x1, #96] +; NO_SVE-NEXT: stp q7, q6, [x1, #32] +; NO_SVE-NEXT: ret +; +; VBITS_EQ_256-LABEL: extract_subvector_v64i32: +; VBITS_EQ_256: // %bb.0: +; VBITS_EQ_256-NEXT: mov x8, #32 +; VBITS_EQ_256-NEXT: mov x9, #40 +; VBITS_EQ_256-NEXT: mov x10, #56 +; VBITS_EQ_256-NEXT: mov x11, #48 +; VBITS_EQ_256-NEXT: ptrue p0.s, vl8 +; VBITS_EQ_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] +; VBITS_EQ_256-NEXT: ld1w { z1.s }, p0/z, [x0, x9, lsl #2] +; VBITS_EQ_256-NEXT: ld1w { z2.s }, p0/z, [x0, x10, lsl #2] +; VBITS_EQ_256-NEXT: ld1w { z3.s }, p0/z, [x0, x11, lsl #2] +; VBITS_EQ_256-NEXT: mov x8, #16 +; VBITS_EQ_256-NEXT: mov x9, #24 +; VBITS_EQ_256-NEXT: mov x10, #8 +; VBITS_EQ_256-NEXT: st1w { z0.s }, p0, [x1] +; VBITS_EQ_256-NEXT: st1w { z3.s }, p0, [x1, x8, lsl #2] +; VBITS_EQ_256-NEXT: st1w { z2.s }, p0, [x1, x9, lsl #2] +; VBITS_EQ_256-NEXT: st1w { z1.s }, p0, [x1, x10, lsl #2] +; VBITS_EQ_256-NEXT: ret +; +; VBITS_GE_384-LABEL: extract_subvector_v64i32: +; VBITS_GE_384: // %bb.0: +; VBITS_GE_384-NEXT: mov x8, #32 +; VBITS_GE_384-NEXT: mov x9, #40 +; VBITS_GE_384-NEXT: mov x10, #56 +; VBITS_GE_384-NEXT: mov x11, #48 +; VBITS_GE_384-NEXT: ptrue p0.s, vl8 +; VBITS_GE_384-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] +; VBITS_GE_384-NEXT: ld1w { z1.s }, p0/z, [x0, x9, lsl #2] +; VBITS_GE_384-NEXT: ld1w { z2.s }, p0/z, [x0, x10, lsl #2] +; VBITS_GE_384-NEXT: ld1w { z3.s }, p0/z, [x0, x11, lsl #2] +; VBITS_GE_384-NEXT: mov x8, #16 +; VBITS_GE_384-NEXT: mov x9, #24 +; VBITS_GE_384-NEXT: mov x10, #8 +; VBITS_GE_384-NEXT: st1w { z0.s }, p0, [x1] +; VBITS_GE_384-NEXT: st1w { z3.s }, p0, [x1, x8, lsl #2] +; VBITS_GE_384-NEXT: st1w { z2.s }, p0, [x1, x9, lsl #2] +; VBITS_GE_384-NEXT: st1w { z1.s }, p0, [x1, x10, lsl #2] +; VBITS_GE_384-NEXT: ret +; ; VBITS_GE_2048-LABEL: extract_subvector_v64i32: ; VBITS_GE_2048: // %bb.0: ; VBITS_GE_2048-NEXT: ptrue p0.s, vl64 @@ -300,6 +652,12 @@ ; Don't use SVE for 128-bit vectors. define <1 x i64> @extract_subvector_v2i64(<2 x i64> %op) #0 { +; NO_SVE-LABEL: extract_subvector_v2i64: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: ext v0.16b, v0.16b, v0.16b, #8 +; NO_SVE-NEXT: // kill: def $d0 killed $d0 killed $q0 +; NO_SVE-NEXT: ret +; ; CHECK-LABEL: extract_subvector_v2i64: ; CHECK: // %bb.0: ; CHECK-NEXT: ext v0.16b, v0.16b, v0.16b, #8 @@ -310,6 +668,12 @@ } define void @extract_subvector_v4i64(<4 x i64>* %a, <2 x i64>* %b) #0 { +; NO_SVE-LABEL: extract_subvector_v4i64: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: ldr q0, [x0, #16] +; NO_SVE-NEXT: str q0, [x1] +; NO_SVE-NEXT: ret +; ; CHECK-LABEL: extract_subvector_v4i64: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.d, vl4 @@ -324,6 +688,12 @@ } define void @extract_subvector_v8i64(<8 x i64>* %a, <4 x i64>* %b) #0 { +; NO_SVE-LABEL: extract_subvector_v8i64: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: ldp q1, q0, [x0, #32] +; NO_SVE-NEXT: stp q1, q0, [x1] +; NO_SVE-NEXT: ret +; ; VBITS_EQ_256-LABEL: extract_subvector_v8i64: ; VBITS_EQ_256: // %bb.0: ; VBITS_EQ_256-NEXT: mov x8, #4 @@ -332,6 +702,14 @@ ; VBITS_EQ_256-NEXT: st1d { z0.d }, p0, [x1] ; VBITS_EQ_256-NEXT: ret ; +; VBITS_GE_384-LABEL: extract_subvector_v8i64: +; VBITS_GE_384: // %bb.0: +; VBITS_GE_384-NEXT: mov x8, #4 +; VBITS_GE_384-NEXT: ptrue p0.d, vl4 +; VBITS_GE_384-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] +; VBITS_GE_384-NEXT: st1d { z0.d }, p0, [x1] +; VBITS_GE_384-NEXT: ret +; ; VBITS_GE_512-LABEL: extract_subvector_v8i64: ; VBITS_GE_512: // %bb.0: ; VBITS_GE_512-NEXT: ptrue p0.d, vl8 @@ -347,6 +725,38 @@ } define void @extract_subvector_v16i64(<16 x i64>* %a, <8 x i64>* %b) #0 { +; NO_SVE-LABEL: extract_subvector_v16i64: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: ldp q0, q1, [x0, #96] +; NO_SVE-NEXT: ldp q2, q3, [x0, #64] +; NO_SVE-NEXT: stp q0, q1, [x1, #32] +; NO_SVE-NEXT: stp q2, q3, [x1] +; NO_SVE-NEXT: ret +; +; VBITS_EQ_256-LABEL: extract_subvector_v16i64: +; VBITS_EQ_256: // %bb.0: +; VBITS_EQ_256-NEXT: mov x8, #8 +; VBITS_EQ_256-NEXT: mov x9, #12 +; VBITS_EQ_256-NEXT: ptrue p0.d, vl4 +; VBITS_EQ_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] +; VBITS_EQ_256-NEXT: ld1d { z1.d }, p0/z, [x0, x9, lsl #3] +; VBITS_EQ_256-NEXT: mov x8, #4 +; VBITS_EQ_256-NEXT: st1d { z1.d }, p0, [x1, x8, lsl #3] +; VBITS_EQ_256-NEXT: st1d { z0.d }, p0, [x1] +; VBITS_EQ_256-NEXT: ret +; +; VBITS_GE_384-LABEL: extract_subvector_v16i64: +; VBITS_GE_384: // %bb.0: +; VBITS_GE_384-NEXT: mov x8, #8 +; VBITS_GE_384-NEXT: mov x9, #12 +; VBITS_GE_384-NEXT: ptrue p0.d, vl4 +; VBITS_GE_384-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] +; VBITS_GE_384-NEXT: ld1d { z1.d }, p0/z, [x0, x9, lsl #3] +; VBITS_GE_384-NEXT: mov x8, #4 +; VBITS_GE_384-NEXT: st1d { z1.d }, p0, [x1, x8, lsl #3] +; VBITS_GE_384-NEXT: st1d { z0.d }, p0, [x1] +; VBITS_GE_384-NEXT: ret +; ; VBITS_GE_1024-LABEL: extract_subvector_v16i64: ; VBITS_GE_1024: // %bb.0: ; VBITS_GE_1024-NEXT: ptrue p0.d, vl16 @@ -362,6 +772,58 @@ } define void @extract_subvector_v32i64(<32 x i64>* %a, <16 x i64>* %b) #0 { +; NO_SVE-LABEL: extract_subvector_v32i64: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: ldp q1, q0, [x0, #128] +; NO_SVE-NEXT: ldp q3, q2, [x0, #224] +; NO_SVE-NEXT: ldp q5, q4, [x0, #192] +; NO_SVE-NEXT: ldp q7, q6, [x0, #160] +; NO_SVE-NEXT: stp q1, q0, [x1] +; NO_SVE-NEXT: stp q5, q4, [x1, #64] +; NO_SVE-NEXT: stp q3, q2, [x1, #96] +; NO_SVE-NEXT: stp q7, q6, [x1, #32] +; NO_SVE-NEXT: ret +; +; VBITS_EQ_256-LABEL: extract_subvector_v32i64: +; VBITS_EQ_256: // %bb.0: +; VBITS_EQ_256-NEXT: mov x8, #16 +; VBITS_EQ_256-NEXT: mov x9, #20 +; VBITS_EQ_256-NEXT: mov x10, #28 +; VBITS_EQ_256-NEXT: mov x11, #24 +; VBITS_EQ_256-NEXT: ptrue p0.d, vl4 +; VBITS_EQ_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] +; VBITS_EQ_256-NEXT: ld1d { z1.d }, p0/z, [x0, x9, lsl #3] +; VBITS_EQ_256-NEXT: ld1d { z2.d }, p0/z, [x0, x10, lsl #3] +; VBITS_EQ_256-NEXT: ld1d { z3.d }, p0/z, [x0, x11, lsl #3] +; VBITS_EQ_256-NEXT: mov x8, #8 +; VBITS_EQ_256-NEXT: mov x9, #12 +; VBITS_EQ_256-NEXT: mov x10, #4 +; VBITS_EQ_256-NEXT: st1d { z0.d }, p0, [x1] +; VBITS_EQ_256-NEXT: st1d { z3.d }, p0, [x1, x8, lsl #3] +; VBITS_EQ_256-NEXT: st1d { z2.d }, p0, [x1, x9, lsl #3] +; VBITS_EQ_256-NEXT: st1d { z1.d }, p0, [x1, x10, lsl #3] +; VBITS_EQ_256-NEXT: ret +; +; VBITS_GE_384-LABEL: extract_subvector_v32i64: +; VBITS_GE_384: // %bb.0: +; VBITS_GE_384-NEXT: mov x8, #16 +; VBITS_GE_384-NEXT: mov x9, #20 +; VBITS_GE_384-NEXT: mov x10, #28 +; VBITS_GE_384-NEXT: mov x11, #24 +; VBITS_GE_384-NEXT: ptrue p0.d, vl4 +; VBITS_GE_384-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] +; VBITS_GE_384-NEXT: ld1d { z1.d }, p0/z, [x0, x9, lsl #3] +; VBITS_GE_384-NEXT: ld1d { z2.d }, p0/z, [x0, x10, lsl #3] +; VBITS_GE_384-NEXT: ld1d { z3.d }, p0/z, [x0, x11, lsl #3] +; VBITS_GE_384-NEXT: mov x8, #8 +; VBITS_GE_384-NEXT: mov x9, #12 +; VBITS_GE_384-NEXT: mov x10, #4 +; VBITS_GE_384-NEXT: st1d { z0.d }, p0, [x1] +; VBITS_GE_384-NEXT: st1d { z3.d }, p0, [x1, x8, lsl #3] +; VBITS_GE_384-NEXT: st1d { z2.d }, p0, [x1, x9, lsl #3] +; VBITS_GE_384-NEXT: st1d { z1.d }, p0, [x1, x10, lsl #3] +; VBITS_GE_384-NEXT: ret +; ; VBITS_GE_2048-LABEL: extract_subvector_v32i64: ; VBITS_GE_2048: // %bb.0: ; VBITS_GE_2048-NEXT: ptrue p0.d, vl32 @@ -380,6 +842,12 @@ ; Don't use SVE for 64-bit vectors. define <2 x half> @extract_subvector_v4f16(<4 x half> %op) #0 { +; NO_SVE-LABEL: extract_subvector_v4f16: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: // kill: def $d0 killed $d0 def $q0 +; NO_SVE-NEXT: dup v0.2s, v0.s[1] +; NO_SVE-NEXT: ret +; ; CHECK-LABEL: extract_subvector_v4f16: ; CHECK: // %bb.0: ; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 @@ -391,6 +859,12 @@ ; Don't use SVE for 128-bit vectors. define <4 x half> @extract_subvector_v8f16(<8 x half> %op) #0 { +; NO_SVE-LABEL: extract_subvector_v8f16: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: ext v0.16b, v0.16b, v0.16b, #8 +; NO_SVE-NEXT: // kill: def $d0 killed $d0 killed $q0 +; NO_SVE-NEXT: ret +; ; CHECK-LABEL: extract_subvector_v8f16: ; CHECK: // %bb.0: ; CHECK-NEXT: ext v0.16b, v0.16b, v0.16b, #8 @@ -401,6 +875,12 @@ } define void @extract_subvector_v16f16(<16 x half>* %a, <8 x half>* %b) #0 { +; NO_SVE-LABEL: extract_subvector_v16f16: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: ldr q0, [x0, #16] +; NO_SVE-NEXT: str q0, [x1] +; NO_SVE-NEXT: ret +; ; CHECK-LABEL: extract_subvector_v16f16: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.h, vl16 @@ -415,6 +895,12 @@ } define void @extract_subvector_v32f16(<32 x half>* %a, <16 x half>* %b) #0 { +; NO_SVE-LABEL: extract_subvector_v32f16: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: ldp q1, q0, [x0, #32] +; NO_SVE-NEXT: stp q1, q0, [x1] +; NO_SVE-NEXT: ret +; ; VBITS_EQ_256-LABEL: extract_subvector_v32f16: ; VBITS_EQ_256: // %bb.0: ; VBITS_EQ_256-NEXT: mov x8, #16 @@ -423,6 +909,14 @@ ; VBITS_EQ_256-NEXT: st1h { z0.h }, p0, [x1] ; VBITS_EQ_256-NEXT: ret ; +; VBITS_GE_384-LABEL: extract_subvector_v32f16: +; VBITS_GE_384: // %bb.0: +; VBITS_GE_384-NEXT: mov x8, #16 +; VBITS_GE_384-NEXT: ptrue p0.h, vl16 +; VBITS_GE_384-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] +; VBITS_GE_384-NEXT: st1h { z0.h }, p0, [x1] +; VBITS_GE_384-NEXT: ret +; ; VBITS_GE_512-LABEL: extract_subvector_v32f16: ; VBITS_GE_512: // %bb.0: ; VBITS_GE_512-NEXT: ptrue p0.h, vl32 @@ -438,6 +932,38 @@ } define void @extract_subvector_v64f16(<64 x half>* %a, <32 x half>* %b) #0 { +; NO_SVE-LABEL: extract_subvector_v64f16: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: ldp q0, q1, [x0, #96] +; NO_SVE-NEXT: ldp q2, q3, [x0, #64] +; NO_SVE-NEXT: stp q0, q1, [x1, #32] +; NO_SVE-NEXT: stp q2, q3, [x1] +; NO_SVE-NEXT: ret +; +; VBITS_EQ_256-LABEL: extract_subvector_v64f16: +; VBITS_EQ_256: // %bb.0: +; VBITS_EQ_256-NEXT: mov x8, #32 +; VBITS_EQ_256-NEXT: mov x9, #48 +; VBITS_EQ_256-NEXT: ptrue p0.h, vl16 +; VBITS_EQ_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] +; VBITS_EQ_256-NEXT: ld1h { z1.h }, p0/z, [x0, x9, lsl #1] +; VBITS_EQ_256-NEXT: mov x8, #16 +; VBITS_EQ_256-NEXT: st1h { z1.h }, p0, [x1, x8, lsl #1] +; VBITS_EQ_256-NEXT: st1h { z0.h }, p0, [x1] +; VBITS_EQ_256-NEXT: ret +; +; VBITS_GE_384-LABEL: extract_subvector_v64f16: +; VBITS_GE_384: // %bb.0: +; VBITS_GE_384-NEXT: mov x8, #32 +; VBITS_GE_384-NEXT: mov x9, #48 +; VBITS_GE_384-NEXT: ptrue p0.h, vl16 +; VBITS_GE_384-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] +; VBITS_GE_384-NEXT: ld1h { z1.h }, p0/z, [x0, x9, lsl #1] +; VBITS_GE_384-NEXT: mov x8, #16 +; VBITS_GE_384-NEXT: st1h { z1.h }, p0, [x1, x8, lsl #1] +; VBITS_GE_384-NEXT: st1h { z0.h }, p0, [x1] +; VBITS_GE_384-NEXT: ret +; ; VBITS_GE_1024-LABEL: extract_subvector_v64f16: ; VBITS_GE_1024: // %bb.0: ; VBITS_GE_1024-NEXT: ptrue p0.h, vl64 @@ -453,6 +979,58 @@ } define void @extract_subvector_v128f16(<128 x half>* %a, <64 x half>* %b) #0 { +; NO_SVE-LABEL: extract_subvector_v128f16: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: ldp q1, q0, [x0, #128] +; NO_SVE-NEXT: ldp q3, q2, [x0, #224] +; NO_SVE-NEXT: ldp q5, q4, [x0, #192] +; NO_SVE-NEXT: ldp q7, q6, [x0, #160] +; NO_SVE-NEXT: stp q1, q0, [x1] +; NO_SVE-NEXT: stp q5, q4, [x1, #64] +; NO_SVE-NEXT: stp q3, q2, [x1, #96] +; NO_SVE-NEXT: stp q7, q6, [x1, #32] +; NO_SVE-NEXT: ret +; +; VBITS_EQ_256-LABEL: extract_subvector_v128f16: +; VBITS_EQ_256: // %bb.0: +; VBITS_EQ_256-NEXT: mov x8, #64 +; VBITS_EQ_256-NEXT: mov x9, #80 +; VBITS_EQ_256-NEXT: mov x10, #112 +; VBITS_EQ_256-NEXT: mov x11, #96 +; VBITS_EQ_256-NEXT: ptrue p0.h, vl16 +; VBITS_EQ_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] +; VBITS_EQ_256-NEXT: ld1h { z1.h }, p0/z, [x0, x9, lsl #1] +; VBITS_EQ_256-NEXT: ld1h { z2.h }, p0/z, [x0, x10, lsl #1] +; VBITS_EQ_256-NEXT: ld1h { z3.h }, p0/z, [x0, x11, lsl #1] +; VBITS_EQ_256-NEXT: mov x8, #32 +; VBITS_EQ_256-NEXT: mov x9, #48 +; VBITS_EQ_256-NEXT: mov x10, #16 +; VBITS_EQ_256-NEXT: st1h { z0.h }, p0, [x1] +; VBITS_EQ_256-NEXT: st1h { z3.h }, p0, [x1, x8, lsl #1] +; VBITS_EQ_256-NEXT: st1h { z2.h }, p0, [x1, x9, lsl #1] +; VBITS_EQ_256-NEXT: st1h { z1.h }, p0, [x1, x10, lsl #1] +; VBITS_EQ_256-NEXT: ret +; +; VBITS_GE_384-LABEL: extract_subvector_v128f16: +; VBITS_GE_384: // %bb.0: +; VBITS_GE_384-NEXT: mov x8, #64 +; VBITS_GE_384-NEXT: mov x9, #80 +; VBITS_GE_384-NEXT: mov x10, #112 +; VBITS_GE_384-NEXT: mov x11, #96 +; VBITS_GE_384-NEXT: ptrue p0.h, vl16 +; VBITS_GE_384-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] +; VBITS_GE_384-NEXT: ld1h { z1.h }, p0/z, [x0, x9, lsl #1] +; VBITS_GE_384-NEXT: ld1h { z2.h }, p0/z, [x0, x10, lsl #1] +; VBITS_GE_384-NEXT: ld1h { z3.h }, p0/z, [x0, x11, lsl #1] +; VBITS_GE_384-NEXT: mov x8, #32 +; VBITS_GE_384-NEXT: mov x9, #48 +; VBITS_GE_384-NEXT: mov x10, #16 +; VBITS_GE_384-NEXT: st1h { z0.h }, p0, [x1] +; VBITS_GE_384-NEXT: st1h { z3.h }, p0, [x1, x8, lsl #1] +; VBITS_GE_384-NEXT: st1h { z2.h }, p0, [x1, x9, lsl #1] +; VBITS_GE_384-NEXT: st1h { z1.h }, p0, [x1, x10, lsl #1] +; VBITS_GE_384-NEXT: ret +; ; VBITS_GE_2048-LABEL: extract_subvector_v128f16: ; VBITS_GE_2048: // %bb.0: ; VBITS_GE_2048-NEXT: ptrue p0.h, vl128 @@ -471,6 +1049,12 @@ ; Don't use SVE for 64-bit vectors. define <1 x float> @extract_subvector_v2f32(<2 x float> %op) #0 { +; NO_SVE-LABEL: extract_subvector_v2f32: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: // kill: def $d0 killed $d0 def $q0 +; NO_SVE-NEXT: dup v0.2s, v0.s[1] +; NO_SVE-NEXT: ret +; ; CHECK-LABEL: extract_subvector_v2f32: ; CHECK: // %bb.0: ; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 @@ -482,6 +1066,12 @@ ; Don't use SVE for 128-bit vectors. define <2 x float> @extract_subvector_v4f32(<4 x float> %op) #0 { +; NO_SVE-LABEL: extract_subvector_v4f32: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: ext v0.16b, v0.16b, v0.16b, #8 +; NO_SVE-NEXT: // kill: def $d0 killed $d0 killed $q0 +; NO_SVE-NEXT: ret +; ; CHECK-LABEL: extract_subvector_v4f32: ; CHECK: // %bb.0: ; CHECK-NEXT: ext v0.16b, v0.16b, v0.16b, #8 @@ -492,6 +1082,12 @@ } define void @extract_subvector_v8f32(<8 x float>* %a, <4 x float>* %b) #0 { +; NO_SVE-LABEL: extract_subvector_v8f32: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: ldr q0, [x0, #16] +; NO_SVE-NEXT: str q0, [x1] +; NO_SVE-NEXT: ret +; ; CHECK-LABEL: extract_subvector_v8f32: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.s, vl8 @@ -506,6 +1102,12 @@ } define void @extract_subvector_v16f32(<16 x float>* %a, <8 x float>* %b) #0 { +; NO_SVE-LABEL: extract_subvector_v16f32: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: ldp q1, q0, [x0, #32] +; NO_SVE-NEXT: stp q1, q0, [x1] +; NO_SVE-NEXT: ret +; ; VBITS_EQ_256-LABEL: extract_subvector_v16f32: ; VBITS_EQ_256: // %bb.0: ; VBITS_EQ_256-NEXT: mov x8, #8 @@ -514,6 +1116,14 @@ ; VBITS_EQ_256-NEXT: st1w { z0.s }, p0, [x1] ; VBITS_EQ_256-NEXT: ret ; +; VBITS_GE_384-LABEL: extract_subvector_v16f32: +; VBITS_GE_384: // %bb.0: +; VBITS_GE_384-NEXT: mov x8, #8 +; VBITS_GE_384-NEXT: ptrue p0.s, vl8 +; VBITS_GE_384-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] +; VBITS_GE_384-NEXT: st1w { z0.s }, p0, [x1] +; VBITS_GE_384-NEXT: ret +; ; VBITS_GE_512-LABEL: extract_subvector_v16f32: ; VBITS_GE_512: // %bb.0: ; VBITS_GE_512-NEXT: ptrue p0.s, vl16 @@ -529,6 +1139,38 @@ } define void @extract_subvector_v32f32(<32 x float>* %a, <16 x float>* %b) #0 { +; NO_SVE-LABEL: extract_subvector_v32f32: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: ldp q0, q1, [x0, #96] +; NO_SVE-NEXT: ldp q2, q3, [x0, #64] +; NO_SVE-NEXT: stp q0, q1, [x1, #32] +; NO_SVE-NEXT: stp q2, q3, [x1] +; NO_SVE-NEXT: ret +; +; VBITS_EQ_256-LABEL: extract_subvector_v32f32: +; VBITS_EQ_256: // %bb.0: +; VBITS_EQ_256-NEXT: mov x8, #16 +; VBITS_EQ_256-NEXT: mov x9, #24 +; VBITS_EQ_256-NEXT: ptrue p0.s, vl8 +; VBITS_EQ_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] +; VBITS_EQ_256-NEXT: ld1w { z1.s }, p0/z, [x0, x9, lsl #2] +; VBITS_EQ_256-NEXT: mov x8, #8 +; VBITS_EQ_256-NEXT: st1w { z1.s }, p0, [x1, x8, lsl #2] +; VBITS_EQ_256-NEXT: st1w { z0.s }, p0, [x1] +; VBITS_EQ_256-NEXT: ret +; +; VBITS_GE_384-LABEL: extract_subvector_v32f32: +; VBITS_GE_384: // %bb.0: +; VBITS_GE_384-NEXT: mov x8, #16 +; VBITS_GE_384-NEXT: mov x9, #24 +; VBITS_GE_384-NEXT: ptrue p0.s, vl8 +; VBITS_GE_384-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] +; VBITS_GE_384-NEXT: ld1w { z1.s }, p0/z, [x0, x9, lsl #2] +; VBITS_GE_384-NEXT: mov x8, #8 +; VBITS_GE_384-NEXT: st1w { z1.s }, p0, [x1, x8, lsl #2] +; VBITS_GE_384-NEXT: st1w { z0.s }, p0, [x1] +; VBITS_GE_384-NEXT: ret +; ; VBITS_GE_1024-LABEL: extract_subvector_v32f32: ; VBITS_GE_1024: // %bb.0: ; VBITS_GE_1024-NEXT: ptrue p0.s, vl32 @@ -544,6 +1186,58 @@ } define void @extract_subvector_v64f32(<64 x float>* %a, <32 x float>* %b) #0 { +; NO_SVE-LABEL: extract_subvector_v64f32: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: ldp q1, q0, [x0, #128] +; NO_SVE-NEXT: ldp q3, q2, [x0, #224] +; NO_SVE-NEXT: ldp q5, q4, [x0, #192] +; NO_SVE-NEXT: ldp q7, q6, [x0, #160] +; NO_SVE-NEXT: stp q1, q0, [x1] +; NO_SVE-NEXT: stp q5, q4, [x1, #64] +; NO_SVE-NEXT: stp q3, q2, [x1, #96] +; NO_SVE-NEXT: stp q7, q6, [x1, #32] +; NO_SVE-NEXT: ret +; +; VBITS_EQ_256-LABEL: extract_subvector_v64f32: +; VBITS_EQ_256: // %bb.0: +; VBITS_EQ_256-NEXT: mov x8, #32 +; VBITS_EQ_256-NEXT: mov x9, #40 +; VBITS_EQ_256-NEXT: mov x10, #56 +; VBITS_EQ_256-NEXT: mov x11, #48 +; VBITS_EQ_256-NEXT: ptrue p0.s, vl8 +; VBITS_EQ_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] +; VBITS_EQ_256-NEXT: ld1w { z1.s }, p0/z, [x0, x9, lsl #2] +; VBITS_EQ_256-NEXT: ld1w { z2.s }, p0/z, [x0, x10, lsl #2] +; VBITS_EQ_256-NEXT: ld1w { z3.s }, p0/z, [x0, x11, lsl #2] +; VBITS_EQ_256-NEXT: mov x8, #16 +; VBITS_EQ_256-NEXT: mov x9, #24 +; VBITS_EQ_256-NEXT: mov x10, #8 +; VBITS_EQ_256-NEXT: st1w { z0.s }, p0, [x1] +; VBITS_EQ_256-NEXT: st1w { z3.s }, p0, [x1, x8, lsl #2] +; VBITS_EQ_256-NEXT: st1w { z2.s }, p0, [x1, x9, lsl #2] +; VBITS_EQ_256-NEXT: st1w { z1.s }, p0, [x1, x10, lsl #2] +; VBITS_EQ_256-NEXT: ret +; +; VBITS_GE_384-LABEL: extract_subvector_v64f32: +; VBITS_GE_384: // %bb.0: +; VBITS_GE_384-NEXT: mov x8, #32 +; VBITS_GE_384-NEXT: mov x9, #40 +; VBITS_GE_384-NEXT: mov x10, #56 +; VBITS_GE_384-NEXT: mov x11, #48 +; VBITS_GE_384-NEXT: ptrue p0.s, vl8 +; VBITS_GE_384-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] +; VBITS_GE_384-NEXT: ld1w { z1.s }, p0/z, [x0, x9, lsl #2] +; VBITS_GE_384-NEXT: ld1w { z2.s }, p0/z, [x0, x10, lsl #2] +; VBITS_GE_384-NEXT: ld1w { z3.s }, p0/z, [x0, x11, lsl #2] +; VBITS_GE_384-NEXT: mov x8, #16 +; VBITS_GE_384-NEXT: mov x9, #24 +; VBITS_GE_384-NEXT: mov x10, #8 +; VBITS_GE_384-NEXT: st1w { z0.s }, p0, [x1] +; VBITS_GE_384-NEXT: st1w { z3.s }, p0, [x1, x8, lsl #2] +; VBITS_GE_384-NEXT: st1w { z2.s }, p0, [x1, x9, lsl #2] +; VBITS_GE_384-NEXT: st1w { z1.s }, p0, [x1, x10, lsl #2] +; VBITS_GE_384-NEXT: ret +; ; VBITS_GE_2048-LABEL: extract_subvector_v64f32: ; VBITS_GE_2048: // %bb.0: ; VBITS_GE_2048-NEXT: ptrue p0.s, vl64 @@ -562,6 +1256,12 @@ ; Don't use SVE for 128-bit vectors. define <1 x double> @extract_subvector_v2f64(<2 x double> %op) #0 { +; NO_SVE-LABEL: extract_subvector_v2f64: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: ext v0.16b, v0.16b, v0.16b, #8 +; NO_SVE-NEXT: // kill: def $d0 killed $d0 killed $q0 +; NO_SVE-NEXT: ret +; ; CHECK-LABEL: extract_subvector_v2f64: ; CHECK: // %bb.0: ; CHECK-NEXT: ext v0.16b, v0.16b, v0.16b, #8 @@ -572,6 +1272,12 @@ } define void @extract_subvector_v4f64(<4 x double>* %a, <2 x double>* %b) #0 { +; NO_SVE-LABEL: extract_subvector_v4f64: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: ldr q0, [x0, #16] +; NO_SVE-NEXT: str q0, [x1] +; NO_SVE-NEXT: ret +; ; CHECK-LABEL: extract_subvector_v4f64: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.d, vl4 @@ -586,6 +1292,12 @@ } define void @extract_subvector_v8f64(<8 x double>* %a, <4 x double>* %b) #0 { +; NO_SVE-LABEL: extract_subvector_v8f64: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: ldp q1, q0, [x0, #32] +; NO_SVE-NEXT: stp q1, q0, [x1] +; NO_SVE-NEXT: ret +; ; VBITS_EQ_256-LABEL: extract_subvector_v8f64: ; VBITS_EQ_256: // %bb.0: ; VBITS_EQ_256-NEXT: mov x8, #4 @@ -594,6 +1306,14 @@ ; VBITS_EQ_256-NEXT: st1d { z0.d }, p0, [x1] ; VBITS_EQ_256-NEXT: ret ; +; VBITS_GE_384-LABEL: extract_subvector_v8f64: +; VBITS_GE_384: // %bb.0: +; VBITS_GE_384-NEXT: mov x8, #4 +; VBITS_GE_384-NEXT: ptrue p0.d, vl4 +; VBITS_GE_384-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] +; VBITS_GE_384-NEXT: st1d { z0.d }, p0, [x1] +; VBITS_GE_384-NEXT: ret +; ; VBITS_GE_512-LABEL: extract_subvector_v8f64: ; VBITS_GE_512: // %bb.0: ; VBITS_GE_512-NEXT: ptrue p0.d, vl8 @@ -609,6 +1329,38 @@ } define void @extract_subvector_v16f64(<16 x double>* %a, <8 x double>* %b) #0 { +; NO_SVE-LABEL: extract_subvector_v16f64: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: ldp q0, q1, [x0, #96] +; NO_SVE-NEXT: ldp q2, q3, [x0, #64] +; NO_SVE-NEXT: stp q0, q1, [x1, #32] +; NO_SVE-NEXT: stp q2, q3, [x1] +; NO_SVE-NEXT: ret +; +; VBITS_EQ_256-LABEL: extract_subvector_v16f64: +; VBITS_EQ_256: // %bb.0: +; VBITS_EQ_256-NEXT: mov x8, #8 +; VBITS_EQ_256-NEXT: mov x9, #12 +; VBITS_EQ_256-NEXT: ptrue p0.d, vl4 +; VBITS_EQ_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] +; VBITS_EQ_256-NEXT: ld1d { z1.d }, p0/z, [x0, x9, lsl #3] +; VBITS_EQ_256-NEXT: mov x8, #4 +; VBITS_EQ_256-NEXT: st1d { z1.d }, p0, [x1, x8, lsl #3] +; VBITS_EQ_256-NEXT: st1d { z0.d }, p0, [x1] +; VBITS_EQ_256-NEXT: ret +; +; VBITS_GE_384-LABEL: extract_subvector_v16f64: +; VBITS_GE_384: // %bb.0: +; VBITS_GE_384-NEXT: mov x8, #8 +; VBITS_GE_384-NEXT: mov x9, #12 +; VBITS_GE_384-NEXT: ptrue p0.d, vl4 +; VBITS_GE_384-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] +; VBITS_GE_384-NEXT: ld1d { z1.d }, p0/z, [x0, x9, lsl #3] +; VBITS_GE_384-NEXT: mov x8, #4 +; VBITS_GE_384-NEXT: st1d { z1.d }, p0, [x1, x8, lsl #3] +; VBITS_GE_384-NEXT: st1d { z0.d }, p0, [x1] +; VBITS_GE_384-NEXT: ret +; ; VBITS_GE_1024-LABEL: extract_subvector_v16f64: ; VBITS_GE_1024: // %bb.0: ; VBITS_GE_1024-NEXT: ptrue p0.d, vl16 @@ -624,6 +1376,58 @@ } define void @extract_subvector_v32f64(<32 x double>* %a, <16 x double>* %b) #0 { +; NO_SVE-LABEL: extract_subvector_v32f64: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: ldp q1, q0, [x0, #128] +; NO_SVE-NEXT: ldp q3, q2, [x0, #224] +; NO_SVE-NEXT: ldp q5, q4, [x0, #192] +; NO_SVE-NEXT: ldp q7, q6, [x0, #160] +; NO_SVE-NEXT: stp q1, q0, [x1] +; NO_SVE-NEXT: stp q5, q4, [x1, #64] +; NO_SVE-NEXT: stp q3, q2, [x1, #96] +; NO_SVE-NEXT: stp q7, q6, [x1, #32] +; NO_SVE-NEXT: ret +; +; VBITS_EQ_256-LABEL: extract_subvector_v32f64: +; VBITS_EQ_256: // %bb.0: +; VBITS_EQ_256-NEXT: mov x8, #16 +; VBITS_EQ_256-NEXT: mov x9, #20 +; VBITS_EQ_256-NEXT: mov x10, #28 +; VBITS_EQ_256-NEXT: mov x11, #24 +; VBITS_EQ_256-NEXT: ptrue p0.d, vl4 +; VBITS_EQ_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] +; VBITS_EQ_256-NEXT: ld1d { z1.d }, p0/z, [x0, x9, lsl #3] +; VBITS_EQ_256-NEXT: ld1d { z2.d }, p0/z, [x0, x10, lsl #3] +; VBITS_EQ_256-NEXT: ld1d { z3.d }, p0/z, [x0, x11, lsl #3] +; VBITS_EQ_256-NEXT: mov x8, #8 +; VBITS_EQ_256-NEXT: mov x9, #12 +; VBITS_EQ_256-NEXT: mov x10, #4 +; VBITS_EQ_256-NEXT: st1d { z0.d }, p0, [x1] +; VBITS_EQ_256-NEXT: st1d { z3.d }, p0, [x1, x8, lsl #3] +; VBITS_EQ_256-NEXT: st1d { z2.d }, p0, [x1, x9, lsl #3] +; VBITS_EQ_256-NEXT: st1d { z1.d }, p0, [x1, x10, lsl #3] +; VBITS_EQ_256-NEXT: ret +; +; VBITS_GE_384-LABEL: extract_subvector_v32f64: +; VBITS_GE_384: // %bb.0: +; VBITS_GE_384-NEXT: mov x8, #16 +; VBITS_GE_384-NEXT: mov x9, #20 +; VBITS_GE_384-NEXT: mov x10, #28 +; VBITS_GE_384-NEXT: mov x11, #24 +; VBITS_GE_384-NEXT: ptrue p0.d, vl4 +; VBITS_GE_384-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] +; VBITS_GE_384-NEXT: ld1d { z1.d }, p0/z, [x0, x9, lsl #3] +; VBITS_GE_384-NEXT: ld1d { z2.d }, p0/z, [x0, x10, lsl #3] +; VBITS_GE_384-NEXT: ld1d { z3.d }, p0/z, [x0, x11, lsl #3] +; VBITS_GE_384-NEXT: mov x8, #8 +; VBITS_GE_384-NEXT: mov x9, #12 +; VBITS_GE_384-NEXT: mov x10, #4 +; VBITS_GE_384-NEXT: st1d { z0.d }, p0, [x1] +; VBITS_GE_384-NEXT: st1d { z3.d }, p0, [x1, x8, lsl #3] +; VBITS_GE_384-NEXT: st1d { z2.d }, p0, [x1, x9, lsl #3] +; VBITS_GE_384-NEXT: st1d { z1.d }, p0, [x1, x10, lsl #3] +; VBITS_GE_384-NEXT: ret +; ; VBITS_GE_2048-LABEL: extract_subvector_v32f64: ; VBITS_GE_2048: // %bb.0: ; VBITS_GE_2048-NEXT: ptrue p0.d, vl32 diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-float-compares.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-float-compares.ll --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-float-compares.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-float-compares.ll @@ -1,6 +1,7 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -aarch64-sve-vector-bits-min=128 < %s | FileCheck %s -check-prefix=NO_SVE ; RUN: llc -aarch64-sve-vector-bits-min=256 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_EQ_256 -; RUN: llc -aarch64-sve-vector-bits-min=384 < %s | FileCheck %s -check-prefixes=CHECK +; RUN: llc -aarch64-sve-vector-bits-min=384 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_384 ; RUN: llc -aarch64-sve-vector-bits-min=512 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512 ; RUN: llc -aarch64-sve-vector-bits-min=640 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512 ; RUN: llc -aarch64-sve-vector-bits-min=768 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512 @@ -26,6 +27,11 @@ ; Don't use SVE for 64-bit vectors. define <4 x i16> @fcmp_oeq_v4f16(<4 x half> %op1, <4 x half> %op2) #0 { +; NO_SVE-LABEL: fcmp_oeq_v4f16: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: fcmeq v0.4h, v0.4h, v1.4h +; NO_SVE-NEXT: ret +; ; CHECK-LABEL: fcmp_oeq_v4f16: ; CHECK: // %bb.0: ; CHECK-NEXT: fcmeq v0.4h, v0.4h, v1.4h @@ -37,6 +43,11 @@ ; Don't use SVE for 128-bit vectors. define <8 x i16> @fcmp_oeq_v8f16(<8 x half> %op1, <8 x half> %op2) #0 { +; NO_SVE-LABEL: fcmp_oeq_v8f16: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: fcmeq v0.8h, v0.8h, v1.8h +; NO_SVE-NEXT: ret +; ; CHECK-LABEL: fcmp_oeq_v8f16: ; CHECK: // %bb.0: ; CHECK-NEXT: fcmeq v0.8h, v0.8h, v1.8h @@ -47,6 +58,15 @@ } define void @fcmp_oeq_v16f16(<16 x half>* %a, <16 x half>* %b, <16 x i16>* %c) #0 { +; NO_SVE-LABEL: fcmp_oeq_v16f16: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: ldp q0, q3, [x1] +; NO_SVE-NEXT: ldp q1, q2, [x0] +; NO_SVE-NEXT: fcmeq v0.8h, v1.8h, v0.8h +; NO_SVE-NEXT: fcmeq v1.8h, v2.8h, v3.8h +; NO_SVE-NEXT: stp q0, q1, [x2] +; NO_SVE-NEXT: ret +; ; CHECK-LABEL: fcmp_oeq_v16f16: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.h, vl16 @@ -66,6 +86,20 @@ define void @fcmp_oeq_v32f16(<32 x half>* %a, <32 x half>* %b, <32 x i16>* %c) #0 { ; Ensure sensible type legalisation +; NO_SVE-LABEL: fcmp_oeq_v32f16: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: ldp q0, q5, [x1, #32] +; NO_SVE-NEXT: ldp q1, q2, [x0, #32] +; NO_SVE-NEXT: fcmeq v0.8h, v1.8h, v0.8h +; NO_SVE-NEXT: ldp q3, q4, [x0] +; NO_SVE-NEXT: fcmeq v2.8h, v2.8h, v5.8h +; NO_SVE-NEXT: ldp q6, q1, [x1] +; NO_SVE-NEXT: stp q0, q2, [x2, #32] +; NO_SVE-NEXT: fcmeq v3.8h, v3.8h, v6.8h +; NO_SVE-NEXT: fcmeq v1.8h, v4.8h, v1.8h +; NO_SVE-NEXT: stp q3, q1, [x2] +; NO_SVE-NEXT: ret +; ; VBITS_EQ_256-LABEL: fcmp_oeq_v32f16: ; VBITS_EQ_256: // %bb.0: ; VBITS_EQ_256-NEXT: mov x8, #16 @@ -82,6 +116,22 @@ ; VBITS_EQ_256-NEXT: st1h { z1.h }, p0, [x2] ; VBITS_EQ_256-NEXT: ret ; +; VBITS_GE_384-LABEL: fcmp_oeq_v32f16: +; VBITS_GE_384: // %bb.0: +; VBITS_GE_384-NEXT: mov x8, #16 +; VBITS_GE_384-NEXT: ptrue p0.h, vl16 +; VBITS_GE_384-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] +; VBITS_GE_384-NEXT: ld1h { z1.h }, p0/z, [x0] +; VBITS_GE_384-NEXT: ld1h { z2.h }, p0/z, [x1, x8, lsl #1] +; VBITS_GE_384-NEXT: ld1h { z3.h }, p0/z, [x1] +; VBITS_GE_384-NEXT: fcmeq p1.h, p0/z, z0.h, z2.h +; VBITS_GE_384-NEXT: fcmeq p2.h, p0/z, z1.h, z3.h +; VBITS_GE_384-NEXT: mov z0.h, p1/z, #-1 // =0xffffffffffffffff +; VBITS_GE_384-NEXT: mov z1.h, p2/z, #-1 // =0xffffffffffffffff +; VBITS_GE_384-NEXT: st1h { z0.h }, p0, [x2, x8, lsl #1] +; VBITS_GE_384-NEXT: st1h { z1.h }, p0, [x2] +; VBITS_GE_384-NEXT: ret +; ; VBITS_GE_512-LABEL: fcmp_oeq_v32f16: ; VBITS_GE_512: // %bb.0: ; VBITS_GE_512-NEXT: ptrue p0.h, vl32 @@ -100,6 +150,86 @@ } define void @fcmp_oeq_v64f16(<64 x half>* %a, <64 x half>* %b, <64 x i16>* %c) #0 { +; NO_SVE-LABEL: fcmp_oeq_v64f16: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: ldp q0, q17, [x1, #32] +; NO_SVE-NEXT: ldp q1, q2, [x0, #96] +; NO_SVE-NEXT: ldp q5, q6, [x0, #32] +; NO_SVE-NEXT: fcmeq v0.8h, v5.8h, v0.8h +; NO_SVE-NEXT: ldp q7, q16, [x1, #96] +; NO_SVE-NEXT: fcmeq v1.8h, v1.8h, v7.8h +; NO_SVE-NEXT: ldp q3, q4, [x0, #64] +; NO_SVE-NEXT: fcmeq v2.8h, v2.8h, v16.8h +; NO_SVE-NEXT: ldp q5, q7, [x1, #64] +; NO_SVE-NEXT: fcmeq v3.8h, v3.8h, v5.8h +; NO_SVE-NEXT: ldp q16, q5, [x0] +; NO_SVE-NEXT: fcmeq v4.8h, v4.8h, v7.8h +; NO_SVE-NEXT: ldp q7, q18, [x1] +; NO_SVE-NEXT: stp q3, q4, [x2, #64] +; NO_SVE-NEXT: stp q1, q2, [x2, #96] +; NO_SVE-NEXT: fcmeq v1.8h, v6.8h, v17.8h +; NO_SVE-NEXT: fcmeq v2.8h, v16.8h, v7.8h +; NO_SVE-NEXT: fcmeq v3.8h, v5.8h, v18.8h +; NO_SVE-NEXT: stp q0, q1, [x2, #32] +; NO_SVE-NEXT: stp q2, q3, [x2] +; NO_SVE-NEXT: ret +; +; VBITS_EQ_256-LABEL: fcmp_oeq_v64f16: +; VBITS_EQ_256: // %bb.0: +; VBITS_EQ_256-NEXT: mov x8, #32 +; VBITS_EQ_256-NEXT: mov x9, #48 +; VBITS_EQ_256-NEXT: mov x10, #16 +; VBITS_EQ_256-NEXT: ptrue p0.h, vl16 +; VBITS_EQ_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] +; VBITS_EQ_256-NEXT: ld1h { z1.h }, p0/z, [x0, x9, lsl #1] +; VBITS_EQ_256-NEXT: ld1h { z2.h }, p0/z, [x0, x10, lsl #1] +; VBITS_EQ_256-NEXT: ld1h { z3.h }, p0/z, [x0] +; VBITS_EQ_256-NEXT: ld1h { z4.h }, p0/z, [x1, x9, lsl #1] +; VBITS_EQ_256-NEXT: ld1h { z5.h }, p0/z, [x1, x10, lsl #1] +; VBITS_EQ_256-NEXT: ld1h { z6.h }, p0/z, [x1, x8, lsl #1] +; VBITS_EQ_256-NEXT: ld1h { z7.h }, p0/z, [x1] +; VBITS_EQ_256-NEXT: fcmeq p1.h, p0/z, z2.h, z5.h +; VBITS_EQ_256-NEXT: fcmeq p2.h, p0/z, z1.h, z4.h +; VBITS_EQ_256-NEXT: mov z1.h, p1/z, #-1 // =0xffffffffffffffff +; VBITS_EQ_256-NEXT: mov z2.h, p2/z, #-1 // =0xffffffffffffffff +; VBITS_EQ_256-NEXT: fcmeq p1.h, p0/z, z0.h, z6.h +; VBITS_EQ_256-NEXT: fcmeq p2.h, p0/z, z3.h, z7.h +; VBITS_EQ_256-NEXT: mov z0.h, p1/z, #-1 // =0xffffffffffffffff +; VBITS_EQ_256-NEXT: mov z3.h, p2/z, #-1 // =0xffffffffffffffff +; VBITS_EQ_256-NEXT: st1h { z0.h }, p0, [x2, x8, lsl #1] +; VBITS_EQ_256-NEXT: st1h { z2.h }, p0, [x2, x9, lsl #1] +; VBITS_EQ_256-NEXT: st1h { z1.h }, p0, [x2, x10, lsl #1] +; VBITS_EQ_256-NEXT: st1h { z3.h }, p0, [x2] +; VBITS_EQ_256-NEXT: ret +; +; VBITS_GE_384-LABEL: fcmp_oeq_v64f16: +; VBITS_GE_384: // %bb.0: +; VBITS_GE_384-NEXT: mov x8, #32 +; VBITS_GE_384-NEXT: mov x9, #48 +; VBITS_GE_384-NEXT: mov x10, #16 +; VBITS_GE_384-NEXT: ptrue p0.h, vl16 +; VBITS_GE_384-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] +; VBITS_GE_384-NEXT: ld1h { z1.h }, p0/z, [x0, x9, lsl #1] +; VBITS_GE_384-NEXT: ld1h { z2.h }, p0/z, [x0, x10, lsl #1] +; VBITS_GE_384-NEXT: ld1h { z3.h }, p0/z, [x0] +; VBITS_GE_384-NEXT: ld1h { z4.h }, p0/z, [x1, x9, lsl #1] +; VBITS_GE_384-NEXT: ld1h { z5.h }, p0/z, [x1, x10, lsl #1] +; VBITS_GE_384-NEXT: ld1h { z6.h }, p0/z, [x1, x8, lsl #1] +; VBITS_GE_384-NEXT: ld1h { z7.h }, p0/z, [x1] +; VBITS_GE_384-NEXT: fcmeq p1.h, p0/z, z2.h, z5.h +; VBITS_GE_384-NEXT: fcmeq p2.h, p0/z, z1.h, z4.h +; VBITS_GE_384-NEXT: mov z1.h, p1/z, #-1 // =0xffffffffffffffff +; VBITS_GE_384-NEXT: mov z2.h, p2/z, #-1 // =0xffffffffffffffff +; VBITS_GE_384-NEXT: fcmeq p1.h, p0/z, z0.h, z6.h +; VBITS_GE_384-NEXT: fcmeq p2.h, p0/z, z3.h, z7.h +; VBITS_GE_384-NEXT: mov z0.h, p1/z, #-1 // =0xffffffffffffffff +; VBITS_GE_384-NEXT: mov z3.h, p2/z, #-1 // =0xffffffffffffffff +; VBITS_GE_384-NEXT: st1h { z0.h }, p0, [x2, x8, lsl #1] +; VBITS_GE_384-NEXT: st1h { z2.h }, p0, [x2, x9, lsl #1] +; VBITS_GE_384-NEXT: st1h { z1.h }, p0, [x2, x10, lsl #1] +; VBITS_GE_384-NEXT: st1h { z3.h }, p0, [x2] +; VBITS_GE_384-NEXT: ret +; ; VBITS_GE_1024-LABEL: fcmp_oeq_v64f16: ; VBITS_GE_1024: // %bb.0: ; VBITS_GE_1024-NEXT: ptrue p0.h, vl64 @@ -118,6 +248,162 @@ } define void @fcmp_oeq_v128f16(<128 x half>* %a, <128 x half>* %b, <128 x i16>* %c) #0 { +; NO_SVE-LABEL: fcmp_oeq_v128f16: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: str d10, [sp, #-32]! // 8-byte Folded Spill +; NO_SVE-NEXT: .cfi_def_cfa_offset 32 +; NO_SVE-NEXT: stp d9, d8, [sp, #16] // 16-byte Folded Spill +; NO_SVE-NEXT: .cfi_offset b8, -8 +; NO_SVE-NEXT: .cfi_offset b9, -16 +; NO_SVE-NEXT: .cfi_offset b10, -32 +; NO_SVE-NEXT: ldp q25, q0, [x1, #224] +; NO_SVE-NEXT: ldp q2, q1, [x0, #224] +; NO_SVE-NEXT: fcmeq v2.8h, v2.8h, v25.8h +; NO_SVE-NEXT: ldp q6, q5, [x0, #160] +; NO_SVE-NEXT: fcmeq v0.8h, v1.8h, v0.8h +; NO_SVE-NEXT: ldp q29, q28, [x1, #160] +; NO_SVE-NEXT: ldp q4, q3, [x0, #192] +; NO_SVE-NEXT: fcmeq v5.8h, v5.8h, v28.8h +; NO_SVE-NEXT: ldp q27, q26, [x1, #192] +; NO_SVE-NEXT: fcmeq v4.8h, v4.8h, v27.8h +; NO_SVE-NEXT: ldp q16, q7, [x0, #128] +; NO_SVE-NEXT: fcmeq v3.8h, v3.8h, v26.8h +; NO_SVE-NEXT: ldp q18, q17, [x0, #96] +; NO_SVE-NEXT: ldp q20, q19, [x0, #64] +; NO_SVE-NEXT: ldp q31, q30, [x1, #128] +; NO_SVE-NEXT: ldp q9, q8, [x1, #96] +; NO_SVE-NEXT: ldp q1, q28, [x1, #64] +; NO_SVE-NEXT: fcmeq v1.8h, v20.8h, v1.8h +; NO_SVE-NEXT: ldp q22, q21, [x0, #32] +; NO_SVE-NEXT: ldp q24, q23, [x0] +; NO_SVE-NEXT: ldp q26, q25, [x1, #32] +; NO_SVE-NEXT: ldp q10, q27, [x1] +; NO_SVE-NEXT: stp q4, q3, [x2, #192] +; NO_SVE-NEXT: stp q2, q0, [x2, #224] +; NO_SVE-NEXT: fcmeq v0.8h, v6.8h, v29.8h +; NO_SVE-NEXT: fcmeq v2.8h, v7.8h, v30.8h +; NO_SVE-NEXT: fcmeq v3.8h, v16.8h, v31.8h +; NO_SVE-NEXT: fcmeq v4.8h, v17.8h, v8.8h +; NO_SVE-NEXT: stp q0, q5, [x2, #160] +; NO_SVE-NEXT: fcmeq v5.8h, v18.8h, v9.8h +; NO_SVE-NEXT: fcmeq v0.8h, v19.8h, v28.8h +; NO_SVE-NEXT: stp q3, q2, [x2, #128] +; NO_SVE-NEXT: fcmeq v2.8h, v21.8h, v25.8h +; NO_SVE-NEXT: fcmeq v3.8h, v22.8h, v26.8h +; NO_SVE-NEXT: stp q5, q4, [x2, #96] +; NO_SVE-NEXT: fcmeq v4.8h, v23.8h, v27.8h +; NO_SVE-NEXT: stp q1, q0, [x2, #64] +; NO_SVE-NEXT: fcmeq v0.8h, v24.8h, v10.8h +; NO_SVE-NEXT: ldp d9, d8, [sp, #16] // 16-byte Folded Reload +; NO_SVE-NEXT: stp q3, q2, [x2, #32] +; NO_SVE-NEXT: stp q0, q4, [x2] +; NO_SVE-NEXT: ldr d10, [sp], #32 // 8-byte Folded Reload +; NO_SVE-NEXT: ret +; +; VBITS_EQ_256-LABEL: fcmp_oeq_v128f16: +; VBITS_EQ_256: // %bb.0: +; VBITS_EQ_256-NEXT: mov x8, #96 +; VBITS_EQ_256-NEXT: mov x9, #112 +; VBITS_EQ_256-NEXT: mov x10, #64 +; VBITS_EQ_256-NEXT: mov x11, #80 +; VBITS_EQ_256-NEXT: mov x12, #32 +; VBITS_EQ_256-NEXT: mov x13, #48 +; VBITS_EQ_256-NEXT: mov x14, #16 +; VBITS_EQ_256-NEXT: ptrue p0.h, vl16 +; VBITS_EQ_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] +; VBITS_EQ_256-NEXT: ld1h { z1.h }, p0/z, [x0, x9, lsl #1] +; VBITS_EQ_256-NEXT: ld1h { z2.h }, p0/z, [x0, x10, lsl #1] +; VBITS_EQ_256-NEXT: ld1h { z3.h }, p0/z, [x0, x11, lsl #1] +; VBITS_EQ_256-NEXT: ld1h { z4.h }, p0/z, [x0, x12, lsl #1] +; VBITS_EQ_256-NEXT: ld1h { z5.h }, p0/z, [x0, x13, lsl #1] +; VBITS_EQ_256-NEXT: ld1h { z6.h }, p0/z, [x0, x14, lsl #1] +; VBITS_EQ_256-NEXT: ld1h { z7.h }, p0/z, [x0] +; VBITS_EQ_256-NEXT: ld1h { z16.h }, p0/z, [x1, x13, lsl #1] +; VBITS_EQ_256-NEXT: ld1h { z17.h }, p0/z, [x1, x14, lsl #1] +; VBITS_EQ_256-NEXT: ld1h { z20.h }, p0/z, [x1, x11, lsl #1] +; VBITS_EQ_256-NEXT: ld1h { z21.h }, p0/z, [x1, x12, lsl #1] +; VBITS_EQ_256-NEXT: ld1h { z19.h }, p0/z, [x1, x9, lsl #1] +; VBITS_EQ_256-NEXT: ld1h { z22.h }, p0/z, [x1, x10, lsl #1] +; VBITS_EQ_256-NEXT: ld1h { z18.h }, p0/z, [x1, x8, lsl #1] +; VBITS_EQ_256-NEXT: ld1h { z23.h }, p0/z, [x1] +; VBITS_EQ_256-NEXT: fcmeq p1.h, p0/z, z6.h, z17.h +; VBITS_EQ_256-NEXT: fcmeq p2.h, p0/z, z5.h, z16.h +; VBITS_EQ_256-NEXT: mov z5.h, p1/z, #-1 // =0xffffffffffffffff +; VBITS_EQ_256-NEXT: mov z6.h, p2/z, #-1 // =0xffffffffffffffff +; VBITS_EQ_256-NEXT: fcmeq p1.h, p0/z, z4.h, z21.h +; VBITS_EQ_256-NEXT: fcmeq p2.h, p0/z, z3.h, z20.h +; VBITS_EQ_256-NEXT: mov z3.h, p1/z, #-1 // =0xffffffffffffffff +; VBITS_EQ_256-NEXT: mov z4.h, p2/z, #-1 // =0xffffffffffffffff +; VBITS_EQ_256-NEXT: fcmeq p1.h, p0/z, z2.h, z22.h +; VBITS_EQ_256-NEXT: fcmeq p2.h, p0/z, z1.h, z19.h +; VBITS_EQ_256-NEXT: mov z1.h, p1/z, #-1 // =0xffffffffffffffff +; VBITS_EQ_256-NEXT: mov z2.h, p2/z, #-1 // =0xffffffffffffffff +; VBITS_EQ_256-NEXT: fcmeq p1.h, p0/z, z0.h, z18.h +; VBITS_EQ_256-NEXT: fcmeq p2.h, p0/z, z7.h, z23.h +; VBITS_EQ_256-NEXT: mov z0.h, p1/z, #-1 // =0xffffffffffffffff +; VBITS_EQ_256-NEXT: mov z7.h, p2/z, #-1 // =0xffffffffffffffff +; VBITS_EQ_256-NEXT: st1h { z0.h }, p0, [x2, x8, lsl #1] +; VBITS_EQ_256-NEXT: st1h { z2.h }, p0, [x2, x9, lsl #1] +; VBITS_EQ_256-NEXT: st1h { z1.h }, p0, [x2, x10, lsl #1] +; VBITS_EQ_256-NEXT: st1h { z4.h }, p0, [x2, x11, lsl #1] +; VBITS_EQ_256-NEXT: st1h { z3.h }, p0, [x2, x12, lsl #1] +; VBITS_EQ_256-NEXT: st1h { z6.h }, p0, [x2, x13, lsl #1] +; VBITS_EQ_256-NEXT: st1h { z5.h }, p0, [x2, x14, lsl #1] +; VBITS_EQ_256-NEXT: st1h { z7.h }, p0, [x2] +; VBITS_EQ_256-NEXT: ret +; +; VBITS_GE_384-LABEL: fcmp_oeq_v128f16: +; VBITS_GE_384: // %bb.0: +; VBITS_GE_384-NEXT: mov x8, #96 +; VBITS_GE_384-NEXT: mov x9, #112 +; VBITS_GE_384-NEXT: mov x10, #64 +; VBITS_GE_384-NEXT: mov x11, #80 +; VBITS_GE_384-NEXT: mov x12, #32 +; VBITS_GE_384-NEXT: mov x13, #48 +; VBITS_GE_384-NEXT: mov x14, #16 +; VBITS_GE_384-NEXT: ptrue p0.h, vl16 +; VBITS_GE_384-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] +; VBITS_GE_384-NEXT: ld1h { z1.h }, p0/z, [x0, x9, lsl #1] +; VBITS_GE_384-NEXT: ld1h { z2.h }, p0/z, [x0, x10, lsl #1] +; VBITS_GE_384-NEXT: ld1h { z3.h }, p0/z, [x0, x11, lsl #1] +; VBITS_GE_384-NEXT: ld1h { z4.h }, p0/z, [x0, x12, lsl #1] +; VBITS_GE_384-NEXT: ld1h { z5.h }, p0/z, [x0, x13, lsl #1] +; VBITS_GE_384-NEXT: ld1h { z6.h }, p0/z, [x0, x14, lsl #1] +; VBITS_GE_384-NEXT: ld1h { z7.h }, p0/z, [x0] +; VBITS_GE_384-NEXT: ld1h { z16.h }, p0/z, [x1, x13, lsl #1] +; VBITS_GE_384-NEXT: ld1h { z17.h }, p0/z, [x1, x14, lsl #1] +; VBITS_GE_384-NEXT: ld1h { z20.h }, p0/z, [x1, x11, lsl #1] +; VBITS_GE_384-NEXT: ld1h { z21.h }, p0/z, [x1, x12, lsl #1] +; VBITS_GE_384-NEXT: ld1h { z19.h }, p0/z, [x1, x9, lsl #1] +; VBITS_GE_384-NEXT: ld1h { z22.h }, p0/z, [x1, x10, lsl #1] +; VBITS_GE_384-NEXT: ld1h { z18.h }, p0/z, [x1, x8, lsl #1] +; VBITS_GE_384-NEXT: ld1h { z23.h }, p0/z, [x1] +; VBITS_GE_384-NEXT: fcmeq p1.h, p0/z, z6.h, z17.h +; VBITS_GE_384-NEXT: fcmeq p2.h, p0/z, z5.h, z16.h +; VBITS_GE_384-NEXT: mov z5.h, p1/z, #-1 // =0xffffffffffffffff +; VBITS_GE_384-NEXT: mov z6.h, p2/z, #-1 // =0xffffffffffffffff +; VBITS_GE_384-NEXT: fcmeq p1.h, p0/z, z4.h, z21.h +; VBITS_GE_384-NEXT: fcmeq p2.h, p0/z, z3.h, z20.h +; VBITS_GE_384-NEXT: mov z3.h, p1/z, #-1 // =0xffffffffffffffff +; VBITS_GE_384-NEXT: mov z4.h, p2/z, #-1 // =0xffffffffffffffff +; VBITS_GE_384-NEXT: fcmeq p1.h, p0/z, z2.h, z22.h +; VBITS_GE_384-NEXT: fcmeq p2.h, p0/z, z1.h, z19.h +; VBITS_GE_384-NEXT: mov z1.h, p1/z, #-1 // =0xffffffffffffffff +; VBITS_GE_384-NEXT: mov z2.h, p2/z, #-1 // =0xffffffffffffffff +; VBITS_GE_384-NEXT: fcmeq p1.h, p0/z, z0.h, z18.h +; VBITS_GE_384-NEXT: fcmeq p2.h, p0/z, z7.h, z23.h +; VBITS_GE_384-NEXT: mov z0.h, p1/z, #-1 // =0xffffffffffffffff +; VBITS_GE_384-NEXT: mov z7.h, p2/z, #-1 // =0xffffffffffffffff +; VBITS_GE_384-NEXT: st1h { z0.h }, p0, [x2, x8, lsl #1] +; VBITS_GE_384-NEXT: st1h { z2.h }, p0, [x2, x9, lsl #1] +; VBITS_GE_384-NEXT: st1h { z1.h }, p0, [x2, x10, lsl #1] +; VBITS_GE_384-NEXT: st1h { z4.h }, p0, [x2, x11, lsl #1] +; VBITS_GE_384-NEXT: st1h { z3.h }, p0, [x2, x12, lsl #1] +; VBITS_GE_384-NEXT: st1h { z6.h }, p0, [x2, x13, lsl #1] +; VBITS_GE_384-NEXT: st1h { z5.h }, p0, [x2, x14, lsl #1] +; VBITS_GE_384-NEXT: st1h { z7.h }, p0, [x2] +; VBITS_GE_384-NEXT: ret +; ; VBITS_GE_2048-LABEL: fcmp_oeq_v128f16: ; VBITS_GE_2048: // %bb.0: ; VBITS_GE_2048-NEXT: ptrue p0.h, vl128 @@ -137,6 +423,11 @@ ; Don't use SVE for 64-bit vectors. define <2 x i32> @fcmp_oeq_v2f32(<2 x float> %op1, <2 x float> %op2) #0 { +; NO_SVE-LABEL: fcmp_oeq_v2f32: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: fcmeq v0.2s, v0.2s, v1.2s +; NO_SVE-NEXT: ret +; ; CHECK-LABEL: fcmp_oeq_v2f32: ; CHECK: // %bb.0: ; CHECK-NEXT: fcmeq v0.2s, v0.2s, v1.2s @@ -148,6 +439,11 @@ ; Don't use SVE for 128-bit vectors. define <4 x i32> @fcmp_oeq_v4f32(<4 x float> %op1, <4 x float> %op2) #0 { +; NO_SVE-LABEL: fcmp_oeq_v4f32: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: fcmeq v0.4s, v0.4s, v1.4s +; NO_SVE-NEXT: ret +; ; CHECK-LABEL: fcmp_oeq_v4f32: ; CHECK: // %bb.0: ; CHECK-NEXT: fcmeq v0.4s, v0.4s, v1.4s @@ -158,6 +454,15 @@ } define void @fcmp_oeq_v8f32(<8 x float>* %a, <8 x float>* %b, <8 x i32>* %c) #0 { +; NO_SVE-LABEL: fcmp_oeq_v8f32: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: ldp q0, q3, [x1] +; NO_SVE-NEXT: ldp q1, q2, [x0] +; NO_SVE-NEXT: fcmeq v0.4s, v1.4s, v0.4s +; NO_SVE-NEXT: fcmeq v1.4s, v2.4s, v3.4s +; NO_SVE-NEXT: stp q0, q1, [x2] +; NO_SVE-NEXT: ret +; ; CHECK-LABEL: fcmp_oeq_v8f32: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.s, vl8 @@ -177,6 +482,20 @@ define void @fcmp_oeq_v16f32(<16 x float>* %a, <16 x float>* %b, <16 x i32>* %c) #0 { ; Ensure sensible type legalisation +; NO_SVE-LABEL: fcmp_oeq_v16f32: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: ldp q0, q5, [x1, #32] +; NO_SVE-NEXT: ldp q1, q2, [x0, #32] +; NO_SVE-NEXT: fcmeq v0.4s, v1.4s, v0.4s +; NO_SVE-NEXT: ldp q3, q4, [x0] +; NO_SVE-NEXT: fcmeq v2.4s, v2.4s, v5.4s +; NO_SVE-NEXT: ldp q6, q1, [x1] +; NO_SVE-NEXT: stp q0, q2, [x2, #32] +; NO_SVE-NEXT: fcmeq v3.4s, v3.4s, v6.4s +; NO_SVE-NEXT: fcmeq v1.4s, v4.4s, v1.4s +; NO_SVE-NEXT: stp q3, q1, [x2] +; NO_SVE-NEXT: ret +; ; VBITS_EQ_256-LABEL: fcmp_oeq_v16f32: ; VBITS_EQ_256: // %bb.0: ; VBITS_EQ_256-NEXT: mov x8, #8 @@ -193,6 +512,22 @@ ; VBITS_EQ_256-NEXT: st1w { z1.s }, p0, [x2] ; VBITS_EQ_256-NEXT: ret ; +; VBITS_GE_384-LABEL: fcmp_oeq_v16f32: +; VBITS_GE_384: // %bb.0: +; VBITS_GE_384-NEXT: mov x8, #8 +; VBITS_GE_384-NEXT: ptrue p0.s, vl8 +; VBITS_GE_384-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] +; VBITS_GE_384-NEXT: ld1w { z1.s }, p0/z, [x0] +; VBITS_GE_384-NEXT: ld1w { z2.s }, p0/z, [x1, x8, lsl #2] +; VBITS_GE_384-NEXT: ld1w { z3.s }, p0/z, [x1] +; VBITS_GE_384-NEXT: fcmeq p1.s, p0/z, z0.s, z2.s +; VBITS_GE_384-NEXT: fcmeq p2.s, p0/z, z1.s, z3.s +; VBITS_GE_384-NEXT: mov z0.s, p1/z, #-1 // =0xffffffffffffffff +; VBITS_GE_384-NEXT: mov z1.s, p2/z, #-1 // =0xffffffffffffffff +; VBITS_GE_384-NEXT: st1w { z0.s }, p0, [x2, x8, lsl #2] +; VBITS_GE_384-NEXT: st1w { z1.s }, p0, [x2] +; VBITS_GE_384-NEXT: ret +; ; VBITS_GE_512-LABEL: fcmp_oeq_v16f32: ; VBITS_GE_512: // %bb.0: ; VBITS_GE_512-NEXT: ptrue p0.s, vl16 @@ -211,6 +546,86 @@ } define void @fcmp_oeq_v32f32(<32 x float>* %a, <32 x float>* %b, <32 x i32>* %c) #0 { +; NO_SVE-LABEL: fcmp_oeq_v32f32: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: ldp q0, q17, [x1, #32] +; NO_SVE-NEXT: ldp q1, q2, [x0, #96] +; NO_SVE-NEXT: ldp q5, q6, [x0, #32] +; NO_SVE-NEXT: fcmeq v0.4s, v5.4s, v0.4s +; NO_SVE-NEXT: ldp q7, q16, [x1, #96] +; NO_SVE-NEXT: fcmeq v1.4s, v1.4s, v7.4s +; NO_SVE-NEXT: ldp q3, q4, [x0, #64] +; NO_SVE-NEXT: fcmeq v2.4s, v2.4s, v16.4s +; NO_SVE-NEXT: ldp q5, q7, [x1, #64] +; NO_SVE-NEXT: fcmeq v3.4s, v3.4s, v5.4s +; NO_SVE-NEXT: ldp q16, q5, [x0] +; NO_SVE-NEXT: fcmeq v4.4s, v4.4s, v7.4s +; NO_SVE-NEXT: ldp q7, q18, [x1] +; NO_SVE-NEXT: stp q3, q4, [x2, #64] +; NO_SVE-NEXT: stp q1, q2, [x2, #96] +; NO_SVE-NEXT: fcmeq v1.4s, v6.4s, v17.4s +; NO_SVE-NEXT: fcmeq v2.4s, v16.4s, v7.4s +; NO_SVE-NEXT: fcmeq v3.4s, v5.4s, v18.4s +; NO_SVE-NEXT: stp q0, q1, [x2, #32] +; NO_SVE-NEXT: stp q2, q3, [x2] +; NO_SVE-NEXT: ret +; +; VBITS_EQ_256-LABEL: fcmp_oeq_v32f32: +; VBITS_EQ_256: // %bb.0: +; VBITS_EQ_256-NEXT: mov x8, #16 +; VBITS_EQ_256-NEXT: mov x9, #24 +; VBITS_EQ_256-NEXT: mov x10, #8 +; VBITS_EQ_256-NEXT: ptrue p0.s, vl8 +; VBITS_EQ_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] +; VBITS_EQ_256-NEXT: ld1w { z1.s }, p0/z, [x0, x9, lsl #2] +; VBITS_EQ_256-NEXT: ld1w { z2.s }, p0/z, [x0, x10, lsl #2] +; VBITS_EQ_256-NEXT: ld1w { z3.s }, p0/z, [x0] +; VBITS_EQ_256-NEXT: ld1w { z4.s }, p0/z, [x1, x9, lsl #2] +; VBITS_EQ_256-NEXT: ld1w { z5.s }, p0/z, [x1, x10, lsl #2] +; VBITS_EQ_256-NEXT: ld1w { z6.s }, p0/z, [x1, x8, lsl #2] +; VBITS_EQ_256-NEXT: ld1w { z7.s }, p0/z, [x1] +; VBITS_EQ_256-NEXT: fcmeq p1.s, p0/z, z2.s, z5.s +; VBITS_EQ_256-NEXT: fcmeq p2.s, p0/z, z1.s, z4.s +; VBITS_EQ_256-NEXT: mov z1.s, p1/z, #-1 // =0xffffffffffffffff +; VBITS_EQ_256-NEXT: mov z2.s, p2/z, #-1 // =0xffffffffffffffff +; VBITS_EQ_256-NEXT: fcmeq p1.s, p0/z, z0.s, z6.s +; VBITS_EQ_256-NEXT: fcmeq p2.s, p0/z, z3.s, z7.s +; VBITS_EQ_256-NEXT: mov z0.s, p1/z, #-1 // =0xffffffffffffffff +; VBITS_EQ_256-NEXT: mov z3.s, p2/z, #-1 // =0xffffffffffffffff +; VBITS_EQ_256-NEXT: st1w { z0.s }, p0, [x2, x8, lsl #2] +; VBITS_EQ_256-NEXT: st1w { z2.s }, p0, [x2, x9, lsl #2] +; VBITS_EQ_256-NEXT: st1w { z1.s }, p0, [x2, x10, lsl #2] +; VBITS_EQ_256-NEXT: st1w { z3.s }, p0, [x2] +; VBITS_EQ_256-NEXT: ret +; +; VBITS_GE_384-LABEL: fcmp_oeq_v32f32: +; VBITS_GE_384: // %bb.0: +; VBITS_GE_384-NEXT: mov x8, #16 +; VBITS_GE_384-NEXT: mov x9, #24 +; VBITS_GE_384-NEXT: mov x10, #8 +; VBITS_GE_384-NEXT: ptrue p0.s, vl8 +; VBITS_GE_384-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] +; VBITS_GE_384-NEXT: ld1w { z1.s }, p0/z, [x0, x9, lsl #2] +; VBITS_GE_384-NEXT: ld1w { z2.s }, p0/z, [x0, x10, lsl #2] +; VBITS_GE_384-NEXT: ld1w { z3.s }, p0/z, [x0] +; VBITS_GE_384-NEXT: ld1w { z4.s }, p0/z, [x1, x9, lsl #2] +; VBITS_GE_384-NEXT: ld1w { z5.s }, p0/z, [x1, x10, lsl #2] +; VBITS_GE_384-NEXT: ld1w { z6.s }, p0/z, [x1, x8, lsl #2] +; VBITS_GE_384-NEXT: ld1w { z7.s }, p0/z, [x1] +; VBITS_GE_384-NEXT: fcmeq p1.s, p0/z, z2.s, z5.s +; VBITS_GE_384-NEXT: fcmeq p2.s, p0/z, z1.s, z4.s +; VBITS_GE_384-NEXT: mov z1.s, p1/z, #-1 // =0xffffffffffffffff +; VBITS_GE_384-NEXT: mov z2.s, p2/z, #-1 // =0xffffffffffffffff +; VBITS_GE_384-NEXT: fcmeq p1.s, p0/z, z0.s, z6.s +; VBITS_GE_384-NEXT: fcmeq p2.s, p0/z, z3.s, z7.s +; VBITS_GE_384-NEXT: mov z0.s, p1/z, #-1 // =0xffffffffffffffff +; VBITS_GE_384-NEXT: mov z3.s, p2/z, #-1 // =0xffffffffffffffff +; VBITS_GE_384-NEXT: st1w { z0.s }, p0, [x2, x8, lsl #2] +; VBITS_GE_384-NEXT: st1w { z2.s }, p0, [x2, x9, lsl #2] +; VBITS_GE_384-NEXT: st1w { z1.s }, p0, [x2, x10, lsl #2] +; VBITS_GE_384-NEXT: st1w { z3.s }, p0, [x2] +; VBITS_GE_384-NEXT: ret +; ; VBITS_GE_1024-LABEL: fcmp_oeq_v32f32: ; VBITS_GE_1024: // %bb.0: ; VBITS_GE_1024-NEXT: ptrue p0.s, vl32 @@ -229,6 +644,162 @@ } define void @fcmp_oeq_v64f32(<64 x float>* %a, <64 x float>* %b, <64 x i32>* %c) #0 { +; NO_SVE-LABEL: fcmp_oeq_v64f32: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: str d10, [sp, #-32]! // 8-byte Folded Spill +; NO_SVE-NEXT: .cfi_def_cfa_offset 32 +; NO_SVE-NEXT: stp d9, d8, [sp, #16] // 16-byte Folded Spill +; NO_SVE-NEXT: .cfi_offset b8, -8 +; NO_SVE-NEXT: .cfi_offset b9, -16 +; NO_SVE-NEXT: .cfi_offset b10, -32 +; NO_SVE-NEXT: ldp q25, q0, [x1, #224] +; NO_SVE-NEXT: ldp q2, q1, [x0, #224] +; NO_SVE-NEXT: fcmeq v2.4s, v2.4s, v25.4s +; NO_SVE-NEXT: ldp q6, q5, [x0, #160] +; NO_SVE-NEXT: fcmeq v0.4s, v1.4s, v0.4s +; NO_SVE-NEXT: ldp q29, q28, [x1, #160] +; NO_SVE-NEXT: ldp q4, q3, [x0, #192] +; NO_SVE-NEXT: fcmeq v5.4s, v5.4s, v28.4s +; NO_SVE-NEXT: ldp q27, q26, [x1, #192] +; NO_SVE-NEXT: fcmeq v4.4s, v4.4s, v27.4s +; NO_SVE-NEXT: ldp q16, q7, [x0, #128] +; NO_SVE-NEXT: fcmeq v3.4s, v3.4s, v26.4s +; NO_SVE-NEXT: ldp q18, q17, [x0, #96] +; NO_SVE-NEXT: ldp q20, q19, [x0, #64] +; NO_SVE-NEXT: ldp q31, q30, [x1, #128] +; NO_SVE-NEXT: ldp q9, q8, [x1, #96] +; NO_SVE-NEXT: ldp q1, q28, [x1, #64] +; NO_SVE-NEXT: fcmeq v1.4s, v20.4s, v1.4s +; NO_SVE-NEXT: ldp q22, q21, [x0, #32] +; NO_SVE-NEXT: ldp q24, q23, [x0] +; NO_SVE-NEXT: ldp q26, q25, [x1, #32] +; NO_SVE-NEXT: ldp q10, q27, [x1] +; NO_SVE-NEXT: stp q4, q3, [x2, #192] +; NO_SVE-NEXT: stp q2, q0, [x2, #224] +; NO_SVE-NEXT: fcmeq v0.4s, v6.4s, v29.4s +; NO_SVE-NEXT: fcmeq v2.4s, v7.4s, v30.4s +; NO_SVE-NEXT: fcmeq v3.4s, v16.4s, v31.4s +; NO_SVE-NEXT: fcmeq v4.4s, v17.4s, v8.4s +; NO_SVE-NEXT: stp q0, q5, [x2, #160] +; NO_SVE-NEXT: fcmeq v5.4s, v18.4s, v9.4s +; NO_SVE-NEXT: fcmeq v0.4s, v19.4s, v28.4s +; NO_SVE-NEXT: stp q3, q2, [x2, #128] +; NO_SVE-NEXT: fcmeq v2.4s, v21.4s, v25.4s +; NO_SVE-NEXT: fcmeq v3.4s, v22.4s, v26.4s +; NO_SVE-NEXT: stp q5, q4, [x2, #96] +; NO_SVE-NEXT: fcmeq v4.4s, v23.4s, v27.4s +; NO_SVE-NEXT: stp q1, q0, [x2, #64] +; NO_SVE-NEXT: fcmeq v0.4s, v24.4s, v10.4s +; NO_SVE-NEXT: ldp d9, d8, [sp, #16] // 16-byte Folded Reload +; NO_SVE-NEXT: stp q3, q2, [x2, #32] +; NO_SVE-NEXT: stp q0, q4, [x2] +; NO_SVE-NEXT: ldr d10, [sp], #32 // 8-byte Folded Reload +; NO_SVE-NEXT: ret +; +; VBITS_EQ_256-LABEL: fcmp_oeq_v64f32: +; VBITS_EQ_256: // %bb.0: +; VBITS_EQ_256-NEXT: mov x8, #48 +; VBITS_EQ_256-NEXT: mov x9, #56 +; VBITS_EQ_256-NEXT: mov x10, #32 +; VBITS_EQ_256-NEXT: mov x11, #40 +; VBITS_EQ_256-NEXT: mov x12, #16 +; VBITS_EQ_256-NEXT: mov x13, #24 +; VBITS_EQ_256-NEXT: mov x14, #8 +; VBITS_EQ_256-NEXT: ptrue p0.s, vl8 +; VBITS_EQ_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] +; VBITS_EQ_256-NEXT: ld1w { z1.s }, p0/z, [x0, x9, lsl #2] +; VBITS_EQ_256-NEXT: ld1w { z2.s }, p0/z, [x0, x10, lsl #2] +; VBITS_EQ_256-NEXT: ld1w { z3.s }, p0/z, [x0, x11, lsl #2] +; VBITS_EQ_256-NEXT: ld1w { z4.s }, p0/z, [x0, x12, lsl #2] +; VBITS_EQ_256-NEXT: ld1w { z5.s }, p0/z, [x0, x13, lsl #2] +; VBITS_EQ_256-NEXT: ld1w { z6.s }, p0/z, [x0, x14, lsl #2] +; VBITS_EQ_256-NEXT: ld1w { z7.s }, p0/z, [x0] +; VBITS_EQ_256-NEXT: ld1w { z16.s }, p0/z, [x1, x13, lsl #2] +; VBITS_EQ_256-NEXT: ld1w { z17.s }, p0/z, [x1, x14, lsl #2] +; VBITS_EQ_256-NEXT: ld1w { z20.s }, p0/z, [x1, x11, lsl #2] +; VBITS_EQ_256-NEXT: ld1w { z21.s }, p0/z, [x1, x12, lsl #2] +; VBITS_EQ_256-NEXT: ld1w { z19.s }, p0/z, [x1, x9, lsl #2] +; VBITS_EQ_256-NEXT: ld1w { z22.s }, p0/z, [x1, x10, lsl #2] +; VBITS_EQ_256-NEXT: ld1w { z18.s }, p0/z, [x1, x8, lsl #2] +; VBITS_EQ_256-NEXT: ld1w { z23.s }, p0/z, [x1] +; VBITS_EQ_256-NEXT: fcmeq p1.s, p0/z, z6.s, z17.s +; VBITS_EQ_256-NEXT: fcmeq p2.s, p0/z, z5.s, z16.s +; VBITS_EQ_256-NEXT: mov z5.s, p1/z, #-1 // =0xffffffffffffffff +; VBITS_EQ_256-NEXT: mov z6.s, p2/z, #-1 // =0xffffffffffffffff +; VBITS_EQ_256-NEXT: fcmeq p1.s, p0/z, z4.s, z21.s +; VBITS_EQ_256-NEXT: fcmeq p2.s, p0/z, z3.s, z20.s +; VBITS_EQ_256-NEXT: mov z3.s, p1/z, #-1 // =0xffffffffffffffff +; VBITS_EQ_256-NEXT: mov z4.s, p2/z, #-1 // =0xffffffffffffffff +; VBITS_EQ_256-NEXT: fcmeq p1.s, p0/z, z2.s, z22.s +; VBITS_EQ_256-NEXT: fcmeq p2.s, p0/z, z1.s, z19.s +; VBITS_EQ_256-NEXT: mov z1.s, p1/z, #-1 // =0xffffffffffffffff +; VBITS_EQ_256-NEXT: mov z2.s, p2/z, #-1 // =0xffffffffffffffff +; VBITS_EQ_256-NEXT: fcmeq p1.s, p0/z, z0.s, z18.s +; VBITS_EQ_256-NEXT: fcmeq p2.s, p0/z, z7.s, z23.s +; VBITS_EQ_256-NEXT: mov z0.s, p1/z, #-1 // =0xffffffffffffffff +; VBITS_EQ_256-NEXT: mov z7.s, p2/z, #-1 // =0xffffffffffffffff +; VBITS_EQ_256-NEXT: st1w { z0.s }, p0, [x2, x8, lsl #2] +; VBITS_EQ_256-NEXT: st1w { z2.s }, p0, [x2, x9, lsl #2] +; VBITS_EQ_256-NEXT: st1w { z1.s }, p0, [x2, x10, lsl #2] +; VBITS_EQ_256-NEXT: st1w { z4.s }, p0, [x2, x11, lsl #2] +; VBITS_EQ_256-NEXT: st1w { z3.s }, p0, [x2, x12, lsl #2] +; VBITS_EQ_256-NEXT: st1w { z6.s }, p0, [x2, x13, lsl #2] +; VBITS_EQ_256-NEXT: st1w { z5.s }, p0, [x2, x14, lsl #2] +; VBITS_EQ_256-NEXT: st1w { z7.s }, p0, [x2] +; VBITS_EQ_256-NEXT: ret +; +; VBITS_GE_384-LABEL: fcmp_oeq_v64f32: +; VBITS_GE_384: // %bb.0: +; VBITS_GE_384-NEXT: mov x8, #48 +; VBITS_GE_384-NEXT: mov x9, #56 +; VBITS_GE_384-NEXT: mov x10, #32 +; VBITS_GE_384-NEXT: mov x11, #40 +; VBITS_GE_384-NEXT: mov x12, #16 +; VBITS_GE_384-NEXT: mov x13, #24 +; VBITS_GE_384-NEXT: mov x14, #8 +; VBITS_GE_384-NEXT: ptrue p0.s, vl8 +; VBITS_GE_384-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] +; VBITS_GE_384-NEXT: ld1w { z1.s }, p0/z, [x0, x9, lsl #2] +; VBITS_GE_384-NEXT: ld1w { z2.s }, p0/z, [x0, x10, lsl #2] +; VBITS_GE_384-NEXT: ld1w { z3.s }, p0/z, [x0, x11, lsl #2] +; VBITS_GE_384-NEXT: ld1w { z4.s }, p0/z, [x0, x12, lsl #2] +; VBITS_GE_384-NEXT: ld1w { z5.s }, p0/z, [x0, x13, lsl #2] +; VBITS_GE_384-NEXT: ld1w { z6.s }, p0/z, [x0, x14, lsl #2] +; VBITS_GE_384-NEXT: ld1w { z7.s }, p0/z, [x0] +; VBITS_GE_384-NEXT: ld1w { z16.s }, p0/z, [x1, x13, lsl #2] +; VBITS_GE_384-NEXT: ld1w { z17.s }, p0/z, [x1, x14, lsl #2] +; VBITS_GE_384-NEXT: ld1w { z20.s }, p0/z, [x1, x11, lsl #2] +; VBITS_GE_384-NEXT: ld1w { z21.s }, p0/z, [x1, x12, lsl #2] +; VBITS_GE_384-NEXT: ld1w { z19.s }, p0/z, [x1, x9, lsl #2] +; VBITS_GE_384-NEXT: ld1w { z22.s }, p0/z, [x1, x10, lsl #2] +; VBITS_GE_384-NEXT: ld1w { z18.s }, p0/z, [x1, x8, lsl #2] +; VBITS_GE_384-NEXT: ld1w { z23.s }, p0/z, [x1] +; VBITS_GE_384-NEXT: fcmeq p1.s, p0/z, z6.s, z17.s +; VBITS_GE_384-NEXT: fcmeq p2.s, p0/z, z5.s, z16.s +; VBITS_GE_384-NEXT: mov z5.s, p1/z, #-1 // =0xffffffffffffffff +; VBITS_GE_384-NEXT: mov z6.s, p2/z, #-1 // =0xffffffffffffffff +; VBITS_GE_384-NEXT: fcmeq p1.s, p0/z, z4.s, z21.s +; VBITS_GE_384-NEXT: fcmeq p2.s, p0/z, z3.s, z20.s +; VBITS_GE_384-NEXT: mov z3.s, p1/z, #-1 // =0xffffffffffffffff +; VBITS_GE_384-NEXT: mov z4.s, p2/z, #-1 // =0xffffffffffffffff +; VBITS_GE_384-NEXT: fcmeq p1.s, p0/z, z2.s, z22.s +; VBITS_GE_384-NEXT: fcmeq p2.s, p0/z, z1.s, z19.s +; VBITS_GE_384-NEXT: mov z1.s, p1/z, #-1 // =0xffffffffffffffff +; VBITS_GE_384-NEXT: mov z2.s, p2/z, #-1 // =0xffffffffffffffff +; VBITS_GE_384-NEXT: fcmeq p1.s, p0/z, z0.s, z18.s +; VBITS_GE_384-NEXT: fcmeq p2.s, p0/z, z7.s, z23.s +; VBITS_GE_384-NEXT: mov z0.s, p1/z, #-1 // =0xffffffffffffffff +; VBITS_GE_384-NEXT: mov z7.s, p2/z, #-1 // =0xffffffffffffffff +; VBITS_GE_384-NEXT: st1w { z0.s }, p0, [x2, x8, lsl #2] +; VBITS_GE_384-NEXT: st1w { z2.s }, p0, [x2, x9, lsl #2] +; VBITS_GE_384-NEXT: st1w { z1.s }, p0, [x2, x10, lsl #2] +; VBITS_GE_384-NEXT: st1w { z4.s }, p0, [x2, x11, lsl #2] +; VBITS_GE_384-NEXT: st1w { z3.s }, p0, [x2, x12, lsl #2] +; VBITS_GE_384-NEXT: st1w { z6.s }, p0, [x2, x13, lsl #2] +; VBITS_GE_384-NEXT: st1w { z5.s }, p0, [x2, x14, lsl #2] +; VBITS_GE_384-NEXT: st1w { z7.s }, p0, [x2] +; VBITS_GE_384-NEXT: ret +; ; VBITS_GE_2048-LABEL: fcmp_oeq_v64f32: ; VBITS_GE_2048: // %bb.0: ; VBITS_GE_2048-NEXT: ptrue p0.s, vl64 @@ -248,6 +819,11 @@ ; Don't use SVE for 64-bit vectors. define <1 x i64> @fcmp_oeq_v1f64(<1 x double> %op1, <1 x double> %op2) #0 { +; NO_SVE-LABEL: fcmp_oeq_v1f64: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: fcmeq d0, d0, d1 +; NO_SVE-NEXT: ret +; ; CHECK-LABEL: fcmp_oeq_v1f64: ; CHECK: // %bb.0: ; CHECK-NEXT: fcmeq d0, d0, d1 @@ -259,6 +835,11 @@ ; Don't use SVE for 128-bit vectors. define <2 x i64> @fcmp_oeq_v2f64(<2 x double> %op1, <2 x double> %op2) #0 { +; NO_SVE-LABEL: fcmp_oeq_v2f64: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: fcmeq v0.2d, v0.2d, v1.2d +; NO_SVE-NEXT: ret +; ; CHECK-LABEL: fcmp_oeq_v2f64: ; CHECK: // %bb.0: ; CHECK-NEXT: fcmeq v0.2d, v0.2d, v1.2d @@ -269,6 +850,15 @@ } define void @fcmp_oeq_v4f64(<4 x double>* %a, <4 x double>* %b, <4 x i64>* %c) #0 { +; NO_SVE-LABEL: fcmp_oeq_v4f64: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: ldp q0, q3, [x1] +; NO_SVE-NEXT: ldp q1, q2, [x0] +; NO_SVE-NEXT: fcmeq v0.2d, v1.2d, v0.2d +; NO_SVE-NEXT: fcmeq v1.2d, v2.2d, v3.2d +; NO_SVE-NEXT: stp q0, q1, [x2] +; NO_SVE-NEXT: ret +; ; CHECK-LABEL: fcmp_oeq_v4f64: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.d, vl4 @@ -288,6 +878,20 @@ define void @fcmp_oeq_v8f64(<8 x double>* %a, <8 x double>* %b, <8 x i64>* %c) #0 { ; Ensure sensible type legalisation +; NO_SVE-LABEL: fcmp_oeq_v8f64: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: ldp q0, q5, [x1, #32] +; NO_SVE-NEXT: ldp q1, q2, [x0, #32] +; NO_SVE-NEXT: fcmeq v0.2d, v1.2d, v0.2d +; NO_SVE-NEXT: ldp q3, q4, [x0] +; NO_SVE-NEXT: fcmeq v2.2d, v2.2d, v5.2d +; NO_SVE-NEXT: ldp q6, q1, [x1] +; NO_SVE-NEXT: stp q0, q2, [x2, #32] +; NO_SVE-NEXT: fcmeq v3.2d, v3.2d, v6.2d +; NO_SVE-NEXT: fcmeq v1.2d, v4.2d, v1.2d +; NO_SVE-NEXT: stp q3, q1, [x2] +; NO_SVE-NEXT: ret +; ; VBITS_EQ_256-LABEL: fcmp_oeq_v8f64: ; VBITS_EQ_256: // %bb.0: ; VBITS_EQ_256-NEXT: mov x8, #4 @@ -304,6 +908,22 @@ ; VBITS_EQ_256-NEXT: st1d { z1.d }, p0, [x2] ; VBITS_EQ_256-NEXT: ret ; +; VBITS_GE_384-LABEL: fcmp_oeq_v8f64: +; VBITS_GE_384: // %bb.0: +; VBITS_GE_384-NEXT: mov x8, #4 +; VBITS_GE_384-NEXT: ptrue p0.d, vl4 +; VBITS_GE_384-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] +; VBITS_GE_384-NEXT: ld1d { z1.d }, p0/z, [x0] +; VBITS_GE_384-NEXT: ld1d { z2.d }, p0/z, [x1, x8, lsl #3] +; VBITS_GE_384-NEXT: ld1d { z3.d }, p0/z, [x1] +; VBITS_GE_384-NEXT: fcmeq p1.d, p0/z, z0.d, z2.d +; VBITS_GE_384-NEXT: fcmeq p2.d, p0/z, z1.d, z3.d +; VBITS_GE_384-NEXT: mov z0.d, p1/z, #-1 // =0xffffffffffffffff +; VBITS_GE_384-NEXT: mov z1.d, p2/z, #-1 // =0xffffffffffffffff +; VBITS_GE_384-NEXT: st1d { z0.d }, p0, [x2, x8, lsl #3] +; VBITS_GE_384-NEXT: st1d { z1.d }, p0, [x2] +; VBITS_GE_384-NEXT: ret +; ; VBITS_GE_512-LABEL: fcmp_oeq_v8f64: ; VBITS_GE_512: // %bb.0: ; VBITS_GE_512-NEXT: ptrue p0.d, vl8 @@ -322,6 +942,86 @@ } define void @fcmp_oeq_v16f64(<16 x double>* %a, <16 x double>* %b, <16 x i64>* %c) #0 { +; NO_SVE-LABEL: fcmp_oeq_v16f64: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: ldp q0, q17, [x1, #32] +; NO_SVE-NEXT: ldp q1, q2, [x0, #96] +; NO_SVE-NEXT: ldp q5, q6, [x0, #32] +; NO_SVE-NEXT: fcmeq v0.2d, v5.2d, v0.2d +; NO_SVE-NEXT: ldp q7, q16, [x1, #96] +; NO_SVE-NEXT: fcmeq v1.2d, v1.2d, v7.2d +; NO_SVE-NEXT: ldp q3, q4, [x0, #64] +; NO_SVE-NEXT: fcmeq v2.2d, v2.2d, v16.2d +; NO_SVE-NEXT: ldp q5, q7, [x1, #64] +; NO_SVE-NEXT: fcmeq v3.2d, v3.2d, v5.2d +; NO_SVE-NEXT: ldp q16, q5, [x0] +; NO_SVE-NEXT: fcmeq v4.2d, v4.2d, v7.2d +; NO_SVE-NEXT: ldp q7, q18, [x1] +; NO_SVE-NEXT: stp q3, q4, [x2, #64] +; NO_SVE-NEXT: stp q1, q2, [x2, #96] +; NO_SVE-NEXT: fcmeq v1.2d, v6.2d, v17.2d +; NO_SVE-NEXT: fcmeq v2.2d, v16.2d, v7.2d +; NO_SVE-NEXT: fcmeq v3.2d, v5.2d, v18.2d +; NO_SVE-NEXT: stp q0, q1, [x2, #32] +; NO_SVE-NEXT: stp q2, q3, [x2] +; NO_SVE-NEXT: ret +; +; VBITS_EQ_256-LABEL: fcmp_oeq_v16f64: +; VBITS_EQ_256: // %bb.0: +; VBITS_EQ_256-NEXT: mov x8, #8 +; VBITS_EQ_256-NEXT: mov x9, #12 +; VBITS_EQ_256-NEXT: mov x10, #4 +; VBITS_EQ_256-NEXT: ptrue p0.d, vl4 +; VBITS_EQ_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] +; VBITS_EQ_256-NEXT: ld1d { z1.d }, p0/z, [x0, x9, lsl #3] +; VBITS_EQ_256-NEXT: ld1d { z2.d }, p0/z, [x0, x10, lsl #3] +; VBITS_EQ_256-NEXT: ld1d { z3.d }, p0/z, [x0] +; VBITS_EQ_256-NEXT: ld1d { z4.d }, p0/z, [x1, x9, lsl #3] +; VBITS_EQ_256-NEXT: ld1d { z5.d }, p0/z, [x1, x10, lsl #3] +; VBITS_EQ_256-NEXT: ld1d { z6.d }, p0/z, [x1, x8, lsl #3] +; VBITS_EQ_256-NEXT: ld1d { z7.d }, p0/z, [x1] +; VBITS_EQ_256-NEXT: fcmeq p1.d, p0/z, z2.d, z5.d +; VBITS_EQ_256-NEXT: fcmeq p2.d, p0/z, z1.d, z4.d +; VBITS_EQ_256-NEXT: mov z1.d, p1/z, #-1 // =0xffffffffffffffff +; VBITS_EQ_256-NEXT: mov z2.d, p2/z, #-1 // =0xffffffffffffffff +; VBITS_EQ_256-NEXT: fcmeq p1.d, p0/z, z0.d, z6.d +; VBITS_EQ_256-NEXT: fcmeq p2.d, p0/z, z3.d, z7.d +; VBITS_EQ_256-NEXT: mov z0.d, p1/z, #-1 // =0xffffffffffffffff +; VBITS_EQ_256-NEXT: mov z3.d, p2/z, #-1 // =0xffffffffffffffff +; VBITS_EQ_256-NEXT: st1d { z0.d }, p0, [x2, x8, lsl #3] +; VBITS_EQ_256-NEXT: st1d { z2.d }, p0, [x2, x9, lsl #3] +; VBITS_EQ_256-NEXT: st1d { z1.d }, p0, [x2, x10, lsl #3] +; VBITS_EQ_256-NEXT: st1d { z3.d }, p0, [x2] +; VBITS_EQ_256-NEXT: ret +; +; VBITS_GE_384-LABEL: fcmp_oeq_v16f64: +; VBITS_GE_384: // %bb.0: +; VBITS_GE_384-NEXT: mov x8, #8 +; VBITS_GE_384-NEXT: mov x9, #12 +; VBITS_GE_384-NEXT: mov x10, #4 +; VBITS_GE_384-NEXT: ptrue p0.d, vl4 +; VBITS_GE_384-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] +; VBITS_GE_384-NEXT: ld1d { z1.d }, p0/z, [x0, x9, lsl #3] +; VBITS_GE_384-NEXT: ld1d { z2.d }, p0/z, [x0, x10, lsl #3] +; VBITS_GE_384-NEXT: ld1d { z3.d }, p0/z, [x0] +; VBITS_GE_384-NEXT: ld1d { z4.d }, p0/z, [x1, x9, lsl #3] +; VBITS_GE_384-NEXT: ld1d { z5.d }, p0/z, [x1, x10, lsl #3] +; VBITS_GE_384-NEXT: ld1d { z6.d }, p0/z, [x1, x8, lsl #3] +; VBITS_GE_384-NEXT: ld1d { z7.d }, p0/z, [x1] +; VBITS_GE_384-NEXT: fcmeq p1.d, p0/z, z2.d, z5.d +; VBITS_GE_384-NEXT: fcmeq p2.d, p0/z, z1.d, z4.d +; VBITS_GE_384-NEXT: mov z1.d, p1/z, #-1 // =0xffffffffffffffff +; VBITS_GE_384-NEXT: mov z2.d, p2/z, #-1 // =0xffffffffffffffff +; VBITS_GE_384-NEXT: fcmeq p1.d, p0/z, z0.d, z6.d +; VBITS_GE_384-NEXT: fcmeq p2.d, p0/z, z3.d, z7.d +; VBITS_GE_384-NEXT: mov z0.d, p1/z, #-1 // =0xffffffffffffffff +; VBITS_GE_384-NEXT: mov z3.d, p2/z, #-1 // =0xffffffffffffffff +; VBITS_GE_384-NEXT: st1d { z0.d }, p0, [x2, x8, lsl #3] +; VBITS_GE_384-NEXT: st1d { z2.d }, p0, [x2, x9, lsl #3] +; VBITS_GE_384-NEXT: st1d { z1.d }, p0, [x2, x10, lsl #3] +; VBITS_GE_384-NEXT: st1d { z3.d }, p0, [x2] +; VBITS_GE_384-NEXT: ret +; ; VBITS_GE_1024-LABEL: fcmp_oeq_v16f64: ; VBITS_GE_1024: // %bb.0: ; VBITS_GE_1024-NEXT: ptrue p0.d, vl16 @@ -340,6 +1040,162 @@ } define void @fcmp_oeq_v32f64(<32 x double>* %a, <32 x double>* %b, <32 x i64>* %c) #0 { +; NO_SVE-LABEL: fcmp_oeq_v32f64: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: str d10, [sp, #-32]! // 8-byte Folded Spill +; NO_SVE-NEXT: .cfi_def_cfa_offset 32 +; NO_SVE-NEXT: stp d9, d8, [sp, #16] // 16-byte Folded Spill +; NO_SVE-NEXT: .cfi_offset b8, -8 +; NO_SVE-NEXT: .cfi_offset b9, -16 +; NO_SVE-NEXT: .cfi_offset b10, -32 +; NO_SVE-NEXT: ldp q25, q0, [x1, #224] +; NO_SVE-NEXT: ldp q2, q1, [x0, #224] +; NO_SVE-NEXT: fcmeq v2.2d, v2.2d, v25.2d +; NO_SVE-NEXT: ldp q6, q5, [x0, #160] +; NO_SVE-NEXT: fcmeq v0.2d, v1.2d, v0.2d +; NO_SVE-NEXT: ldp q29, q28, [x1, #160] +; NO_SVE-NEXT: ldp q4, q3, [x0, #192] +; NO_SVE-NEXT: fcmeq v5.2d, v5.2d, v28.2d +; NO_SVE-NEXT: ldp q27, q26, [x1, #192] +; NO_SVE-NEXT: fcmeq v4.2d, v4.2d, v27.2d +; NO_SVE-NEXT: ldp q16, q7, [x0, #128] +; NO_SVE-NEXT: fcmeq v3.2d, v3.2d, v26.2d +; NO_SVE-NEXT: ldp q18, q17, [x0, #96] +; NO_SVE-NEXT: ldp q20, q19, [x0, #64] +; NO_SVE-NEXT: ldp q31, q30, [x1, #128] +; NO_SVE-NEXT: ldp q9, q8, [x1, #96] +; NO_SVE-NEXT: ldp q1, q28, [x1, #64] +; NO_SVE-NEXT: fcmeq v1.2d, v20.2d, v1.2d +; NO_SVE-NEXT: ldp q22, q21, [x0, #32] +; NO_SVE-NEXT: ldp q24, q23, [x0] +; NO_SVE-NEXT: ldp q26, q25, [x1, #32] +; NO_SVE-NEXT: ldp q10, q27, [x1] +; NO_SVE-NEXT: stp q4, q3, [x2, #192] +; NO_SVE-NEXT: stp q2, q0, [x2, #224] +; NO_SVE-NEXT: fcmeq v0.2d, v6.2d, v29.2d +; NO_SVE-NEXT: fcmeq v2.2d, v7.2d, v30.2d +; NO_SVE-NEXT: fcmeq v3.2d, v16.2d, v31.2d +; NO_SVE-NEXT: fcmeq v4.2d, v17.2d, v8.2d +; NO_SVE-NEXT: stp q0, q5, [x2, #160] +; NO_SVE-NEXT: fcmeq v5.2d, v18.2d, v9.2d +; NO_SVE-NEXT: fcmeq v0.2d, v19.2d, v28.2d +; NO_SVE-NEXT: stp q3, q2, [x2, #128] +; NO_SVE-NEXT: fcmeq v2.2d, v21.2d, v25.2d +; NO_SVE-NEXT: fcmeq v3.2d, v22.2d, v26.2d +; NO_SVE-NEXT: stp q5, q4, [x2, #96] +; NO_SVE-NEXT: fcmeq v4.2d, v23.2d, v27.2d +; NO_SVE-NEXT: stp q1, q0, [x2, #64] +; NO_SVE-NEXT: fcmeq v0.2d, v24.2d, v10.2d +; NO_SVE-NEXT: ldp d9, d8, [sp, #16] // 16-byte Folded Reload +; NO_SVE-NEXT: stp q3, q2, [x2, #32] +; NO_SVE-NEXT: stp q0, q4, [x2] +; NO_SVE-NEXT: ldr d10, [sp], #32 // 8-byte Folded Reload +; NO_SVE-NEXT: ret +; +; VBITS_EQ_256-LABEL: fcmp_oeq_v32f64: +; VBITS_EQ_256: // %bb.0: +; VBITS_EQ_256-NEXT: mov x8, #24 +; VBITS_EQ_256-NEXT: mov x9, #28 +; VBITS_EQ_256-NEXT: mov x10, #16 +; VBITS_EQ_256-NEXT: mov x11, #20 +; VBITS_EQ_256-NEXT: mov x12, #8 +; VBITS_EQ_256-NEXT: mov x13, #12 +; VBITS_EQ_256-NEXT: mov x14, #4 +; VBITS_EQ_256-NEXT: ptrue p0.d, vl4 +; VBITS_EQ_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] +; VBITS_EQ_256-NEXT: ld1d { z1.d }, p0/z, [x0, x9, lsl #3] +; VBITS_EQ_256-NEXT: ld1d { z2.d }, p0/z, [x0, x10, lsl #3] +; VBITS_EQ_256-NEXT: ld1d { z3.d }, p0/z, [x0, x11, lsl #3] +; VBITS_EQ_256-NEXT: ld1d { z4.d }, p0/z, [x0, x12, lsl #3] +; VBITS_EQ_256-NEXT: ld1d { z5.d }, p0/z, [x0, x13, lsl #3] +; VBITS_EQ_256-NEXT: ld1d { z6.d }, p0/z, [x0, x14, lsl #3] +; VBITS_EQ_256-NEXT: ld1d { z7.d }, p0/z, [x0] +; VBITS_EQ_256-NEXT: ld1d { z16.d }, p0/z, [x1, x13, lsl #3] +; VBITS_EQ_256-NEXT: ld1d { z17.d }, p0/z, [x1, x14, lsl #3] +; VBITS_EQ_256-NEXT: ld1d { z20.d }, p0/z, [x1, x11, lsl #3] +; VBITS_EQ_256-NEXT: ld1d { z21.d }, p0/z, [x1, x12, lsl #3] +; VBITS_EQ_256-NEXT: ld1d { z19.d }, p0/z, [x1, x9, lsl #3] +; VBITS_EQ_256-NEXT: ld1d { z22.d }, p0/z, [x1, x10, lsl #3] +; VBITS_EQ_256-NEXT: ld1d { z18.d }, p0/z, [x1, x8, lsl #3] +; VBITS_EQ_256-NEXT: ld1d { z23.d }, p0/z, [x1] +; VBITS_EQ_256-NEXT: fcmeq p1.d, p0/z, z6.d, z17.d +; VBITS_EQ_256-NEXT: fcmeq p2.d, p0/z, z5.d, z16.d +; VBITS_EQ_256-NEXT: mov z5.d, p1/z, #-1 // =0xffffffffffffffff +; VBITS_EQ_256-NEXT: mov z6.d, p2/z, #-1 // =0xffffffffffffffff +; VBITS_EQ_256-NEXT: fcmeq p1.d, p0/z, z4.d, z21.d +; VBITS_EQ_256-NEXT: fcmeq p2.d, p0/z, z3.d, z20.d +; VBITS_EQ_256-NEXT: mov z3.d, p1/z, #-1 // =0xffffffffffffffff +; VBITS_EQ_256-NEXT: mov z4.d, p2/z, #-1 // =0xffffffffffffffff +; VBITS_EQ_256-NEXT: fcmeq p1.d, p0/z, z2.d, z22.d +; VBITS_EQ_256-NEXT: fcmeq p2.d, p0/z, z1.d, z19.d +; VBITS_EQ_256-NEXT: mov z1.d, p1/z, #-1 // =0xffffffffffffffff +; VBITS_EQ_256-NEXT: mov z2.d, p2/z, #-1 // =0xffffffffffffffff +; VBITS_EQ_256-NEXT: fcmeq p1.d, p0/z, z0.d, z18.d +; VBITS_EQ_256-NEXT: fcmeq p2.d, p0/z, z7.d, z23.d +; VBITS_EQ_256-NEXT: mov z0.d, p1/z, #-1 // =0xffffffffffffffff +; VBITS_EQ_256-NEXT: mov z7.d, p2/z, #-1 // =0xffffffffffffffff +; VBITS_EQ_256-NEXT: st1d { z0.d }, p0, [x2, x8, lsl #3] +; VBITS_EQ_256-NEXT: st1d { z2.d }, p0, [x2, x9, lsl #3] +; VBITS_EQ_256-NEXT: st1d { z1.d }, p0, [x2, x10, lsl #3] +; VBITS_EQ_256-NEXT: st1d { z4.d }, p0, [x2, x11, lsl #3] +; VBITS_EQ_256-NEXT: st1d { z3.d }, p0, [x2, x12, lsl #3] +; VBITS_EQ_256-NEXT: st1d { z6.d }, p0, [x2, x13, lsl #3] +; VBITS_EQ_256-NEXT: st1d { z5.d }, p0, [x2, x14, lsl #3] +; VBITS_EQ_256-NEXT: st1d { z7.d }, p0, [x2] +; VBITS_EQ_256-NEXT: ret +; +; VBITS_GE_384-LABEL: fcmp_oeq_v32f64: +; VBITS_GE_384: // %bb.0: +; VBITS_GE_384-NEXT: mov x8, #24 +; VBITS_GE_384-NEXT: mov x9, #28 +; VBITS_GE_384-NEXT: mov x10, #16 +; VBITS_GE_384-NEXT: mov x11, #20 +; VBITS_GE_384-NEXT: mov x12, #8 +; VBITS_GE_384-NEXT: mov x13, #12 +; VBITS_GE_384-NEXT: mov x14, #4 +; VBITS_GE_384-NEXT: ptrue p0.d, vl4 +; VBITS_GE_384-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] +; VBITS_GE_384-NEXT: ld1d { z1.d }, p0/z, [x0, x9, lsl #3] +; VBITS_GE_384-NEXT: ld1d { z2.d }, p0/z, [x0, x10, lsl #3] +; VBITS_GE_384-NEXT: ld1d { z3.d }, p0/z, [x0, x11, lsl #3] +; VBITS_GE_384-NEXT: ld1d { z4.d }, p0/z, [x0, x12, lsl #3] +; VBITS_GE_384-NEXT: ld1d { z5.d }, p0/z, [x0, x13, lsl #3] +; VBITS_GE_384-NEXT: ld1d { z6.d }, p0/z, [x0, x14, lsl #3] +; VBITS_GE_384-NEXT: ld1d { z7.d }, p0/z, [x0] +; VBITS_GE_384-NEXT: ld1d { z16.d }, p0/z, [x1, x13, lsl #3] +; VBITS_GE_384-NEXT: ld1d { z17.d }, p0/z, [x1, x14, lsl #3] +; VBITS_GE_384-NEXT: ld1d { z20.d }, p0/z, [x1, x11, lsl #3] +; VBITS_GE_384-NEXT: ld1d { z21.d }, p0/z, [x1, x12, lsl #3] +; VBITS_GE_384-NEXT: ld1d { z19.d }, p0/z, [x1, x9, lsl #3] +; VBITS_GE_384-NEXT: ld1d { z22.d }, p0/z, [x1, x10, lsl #3] +; VBITS_GE_384-NEXT: ld1d { z18.d }, p0/z, [x1, x8, lsl #3] +; VBITS_GE_384-NEXT: ld1d { z23.d }, p0/z, [x1] +; VBITS_GE_384-NEXT: fcmeq p1.d, p0/z, z6.d, z17.d +; VBITS_GE_384-NEXT: fcmeq p2.d, p0/z, z5.d, z16.d +; VBITS_GE_384-NEXT: mov z5.d, p1/z, #-1 // =0xffffffffffffffff +; VBITS_GE_384-NEXT: mov z6.d, p2/z, #-1 // =0xffffffffffffffff +; VBITS_GE_384-NEXT: fcmeq p1.d, p0/z, z4.d, z21.d +; VBITS_GE_384-NEXT: fcmeq p2.d, p0/z, z3.d, z20.d +; VBITS_GE_384-NEXT: mov z3.d, p1/z, #-1 // =0xffffffffffffffff +; VBITS_GE_384-NEXT: mov z4.d, p2/z, #-1 // =0xffffffffffffffff +; VBITS_GE_384-NEXT: fcmeq p1.d, p0/z, z2.d, z22.d +; VBITS_GE_384-NEXT: fcmeq p2.d, p0/z, z1.d, z19.d +; VBITS_GE_384-NEXT: mov z1.d, p1/z, #-1 // =0xffffffffffffffff +; VBITS_GE_384-NEXT: mov z2.d, p2/z, #-1 // =0xffffffffffffffff +; VBITS_GE_384-NEXT: fcmeq p1.d, p0/z, z0.d, z18.d +; VBITS_GE_384-NEXT: fcmeq p2.d, p0/z, z7.d, z23.d +; VBITS_GE_384-NEXT: mov z0.d, p1/z, #-1 // =0xffffffffffffffff +; VBITS_GE_384-NEXT: mov z7.d, p2/z, #-1 // =0xffffffffffffffff +; VBITS_GE_384-NEXT: st1d { z0.d }, p0, [x2, x8, lsl #3] +; VBITS_GE_384-NEXT: st1d { z2.d }, p0, [x2, x9, lsl #3] +; VBITS_GE_384-NEXT: st1d { z1.d }, p0, [x2, x10, lsl #3] +; VBITS_GE_384-NEXT: st1d { z4.d }, p0, [x2, x11, lsl #3] +; VBITS_GE_384-NEXT: st1d { z3.d }, p0, [x2, x12, lsl #3] +; VBITS_GE_384-NEXT: st1d { z6.d }, p0, [x2, x13, lsl #3] +; VBITS_GE_384-NEXT: st1d { z5.d }, p0, [x2, x14, lsl #3] +; VBITS_GE_384-NEXT: st1d { z7.d }, p0, [x2] +; VBITS_GE_384-NEXT: ret +; ; VBITS_GE_2048-LABEL: fcmp_oeq_v32f64: ; VBITS_GE_2048: // %bb.0: ; VBITS_GE_2048-NEXT: ptrue p0.d, vl32 @@ -362,6 +1218,21 @@ ; define void @fcmp_ueq_v16f16(<16 x half>* %a, <16 x half>* %b, <16 x i16>* %c) #0 { +; NO_SVE-LABEL: fcmp_ueq_v16f16: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: ldp q3, q0, [x1] +; NO_SVE-NEXT: ldp q2, q1, [x0] +; NO_SVE-NEXT: fcmgt v4.8h, v1.8h, v0.8h +; NO_SVE-NEXT: fcmgt v0.8h, v0.8h, v1.8h +; NO_SVE-NEXT: fcmgt v1.8h, v2.8h, v3.8h +; NO_SVE-NEXT: fcmgt v2.8h, v3.8h, v2.8h +; NO_SVE-NEXT: orr v0.16b, v0.16b, v4.16b +; NO_SVE-NEXT: mvn v0.16b, v0.16b +; NO_SVE-NEXT: orr v1.16b, v2.16b, v1.16b +; NO_SVE-NEXT: mvn v1.16b, v1.16b +; NO_SVE-NEXT: stp q1, q0, [x2] +; NO_SVE-NEXT: ret +; ; CHECK-LABEL: fcmp_ueq_v16f16: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.h, vl16 @@ -386,6 +1257,19 @@ ; define void @fcmp_one_v16f16(<16 x half>* %a, <16 x half>* %b, <16 x i16>* %c) #0 { +; NO_SVE-LABEL: fcmp_one_v16f16: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: ldp q3, q0, [x1] +; NO_SVE-NEXT: ldp q1, q2, [x0] +; NO_SVE-NEXT: fcmgt v5.8h, v1.8h, v3.8h +; NO_SVE-NEXT: fcmgt v1.8h, v3.8h, v1.8h +; NO_SVE-NEXT: fcmgt v4.8h, v2.8h, v0.8h +; NO_SVE-NEXT: fcmgt v0.8h, v0.8h, v2.8h +; NO_SVE-NEXT: orr v1.16b, v1.16b, v5.16b +; NO_SVE-NEXT: orr v0.16b, v0.16b, v4.16b +; NO_SVE-NEXT: stp q1, q0, [x2] +; NO_SVE-NEXT: ret +; ; CHECK-LABEL: fcmp_one_v16f16: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.h, vl16 @@ -410,6 +1294,17 @@ ; define void @fcmp_une_v16f16(<16 x half>* %a, <16 x half>* %b, <16 x i16>* %c) #0 { +; NO_SVE-LABEL: fcmp_une_v16f16: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: ldp q0, q3, [x1] +; NO_SVE-NEXT: ldp q1, q2, [x0] +; NO_SVE-NEXT: fcmeq v0.8h, v1.8h, v0.8h +; NO_SVE-NEXT: fcmeq v1.8h, v2.8h, v3.8h +; NO_SVE-NEXT: mvn v0.16b, v0.16b +; NO_SVE-NEXT: mvn v1.16b, v1.16b +; NO_SVE-NEXT: stp q0, q1, [x2] +; NO_SVE-NEXT: ret +; ; CHECK-LABEL: fcmp_une_v16f16: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.h, vl16 @@ -432,6 +1327,15 @@ ; define void @fcmp_ogt_v16f16(<16 x half>* %a, <16 x half>* %b, <16 x i16>* %c) #0 { +; NO_SVE-LABEL: fcmp_ogt_v16f16: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: ldp q0, q3, [x1] +; NO_SVE-NEXT: ldp q1, q2, [x0] +; NO_SVE-NEXT: fcmgt v0.8h, v1.8h, v0.8h +; NO_SVE-NEXT: fcmgt v1.8h, v2.8h, v3.8h +; NO_SVE-NEXT: stp q0, q1, [x2] +; NO_SVE-NEXT: ret +; ; CHECK-LABEL: fcmp_ogt_v16f16: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.h, vl16 @@ -454,6 +1358,17 @@ ; define void @fcmp_ugt_v16f16(<16 x half>* %a, <16 x half>* %b, <16 x i16>* %c) #0 { +; NO_SVE-LABEL: fcmp_ugt_v16f16: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: ldp q0, q3, [x1] +; NO_SVE-NEXT: ldp q1, q2, [x0] +; NO_SVE-NEXT: fcmge v0.8h, v0.8h, v1.8h +; NO_SVE-NEXT: fcmge v1.8h, v3.8h, v2.8h +; NO_SVE-NEXT: mvn v0.16b, v0.16b +; NO_SVE-NEXT: mvn v1.16b, v1.16b +; NO_SVE-NEXT: stp q0, q1, [x2] +; NO_SVE-NEXT: ret +; ; CHECK-LABEL: fcmp_ugt_v16f16: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.h, vl16 @@ -478,6 +1393,15 @@ ; define void @fcmp_olt_v16f16(<16 x half>* %a, <16 x half>* %b, <16 x i16>* %c) #0 { +; NO_SVE-LABEL: fcmp_olt_v16f16: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: ldp q0, q3, [x1] +; NO_SVE-NEXT: ldp q1, q2, [x0] +; NO_SVE-NEXT: fcmgt v0.8h, v0.8h, v1.8h +; NO_SVE-NEXT: fcmgt v1.8h, v3.8h, v2.8h +; NO_SVE-NEXT: stp q0, q1, [x2] +; NO_SVE-NEXT: ret +; ; CHECK-LABEL: fcmp_olt_v16f16: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.h, vl16 @@ -500,6 +1424,17 @@ ; define void @fcmp_ult_v16f16(<16 x half>* %a, <16 x half>* %b, <16 x i16>* %c) #0 { +; NO_SVE-LABEL: fcmp_ult_v16f16: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: ldp q0, q3, [x1] +; NO_SVE-NEXT: ldp q1, q2, [x0] +; NO_SVE-NEXT: fcmge v0.8h, v1.8h, v0.8h +; NO_SVE-NEXT: fcmge v1.8h, v2.8h, v3.8h +; NO_SVE-NEXT: mvn v0.16b, v0.16b +; NO_SVE-NEXT: mvn v1.16b, v1.16b +; NO_SVE-NEXT: stp q0, q1, [x2] +; NO_SVE-NEXT: ret +; ; CHECK-LABEL: fcmp_ult_v16f16: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.h, vl16 @@ -524,6 +1459,15 @@ ; define void @fcmp_oge_v16f16(<16 x half>* %a, <16 x half>* %b, <16 x i16>* %c) #0 { +; NO_SVE-LABEL: fcmp_oge_v16f16: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: ldp q0, q3, [x1] +; NO_SVE-NEXT: ldp q1, q2, [x0] +; NO_SVE-NEXT: fcmge v0.8h, v1.8h, v0.8h +; NO_SVE-NEXT: fcmge v1.8h, v2.8h, v3.8h +; NO_SVE-NEXT: stp q0, q1, [x2] +; NO_SVE-NEXT: ret +; ; CHECK-LABEL: fcmp_oge_v16f16: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.h, vl16 @@ -546,6 +1490,17 @@ ; define void @fcmp_uge_v16f16(<16 x half>* %a, <16 x half>* %b, <16 x i16>* %c) #0 { +; NO_SVE-LABEL: fcmp_uge_v16f16: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: ldp q0, q3, [x1] +; NO_SVE-NEXT: ldp q1, q2, [x0] +; NO_SVE-NEXT: fcmgt v0.8h, v0.8h, v1.8h +; NO_SVE-NEXT: fcmgt v1.8h, v3.8h, v2.8h +; NO_SVE-NEXT: mvn v0.16b, v0.16b +; NO_SVE-NEXT: mvn v1.16b, v1.16b +; NO_SVE-NEXT: stp q0, q1, [x2] +; NO_SVE-NEXT: ret +; ; CHECK-LABEL: fcmp_uge_v16f16: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.h, vl16 @@ -570,6 +1525,15 @@ ; define void @fcmp_ole_v16f16(<16 x half>* %a, <16 x half>* %b, <16 x i16>* %c) #0 { +; NO_SVE-LABEL: fcmp_ole_v16f16: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: ldp q0, q3, [x1] +; NO_SVE-NEXT: ldp q1, q2, [x0] +; NO_SVE-NEXT: fcmge v0.8h, v0.8h, v1.8h +; NO_SVE-NEXT: fcmge v1.8h, v3.8h, v2.8h +; NO_SVE-NEXT: stp q0, q1, [x2] +; NO_SVE-NEXT: ret +; ; CHECK-LABEL: fcmp_ole_v16f16: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.h, vl16 @@ -592,6 +1556,17 @@ ; define void @fcmp_ule_v16f16(<16 x half>* %a, <16 x half>* %b, <16 x i16>* %c) #0 { +; NO_SVE-LABEL: fcmp_ule_v16f16: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: ldp q0, q3, [x1] +; NO_SVE-NEXT: ldp q1, q2, [x0] +; NO_SVE-NEXT: fcmgt v0.8h, v1.8h, v0.8h +; NO_SVE-NEXT: fcmgt v1.8h, v2.8h, v3.8h +; NO_SVE-NEXT: mvn v0.16b, v0.16b +; NO_SVE-NEXT: mvn v1.16b, v1.16b +; NO_SVE-NEXT: stp q0, q1, [x2] +; NO_SVE-NEXT: ret +; ; CHECK-LABEL: fcmp_ule_v16f16: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.h, vl16 @@ -616,6 +1591,21 @@ ; define void @fcmp_uno_v16f16(<16 x half>* %a, <16 x half>* %b, <16 x i16>* %c) #0 { +; NO_SVE-LABEL: fcmp_uno_v16f16: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: ldp q3, q0, [x1] +; NO_SVE-NEXT: ldp q2, q1, [x0] +; NO_SVE-NEXT: fcmge v4.8h, v1.8h, v0.8h +; NO_SVE-NEXT: fcmgt v0.8h, v0.8h, v1.8h +; NO_SVE-NEXT: fcmge v1.8h, v2.8h, v3.8h +; NO_SVE-NEXT: fcmgt v2.8h, v3.8h, v2.8h +; NO_SVE-NEXT: orr v0.16b, v0.16b, v4.16b +; NO_SVE-NEXT: mvn v0.16b, v0.16b +; NO_SVE-NEXT: orr v1.16b, v2.16b, v1.16b +; NO_SVE-NEXT: mvn v1.16b, v1.16b +; NO_SVE-NEXT: stp q1, q0, [x2] +; NO_SVE-NEXT: ret +; ; CHECK-LABEL: fcmp_uno_v16f16: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.h, vl16 @@ -638,6 +1628,19 @@ ; define void @fcmp_ord_v16f16(<16 x half>* %a, <16 x half>* %b, <16 x i16>* %c) #0 { +; NO_SVE-LABEL: fcmp_ord_v16f16: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: ldp q3, q0, [x1] +; NO_SVE-NEXT: ldp q1, q2, [x0] +; NO_SVE-NEXT: fcmge v5.8h, v1.8h, v3.8h +; NO_SVE-NEXT: fcmgt v1.8h, v3.8h, v1.8h +; NO_SVE-NEXT: fcmge v4.8h, v2.8h, v0.8h +; NO_SVE-NEXT: fcmgt v0.8h, v0.8h, v2.8h +; NO_SVE-NEXT: orr v1.16b, v1.16b, v5.16b +; NO_SVE-NEXT: orr v0.16b, v0.16b, v4.16b +; NO_SVE-NEXT: stp q1, q0, [x2] +; NO_SVE-NEXT: ret +; ; CHECK-LABEL: fcmp_ord_v16f16: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.h, vl16 @@ -662,6 +1665,15 @@ ; define void @fcmp_eq_v16f16(<16 x half>* %a, <16 x half>* %b, <16 x i16>* %c) #0 { +; NO_SVE-LABEL: fcmp_eq_v16f16: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: ldp q0, q3, [x1] +; NO_SVE-NEXT: ldp q1, q2, [x0] +; NO_SVE-NEXT: fcmeq v0.8h, v1.8h, v0.8h +; NO_SVE-NEXT: fcmeq v1.8h, v2.8h, v3.8h +; NO_SVE-NEXT: stp q0, q1, [x2] +; NO_SVE-NEXT: ret +; ; CHECK-LABEL: fcmp_eq_v16f16: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.h, vl16 @@ -684,6 +1696,17 @@ ; define void @fcmp_ne_v16f16(<16 x half>* %a, <16 x half>* %b, <16 x i16>* %c) #0 { +; NO_SVE-LABEL: fcmp_ne_v16f16: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: ldp q0, q3, [x1] +; NO_SVE-NEXT: ldp q1, q2, [x0] +; NO_SVE-NEXT: fcmeq v0.8h, v1.8h, v0.8h +; NO_SVE-NEXT: fcmeq v1.8h, v2.8h, v3.8h +; NO_SVE-NEXT: mvn v0.16b, v0.16b +; NO_SVE-NEXT: mvn v1.16b, v1.16b +; NO_SVE-NEXT: stp q0, q1, [x2] +; NO_SVE-NEXT: ret +; ; CHECK-LABEL: fcmp_ne_v16f16: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.h, vl16 @@ -706,6 +1729,15 @@ ; define void @fcmp_gt_v16f16(<16 x half>* %a, <16 x half>* %b, <16 x i16>* %c) #0 { +; NO_SVE-LABEL: fcmp_gt_v16f16: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: ldp q0, q3, [x1] +; NO_SVE-NEXT: ldp q1, q2, [x0] +; NO_SVE-NEXT: fcmgt v0.8h, v1.8h, v0.8h +; NO_SVE-NEXT: fcmgt v1.8h, v2.8h, v3.8h +; NO_SVE-NEXT: stp q0, q1, [x2] +; NO_SVE-NEXT: ret +; ; CHECK-LABEL: fcmp_gt_v16f16: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.h, vl16 @@ -728,6 +1760,91 @@ ; define void @fcmp_lt_v16f16(<16 x half>* %a, <16 x half>* %b, <16 x i16>* %c) #0 { +; NO_SVE-LABEL: fcmp_lt_v16f16: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: ldr q0, [x1, #16] +; NO_SVE-NEXT: ldr q1, [x0, #16] +; NO_SVE-NEXT: mov h2, v0.h[1] +; NO_SVE-NEXT: mov h4, v0.h[3] +; NO_SVE-NEXT: mov h3, v1.h[1] +; NO_SVE-NEXT: mov h5, v1.h[3] +; NO_SVE-NEXT: mov h6, v1.h[6] +; NO_SVE-NEXT: fcmp h3, h2 +; NO_SVE-NEXT: mov h2, v0.h[2] +; NO_SVE-NEXT: mov h3, v1.h[2] +; NO_SVE-NEXT: csetm w8, lt +; NO_SVE-NEXT: fcmp h1, h0 +; NO_SVE-NEXT: csetm w9, lt +; NO_SVE-NEXT: fcmp h3, h2 +; NO_SVE-NEXT: mov h2, v0.h[4] +; NO_SVE-NEXT: mov h3, v1.h[4] +; NO_SVE-NEXT: csetm w10, lt +; NO_SVE-NEXT: fcmp h5, h4 +; NO_SVE-NEXT: mov h4, v0.h[5] +; NO_SVE-NEXT: mov h5, v1.h[5] +; NO_SVE-NEXT: mov h1, v1.h[7] +; NO_SVE-NEXT: csetm w11, lt +; NO_SVE-NEXT: fcmp h3, h2 +; NO_SVE-NEXT: mov h3, v0.h[6] +; NO_SVE-NEXT: ldr q2, [x0] +; NO_SVE-NEXT: mov h0, v0.h[7] +; NO_SVE-NEXT: csetm w12, lt +; NO_SVE-NEXT: fcmp h5, h4 +; NO_SVE-NEXT: ldr q4, [x1] +; NO_SVE-NEXT: mov h5, v2.h[1] +; NO_SVE-NEXT: csetm w13, lt +; NO_SVE-NEXT: fcmp h6, h3 +; NO_SVE-NEXT: mov h3, v4.h[1] +; NO_SVE-NEXT: mov h6, v4.h[3] +; NO_SVE-NEXT: csetm w14, lt +; NO_SVE-NEXT: fcmp h1, h0 +; NO_SVE-NEXT: mov h0, v4.h[2] +; NO_SVE-NEXT: mov h1, v2.h[2] +; NO_SVE-NEXT: csetm w15, lt +; NO_SVE-NEXT: fcmp h5, h3 +; NO_SVE-NEXT: fmov s3, w9 +; NO_SVE-NEXT: csetm w16, lt +; NO_SVE-NEXT: fcmp h2, h4 +; NO_SVE-NEXT: mov v3.h[1], w8 +; NO_SVE-NEXT: csetm w17, lt +; NO_SVE-NEXT: fcmp h1, h0 +; NO_SVE-NEXT: mov h0, v2.h[3] +; NO_SVE-NEXT: mov h1, v4.h[4] +; NO_SVE-NEXT: mov v3.h[2], w10 +; NO_SVE-NEXT: fmov s5, w17 +; NO_SVE-NEXT: csetm w8, lt +; NO_SVE-NEXT: fcmp h0, h6 +; NO_SVE-NEXT: mov h0, v2.h[4] +; NO_SVE-NEXT: mov v5.h[1], w16 +; NO_SVE-NEXT: mov h6, v4.h[5] +; NO_SVE-NEXT: mov v3.h[3], w11 +; NO_SVE-NEXT: mov v5.h[2], w8 +; NO_SVE-NEXT: csetm w8, lt +; NO_SVE-NEXT: fcmp h0, h1 +; NO_SVE-NEXT: mov h0, v2.h[5] +; NO_SVE-NEXT: mov h1, v4.h[6] +; NO_SVE-NEXT: mov v3.h[4], w12 +; NO_SVE-NEXT: mov h4, v4.h[7] +; NO_SVE-NEXT: mov v5.h[3], w8 +; NO_SVE-NEXT: csetm w8, lt +; NO_SVE-NEXT: fcmp h0, h6 +; NO_SVE-NEXT: mov h0, v2.h[6] +; NO_SVE-NEXT: mov h2, v2.h[7] +; NO_SVE-NEXT: mov v5.h[4], w8 +; NO_SVE-NEXT: csetm w8, lt +; NO_SVE-NEXT: fcmp h0, h1 +; NO_SVE-NEXT: mov v3.h[5], w13 +; NO_SVE-NEXT: mov v5.h[5], w8 +; NO_SVE-NEXT: csetm w8, lt +; NO_SVE-NEXT: fcmp h2, h4 +; NO_SVE-NEXT: mov v3.h[6], w14 +; NO_SVE-NEXT: mov v5.h[6], w8 +; NO_SVE-NEXT: csetm w8, lt +; NO_SVE-NEXT: mov v3.h[7], w15 +; NO_SVE-NEXT: mov v5.h[7], w8 +; NO_SVE-NEXT: stp q5, q3, [x2] +; NO_SVE-NEXT: ret +; ; CHECK-LABEL: fcmp_lt_v16f16: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.h, vl16 @@ -750,6 +1867,15 @@ ; define void @fcmp_ge_v16f16(<16 x half>* %a, <16 x half>* %b, <16 x i16>* %c) #0 { +; NO_SVE-LABEL: fcmp_ge_v16f16: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: ldp q0, q3, [x1] +; NO_SVE-NEXT: ldp q1, q2, [x0] +; NO_SVE-NEXT: fcmge v0.8h, v1.8h, v0.8h +; NO_SVE-NEXT: fcmge v1.8h, v2.8h, v3.8h +; NO_SVE-NEXT: stp q0, q1, [x2] +; NO_SVE-NEXT: ret +; ; CHECK-LABEL: fcmp_ge_v16f16: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.h, vl16 @@ -772,6 +1898,91 @@ ; define void @fcmp_le_v16f16(<16 x half>* %a, <16 x half>* %b, <16 x i16>* %c) #0 { +; NO_SVE-LABEL: fcmp_le_v16f16: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: ldr q0, [x1, #16] +; NO_SVE-NEXT: ldr q1, [x0, #16] +; NO_SVE-NEXT: mov h2, v0.h[1] +; NO_SVE-NEXT: mov h4, v0.h[3] +; NO_SVE-NEXT: mov h3, v1.h[1] +; NO_SVE-NEXT: mov h5, v1.h[3] +; NO_SVE-NEXT: mov h6, v1.h[6] +; NO_SVE-NEXT: fcmp h3, h2 +; NO_SVE-NEXT: mov h2, v0.h[2] +; NO_SVE-NEXT: mov h3, v1.h[2] +; NO_SVE-NEXT: csetm w8, le +; NO_SVE-NEXT: fcmp h1, h0 +; NO_SVE-NEXT: csetm w9, le +; NO_SVE-NEXT: fcmp h3, h2 +; NO_SVE-NEXT: mov h2, v0.h[4] +; NO_SVE-NEXT: mov h3, v1.h[4] +; NO_SVE-NEXT: csetm w10, le +; NO_SVE-NEXT: fcmp h5, h4 +; NO_SVE-NEXT: mov h4, v0.h[5] +; NO_SVE-NEXT: mov h5, v1.h[5] +; NO_SVE-NEXT: mov h1, v1.h[7] +; NO_SVE-NEXT: csetm w11, le +; NO_SVE-NEXT: fcmp h3, h2 +; NO_SVE-NEXT: mov h3, v0.h[6] +; NO_SVE-NEXT: ldr q2, [x0] +; NO_SVE-NEXT: mov h0, v0.h[7] +; NO_SVE-NEXT: csetm w12, le +; NO_SVE-NEXT: fcmp h5, h4 +; NO_SVE-NEXT: ldr q4, [x1] +; NO_SVE-NEXT: mov h5, v2.h[1] +; NO_SVE-NEXT: csetm w13, le +; NO_SVE-NEXT: fcmp h6, h3 +; NO_SVE-NEXT: mov h3, v4.h[1] +; NO_SVE-NEXT: mov h6, v4.h[3] +; NO_SVE-NEXT: csetm w14, le +; NO_SVE-NEXT: fcmp h1, h0 +; NO_SVE-NEXT: mov h0, v4.h[2] +; NO_SVE-NEXT: mov h1, v2.h[2] +; NO_SVE-NEXT: csetm w15, le +; NO_SVE-NEXT: fcmp h5, h3 +; NO_SVE-NEXT: fmov s3, w9 +; NO_SVE-NEXT: csetm w16, le +; NO_SVE-NEXT: fcmp h2, h4 +; NO_SVE-NEXT: mov v3.h[1], w8 +; NO_SVE-NEXT: csetm w17, le +; NO_SVE-NEXT: fcmp h1, h0 +; NO_SVE-NEXT: mov h0, v2.h[3] +; NO_SVE-NEXT: mov h1, v4.h[4] +; NO_SVE-NEXT: mov v3.h[2], w10 +; NO_SVE-NEXT: fmov s5, w17 +; NO_SVE-NEXT: csetm w8, le +; NO_SVE-NEXT: fcmp h0, h6 +; NO_SVE-NEXT: mov h0, v2.h[4] +; NO_SVE-NEXT: mov v5.h[1], w16 +; NO_SVE-NEXT: mov h6, v4.h[5] +; NO_SVE-NEXT: mov v3.h[3], w11 +; NO_SVE-NEXT: mov v5.h[2], w8 +; NO_SVE-NEXT: csetm w8, le +; NO_SVE-NEXT: fcmp h0, h1 +; NO_SVE-NEXT: mov h0, v2.h[5] +; NO_SVE-NEXT: mov h1, v4.h[6] +; NO_SVE-NEXT: mov v3.h[4], w12 +; NO_SVE-NEXT: mov h4, v4.h[7] +; NO_SVE-NEXT: mov v5.h[3], w8 +; NO_SVE-NEXT: csetm w8, le +; NO_SVE-NEXT: fcmp h0, h6 +; NO_SVE-NEXT: mov h0, v2.h[6] +; NO_SVE-NEXT: mov h2, v2.h[7] +; NO_SVE-NEXT: mov v5.h[4], w8 +; NO_SVE-NEXT: csetm w8, le +; NO_SVE-NEXT: fcmp h0, h1 +; NO_SVE-NEXT: mov v3.h[5], w13 +; NO_SVE-NEXT: mov v5.h[5], w8 +; NO_SVE-NEXT: csetm w8, le +; NO_SVE-NEXT: fcmp h2, h4 +; NO_SVE-NEXT: mov v3.h[6], w14 +; NO_SVE-NEXT: mov v5.h[6], w8 +; NO_SVE-NEXT: csetm w8, le +; NO_SVE-NEXT: mov v3.h[7], w15 +; NO_SVE-NEXT: mov v5.h[7], w8 +; NO_SVE-NEXT: stp q5, q3, [x2] +; NO_SVE-NEXT: ret +; ; CHECK-LABEL: fcmp_le_v16f16: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.h, vl16 diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-frame-offests-crash.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-frame-offests-crash.ll --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-frame-offests-crash.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-frame-offests-crash.ll @@ -1,3 +1,4 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s | FileCheck %s target triple = "aarch64-unknown-linux-gnu" @@ -8,13 +9,66 @@ ; spill slot. define dso_local void @func1(i64* %v1, i64* %v2, i64* %v3, i64* %v4, i64* %v5, i64* %v6, i64* %v7, i64* %v8, +; CHECK-LABEL: func1: +; CHECK: // %bb.0: +; CHECK-NEXT: str x25, [sp, #-64]! // 8-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 64 +; CHECK-NEXT: stp x24, x23, [sp, #16] // 16-byte Folded Spill +; CHECK-NEXT: stp x22, x21, [sp, #32] // 16-byte Folded Spill +; CHECK-NEXT: stp x20, x19, [sp, #48] // 16-byte Folded Spill +; CHECK-NEXT: .cfi_offset w19, -8 +; CHECK-NEXT: .cfi_offset w20, -16 +; CHECK-NEXT: .cfi_offset w21, -24 +; CHECK-NEXT: .cfi_offset w22, -32 +; CHECK-NEXT: .cfi_offset w23, -40 +; CHECK-NEXT: .cfi_offset w24, -48 +; CHECK-NEXT: .cfi_offset w25, -64 +; CHECK-NEXT: add x8, sp, #64 +; CHECK-NEXT: add x9, sp, #128 +; CHECK-NEXT: add x10, sp, #160 +; CHECK-NEXT: add x11, sp, #192 +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: add x20, sp, #192 +; CHECK-NEXT: ld1d { z0.d }, p0/z, [x8] +; CHECK-NEXT: ld1d { z1.d }, p0/z, [x9] +; CHECK-NEXT: ld1d { z2.d }, p0/z, [x10] +; CHECK-NEXT: ld1d { z3.d }, p0/z, [x11] +; CHECK-NEXT: ldp x18, x19, [sp, #368] +; CHECK-NEXT: add x21, sp, #160 +; CHECK-NEXT: add x22, sp, #128 +; CHECK-NEXT: ldp x24, x14, [sp, #296] +; CHECK-NEXT: add x23, sp, #64 +; CHECK-NEXT: ldr x25, [sp, #288] +; CHECK-NEXT: ldp x9, x8, [sp, #344] +; CHECK-NEXT: ldp x11, x10, [sp, #328] +; CHECK-NEXT: ldp x13, x12, [sp, #312] +; CHECK-NEXT: ldr x15, [sp, #120] +; CHECK-NEXT: ldur q4, [sp, #104] +; CHECK-NEXT: ldp x16, x17, [sp, #224] +; CHECK-NEXT: st1d { z3.d }, p0, [x20] +; CHECK-NEXT: st1d { z2.d }, p0, [x21] +; CHECK-NEXT: st1d { z1.d }, p0, [x22] +; CHECK-NEXT: st1d { z0.d }, p0, [x23] +; CHECK-NEXT: stp x18, x19, [sp, #368] +; CHECK-NEXT: stp x25, x24, [sp, #288] +; CHECK-NEXT: ldp x20, x19, [sp, #48] // 16-byte Folded Reload +; CHECK-NEXT: stp x16, x17, [sp, #224] +; CHECK-NEXT: ldp x22, x21, [sp, #32] // 16-byte Folded Reload +; CHECK-NEXT: stur q4, [sp, #104] +; CHECK-NEXT: ldp x24, x23, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: str x15, [sp, #120] +; CHECK-NEXT: stp x14, x13, [sp, #304] +; CHECK-NEXT: stp x12, x11, [sp, #320] +; CHECK-NEXT: stp x10, x9, [sp, #336] +; CHECK-NEXT: str x8, [sp, #352] +; CHECK-NEXT: ldr x25, [sp], #64 // 8-byte Folded Reload +; CHECK-NEXT: b func2 i64* %v9, i64* %v10, i64* %v11, i64* %v12, i64* %v13, i64* %v14, i64* %v15, i64* %v16, i64* %v17, i64* %v18, i64* %v19, i64* %v20, i64* %v21, i64* %v22, i64* %v23, i64* %v24, i64* %v25, i64* %v26, i64* %v27, i64* %v28, i64* %v29, i64* %v30, i64* %v31, i64* %v32, i64* %v33, i64* %v34, i64* %v35, i64* %v36, i64* %v37, i64* %v38, i64* %v39, i64* %v40, i64* %v41, i64* %v42, i64* %v43, i64* %v44, i64* %v45, i64* %v46, i64* %v47, i64* %v48, i64 %v49) #0 { -; CHECK-LABEL: func1 tail call void @func2(i64* %v1, i64* %v2, i64* %v3, i64* %v4, i64* %v5, i64* %v6, i64* %v7, i64* %v8, i64* %v9, i64* %v10, i64* %v11, i64* %v12, i64* undef, i64* %v14, i64* %v15, i64* %v16, i64* %v17, i64* %v18, i64* %v19, i64* %v20, i64* %v21, i64* %v22, i64* %v23, i64* %v24, diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-insert-vector-elt.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-insert-vector-elt.ll --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-insert-vector-elt.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-insert-vector-elt.ll @@ -1,3 +1,4 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -aarch64-sve-vector-bits-min=128 < %s | FileCheck %s -D#VBYTES=16 -check-prefix=NO_SVE ; RUN: llc -aarch64-sve-vector-bits-min=256 < %s | FileCheck %s -check-prefixes=VBITS_GE_256 ; RUN: llc -aarch64-sve-vector-bits-min=384 < %s | FileCheck %s -check-prefixes=VBITS_GE_256 @@ -26,6 +27,14 @@ ; Don't use SVE for 64-bit vectors. define <4 x half> @insertelement_v4f16(<4 x half> %op1) #0 { +; NO_SVE-LABEL: insertelement_v4f16: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: fmov h1, #5.00000000 +; NO_SVE-NEXT: // kill: def $d0 killed $d0 def $q0 +; NO_SVE-NEXT: mov v0.h[3], v1.h[0] +; NO_SVE-NEXT: // kill: def $d0 killed $d0 killed $q0 +; NO_SVE-NEXT: ret +; ; VBITS_GE_256-LABEL: insertelement_v4f16: ; VBITS_GE_256: // %bb.0: ; VBITS_GE_256-NEXT: fmov h1, #5.00000000 @@ -39,6 +48,12 @@ ; Don't use SVE for 128-bit vectors. define <8 x half> @insertelement_v8f16(<8 x half> %op1) #0 { +; NO_SVE-LABEL: insertelement_v8f16: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: fmov h1, #5.00000000 +; NO_SVE-NEXT: mov v0.h[7], v1.h[0] +; NO_SVE-NEXT: ret +; ; VBITS_GE_256-LABEL: insertelement_v8f16: ; VBITS_GE_256: // %bb.0: ; VBITS_GE_256-NEXT: fmov h1, #5.00000000 @@ -49,6 +64,14 @@ } define <16 x half> @insertelement_v16f16(<16 x half>* %a) #0 { +; NO_SVE-LABEL: insertelement_v16f16: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: fmov h0, #5.00000000 +; NO_SVE-NEXT: ldr q1, [x0, #16] +; NO_SVE-NEXT: mov v1.h[7], v0.h[0] +; NO_SVE-NEXT: ldr q0, [x0] +; NO_SVE-NEXT: ret +; ; VBITS_GE_256-LABEL: insertelement_v16f16: ; VBITS_GE_256: // %bb.0: ; VBITS_GE_256-NEXT: mov w9, #15 @@ -68,6 +91,15 @@ } define <32 x half> @insertelement_v32f16(<32 x half>* %a) #0 { +; NO_SVE-LABEL: insertelement_v32f16: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: fmov h2, #5.00000000 +; NO_SVE-NEXT: ldr q3, [x0, #48] +; NO_SVE-NEXT: ldp q0, q1, [x0] +; NO_SVE-NEXT: mov v3.h[7], v2.h[0] +; NO_SVE-NEXT: ldr q2, [x0, #32] +; NO_SVE-NEXT: ret +; ; VBITS_GE_512-LABEL: insertelement_v32f16: ; VBITS_GE_512: // %bb.0: ; VBITS_GE_512-NEXT: mov w9, #31 @@ -87,6 +119,16 @@ } define <64 x half> @insertelement_v64f16(<64 x half>* %a) #0 { +; NO_SVE-LABEL: insertelement_v64f16: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: ldp q6, q7, [x0, #96] +; NO_SVE-NEXT: fmov h5, #5.00000000 +; NO_SVE-NEXT: ldp q0, q1, [x0] +; NO_SVE-NEXT: mov v7.h[7], v5.h[0] +; NO_SVE-NEXT: ldp q2, q3, [x0, #32] +; NO_SVE-NEXT: ldp q4, q5, [x0, #64] +; NO_SVE-NEXT: ret +; ; VBITS_GE_1024-LABEL: insertelement_v64f16: ; VBITS_GE_1024: // %bb.0: ; VBITS_GE_1024-NEXT: mov w9, #63 @@ -106,6 +148,28 @@ } define <128 x half> @insertelement_v128f16(<128 x half>* %a) #0 { +; NO_SVE-LABEL: insertelement_v128f16: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: ldp q23, q24, [x0, #224] +; NO_SVE-NEXT: fmov h0, #5.00000000 +; NO_SVE-NEXT: ldp q1, q2, [x0] +; NO_SVE-NEXT: mov v24.h[7], v0.h[0] +; NO_SVE-NEXT: ldp q3, q4, [x0, #32] +; NO_SVE-NEXT: ldp q5, q6, [x0, #64] +; NO_SVE-NEXT: ldp q7, q16, [x0, #96] +; NO_SVE-NEXT: ldp q17, q18, [x0, #128] +; NO_SVE-NEXT: ldp q19, q20, [x0, #192] +; NO_SVE-NEXT: ldp q21, q22, [x0, #160] +; NO_SVE-NEXT: stp q1, q2, [x8] +; NO_SVE-NEXT: stp q3, q4, [x8, #32] +; NO_SVE-NEXT: stp q5, q6, [x8, #64] +; NO_SVE-NEXT: stp q7, q16, [x8, #96] +; NO_SVE-NEXT: stp q17, q18, [x8, #128] +; NO_SVE-NEXT: stp q21, q22, [x8, #160] +; NO_SVE-NEXT: stp q19, q20, [x8, #192] +; NO_SVE-NEXT: stp q23, q24, [x8, #224] +; NO_SVE-NEXT: ret +; ; VBITS_GE_2048-LABEL: insertelement_v128f16: ; VBITS_GE_2048: // %bb.0: ; VBITS_GE_2048-NEXT: mov w9, #127 @@ -126,6 +190,14 @@ ; Don't use SVE for 64-bit vectors. define <2 x float> @insertelement_v2f32(<2 x float> %op1) #0 { +; NO_SVE-LABEL: insertelement_v2f32: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: fmov s1, #5.00000000 +; NO_SVE-NEXT: // kill: def $d0 killed $d0 def $q0 +; NO_SVE-NEXT: mov v0.s[1], v1.s[0] +; NO_SVE-NEXT: // kill: def $d0 killed $d0 killed $q0 +; NO_SVE-NEXT: ret +; ; VBITS_GE_256-LABEL: insertelement_v2f32: ; VBITS_GE_256: // %bb.0: ; VBITS_GE_256-NEXT: fmov s1, #5.00000000 @@ -139,6 +211,12 @@ ; Don't use SVE for 128-bit vectors. define <4 x float> @insertelement_v4f32(<4 x float> %op1) #0 { +; NO_SVE-LABEL: insertelement_v4f32: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: fmov s1, #5.00000000 +; NO_SVE-NEXT: mov v0.s[3], v1.s[0] +; NO_SVE-NEXT: ret +; ; VBITS_GE_256-LABEL: insertelement_v4f32: ; VBITS_GE_256: // %bb.0: ; VBITS_GE_256-NEXT: fmov s1, #5.00000000 @@ -149,6 +227,13 @@ } define <8 x float> @insertelement_v8f32(<8 x float>* %a) #0 { +; NO_SVE-LABEL: insertelement_v8f32: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: ldp q0, q1, [x0] +; NO_SVE-NEXT: fmov s2, #5.00000000 +; NO_SVE-NEXT: mov v1.s[3], v2.s[0] +; NO_SVE-NEXT: ret +; ; VBITS_GE_256-LABEL: insertelement_v8f32: ; VBITS_GE_256: // %bb.0: ; VBITS_GE_256-NEXT: mov w9, #7 @@ -168,6 +253,14 @@ } define <16 x float> @insertelement_v16f32(<16 x float>* %a) #0 { +; NO_SVE-LABEL: insertelement_v16f32: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: ldp q2, q3, [x0, #32] +; NO_SVE-NEXT: fmov s1, #5.00000000 +; NO_SVE-NEXT: mov v3.s[3], v1.s[0] +; NO_SVE-NEXT: ldp q0, q1, [x0] +; NO_SVE-NEXT: ret +; ; VBITS_GE_512-LABEL: insertelement_v16f32: ; VBITS_GE_512: // %bb.0: ; VBITS_GE_512-NEXT: mov w9, #15 @@ -187,6 +280,16 @@ } define <32 x float> @insertelement_v32f32(<32 x float>* %a) #0 { +; NO_SVE-LABEL: insertelement_v32f32: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: ldp q6, q7, [x0, #96] +; NO_SVE-NEXT: fmov s5, #5.00000000 +; NO_SVE-NEXT: ldp q0, q1, [x0] +; NO_SVE-NEXT: mov v7.s[3], v5.s[0] +; NO_SVE-NEXT: ldp q2, q3, [x0, #32] +; NO_SVE-NEXT: ldp q4, q5, [x0, #64] +; NO_SVE-NEXT: ret +; ; VBITS_GE_1024-LABEL: insertelement_v32f32: ; VBITS_GE_1024: // %bb.0: ; VBITS_GE_1024-NEXT: mov w9, #31 @@ -206,6 +309,28 @@ } define <64 x float> @insertelement_v64f32(<64 x float>* %a) #0 { +; NO_SVE-LABEL: insertelement_v64f32: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: ldp q23, q24, [x0, #224] +; NO_SVE-NEXT: fmov s0, #5.00000000 +; NO_SVE-NEXT: ldp q1, q2, [x0] +; NO_SVE-NEXT: mov v24.s[3], v0.s[0] +; NO_SVE-NEXT: ldp q3, q4, [x0, #32] +; NO_SVE-NEXT: ldp q5, q6, [x0, #64] +; NO_SVE-NEXT: ldp q7, q16, [x0, #96] +; NO_SVE-NEXT: ldp q17, q18, [x0, #128] +; NO_SVE-NEXT: ldp q19, q20, [x0, #192] +; NO_SVE-NEXT: ldp q21, q22, [x0, #160] +; NO_SVE-NEXT: stp q1, q2, [x8] +; NO_SVE-NEXT: stp q3, q4, [x8, #32] +; NO_SVE-NEXT: stp q5, q6, [x8, #64] +; NO_SVE-NEXT: stp q7, q16, [x8, #96] +; NO_SVE-NEXT: stp q17, q18, [x8, #128] +; NO_SVE-NEXT: stp q21, q22, [x8, #160] +; NO_SVE-NEXT: stp q19, q20, [x8, #192] +; NO_SVE-NEXT: stp q23, q24, [x8, #224] +; NO_SVE-NEXT: ret +; ; VBITS_GE_2048-LABEL: insertelement_v64f32: ; VBITS_GE_2048: // %bb.0: ; VBITS_GE_2048-NEXT: mov w9, #63 @@ -226,6 +351,11 @@ ; Don't use SVE for 64-bit vectors. define <1 x double> @insertelement_v1f64(<1 x double> %op1) #0 { +; NO_SVE-LABEL: insertelement_v1f64: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: fmov d0, #5.00000000 +; NO_SVE-NEXT: ret +; ; VBITS_GE_256-LABEL: insertelement_v1f64: ; VBITS_GE_256: // %bb.0: ; VBITS_GE_256-NEXT: fmov d0, #5.00000000 @@ -236,6 +366,12 @@ ; Don't use SVE for 128-bit vectors. define <2 x double> @insertelement_v2f64(<2 x double> %op1) #0 { +; NO_SVE-LABEL: insertelement_v2f64: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: fmov d1, #5.00000000 +; NO_SVE-NEXT: mov v0.d[1], v1.d[0] +; NO_SVE-NEXT: ret +; ; VBITS_GE_256-LABEL: insertelement_v2f64: ; VBITS_GE_256: // %bb.0: ; VBITS_GE_256-NEXT: fmov d1, #5.00000000 @@ -246,6 +382,14 @@ } define <4 x double> @insertelement_v4f64(<4 x double>* %a) #0 { +; NO_SVE-LABEL: insertelement_v4f64: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: fmov d0, #5.00000000 +; NO_SVE-NEXT: ldr q1, [x0, #16] +; NO_SVE-NEXT: mov v1.d[1], v0.d[0] +; NO_SVE-NEXT: ldr q0, [x0] +; NO_SVE-NEXT: ret +; ; VBITS_GE_256-LABEL: insertelement_v4f64: ; VBITS_GE_256: // %bb.0: ; VBITS_GE_256-NEXT: mov w9, #3 @@ -265,6 +409,15 @@ } define <8 x double> @insertelement_v8f64(<8 x double>* %a) #0 { +; NO_SVE-LABEL: insertelement_v8f64: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: fmov d2, #5.00000000 +; NO_SVE-NEXT: ldr q3, [x0, #48] +; NO_SVE-NEXT: ldp q0, q1, [x0] +; NO_SVE-NEXT: mov v3.d[1], v2.d[0] +; NO_SVE-NEXT: ldr q2, [x0, #32] +; NO_SVE-NEXT: ret +; ; VBITS_GE_512-LABEL: insertelement_v8f64: ; VBITS_GE_512: // %bb.0: ; VBITS_GE_512-NEXT: mov w9, #7 @@ -284,6 +437,16 @@ } define <16 x double> @insertelement_v16f64(<16 x double>* %a) #0 { +; NO_SVE-LABEL: insertelement_v16f64: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: ldp q6, q7, [x0, #96] +; NO_SVE-NEXT: fmov d5, #5.00000000 +; NO_SVE-NEXT: ldp q0, q1, [x0] +; NO_SVE-NEXT: mov v7.d[1], v5.d[0] +; NO_SVE-NEXT: ldp q2, q3, [x0, #32] +; NO_SVE-NEXT: ldp q4, q5, [x0, #64] +; NO_SVE-NEXT: ret +; ; VBITS_GE_1024-LABEL: insertelement_v16f64: ; VBITS_GE_1024: // %bb.0: ; VBITS_GE_1024-NEXT: mov w9, #15 @@ -303,6 +466,28 @@ } define <32 x double> @insertelement_v32f64(<32 x double>* %a) #0 { +; NO_SVE-LABEL: insertelement_v32f64: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: ldp q23, q24, [x0, #224] +; NO_SVE-NEXT: fmov d0, #5.00000000 +; NO_SVE-NEXT: ldp q1, q2, [x0] +; NO_SVE-NEXT: mov v24.d[1], v0.d[0] +; NO_SVE-NEXT: ldp q3, q4, [x0, #32] +; NO_SVE-NEXT: ldp q5, q6, [x0, #64] +; NO_SVE-NEXT: ldp q7, q16, [x0, #96] +; NO_SVE-NEXT: ldp q17, q18, [x0, #128] +; NO_SVE-NEXT: ldp q19, q20, [x0, #192] +; NO_SVE-NEXT: ldp q21, q22, [x0, #160] +; NO_SVE-NEXT: stp q1, q2, [x8] +; NO_SVE-NEXT: stp q3, q4, [x8, #32] +; NO_SVE-NEXT: stp q5, q6, [x8, #64] +; NO_SVE-NEXT: stp q7, q16, [x8, #96] +; NO_SVE-NEXT: stp q17, q18, [x8, #128] +; NO_SVE-NEXT: stp q21, q22, [x8, #160] +; NO_SVE-NEXT: stp q19, q20, [x8, #192] +; NO_SVE-NEXT: stp q23, q24, [x8, #224] +; NO_SVE-NEXT: ret +; ; VBITS_GE_2048-LABEL: insertelement_v32f64: ; VBITS_GE_2048: // %bb.0: ; VBITS_GE_2048-NEXT: mov w9, #31 diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-gather.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-gather.ll --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-gather.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-gather.ll @@ -1,3 +1,4 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -aarch64-sve-vector-bits-min=128 < %s | FileCheck %s -check-prefix=NO_SVE ; RUN: llc -aarch64-sve-vector-bits-min=256 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_EQ_256 ; RUN: llc -aarch64-sve-vector-bits-min=384 < %s | FileCheck %s -check-prefixes=CHECK @@ -25,6 +26,38 @@ ; define void @masked_gather_v2i8(<2 x i8>* %a, <2 x i8*>* %b) #0 { +; NO_SVE-LABEL: masked_gather_v2i8: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: sub sp, sp, #16 +; NO_SVE-NEXT: .cfi_def_cfa_offset 16 +; NO_SVE-NEXT: ldrb w8, [x0] +; NO_SVE-NEXT: ldr q1, [x1] +; NO_SVE-NEXT: fmov s0, w8 +; NO_SVE-NEXT: ldrb w8, [x0, #1] +; NO_SVE-NEXT: mov v0.s[1], w8 +; NO_SVE-NEXT: cmeq v0.2s, v0.2s, #0 +; NO_SVE-NEXT: mov w8, v0.s[1] +; NO_SVE-NEXT: fmov w9, s0 +; NO_SVE-NEXT: // implicit-def: $d0 +; NO_SVE-NEXT: bfi w9, w8, #1, #31 +; NO_SVE-NEXT: and w8, w9, #0x3 +; NO_SVE-NEXT: tbz w9, #0, .LBB0_2 +; NO_SVE-NEXT: // %bb.1: // %cond.load +; NO_SVE-NEXT: fmov x9, d1 +; NO_SVE-NEXT: ld1 { v0.b }[0], [x9] +; NO_SVE-NEXT: .LBB0_2: // %else +; NO_SVE-NEXT: tbz w8, #1, .LBB0_4 +; NO_SVE-NEXT: // %bb.3: // %cond.load1 +; NO_SVE-NEXT: mov x8, v1.d[1] +; NO_SVE-NEXT: ld1 { v0.b }[4], [x8] +; NO_SVE-NEXT: .LBB0_4: // %else2 +; NO_SVE-NEXT: mov w8, v0.s[1] +; NO_SVE-NEXT: fmov w9, s0 +; NO_SVE-NEXT: strb w9, [x0] +; NO_SVE-NEXT: strb w8, [x0, #1] +; NO_SVE-NEXT: add sp, sp, #16 +; NO_SVE-NEXT: ret +; ; CHECK-LABEL: masked_gather_v2i8: ; CHECK: // %bb.0: ; CHECK-NEXT: ldrb w8, [x0] @@ -50,6 +83,56 @@ } define void @masked_gather_v4i8(<4 x i8>* %a, <4 x i8*>* %b) #0 { +; NO_SVE-LABEL: masked_gather_v4i8: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: sub sp, sp, #16 +; NO_SVE-NEXT: .cfi_def_cfa_offset 16 +; NO_SVE-NEXT: ldr s0, [x0] +; NO_SVE-NEXT: ldr q1, [x1] +; NO_SVE-NEXT: ushll v0.8h, v0.8b, #0 +; NO_SVE-NEXT: cmeq v0.4h, v0.4h, #0 +; NO_SVE-NEXT: umov w8, v0.h[1] +; NO_SVE-NEXT: umov w9, v0.h[2] +; NO_SVE-NEXT: umov w10, v0.h[0] +; NO_SVE-NEXT: umov w11, v0.h[3] +; NO_SVE-NEXT: // implicit-def: $d0 +; NO_SVE-NEXT: and w8, w8, #0x1 +; NO_SVE-NEXT: and w9, w9, #0x1 +; NO_SVE-NEXT: bfi w10, w8, #1, #1 +; NO_SVE-NEXT: bfi w10, w9, #2, #1 +; NO_SVE-NEXT: bfi w10, w11, #3, #29 +; NO_SVE-NEXT: and w8, w10, #0xf +; NO_SVE-NEXT: tbnz w10, #0, .LBB1_6 +; NO_SVE-NEXT: // %bb.1: // %else +; NO_SVE-NEXT: tbnz w8, #1, .LBB1_7 +; NO_SVE-NEXT: .LBB1_2: // %else2 +; NO_SVE-NEXT: ldr q1, [x1, #16] +; NO_SVE-NEXT: tbnz w8, #2, .LBB1_8 +; NO_SVE-NEXT: .LBB1_3: // %else5 +; NO_SVE-NEXT: tbz w8, #3, .LBB1_5 +; NO_SVE-NEXT: .LBB1_4: // %cond.load7 +; NO_SVE-NEXT: mov x8, v1.d[1] +; NO_SVE-NEXT: ld1 { v0.b }[6], [x8] +; NO_SVE-NEXT: .LBB1_5: // %else8 +; NO_SVE-NEXT: xtn v0.8b, v0.8h +; NO_SVE-NEXT: str s0, [x0] +; NO_SVE-NEXT: add sp, sp, #16 +; NO_SVE-NEXT: ret +; NO_SVE-NEXT: .LBB1_6: // %cond.load +; NO_SVE-NEXT: fmov x9, d1 +; NO_SVE-NEXT: ld1 { v0.b }[0], [x9] +; NO_SVE-NEXT: tbz w8, #1, .LBB1_2 +; NO_SVE-NEXT: .LBB1_7: // %cond.load1 +; NO_SVE-NEXT: mov x9, v1.d[1] +; NO_SVE-NEXT: ld1 { v0.b }[2], [x9] +; NO_SVE-NEXT: ldr q1, [x1, #16] +; NO_SVE-NEXT: tbz w8, #2, .LBB1_3 +; NO_SVE-NEXT: .LBB1_8: // %cond.load4 +; NO_SVE-NEXT: fmov x9, d1 +; NO_SVE-NEXT: ld1 { v0.b }[4], [x9] +; NO_SVE-NEXT: tbnz w8, #3, .LBB1_4 +; NO_SVE-NEXT: b .LBB1_5 +; ; CHECK-LABEL: masked_gather_v4i8: ; CHECK: // %bb.0: ; CHECK-NEXT: ldr s0, [x0] @@ -76,6 +159,95 @@ define void @masked_gather_v8i8(<8 x i8>* %a, <8 x i8*>* %b) #0 { ; Ensure sensible type legalisation. +; NO_SVE-LABEL: masked_gather_v8i8: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: sub sp, sp, #16 +; NO_SVE-NEXT: .cfi_def_cfa_offset 16 +; NO_SVE-NEXT: ldr d0, [x0] +; NO_SVE-NEXT: ldr q1, [x1] +; NO_SVE-NEXT: cmeq v0.8b, v0.8b, #0 +; NO_SVE-NEXT: umov w8, v0.b[1] +; NO_SVE-NEXT: umov w10, v0.b[2] +; NO_SVE-NEXT: umov w9, v0.b[0] +; NO_SVE-NEXT: umov w11, v0.b[3] +; NO_SVE-NEXT: umov w12, v0.b[4] +; NO_SVE-NEXT: umov w13, v0.b[5] +; NO_SVE-NEXT: umov w14, v0.b[6] +; NO_SVE-NEXT: and w8, w8, #0x1 +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: and w9, w9, #0x1 +; NO_SVE-NEXT: and w11, w11, #0x1 +; NO_SVE-NEXT: bfi w9, w8, #1, #1 +; NO_SVE-NEXT: and w8, w12, #0x1 +; NO_SVE-NEXT: bfi w9, w10, #2, #1 +; NO_SVE-NEXT: and w10, w13, #0x1 +; NO_SVE-NEXT: bfi w9, w11, #3, #1 +; NO_SVE-NEXT: umov w11, v0.b[7] +; NO_SVE-NEXT: bfi w9, w8, #4, #1 +; NO_SVE-NEXT: and w8, w14, #0x1 +; NO_SVE-NEXT: bfi w9, w10, #5, #1 +; NO_SVE-NEXT: // implicit-def: $d0 +; NO_SVE-NEXT: orr w8, w9, w8, lsl #6 +; NO_SVE-NEXT: orr w9, w8, w11, lsl #7 +; NO_SVE-NEXT: and w8, w9, #0xff +; NO_SVE-NEXT: tbnz w9, #0, .LBB2_10 +; NO_SVE-NEXT: // %bb.1: // %else +; NO_SVE-NEXT: tbnz w8, #1, .LBB2_11 +; NO_SVE-NEXT: .LBB2_2: // %else2 +; NO_SVE-NEXT: ldr q1, [x1, #16] +; NO_SVE-NEXT: tbnz w8, #2, .LBB2_12 +; NO_SVE-NEXT: .LBB2_3: // %else5 +; NO_SVE-NEXT: tbnz w8, #3, .LBB2_13 +; NO_SVE-NEXT: .LBB2_4: // %else8 +; NO_SVE-NEXT: ldr q1, [x1, #32] +; NO_SVE-NEXT: tbnz w8, #4, .LBB2_14 +; NO_SVE-NEXT: .LBB2_5: // %else11 +; NO_SVE-NEXT: tbnz w8, #5, .LBB2_15 +; NO_SVE-NEXT: .LBB2_6: // %else14 +; NO_SVE-NEXT: ldr q1, [x1, #48] +; NO_SVE-NEXT: tbnz w8, #6, .LBB2_16 +; NO_SVE-NEXT: .LBB2_7: // %else17 +; NO_SVE-NEXT: tbz w8, #7, .LBB2_9 +; NO_SVE-NEXT: .LBB2_8: // %cond.load19 +; NO_SVE-NEXT: mov x8, v1.d[1] +; NO_SVE-NEXT: ld1 { v0.b }[7], [x8] +; NO_SVE-NEXT: .LBB2_9: // %else20 +; NO_SVE-NEXT: str d0, [x0] +; NO_SVE-NEXT: add sp, sp, #16 +; NO_SVE-NEXT: ret +; NO_SVE-NEXT: .LBB2_10: // %cond.load +; NO_SVE-NEXT: fmov x9, d1 +; NO_SVE-NEXT: ldr b0, [x9] +; NO_SVE-NEXT: tbz w8, #1, .LBB2_2 +; NO_SVE-NEXT: .LBB2_11: // %cond.load1 +; NO_SVE-NEXT: mov x9, v1.d[1] +; NO_SVE-NEXT: ld1 { v0.b }[1], [x9] +; NO_SVE-NEXT: ldr q1, [x1, #16] +; NO_SVE-NEXT: tbz w8, #2, .LBB2_3 +; NO_SVE-NEXT: .LBB2_12: // %cond.load4 +; NO_SVE-NEXT: fmov x9, d1 +; NO_SVE-NEXT: ld1 { v0.b }[2], [x9] +; NO_SVE-NEXT: tbz w8, #3, .LBB2_4 +; NO_SVE-NEXT: .LBB2_13: // %cond.load7 +; NO_SVE-NEXT: mov x9, v1.d[1] +; NO_SVE-NEXT: ld1 { v0.b }[3], [x9] +; NO_SVE-NEXT: ldr q1, [x1, #32] +; NO_SVE-NEXT: tbz w8, #4, .LBB2_5 +; NO_SVE-NEXT: .LBB2_14: // %cond.load10 +; NO_SVE-NEXT: fmov x9, d1 +; NO_SVE-NEXT: ld1 { v0.b }[4], [x9] +; NO_SVE-NEXT: tbz w8, #5, .LBB2_6 +; NO_SVE-NEXT: .LBB2_15: // %cond.load13 +; NO_SVE-NEXT: mov x9, v1.d[1] +; NO_SVE-NEXT: ld1 { v0.b }[5], [x9] +; NO_SVE-NEXT: ldr q1, [x1, #48] +; NO_SVE-NEXT: tbz w8, #6, .LBB2_7 +; NO_SVE-NEXT: .LBB2_16: // %cond.load16 +; NO_SVE-NEXT: fmov x9, d1 +; NO_SVE-NEXT: ld1 { v0.b }[6], [x9] +; NO_SVE-NEXT: tbnz w8, #7, .LBB2_8 +; NO_SVE-NEXT: b .LBB2_9 +; ; VBITS_EQ_256-LABEL: masked_gather_v8i8: ; VBITS_EQ_256: // %bb.0: ; VBITS_EQ_256-NEXT: ldr d0, [x0] @@ -131,6 +303,230 @@ } define void @masked_gather_v16i8(<16 x i8>* %a, <16 x i8*>* %b) #0 { +; NO_SVE-LABEL: masked_gather_v16i8: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: sub sp, sp, #16 +; NO_SVE-NEXT: .cfi_def_cfa_offset 16 +; NO_SVE-NEXT: ldr q0, [x0] +; NO_SVE-NEXT: ldr q1, [x1] +; NO_SVE-NEXT: cmeq v0.16b, v0.16b, #0 +; NO_SVE-NEXT: umov w8, v0.b[1] +; NO_SVE-NEXT: umov w10, v0.b[2] +; NO_SVE-NEXT: umov w9, v0.b[0] +; NO_SVE-NEXT: umov w11, v0.b[3] +; NO_SVE-NEXT: umov w12, v0.b[4] +; NO_SVE-NEXT: umov w13, v0.b[5] +; NO_SVE-NEXT: and w8, w8, #0x1 +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: and w9, w9, #0x1 +; NO_SVE-NEXT: and w11, w11, #0x1 +; NO_SVE-NEXT: and w12, w12, #0x1 +; NO_SVE-NEXT: and w13, w13, #0x1 +; NO_SVE-NEXT: bfi w9, w8, #1, #1 +; NO_SVE-NEXT: umov w8, v0.b[6] +; NO_SVE-NEXT: bfi w9, w10, #2, #1 +; NO_SVE-NEXT: umov w10, v0.b[7] +; NO_SVE-NEXT: bfi w9, w11, #3, #1 +; NO_SVE-NEXT: umov w11, v0.b[8] +; NO_SVE-NEXT: bfi w9, w12, #4, #1 +; NO_SVE-NEXT: umov w12, v0.b[9] +; NO_SVE-NEXT: and w8, w8, #0x1 +; NO_SVE-NEXT: bfi w9, w13, #5, #1 +; NO_SVE-NEXT: umov w13, v0.b[10] +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: orr w8, w9, w8, lsl #6 +; NO_SVE-NEXT: umov w9, v0.b[11] +; NO_SVE-NEXT: and w11, w11, #0x1 +; NO_SVE-NEXT: orr w8, w8, w10, lsl #7 +; NO_SVE-NEXT: umov w10, v0.b[12] +; NO_SVE-NEXT: and w12, w12, #0x1 +; NO_SVE-NEXT: orr w8, w8, w11, lsl #8 +; NO_SVE-NEXT: umov w11, v0.b[13] +; NO_SVE-NEXT: and w13, w13, #0x1 +; NO_SVE-NEXT: orr w8, w8, w12, lsl #9 +; NO_SVE-NEXT: umov w12, v0.b[14] +; NO_SVE-NEXT: and w9, w9, #0x1 +; NO_SVE-NEXT: orr w8, w8, w13, lsl #10 +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: orr w8, w8, w9, lsl #11 +; NO_SVE-NEXT: and w9, w11, #0x1 +; NO_SVE-NEXT: umov w11, v0.b[15] +; NO_SVE-NEXT: orr w8, w8, w10, lsl #12 +; NO_SVE-NEXT: and w10, w12, #0x1 +; NO_SVE-NEXT: orr w8, w8, w9, lsl #13 +; NO_SVE-NEXT: // implicit-def: $q0 +; NO_SVE-NEXT: orr w8, w8, w10, lsl #14 +; NO_SVE-NEXT: orr w9, w8, w11, lsl #15 +; NO_SVE-NEXT: and w8, w9, #0xffff +; NO_SVE-NEXT: tbnz w9, #0, .LBB3_18 +; NO_SVE-NEXT: // %bb.1: // %else +; NO_SVE-NEXT: tbnz w8, #1, .LBB3_19 +; NO_SVE-NEXT: .LBB3_2: // %else2 +; NO_SVE-NEXT: ldr q1, [x1, #16] +; NO_SVE-NEXT: tbnz w8, #2, .LBB3_20 +; NO_SVE-NEXT: .LBB3_3: // %else5 +; NO_SVE-NEXT: tbnz w8, #3, .LBB3_21 +; NO_SVE-NEXT: .LBB3_4: // %else8 +; NO_SVE-NEXT: ldr q1, [x1, #32] +; NO_SVE-NEXT: tbnz w8, #4, .LBB3_22 +; NO_SVE-NEXT: .LBB3_5: // %else11 +; NO_SVE-NEXT: tbnz w8, #5, .LBB3_23 +; NO_SVE-NEXT: .LBB3_6: // %else14 +; NO_SVE-NEXT: ldr q1, [x1, #48] +; NO_SVE-NEXT: tbnz w8, #6, .LBB3_24 +; NO_SVE-NEXT: .LBB3_7: // %else17 +; NO_SVE-NEXT: tbnz w8, #7, .LBB3_25 +; NO_SVE-NEXT: .LBB3_8: // %else20 +; NO_SVE-NEXT: ldr q1, [x1, #64] +; NO_SVE-NEXT: tbnz w8, #8, .LBB3_26 +; NO_SVE-NEXT: .LBB3_9: // %else23 +; NO_SVE-NEXT: tbnz w8, #9, .LBB3_27 +; NO_SVE-NEXT: .LBB3_10: // %else26 +; NO_SVE-NEXT: ldr q1, [x1, #80] +; NO_SVE-NEXT: tbnz w8, #10, .LBB3_28 +; NO_SVE-NEXT: .LBB3_11: // %else29 +; NO_SVE-NEXT: tbnz w8, #11, .LBB3_29 +; NO_SVE-NEXT: .LBB3_12: // %else32 +; NO_SVE-NEXT: ldr q1, [x1, #96] +; NO_SVE-NEXT: tbnz w8, #12, .LBB3_30 +; NO_SVE-NEXT: .LBB3_13: // %else35 +; NO_SVE-NEXT: tbnz w8, #13, .LBB3_31 +; NO_SVE-NEXT: .LBB3_14: // %else38 +; NO_SVE-NEXT: ldr q1, [x1, #112] +; NO_SVE-NEXT: tbnz w8, #14, .LBB3_32 +; NO_SVE-NEXT: .LBB3_15: // %else41 +; NO_SVE-NEXT: tbz w8, #15, .LBB3_17 +; NO_SVE-NEXT: .LBB3_16: // %cond.load43 +; NO_SVE-NEXT: mov x8, v1.d[1] +; NO_SVE-NEXT: ld1 { v0.b }[15], [x8] +; NO_SVE-NEXT: .LBB3_17: // %else44 +; NO_SVE-NEXT: str q0, [x0] +; NO_SVE-NEXT: add sp, sp, #16 +; NO_SVE-NEXT: ret +; NO_SVE-NEXT: .LBB3_18: // %cond.load +; NO_SVE-NEXT: fmov x9, d1 +; NO_SVE-NEXT: ldr b0, [x9] +; NO_SVE-NEXT: tbz w8, #1, .LBB3_2 +; NO_SVE-NEXT: .LBB3_19: // %cond.load1 +; NO_SVE-NEXT: mov x9, v1.d[1] +; NO_SVE-NEXT: ld1 { v0.b }[1], [x9] +; NO_SVE-NEXT: ldr q1, [x1, #16] +; NO_SVE-NEXT: tbz w8, #2, .LBB3_3 +; NO_SVE-NEXT: .LBB3_20: // %cond.load4 +; NO_SVE-NEXT: fmov x9, d1 +; NO_SVE-NEXT: ld1 { v0.b }[2], [x9] +; NO_SVE-NEXT: tbz w8, #3, .LBB3_4 +; NO_SVE-NEXT: .LBB3_21: // %cond.load7 +; NO_SVE-NEXT: mov x9, v1.d[1] +; NO_SVE-NEXT: ld1 { v0.b }[3], [x9] +; NO_SVE-NEXT: ldr q1, [x1, #32] +; NO_SVE-NEXT: tbz w8, #4, .LBB3_5 +; NO_SVE-NEXT: .LBB3_22: // %cond.load10 +; NO_SVE-NEXT: fmov x9, d1 +; NO_SVE-NEXT: ld1 { v0.b }[4], [x9] +; NO_SVE-NEXT: tbz w8, #5, .LBB3_6 +; NO_SVE-NEXT: .LBB3_23: // %cond.load13 +; NO_SVE-NEXT: mov x9, v1.d[1] +; NO_SVE-NEXT: ld1 { v0.b }[5], [x9] +; NO_SVE-NEXT: ldr q1, [x1, #48] +; NO_SVE-NEXT: tbz w8, #6, .LBB3_7 +; NO_SVE-NEXT: .LBB3_24: // %cond.load16 +; NO_SVE-NEXT: fmov x9, d1 +; NO_SVE-NEXT: ld1 { v0.b }[6], [x9] +; NO_SVE-NEXT: tbz w8, #7, .LBB3_8 +; NO_SVE-NEXT: .LBB3_25: // %cond.load19 +; NO_SVE-NEXT: mov x9, v1.d[1] +; NO_SVE-NEXT: ld1 { v0.b }[7], [x9] +; NO_SVE-NEXT: ldr q1, [x1, #64] +; NO_SVE-NEXT: tbz w8, #8, .LBB3_9 +; NO_SVE-NEXT: .LBB3_26: // %cond.load22 +; NO_SVE-NEXT: fmov x9, d1 +; NO_SVE-NEXT: ld1 { v0.b }[8], [x9] +; NO_SVE-NEXT: tbz w8, #9, .LBB3_10 +; NO_SVE-NEXT: .LBB3_27: // %cond.load25 +; NO_SVE-NEXT: mov x9, v1.d[1] +; NO_SVE-NEXT: ld1 { v0.b }[9], [x9] +; NO_SVE-NEXT: ldr q1, [x1, #80] +; NO_SVE-NEXT: tbz w8, #10, .LBB3_11 +; NO_SVE-NEXT: .LBB3_28: // %cond.load28 +; NO_SVE-NEXT: fmov x9, d1 +; NO_SVE-NEXT: ld1 { v0.b }[10], [x9] +; NO_SVE-NEXT: tbz w8, #11, .LBB3_12 +; NO_SVE-NEXT: .LBB3_29: // %cond.load31 +; NO_SVE-NEXT: mov x9, v1.d[1] +; NO_SVE-NEXT: ld1 { v0.b }[11], [x9] +; NO_SVE-NEXT: ldr q1, [x1, #96] +; NO_SVE-NEXT: tbz w8, #12, .LBB3_13 +; NO_SVE-NEXT: .LBB3_30: // %cond.load34 +; NO_SVE-NEXT: fmov x9, d1 +; NO_SVE-NEXT: ld1 { v0.b }[12], [x9] +; NO_SVE-NEXT: tbz w8, #13, .LBB3_14 +; NO_SVE-NEXT: .LBB3_31: // %cond.load37 +; NO_SVE-NEXT: mov x9, v1.d[1] +; NO_SVE-NEXT: ld1 { v0.b }[13], [x9] +; NO_SVE-NEXT: ldr q1, [x1, #112] +; NO_SVE-NEXT: tbz w8, #14, .LBB3_15 +; NO_SVE-NEXT: .LBB3_32: // %cond.load40 +; NO_SVE-NEXT: fmov x9, d1 +; NO_SVE-NEXT: ld1 { v0.b }[14], [x9] +; NO_SVE-NEXT: tbnz w8, #15, .LBB3_16 +; NO_SVE-NEXT: b .LBB3_17 +; +; VBITS_EQ_256-LABEL: masked_gather_v16i8: +; VBITS_EQ_256: // %bb.0: +; VBITS_EQ_256-NEXT: ldr q0, [x0] +; VBITS_EQ_256-NEXT: mov x10, #4 +; VBITS_EQ_256-NEXT: mov x8, #8 +; VBITS_EQ_256-NEXT: mov x9, #12 +; VBITS_EQ_256-NEXT: ptrue p0.d, vl4 +; VBITS_EQ_256-NEXT: cmeq v0.16b, v0.16b, #0 +; VBITS_EQ_256-NEXT: ld1d { z6.d }, p0/z, [x1, x10, lsl #3] +; VBITS_EQ_256-NEXT: ld1d { z4.d }, p0/z, [x1, x8, lsl #3] +; VBITS_EQ_256-NEXT: ld1d { z5.d }, p0/z, [x1, x9, lsl #3] +; VBITS_EQ_256-NEXT: zip2 v1.8b, v0.8b, v0.8b +; VBITS_EQ_256-NEXT: zip1 v2.8b, v0.8b, v0.8b +; VBITS_EQ_256-NEXT: ext v0.16b, v0.16b, v0.16b, #8 +; VBITS_EQ_256-NEXT: ld1d { z7.d }, p0/z, [x1] +; VBITS_EQ_256-NEXT: shl v1.4h, v1.4h, #8 +; VBITS_EQ_256-NEXT: shl v2.4h, v2.4h, #8 +; VBITS_EQ_256-NEXT: zip2 v3.8b, v0.8b, v0.8b +; VBITS_EQ_256-NEXT: zip1 v0.8b, v0.8b, v0.8b +; VBITS_EQ_256-NEXT: sshr v1.4h, v1.4h, #8 +; VBITS_EQ_256-NEXT: sshr v2.4h, v2.4h, #8 +; VBITS_EQ_256-NEXT: sunpklo z1.s, z1.h +; VBITS_EQ_256-NEXT: sunpklo z2.s, z2.h +; VBITS_EQ_256-NEXT: shl v3.4h, v3.4h, #8 +; VBITS_EQ_256-NEXT: shl v0.4h, v0.4h, #8 +; VBITS_EQ_256-NEXT: sunpklo z1.d, z1.s +; VBITS_EQ_256-NEXT: sunpklo z2.d, z2.s +; VBITS_EQ_256-NEXT: cmpne p1.d, p0/z, z1.d, #0 +; VBITS_EQ_256-NEXT: sshr v1.4h, v3.4h, #8 +; VBITS_EQ_256-NEXT: sshr v0.4h, v0.4h, #8 +; VBITS_EQ_256-NEXT: cmpne p2.d, p0/z, z2.d, #0 +; VBITS_EQ_256-NEXT: sunpklo z1.s, z1.h +; VBITS_EQ_256-NEXT: ld1b { z2.d }, p1/z, [z6.d] +; VBITS_EQ_256-NEXT: sunpklo z0.s, z0.h +; VBITS_EQ_256-NEXT: sunpklo z1.d, z1.s +; VBITS_EQ_256-NEXT: sunpklo z0.d, z0.s +; VBITS_EQ_256-NEXT: cmpne p1.d, p0/z, z1.d, #0 +; VBITS_EQ_256-NEXT: cmpne p0.d, p0/z, z0.d, #0 +; VBITS_EQ_256-NEXT: ld1b { z3.d }, p2/z, [z7.d] +; VBITS_EQ_256-NEXT: ld1b { z0.d }, p1/z, [z5.d] +; VBITS_EQ_256-NEXT: ld1b { z1.d }, p0/z, [z4.d] +; VBITS_EQ_256-NEXT: uzp1 z2.s, z2.s, z2.s +; VBITS_EQ_256-NEXT: uzp1 z2.h, z2.h, z2.h +; VBITS_EQ_256-NEXT: uzp1 z3.s, z3.s, z3.s +; VBITS_EQ_256-NEXT: uzp1 z0.s, z0.s, z0.s +; VBITS_EQ_256-NEXT: uzp1 z1.s, z1.s, z1.s +; VBITS_EQ_256-NEXT: uzp1 z3.h, z3.h, z3.h +; VBITS_EQ_256-NEXT: uzp1 z0.h, z0.h, z0.h +; VBITS_EQ_256-NEXT: uzp1 z1.h, z1.h, z1.h +; VBITS_EQ_256-NEXT: mov v3.d[1], v2.d[0] +; VBITS_EQ_256-NEXT: mov v1.d[1], v0.d[0] +; VBITS_EQ_256-NEXT: uzp1 v0.16b, v3.16b, v1.16b +; VBITS_EQ_256-NEXT: str q0, [x0] +; VBITS_EQ_256-NEXT: ret +; ; VBITS_GE_1024-LABEL: masked_gather_v16i8: ; VBITS_GE_1024: // %bb.0: ; VBITS_GE_1024-NEXT: ldr q0, [x0] @@ -156,6 +552,509 @@ } define void @masked_gather_v32i8(<32 x i8>* %a, <32 x i8*>* %b) #0 { +; NO_SVE-LABEL: masked_gather_v32i8: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: sub sp, sp, #16 +; NO_SVE-NEXT: .cfi_def_cfa_offset 16 +; NO_SVE-NEXT: ldp q1, q0, [x0] +; NO_SVE-NEXT: cmeq v1.16b, v1.16b, #0 +; NO_SVE-NEXT: cmeq v0.16b, v0.16b, #0 +; NO_SVE-NEXT: umov w8, v0.b[1] +; NO_SVE-NEXT: umov w10, v0.b[2] +; NO_SVE-NEXT: umov w9, v0.b[0] +; NO_SVE-NEXT: umov w11, v0.b[3] +; NO_SVE-NEXT: umov w12, v0.b[4] +; NO_SVE-NEXT: umov w13, v0.b[5] +; NO_SVE-NEXT: umov w14, v0.b[6] +; NO_SVE-NEXT: umov w15, v0.b[7] +; NO_SVE-NEXT: and w8, w8, #0x1 +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: and w9, w9, #0x1 +; NO_SVE-NEXT: and w11, w11, #0x1 +; NO_SVE-NEXT: and w12, w12, #0x1 +; NO_SVE-NEXT: and w13, w13, #0x1 +; NO_SVE-NEXT: bfi w9, w8, #1, #1 +; NO_SVE-NEXT: umov w8, v0.b[8] +; NO_SVE-NEXT: bfi w9, w10, #2, #1 +; NO_SVE-NEXT: umov w10, v0.b[9] +; NO_SVE-NEXT: bfi w9, w11, #3, #1 +; NO_SVE-NEXT: umov w11, v0.b[10] +; NO_SVE-NEXT: bfi w9, w12, #4, #1 +; NO_SVE-NEXT: and w14, w14, #0x1 +; NO_SVE-NEXT: umov w12, v0.b[11] +; NO_SVE-NEXT: bfi w9, w13, #5, #1 +; NO_SVE-NEXT: and w15, w15, #0x1 +; NO_SVE-NEXT: orr w9, w9, w14, lsl #6 +; NO_SVE-NEXT: and w8, w8, #0x1 +; NO_SVE-NEXT: umov w14, v1.b[1] +; NO_SVE-NEXT: orr w9, w9, w15, lsl #7 +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: orr w8, w9, w8, lsl #8 +; NO_SVE-NEXT: umov w9, v1.b[2] +; NO_SVE-NEXT: and w11, w11, #0x1 +; NO_SVE-NEXT: umov w15, v1.b[0] +; NO_SVE-NEXT: orr w8, w8, w10, lsl #9 +; NO_SVE-NEXT: and w12, w12, #0x1 +; NO_SVE-NEXT: umov w10, v1.b[3] +; NO_SVE-NEXT: orr w8, w8, w11, lsl #10 +; NO_SVE-NEXT: umov w11, v1.b[4] +; NO_SVE-NEXT: orr w8, w8, w12, lsl #11 +; NO_SVE-NEXT: umov w12, v1.b[5] +; NO_SVE-NEXT: and w14, w14, #0x1 +; NO_SVE-NEXT: umov w13, v0.b[12] +; NO_SVE-NEXT: and w9, w9, #0x1 +; NO_SVE-NEXT: and w15, w15, #0x1 +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: bfi w15, w14, #1, #1 +; NO_SVE-NEXT: umov w14, v1.b[9] +; NO_SVE-NEXT: bfi w15, w9, #2, #1 +; NO_SVE-NEXT: and w9, w11, #0x1 +; NO_SVE-NEXT: and w11, w12, #0x1 +; NO_SVE-NEXT: umov w12, v1.b[6] +; NO_SVE-NEXT: and w13, w13, #0x1 +; NO_SVE-NEXT: bfi w15, w10, #3, #1 +; NO_SVE-NEXT: umov w10, v1.b[7] +; NO_SVE-NEXT: bfi w15, w9, #4, #1 +; NO_SVE-NEXT: orr w8, w8, w13, lsl #12 +; NO_SVE-NEXT: umov w13, v0.b[13] +; NO_SVE-NEXT: bfi w15, w11, #5, #1 +; NO_SVE-NEXT: and w11, w12, #0x1 +; NO_SVE-NEXT: umov w12, v1.b[8] +; NO_SVE-NEXT: umov w9, v0.b[14] +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: orr w11, w15, w11, lsl #6 +; NO_SVE-NEXT: and w13, w13, #0x1 +; NO_SVE-NEXT: orr w10, w11, w10, lsl #7 +; NO_SVE-NEXT: and w11, w12, #0x1 +; NO_SVE-NEXT: umov w12, v1.b[10] +; NO_SVE-NEXT: orr w8, w8, w13, lsl #13 +; NO_SVE-NEXT: and w9, w9, #0x1 +; NO_SVE-NEXT: and w13, w14, #0x1 +; NO_SVE-NEXT: umov w14, v1.b[11] +; NO_SVE-NEXT: orr w8, w8, w9, lsl #14 +; NO_SVE-NEXT: orr w9, w10, w11, lsl #8 +; NO_SVE-NEXT: umov w10, v1.b[12] +; NO_SVE-NEXT: and w11, w12, #0x1 +; NO_SVE-NEXT: umov w12, v1.b[13] +; NO_SVE-NEXT: orr w9, w9, w13, lsl #9 +; NO_SVE-NEXT: and w13, w14, #0x1 +; NO_SVE-NEXT: umov w14, v1.b[14] +; NO_SVE-NEXT: orr w9, w9, w11, lsl #10 +; NO_SVE-NEXT: umov w11, v0.b[15] +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: orr w9, w9, w13, lsl #11 +; NO_SVE-NEXT: and w12, w12, #0x1 +; NO_SVE-NEXT: umov w13, v1.b[15] +; NO_SVE-NEXT: orr w9, w9, w10, lsl #12 +; NO_SVE-NEXT: and w10, w14, #0x1 +; NO_SVE-NEXT: orr w11, w8, w11, lsl #15 +; NO_SVE-NEXT: orr w8, w9, w12, lsl #13 +; NO_SVE-NEXT: orr w8, w8, w10, lsl #14 +; NO_SVE-NEXT: ldr q1, [x1] +; NO_SVE-NEXT: orr w8, w8, w13, lsl #15 +; NO_SVE-NEXT: bfi w8, w11, #16, #16 +; NO_SVE-NEXT: tbz w8, #0, .LBB4_2 +; NO_SVE-NEXT: // %bb.1: // %cond.load +; NO_SVE-NEXT: fmov x9, d1 +; NO_SVE-NEXT: ldr b0, [x9] +; NO_SVE-NEXT: tbnz w8, #1, .LBB4_3 +; NO_SVE-NEXT: b .LBB4_4 +; NO_SVE-NEXT: .LBB4_2: +; NO_SVE-NEXT: // implicit-def: $q0 +; NO_SVE-NEXT: tbz w8, #1, .LBB4_4 +; NO_SVE-NEXT: .LBB4_3: // %cond.load1 +; NO_SVE-NEXT: mov x9, v1.d[1] +; NO_SVE-NEXT: ld1 { v0.b }[1], [x9] +; NO_SVE-NEXT: .LBB4_4: // %else2 +; NO_SVE-NEXT: ldr q1, [x1, #16] +; NO_SVE-NEXT: tbnz w8, #2, .LBB4_20 +; NO_SVE-NEXT: // %bb.5: // %else5 +; NO_SVE-NEXT: tbnz w8, #3, .LBB4_21 +; NO_SVE-NEXT: .LBB4_6: // %else8 +; NO_SVE-NEXT: ldr q1, [x1, #32] +; NO_SVE-NEXT: tbnz w8, #4, .LBB4_22 +; NO_SVE-NEXT: .LBB4_7: // %else11 +; NO_SVE-NEXT: tbnz w8, #5, .LBB4_23 +; NO_SVE-NEXT: .LBB4_8: // %else14 +; NO_SVE-NEXT: ldr q1, [x1, #48] +; NO_SVE-NEXT: tbnz w8, #6, .LBB4_24 +; NO_SVE-NEXT: .LBB4_9: // %else17 +; NO_SVE-NEXT: tbnz w8, #7, .LBB4_25 +; NO_SVE-NEXT: .LBB4_10: // %else20 +; NO_SVE-NEXT: ldr q1, [x1, #64] +; NO_SVE-NEXT: tbnz w8, #8, .LBB4_26 +; NO_SVE-NEXT: .LBB4_11: // %else23 +; NO_SVE-NEXT: tbnz w8, #9, .LBB4_27 +; NO_SVE-NEXT: .LBB4_12: // %else26 +; NO_SVE-NEXT: ldr q1, [x1, #80] +; NO_SVE-NEXT: tbnz w8, #10, .LBB4_28 +; NO_SVE-NEXT: .LBB4_13: // %else29 +; NO_SVE-NEXT: tbnz w8, #11, .LBB4_29 +; NO_SVE-NEXT: .LBB4_14: // %else32 +; NO_SVE-NEXT: ldr q1, [x1, #96] +; NO_SVE-NEXT: tbnz w8, #12, .LBB4_30 +; NO_SVE-NEXT: .LBB4_15: // %else35 +; NO_SVE-NEXT: tbnz w8, #13, .LBB4_31 +; NO_SVE-NEXT: .LBB4_16: // %else38 +; NO_SVE-NEXT: ldr q1, [x1, #112] +; NO_SVE-NEXT: tbnz w8, #14, .LBB4_32 +; NO_SVE-NEXT: .LBB4_17: // %else41 +; NO_SVE-NEXT: tbnz w8, #15, .LBB4_33 +; NO_SVE-NEXT: .LBB4_18: // %else44 +; NO_SVE-NEXT: ldr q2, [x1, #128] +; NO_SVE-NEXT: tbz w8, #16, .LBB4_34 +; NO_SVE-NEXT: .LBB4_19: // %cond.load46 +; NO_SVE-NEXT: fmov x9, d2 +; NO_SVE-NEXT: ld1 { v1.b }[0], [x9] +; NO_SVE-NEXT: tbnz w8, #17, .LBB4_35 +; NO_SVE-NEXT: b .LBB4_36 +; NO_SVE-NEXT: .LBB4_20: // %cond.load4 +; NO_SVE-NEXT: fmov x9, d1 +; NO_SVE-NEXT: ld1 { v0.b }[2], [x9] +; NO_SVE-NEXT: tbz w8, #3, .LBB4_6 +; NO_SVE-NEXT: .LBB4_21: // %cond.load7 +; NO_SVE-NEXT: mov x9, v1.d[1] +; NO_SVE-NEXT: ld1 { v0.b }[3], [x9] +; NO_SVE-NEXT: ldr q1, [x1, #32] +; NO_SVE-NEXT: tbz w8, #4, .LBB4_7 +; NO_SVE-NEXT: .LBB4_22: // %cond.load10 +; NO_SVE-NEXT: fmov x9, d1 +; NO_SVE-NEXT: ld1 { v0.b }[4], [x9] +; NO_SVE-NEXT: tbz w8, #5, .LBB4_8 +; NO_SVE-NEXT: .LBB4_23: // %cond.load13 +; NO_SVE-NEXT: mov x9, v1.d[1] +; NO_SVE-NEXT: ld1 { v0.b }[5], [x9] +; NO_SVE-NEXT: ldr q1, [x1, #48] +; NO_SVE-NEXT: tbz w8, #6, .LBB4_9 +; NO_SVE-NEXT: .LBB4_24: // %cond.load16 +; NO_SVE-NEXT: fmov x9, d1 +; NO_SVE-NEXT: ld1 { v0.b }[6], [x9] +; NO_SVE-NEXT: tbz w8, #7, .LBB4_10 +; NO_SVE-NEXT: .LBB4_25: // %cond.load19 +; NO_SVE-NEXT: mov x9, v1.d[1] +; NO_SVE-NEXT: ld1 { v0.b }[7], [x9] +; NO_SVE-NEXT: ldr q1, [x1, #64] +; NO_SVE-NEXT: tbz w8, #8, .LBB4_11 +; NO_SVE-NEXT: .LBB4_26: // %cond.load22 +; NO_SVE-NEXT: fmov x9, d1 +; NO_SVE-NEXT: ld1 { v0.b }[8], [x9] +; NO_SVE-NEXT: tbz w8, #9, .LBB4_12 +; NO_SVE-NEXT: .LBB4_27: // %cond.load25 +; NO_SVE-NEXT: mov x9, v1.d[1] +; NO_SVE-NEXT: ld1 { v0.b }[9], [x9] +; NO_SVE-NEXT: ldr q1, [x1, #80] +; NO_SVE-NEXT: tbz w8, #10, .LBB4_13 +; NO_SVE-NEXT: .LBB4_28: // %cond.load28 +; NO_SVE-NEXT: fmov x9, d1 +; NO_SVE-NEXT: ld1 { v0.b }[10], [x9] +; NO_SVE-NEXT: tbz w8, #11, .LBB4_14 +; NO_SVE-NEXT: .LBB4_29: // %cond.load31 +; NO_SVE-NEXT: mov x9, v1.d[1] +; NO_SVE-NEXT: ld1 { v0.b }[11], [x9] +; NO_SVE-NEXT: ldr q1, [x1, #96] +; NO_SVE-NEXT: tbz w8, #12, .LBB4_15 +; NO_SVE-NEXT: .LBB4_30: // %cond.load34 +; NO_SVE-NEXT: fmov x9, d1 +; NO_SVE-NEXT: ld1 { v0.b }[12], [x9] +; NO_SVE-NEXT: tbz w8, #13, .LBB4_16 +; NO_SVE-NEXT: .LBB4_31: // %cond.load37 +; NO_SVE-NEXT: mov x9, v1.d[1] +; NO_SVE-NEXT: ld1 { v0.b }[13], [x9] +; NO_SVE-NEXT: ldr q1, [x1, #112] +; NO_SVE-NEXT: tbz w8, #14, .LBB4_17 +; NO_SVE-NEXT: .LBB4_32: // %cond.load40 +; NO_SVE-NEXT: fmov x9, d1 +; NO_SVE-NEXT: ld1 { v0.b }[14], [x9] +; NO_SVE-NEXT: tbz w8, #15, .LBB4_18 +; NO_SVE-NEXT: .LBB4_33: // %cond.load43 +; NO_SVE-NEXT: mov x9, v1.d[1] +; NO_SVE-NEXT: ld1 { v0.b }[15], [x9] +; NO_SVE-NEXT: ldr q2, [x1, #128] +; NO_SVE-NEXT: tbnz w8, #16, .LBB4_19 +; NO_SVE-NEXT: .LBB4_34: +; NO_SVE-NEXT: // implicit-def: $q1 +; NO_SVE-NEXT: tbz w8, #17, .LBB4_36 +; NO_SVE-NEXT: .LBB4_35: // %cond.load49 +; NO_SVE-NEXT: mov x9, v2.d[1] +; NO_SVE-NEXT: ld1 { v1.b }[1], [x9] +; NO_SVE-NEXT: .LBB4_36: // %else50 +; NO_SVE-NEXT: ldr q2, [x1, #144] +; NO_SVE-NEXT: tbnz w8, #18, .LBB4_52 +; NO_SVE-NEXT: // %bb.37: // %else53 +; NO_SVE-NEXT: tbnz w8, #19, .LBB4_53 +; NO_SVE-NEXT: .LBB4_38: // %else56 +; NO_SVE-NEXT: ldr q2, [x1, #160] +; NO_SVE-NEXT: tbnz w8, #20, .LBB4_54 +; NO_SVE-NEXT: .LBB4_39: // %else59 +; NO_SVE-NEXT: tbnz w8, #21, .LBB4_55 +; NO_SVE-NEXT: .LBB4_40: // %else62 +; NO_SVE-NEXT: ldr q2, [x1, #176] +; NO_SVE-NEXT: tbnz w8, #22, .LBB4_56 +; NO_SVE-NEXT: .LBB4_41: // %else65 +; NO_SVE-NEXT: tbnz w8, #23, .LBB4_57 +; NO_SVE-NEXT: .LBB4_42: // %else68 +; NO_SVE-NEXT: ldr q2, [x1, #192] +; NO_SVE-NEXT: tbnz w8, #24, .LBB4_58 +; NO_SVE-NEXT: .LBB4_43: // %else71 +; NO_SVE-NEXT: tbnz w8, #25, .LBB4_59 +; NO_SVE-NEXT: .LBB4_44: // %else74 +; NO_SVE-NEXT: ldr q2, [x1, #208] +; NO_SVE-NEXT: tbnz w8, #26, .LBB4_60 +; NO_SVE-NEXT: .LBB4_45: // %else77 +; NO_SVE-NEXT: tbnz w8, #27, .LBB4_61 +; NO_SVE-NEXT: .LBB4_46: // %else80 +; NO_SVE-NEXT: ldr q2, [x1, #224] +; NO_SVE-NEXT: tbnz w8, #28, .LBB4_62 +; NO_SVE-NEXT: .LBB4_47: // %else83 +; NO_SVE-NEXT: tbnz w8, #29, .LBB4_63 +; NO_SVE-NEXT: .LBB4_48: // %else86 +; NO_SVE-NEXT: ldr q2, [x1, #240] +; NO_SVE-NEXT: tbnz w8, #30, .LBB4_64 +; NO_SVE-NEXT: .LBB4_49: // %else89 +; NO_SVE-NEXT: tbz w8, #31, .LBB4_51 +; NO_SVE-NEXT: .LBB4_50: // %cond.load91 +; NO_SVE-NEXT: mov x8, v2.d[1] +; NO_SVE-NEXT: ld1 { v1.b }[15], [x8] +; NO_SVE-NEXT: .LBB4_51: // %else92 +; NO_SVE-NEXT: stp q0, q1, [x0] +; NO_SVE-NEXT: add sp, sp, #16 +; NO_SVE-NEXT: ret +; NO_SVE-NEXT: .LBB4_52: // %cond.load52 +; NO_SVE-NEXT: fmov x9, d2 +; NO_SVE-NEXT: ld1 { v1.b }[2], [x9] +; NO_SVE-NEXT: tbz w8, #19, .LBB4_38 +; NO_SVE-NEXT: .LBB4_53: // %cond.load55 +; NO_SVE-NEXT: mov x9, v2.d[1] +; NO_SVE-NEXT: ld1 { v1.b }[3], [x9] +; NO_SVE-NEXT: ldr q2, [x1, #160] +; NO_SVE-NEXT: tbz w8, #20, .LBB4_39 +; NO_SVE-NEXT: .LBB4_54: // %cond.load58 +; NO_SVE-NEXT: fmov x9, d2 +; NO_SVE-NEXT: ld1 { v1.b }[4], [x9] +; NO_SVE-NEXT: tbz w8, #21, .LBB4_40 +; NO_SVE-NEXT: .LBB4_55: // %cond.load61 +; NO_SVE-NEXT: mov x9, v2.d[1] +; NO_SVE-NEXT: ld1 { v1.b }[5], [x9] +; NO_SVE-NEXT: ldr q2, [x1, #176] +; NO_SVE-NEXT: tbz w8, #22, .LBB4_41 +; NO_SVE-NEXT: .LBB4_56: // %cond.load64 +; NO_SVE-NEXT: fmov x9, d2 +; NO_SVE-NEXT: ld1 { v1.b }[6], [x9] +; NO_SVE-NEXT: tbz w8, #23, .LBB4_42 +; NO_SVE-NEXT: .LBB4_57: // %cond.load67 +; NO_SVE-NEXT: mov x9, v2.d[1] +; NO_SVE-NEXT: ld1 { v1.b }[7], [x9] +; NO_SVE-NEXT: ldr q2, [x1, #192] +; NO_SVE-NEXT: tbz w8, #24, .LBB4_43 +; NO_SVE-NEXT: .LBB4_58: // %cond.load70 +; NO_SVE-NEXT: fmov x9, d2 +; NO_SVE-NEXT: ld1 { v1.b }[8], [x9] +; NO_SVE-NEXT: tbz w8, #25, .LBB4_44 +; NO_SVE-NEXT: .LBB4_59: // %cond.load73 +; NO_SVE-NEXT: mov x9, v2.d[1] +; NO_SVE-NEXT: ld1 { v1.b }[9], [x9] +; NO_SVE-NEXT: ldr q2, [x1, #208] +; NO_SVE-NEXT: tbz w8, #26, .LBB4_45 +; NO_SVE-NEXT: .LBB4_60: // %cond.load76 +; NO_SVE-NEXT: fmov x9, d2 +; NO_SVE-NEXT: ld1 { v1.b }[10], [x9] +; NO_SVE-NEXT: tbz w8, #27, .LBB4_46 +; NO_SVE-NEXT: .LBB4_61: // %cond.load79 +; NO_SVE-NEXT: mov x9, v2.d[1] +; NO_SVE-NEXT: ld1 { v1.b }[11], [x9] +; NO_SVE-NEXT: ldr q2, [x1, #224] +; NO_SVE-NEXT: tbz w8, #28, .LBB4_47 +; NO_SVE-NEXT: .LBB4_62: // %cond.load82 +; NO_SVE-NEXT: fmov x9, d2 +; NO_SVE-NEXT: ld1 { v1.b }[12], [x9] +; NO_SVE-NEXT: tbz w8, #29, .LBB4_48 +; NO_SVE-NEXT: .LBB4_63: // %cond.load85 +; NO_SVE-NEXT: mov x9, v2.d[1] +; NO_SVE-NEXT: ld1 { v1.b }[13], [x9] +; NO_SVE-NEXT: ldr q2, [x1, #240] +; NO_SVE-NEXT: tbz w8, #30, .LBB4_49 +; NO_SVE-NEXT: .LBB4_64: // %cond.load88 +; NO_SVE-NEXT: fmov x9, d2 +; NO_SVE-NEXT: ld1 { v1.b }[14], [x9] +; NO_SVE-NEXT: tbnz w8, #31, .LBB4_50 +; NO_SVE-NEXT: b .LBB4_51 +; +; VBITS_EQ_256-LABEL: masked_gather_v32i8: +; VBITS_EQ_256: // %bb.0: +; VBITS_EQ_256-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill +; VBITS_EQ_256-NEXT: .cfi_def_cfa_offset 16 +; VBITS_EQ_256-NEXT: mov x29, sp +; VBITS_EQ_256-NEXT: .cfi_def_cfa w29, 16 +; VBITS_EQ_256-NEXT: .cfi_offset w30, -8 +; VBITS_EQ_256-NEXT: .cfi_offset w29, -16 +; VBITS_EQ_256-NEXT: sub x9, sp, #48 +; VBITS_EQ_256-NEXT: and sp, x9, #0xffffffffffffffe0 +; VBITS_EQ_256-NEXT: ptrue p0.b, vl32 +; VBITS_EQ_256-NEXT: mov x8, #24 +; VBITS_EQ_256-NEXT: ld1b { z0.b }, p0/z, [x0] +; VBITS_EQ_256-NEXT: mov x9, #28 +; VBITS_EQ_256-NEXT: mov x10, #8 +; VBITS_EQ_256-NEXT: cmpeq p1.b, p0/z, z0.b, #0 +; VBITS_EQ_256-NEXT: mov z4.b, p1/z, #-1 // =0xffffffffffffffff +; VBITS_EQ_256-NEXT: ptrue p1.d, vl4 +; VBITS_EQ_256-NEXT: zip2 v2.8b, v4.8b, v0.8b +; VBITS_EQ_256-NEXT: ld1d { z0.d }, p1/z, [x1, x8, lsl #3] +; VBITS_EQ_256-NEXT: mov x8, #4 +; VBITS_EQ_256-NEXT: ld1d { z1.d }, p1/z, [x1, x9, lsl #3] +; VBITS_EQ_256-NEXT: mov x9, #12 +; VBITS_EQ_256-NEXT: shl v3.4h, v2.4h, #8 +; VBITS_EQ_256-NEXT: ld1d { z7.d }, p1/z, [x1, x8, lsl #3] +; VBITS_EQ_256-NEXT: ld1d { z2.d }, p1/z, [x1, x10, lsl #3] +; VBITS_EQ_256-NEXT: mov x10, #16 +; VBITS_EQ_256-NEXT: sshr v5.4h, v3.4h, #8 +; VBITS_EQ_256-NEXT: mov x8, #20 +; VBITS_EQ_256-NEXT: ld1d { z3.d }, p1/z, [x1, x9, lsl #3] +; VBITS_EQ_256-NEXT: sunpklo z5.s, z5.h +; VBITS_EQ_256-NEXT: ld1d { z6.d }, p1/z, [x1, x10, lsl #3] +; VBITS_EQ_256-NEXT: sunpklo z5.d, z5.s +; VBITS_EQ_256-NEXT: ld1d { z16.d }, p1/z, [x1, x8, lsl #3] +; VBITS_EQ_256-NEXT: cmpne p2.d, p1/z, z5.d, #0 +; VBITS_EQ_256-NEXT: ld1d { z17.d }, p1/z, [x1] +; VBITS_EQ_256-NEXT: ld1b { z5.d }, p2/z, [z7.d] +; VBITS_EQ_256-NEXT: zip1 v7.8b, v4.8b, v0.8b +; VBITS_EQ_256-NEXT: uzp1 z5.s, z5.s, z5.s +; VBITS_EQ_256-NEXT: shl v7.4h, v7.4h, #8 +; VBITS_EQ_256-NEXT: uzp1 z5.h, z5.h, z5.h +; VBITS_EQ_256-NEXT: umov w8, v5.h[3] +; VBITS_EQ_256-NEXT: umov w9, v5.h[2] +; VBITS_EQ_256-NEXT: umov w10, v5.h[1] +; VBITS_EQ_256-NEXT: sshr v7.4h, v7.4h, #8 +; VBITS_EQ_256-NEXT: umov w11, v5.h[0] +; VBITS_EQ_256-NEXT: mov z5.d, z4.d +; VBITS_EQ_256-NEXT: sunpklo z7.s, z7.h +; VBITS_EQ_256-NEXT: ext z5.b, z5.b, z4.b, #16 +; VBITS_EQ_256-NEXT: sunpklo z7.d, z7.s +; VBITS_EQ_256-NEXT: strb w8, [sp, #7] +; VBITS_EQ_256-NEXT: strb w9, [sp, #6] +; VBITS_EQ_256-NEXT: cmpne p2.d, p1/z, z7.d, #0 +; VBITS_EQ_256-NEXT: strb w10, [sp, #5] +; VBITS_EQ_256-NEXT: strb w11, [sp, #4] +; VBITS_EQ_256-NEXT: ld1b { z7.d }, p2/z, [z17.d] +; VBITS_EQ_256-NEXT: zip2 v17.8b, v5.8b, v0.8b +; VBITS_EQ_256-NEXT: ext v4.16b, v4.16b, v4.16b, #8 +; VBITS_EQ_256-NEXT: uzp1 z7.s, z7.s, z7.s +; VBITS_EQ_256-NEXT: shl v17.4h, v17.4h, #8 +; VBITS_EQ_256-NEXT: uzp1 z7.h, z7.h, z7.h +; VBITS_EQ_256-NEXT: umov w8, v7.h[3] +; VBITS_EQ_256-NEXT: umov w9, v7.h[2] +; VBITS_EQ_256-NEXT: umov w10, v7.h[1] +; VBITS_EQ_256-NEXT: sshr v17.4h, v17.4h, #8 +; VBITS_EQ_256-NEXT: umov w11, v7.h[0] +; VBITS_EQ_256-NEXT: sunpklo z7.s, z17.h +; VBITS_EQ_256-NEXT: sunpklo z7.d, z7.s +; VBITS_EQ_256-NEXT: strb w8, [sp, #3] +; VBITS_EQ_256-NEXT: strb w9, [sp, #2] +; VBITS_EQ_256-NEXT: cmpne p2.d, p1/z, z7.d, #0 +; VBITS_EQ_256-NEXT: strb w10, [sp, #1] +; VBITS_EQ_256-NEXT: strb w11, [sp] +; VBITS_EQ_256-NEXT: ld1b { z7.d }, p2/z, [z16.d] +; VBITS_EQ_256-NEXT: zip1 v16.8b, v5.8b, v0.8b +; VBITS_EQ_256-NEXT: uzp1 z7.s, z7.s, z7.s +; VBITS_EQ_256-NEXT: shl v16.4h, v16.4h, #8 +; VBITS_EQ_256-NEXT: uzp1 z7.h, z7.h, z7.h +; VBITS_EQ_256-NEXT: umov w8, v7.h[3] +; VBITS_EQ_256-NEXT: umov w9, v7.h[2] +; VBITS_EQ_256-NEXT: umov w10, v7.h[1] +; VBITS_EQ_256-NEXT: sshr v16.4h, v16.4h, #8 +; VBITS_EQ_256-NEXT: umov w11, v7.h[0] +; VBITS_EQ_256-NEXT: sunpklo z7.s, z16.h +; VBITS_EQ_256-NEXT: sunpklo z7.d, z7.s +; VBITS_EQ_256-NEXT: strb w8, [sp, #23] +; VBITS_EQ_256-NEXT: strb w9, [sp, #22] +; VBITS_EQ_256-NEXT: cmpne p2.d, p1/z, z7.d, #0 +; VBITS_EQ_256-NEXT: strb w10, [sp, #21] +; VBITS_EQ_256-NEXT: zip2 v7.8b, v4.8b, v0.8b +; VBITS_EQ_256-NEXT: strb w11, [sp, #20] +; VBITS_EQ_256-NEXT: zip1 v4.8b, v4.8b, v0.8b +; VBITS_EQ_256-NEXT: ld1b { z6.d }, p2/z, [z6.d] +; VBITS_EQ_256-NEXT: shl v7.4h, v7.4h, #8 +; VBITS_EQ_256-NEXT: shl v4.4h, v4.4h, #8 +; VBITS_EQ_256-NEXT: uzp1 z6.s, z6.s, z6.s +; VBITS_EQ_256-NEXT: sshr v7.4h, v7.4h, #8 +; VBITS_EQ_256-NEXT: uzp1 z6.h, z6.h, z6.h +; VBITS_EQ_256-NEXT: sshr v4.4h, v4.4h, #8 +; VBITS_EQ_256-NEXT: umov w8, v6.h[3] +; VBITS_EQ_256-NEXT: umov w9, v6.h[2] +; VBITS_EQ_256-NEXT: umov w10, v6.h[1] +; VBITS_EQ_256-NEXT: umov w11, v6.h[0] +; VBITS_EQ_256-NEXT: sunpklo z6.s, z7.h +; VBITS_EQ_256-NEXT: sunpklo z4.s, z4.h +; VBITS_EQ_256-NEXT: sunpklo z6.d, z6.s +; VBITS_EQ_256-NEXT: sunpklo z4.d, z4.s +; VBITS_EQ_256-NEXT: strb w8, [sp, #19] +; VBITS_EQ_256-NEXT: cmpne p2.d, p1/z, z6.d, #0 +; VBITS_EQ_256-NEXT: strb w9, [sp, #18] +; VBITS_EQ_256-NEXT: strb w10, [sp, #17] +; VBITS_EQ_256-NEXT: strb w11, [sp, #16] +; VBITS_EQ_256-NEXT: ld1b { z3.d }, p2/z, [z3.d] +; VBITS_EQ_256-NEXT: cmpne p2.d, p1/z, z4.d, #0 +; VBITS_EQ_256-NEXT: uzp1 z3.s, z3.s, z3.s +; VBITS_EQ_256-NEXT: uzp1 z3.h, z3.h, z3.h +; VBITS_EQ_256-NEXT: umov w8, v3.h[3] +; VBITS_EQ_256-NEXT: umov w9, v3.h[2] +; VBITS_EQ_256-NEXT: umov w10, v3.h[1] +; VBITS_EQ_256-NEXT: umov w11, v3.h[0] +; VBITS_EQ_256-NEXT: ext v3.16b, v5.16b, v5.16b, #8 +; VBITS_EQ_256-NEXT: strb w8, [sp, #15] +; VBITS_EQ_256-NEXT: strb w9, [sp, #14] +; VBITS_EQ_256-NEXT: strb w10, [sp, #13] +; VBITS_EQ_256-NEXT: zip2 v4.8b, v3.8b, v0.8b +; VBITS_EQ_256-NEXT: strb w11, [sp, #12] +; VBITS_EQ_256-NEXT: ld1b { z2.d }, p2/z, [z2.d] +; VBITS_EQ_256-NEXT: shl v4.4h, v4.4h, #8 +; VBITS_EQ_256-NEXT: uzp1 z2.s, z2.s, z2.s +; VBITS_EQ_256-NEXT: sshr v4.4h, v4.4h, #8 +; VBITS_EQ_256-NEXT: uzp1 z2.h, z2.h, z2.h +; VBITS_EQ_256-NEXT: umov w8, v2.h[3] +; VBITS_EQ_256-NEXT: umov w9, v2.h[2] +; VBITS_EQ_256-NEXT: umov w10, v2.h[1] +; VBITS_EQ_256-NEXT: umov w11, v2.h[0] +; VBITS_EQ_256-NEXT: sunpklo z2.s, z4.h +; VBITS_EQ_256-NEXT: sunpklo z2.d, z2.s +; VBITS_EQ_256-NEXT: strb w8, [sp, #11] +; VBITS_EQ_256-NEXT: cmpne p2.d, p1/z, z2.d, #0 +; VBITS_EQ_256-NEXT: strb w9, [sp, #10] +; VBITS_EQ_256-NEXT: zip1 v2.8b, v3.8b, v0.8b +; VBITS_EQ_256-NEXT: strb w10, [sp, #9] +; VBITS_EQ_256-NEXT: strb w11, [sp, #8] +; VBITS_EQ_256-NEXT: ld1b { z1.d }, p2/z, [z1.d] +; VBITS_EQ_256-NEXT: shl v2.4h, v2.4h, #8 +; VBITS_EQ_256-NEXT: sshr v2.4h, v2.4h, #8 +; VBITS_EQ_256-NEXT: uzp1 z1.s, z1.s, z1.s +; VBITS_EQ_256-NEXT: uzp1 z1.h, z1.h, z1.h +; VBITS_EQ_256-NEXT: umov w8, v1.h[3] +; VBITS_EQ_256-NEXT: umov w9, v1.h[2] +; VBITS_EQ_256-NEXT: umov w10, v1.h[1] +; VBITS_EQ_256-NEXT: umov w11, v1.h[0] +; VBITS_EQ_256-NEXT: sunpklo z1.s, z2.h +; VBITS_EQ_256-NEXT: sunpklo z1.d, z1.s +; VBITS_EQ_256-NEXT: strb w8, [sp, #31] +; VBITS_EQ_256-NEXT: cmpne p1.d, p1/z, z1.d, #0 +; VBITS_EQ_256-NEXT: strb w9, [sp, #30] +; VBITS_EQ_256-NEXT: strb w10, [sp, #29] +; VBITS_EQ_256-NEXT: strb w11, [sp, #28] +; VBITS_EQ_256-NEXT: ld1b { z0.d }, p1/z, [z0.d] +; VBITS_EQ_256-NEXT: uzp1 z0.s, z0.s, z0.s +; VBITS_EQ_256-NEXT: uzp1 z0.h, z0.h, z0.h +; VBITS_EQ_256-NEXT: umov w8, v0.h[3] +; VBITS_EQ_256-NEXT: umov w9, v0.h[2] +; VBITS_EQ_256-NEXT: umov w10, v0.h[1] +; VBITS_EQ_256-NEXT: umov w11, v0.h[0] +; VBITS_EQ_256-NEXT: strb w8, [sp, #27] +; VBITS_EQ_256-NEXT: strb w9, [sp, #26] +; VBITS_EQ_256-NEXT: strb w10, [sp, #25] +; VBITS_EQ_256-NEXT: strb w11, [sp, #24] +; VBITS_EQ_256-NEXT: ld1b { z0.b }, p0/z, [sp] +; VBITS_EQ_256-NEXT: st1b { z0.b }, p0, [x0] +; VBITS_EQ_256-NEXT: mov sp, x29 +; VBITS_EQ_256-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload +; VBITS_EQ_256-NEXT: ret +; ; VBITS_GE_2048-LABEL: masked_gather_v32i8: ; VBITS_GE_2048: // %bb.0: ; VBITS_GE_2048-NEXT: ptrue p0.b, vl32 @@ -185,6 +1084,38 @@ ; define void @masked_gather_v2i16(<2 x i16>* %a, <2 x i16*>* %b) #0 { +; NO_SVE-LABEL: masked_gather_v2i16: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: sub sp, sp, #16 +; NO_SVE-NEXT: .cfi_def_cfa_offset 16 +; NO_SVE-NEXT: ldrh w8, [x0] +; NO_SVE-NEXT: ldr q1, [x1] +; NO_SVE-NEXT: fmov s0, w8 +; NO_SVE-NEXT: ldrh w8, [x0, #2] +; NO_SVE-NEXT: mov v0.s[1], w8 +; NO_SVE-NEXT: cmeq v0.2s, v0.2s, #0 +; NO_SVE-NEXT: mov w8, v0.s[1] +; NO_SVE-NEXT: fmov w9, s0 +; NO_SVE-NEXT: // implicit-def: $d0 +; NO_SVE-NEXT: bfi w9, w8, #1, #31 +; NO_SVE-NEXT: and w8, w9, #0x3 +; NO_SVE-NEXT: tbz w9, #0, .LBB5_2 +; NO_SVE-NEXT: // %bb.1: // %cond.load +; NO_SVE-NEXT: fmov x9, d1 +; NO_SVE-NEXT: ld1 { v0.h }[0], [x9] +; NO_SVE-NEXT: .LBB5_2: // %else +; NO_SVE-NEXT: tbz w8, #1, .LBB5_4 +; NO_SVE-NEXT: // %bb.3: // %cond.load1 +; NO_SVE-NEXT: mov x8, v1.d[1] +; NO_SVE-NEXT: ld1 { v0.h }[2], [x8] +; NO_SVE-NEXT: .LBB5_4: // %else2 +; NO_SVE-NEXT: mov w8, v0.s[1] +; NO_SVE-NEXT: fmov w9, s0 +; NO_SVE-NEXT: strh w9, [x0] +; NO_SVE-NEXT: strh w8, [x0, #2] +; NO_SVE-NEXT: add sp, sp, #16 +; NO_SVE-NEXT: ret +; ; CHECK-LABEL: masked_gather_v2i16: ; CHECK: // %bb.0: ; CHECK-NEXT: ldrh w8, [x0] @@ -210,6 +1141,54 @@ } define void @masked_gather_v4i16(<4 x i16>* %a, <4 x i16*>* %b) #0 { +; NO_SVE-LABEL: masked_gather_v4i16: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: sub sp, sp, #16 +; NO_SVE-NEXT: .cfi_def_cfa_offset 16 +; NO_SVE-NEXT: ldr d0, [x0] +; NO_SVE-NEXT: ldr q1, [x1] +; NO_SVE-NEXT: cmeq v0.4h, v0.4h, #0 +; NO_SVE-NEXT: umov w8, v0.h[1] +; NO_SVE-NEXT: umov w9, v0.h[2] +; NO_SVE-NEXT: umov w10, v0.h[0] +; NO_SVE-NEXT: umov w11, v0.h[3] +; NO_SVE-NEXT: // implicit-def: $d0 +; NO_SVE-NEXT: and w8, w8, #0x1 +; NO_SVE-NEXT: and w9, w9, #0x1 +; NO_SVE-NEXT: bfi w10, w8, #1, #1 +; NO_SVE-NEXT: bfi w10, w9, #2, #1 +; NO_SVE-NEXT: bfi w10, w11, #3, #29 +; NO_SVE-NEXT: and w8, w10, #0xf +; NO_SVE-NEXT: tbnz w10, #0, .LBB6_6 +; NO_SVE-NEXT: // %bb.1: // %else +; NO_SVE-NEXT: tbnz w8, #1, .LBB6_7 +; NO_SVE-NEXT: .LBB6_2: // %else2 +; NO_SVE-NEXT: ldr q1, [x1, #16] +; NO_SVE-NEXT: tbnz w8, #2, .LBB6_8 +; NO_SVE-NEXT: .LBB6_3: // %else5 +; NO_SVE-NEXT: tbz w8, #3, .LBB6_5 +; NO_SVE-NEXT: .LBB6_4: // %cond.load7 +; NO_SVE-NEXT: mov x8, v1.d[1] +; NO_SVE-NEXT: ld1 { v0.h }[3], [x8] +; NO_SVE-NEXT: .LBB6_5: // %else8 +; NO_SVE-NEXT: str d0, [x0] +; NO_SVE-NEXT: add sp, sp, #16 +; NO_SVE-NEXT: ret +; NO_SVE-NEXT: .LBB6_6: // %cond.load +; NO_SVE-NEXT: fmov x9, d1 +; NO_SVE-NEXT: ldr h0, [x9] +; NO_SVE-NEXT: tbz w8, #1, .LBB6_2 +; NO_SVE-NEXT: .LBB6_7: // %cond.load1 +; NO_SVE-NEXT: mov x9, v1.d[1] +; NO_SVE-NEXT: ld1 { v0.h }[1], [x9] +; NO_SVE-NEXT: ldr q1, [x1, #16] +; NO_SVE-NEXT: tbz w8, #2, .LBB6_3 +; NO_SVE-NEXT: .LBB6_8: // %cond.load4 +; NO_SVE-NEXT: fmov x9, d1 +; NO_SVE-NEXT: ld1 { v0.h }[2], [x9] +; NO_SVE-NEXT: tbnz w8, #3, .LBB6_4 +; NO_SVE-NEXT: b .LBB6_5 +; ; CHECK-LABEL: masked_gather_v4i16: ; CHECK: // %bb.0: ; CHECK-NEXT: ldr d0, [x0] @@ -234,6 +1213,96 @@ define void @masked_gather_v8i16(<8 x i16>* %a, <8 x i16*>* %b) #0 { ; Ensure sensible type legalisation. +; NO_SVE-LABEL: masked_gather_v8i16: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: sub sp, sp, #16 +; NO_SVE-NEXT: .cfi_def_cfa_offset 16 +; NO_SVE-NEXT: ldr q0, [x0] +; NO_SVE-NEXT: ldr q1, [x1] +; NO_SVE-NEXT: cmeq v0.8h, v0.8h, #0 +; NO_SVE-NEXT: xtn v0.8b, v0.8h +; NO_SVE-NEXT: umov w8, v0.b[1] +; NO_SVE-NEXT: umov w10, v0.b[2] +; NO_SVE-NEXT: umov w9, v0.b[0] +; NO_SVE-NEXT: umov w11, v0.b[3] +; NO_SVE-NEXT: umov w12, v0.b[4] +; NO_SVE-NEXT: umov w13, v0.b[5] +; NO_SVE-NEXT: umov w14, v0.b[6] +; NO_SVE-NEXT: and w8, w8, #0x1 +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: and w9, w9, #0x1 +; NO_SVE-NEXT: and w11, w11, #0x1 +; NO_SVE-NEXT: bfi w9, w8, #1, #1 +; NO_SVE-NEXT: and w8, w12, #0x1 +; NO_SVE-NEXT: bfi w9, w10, #2, #1 +; NO_SVE-NEXT: and w10, w13, #0x1 +; NO_SVE-NEXT: bfi w9, w11, #3, #1 +; NO_SVE-NEXT: umov w11, v0.b[7] +; NO_SVE-NEXT: bfi w9, w8, #4, #1 +; NO_SVE-NEXT: and w8, w14, #0x1 +; NO_SVE-NEXT: bfi w9, w10, #5, #1 +; NO_SVE-NEXT: // implicit-def: $q0 +; NO_SVE-NEXT: orr w8, w9, w8, lsl #6 +; NO_SVE-NEXT: orr w9, w8, w11, lsl #7 +; NO_SVE-NEXT: and w8, w9, #0xff +; NO_SVE-NEXT: tbnz w9, #0, .LBB7_10 +; NO_SVE-NEXT: // %bb.1: // %else +; NO_SVE-NEXT: tbnz w8, #1, .LBB7_11 +; NO_SVE-NEXT: .LBB7_2: // %else2 +; NO_SVE-NEXT: ldr q1, [x1, #16] +; NO_SVE-NEXT: tbnz w8, #2, .LBB7_12 +; NO_SVE-NEXT: .LBB7_3: // %else5 +; NO_SVE-NEXT: tbnz w8, #3, .LBB7_13 +; NO_SVE-NEXT: .LBB7_4: // %else8 +; NO_SVE-NEXT: ldr q1, [x1, #32] +; NO_SVE-NEXT: tbnz w8, #4, .LBB7_14 +; NO_SVE-NEXT: .LBB7_5: // %else11 +; NO_SVE-NEXT: tbnz w8, #5, .LBB7_15 +; NO_SVE-NEXT: .LBB7_6: // %else14 +; NO_SVE-NEXT: ldr q1, [x1, #48] +; NO_SVE-NEXT: tbnz w8, #6, .LBB7_16 +; NO_SVE-NEXT: .LBB7_7: // %else17 +; NO_SVE-NEXT: tbz w8, #7, .LBB7_9 +; NO_SVE-NEXT: .LBB7_8: // %cond.load19 +; NO_SVE-NEXT: mov x8, v1.d[1] +; NO_SVE-NEXT: ld1 { v0.h }[7], [x8] +; NO_SVE-NEXT: .LBB7_9: // %else20 +; NO_SVE-NEXT: str q0, [x0] +; NO_SVE-NEXT: add sp, sp, #16 +; NO_SVE-NEXT: ret +; NO_SVE-NEXT: .LBB7_10: // %cond.load +; NO_SVE-NEXT: fmov x9, d1 +; NO_SVE-NEXT: ldr h0, [x9] +; NO_SVE-NEXT: tbz w8, #1, .LBB7_2 +; NO_SVE-NEXT: .LBB7_11: // %cond.load1 +; NO_SVE-NEXT: mov x9, v1.d[1] +; NO_SVE-NEXT: ld1 { v0.h }[1], [x9] +; NO_SVE-NEXT: ldr q1, [x1, #16] +; NO_SVE-NEXT: tbz w8, #2, .LBB7_3 +; NO_SVE-NEXT: .LBB7_12: // %cond.load4 +; NO_SVE-NEXT: fmov x9, d1 +; NO_SVE-NEXT: ld1 { v0.h }[2], [x9] +; NO_SVE-NEXT: tbz w8, #3, .LBB7_4 +; NO_SVE-NEXT: .LBB7_13: // %cond.load7 +; NO_SVE-NEXT: mov x9, v1.d[1] +; NO_SVE-NEXT: ld1 { v0.h }[3], [x9] +; NO_SVE-NEXT: ldr q1, [x1, #32] +; NO_SVE-NEXT: tbz w8, #4, .LBB7_5 +; NO_SVE-NEXT: .LBB7_14: // %cond.load10 +; NO_SVE-NEXT: fmov x9, d1 +; NO_SVE-NEXT: ld1 { v0.h }[4], [x9] +; NO_SVE-NEXT: tbz w8, #5, .LBB7_6 +; NO_SVE-NEXT: .LBB7_15: // %cond.load13 +; NO_SVE-NEXT: mov x9, v1.d[1] +; NO_SVE-NEXT: ld1 { v0.h }[5], [x9] +; NO_SVE-NEXT: ldr q1, [x1, #48] +; NO_SVE-NEXT: tbz w8, #6, .LBB7_7 +; NO_SVE-NEXT: .LBB7_16: // %cond.load16 +; NO_SVE-NEXT: fmov x9, d1 +; NO_SVE-NEXT: ld1 { v0.h }[6], [x9] +; NO_SVE-NEXT: tbnz w8, #7, .LBB7_8 +; NO_SVE-NEXT: b .LBB7_9 +; ; VBITS_EQ_256-LABEL: masked_gather_v8i16: ; VBITS_EQ_256: // %bb.0: ; VBITS_EQ_256-NEXT: ldr q0, [x0] @@ -282,6 +1351,225 @@ } define void @masked_gather_v16i16(<16 x i16>* %a, <16 x i16*>* %b) #0 { +; NO_SVE-LABEL: masked_gather_v16i16: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: sub sp, sp, #16 +; NO_SVE-NEXT: .cfi_def_cfa_offset 16 +; NO_SVE-NEXT: ldp q0, q1, [x0] +; NO_SVE-NEXT: cmeq v0.8h, v0.8h, #0 +; NO_SVE-NEXT: xtn v0.8b, v0.8h +; NO_SVE-NEXT: cmeq v1.8h, v1.8h, #0 +; NO_SVE-NEXT: umov w8, v0.b[1] +; NO_SVE-NEXT: umov w10, v0.b[2] +; NO_SVE-NEXT: umov w9, v0.b[0] +; NO_SVE-NEXT: umov w11, v0.b[3] +; NO_SVE-NEXT: umov w12, v0.b[4] +; NO_SVE-NEXT: umov w13, v0.b[5] +; NO_SVE-NEXT: xtn v1.8b, v1.8h +; NO_SVE-NEXT: and w8, w8, #0x1 +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: and w9, w9, #0x1 +; NO_SVE-NEXT: and w11, w11, #0x1 +; NO_SVE-NEXT: and w12, w12, #0x1 +; NO_SVE-NEXT: bfi w9, w8, #1, #1 +; NO_SVE-NEXT: umov w8, v0.b[6] +; NO_SVE-NEXT: bfi w9, w10, #2, #1 +; NO_SVE-NEXT: umov w10, v0.b[7] +; NO_SVE-NEXT: and w13, w13, #0x1 +; NO_SVE-NEXT: bfi w9, w11, #3, #1 +; NO_SVE-NEXT: umov w11, v1.b[0] +; NO_SVE-NEXT: bfi w9, w12, #4, #1 +; NO_SVE-NEXT: umov w12, v1.b[1] +; NO_SVE-NEXT: and w8, w8, #0x1 +; NO_SVE-NEXT: bfi w9, w13, #5, #1 +; NO_SVE-NEXT: umov w13, v1.b[2] +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: orr w8, w9, w8, lsl #6 +; NO_SVE-NEXT: umov w9, v1.b[3] +; NO_SVE-NEXT: and w11, w11, #0x1 +; NO_SVE-NEXT: orr w8, w8, w10, lsl #7 +; NO_SVE-NEXT: umov w10, v1.b[4] +; NO_SVE-NEXT: and w12, w12, #0x1 +; NO_SVE-NEXT: orr w8, w8, w11, lsl #8 +; NO_SVE-NEXT: umov w11, v1.b[5] +; NO_SVE-NEXT: and w13, w13, #0x1 +; NO_SVE-NEXT: orr w8, w8, w12, lsl #9 +; NO_SVE-NEXT: umov w12, v1.b[6] +; NO_SVE-NEXT: and w9, w9, #0x1 +; NO_SVE-NEXT: orr w8, w8, w13, lsl #10 +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: orr w8, w8, w9, lsl #11 +; NO_SVE-NEXT: and w9, w11, #0x1 +; NO_SVE-NEXT: umov w11, v1.b[7] +; NO_SVE-NEXT: orr w8, w8, w10, lsl #12 +; NO_SVE-NEXT: and w10, w12, #0x1 +; NO_SVE-NEXT: orr w8, w8, w9, lsl #13 +; NO_SVE-NEXT: ldr q1, [x1] +; NO_SVE-NEXT: orr w8, w8, w10, lsl #14 +; NO_SVE-NEXT: orr w9, w8, w11, lsl #15 +; NO_SVE-NEXT: and w8, w9, #0xffff +; NO_SVE-NEXT: tbz w9, #0, .LBB8_2 +; NO_SVE-NEXT: // %bb.1: // %cond.load +; NO_SVE-NEXT: fmov x9, d1 +; NO_SVE-NEXT: ldr h0, [x9] +; NO_SVE-NEXT: tbnz w8, #1, .LBB8_3 +; NO_SVE-NEXT: b .LBB8_4 +; NO_SVE-NEXT: .LBB8_2: +; NO_SVE-NEXT: // implicit-def: $q0 +; NO_SVE-NEXT: tbz w8, #1, .LBB8_4 +; NO_SVE-NEXT: .LBB8_3: // %cond.load1 +; NO_SVE-NEXT: mov x9, v1.d[1] +; NO_SVE-NEXT: ld1 { v0.h }[1], [x9] +; NO_SVE-NEXT: .LBB8_4: // %else2 +; NO_SVE-NEXT: ldr q1, [x1, #16] +; NO_SVE-NEXT: tbnz w8, #2, .LBB8_12 +; NO_SVE-NEXT: // %bb.5: // %else5 +; NO_SVE-NEXT: tbnz w8, #3, .LBB8_13 +; NO_SVE-NEXT: .LBB8_6: // %else8 +; NO_SVE-NEXT: ldr q1, [x1, #32] +; NO_SVE-NEXT: tbnz w8, #4, .LBB8_14 +; NO_SVE-NEXT: .LBB8_7: // %else11 +; NO_SVE-NEXT: tbnz w8, #5, .LBB8_15 +; NO_SVE-NEXT: .LBB8_8: // %else14 +; NO_SVE-NEXT: ldr q1, [x1, #48] +; NO_SVE-NEXT: tbnz w8, #6, .LBB8_16 +; NO_SVE-NEXT: .LBB8_9: // %else17 +; NO_SVE-NEXT: tbnz w8, #7, .LBB8_17 +; NO_SVE-NEXT: .LBB8_10: // %else20 +; NO_SVE-NEXT: ldr q2, [x1, #64] +; NO_SVE-NEXT: tbz w8, #8, .LBB8_18 +; NO_SVE-NEXT: .LBB8_11: // %cond.load22 +; NO_SVE-NEXT: fmov x9, d2 +; NO_SVE-NEXT: ld1 { v1.h }[0], [x9] +; NO_SVE-NEXT: tbnz w8, #9, .LBB8_19 +; NO_SVE-NEXT: b .LBB8_20 +; NO_SVE-NEXT: .LBB8_12: // %cond.load4 +; NO_SVE-NEXT: fmov x9, d1 +; NO_SVE-NEXT: ld1 { v0.h }[2], [x9] +; NO_SVE-NEXT: tbz w8, #3, .LBB8_6 +; NO_SVE-NEXT: .LBB8_13: // %cond.load7 +; NO_SVE-NEXT: mov x9, v1.d[1] +; NO_SVE-NEXT: ld1 { v0.h }[3], [x9] +; NO_SVE-NEXT: ldr q1, [x1, #32] +; NO_SVE-NEXT: tbz w8, #4, .LBB8_7 +; NO_SVE-NEXT: .LBB8_14: // %cond.load10 +; NO_SVE-NEXT: fmov x9, d1 +; NO_SVE-NEXT: ld1 { v0.h }[4], [x9] +; NO_SVE-NEXT: tbz w8, #5, .LBB8_8 +; NO_SVE-NEXT: .LBB8_15: // %cond.load13 +; NO_SVE-NEXT: mov x9, v1.d[1] +; NO_SVE-NEXT: ld1 { v0.h }[5], [x9] +; NO_SVE-NEXT: ldr q1, [x1, #48] +; NO_SVE-NEXT: tbz w8, #6, .LBB8_9 +; NO_SVE-NEXT: .LBB8_16: // %cond.load16 +; NO_SVE-NEXT: fmov x9, d1 +; NO_SVE-NEXT: ld1 { v0.h }[6], [x9] +; NO_SVE-NEXT: tbz w8, #7, .LBB8_10 +; NO_SVE-NEXT: .LBB8_17: // %cond.load19 +; NO_SVE-NEXT: mov x9, v1.d[1] +; NO_SVE-NEXT: ld1 { v0.h }[7], [x9] +; NO_SVE-NEXT: ldr q2, [x1, #64] +; NO_SVE-NEXT: tbnz w8, #8, .LBB8_11 +; NO_SVE-NEXT: .LBB8_18: +; NO_SVE-NEXT: // implicit-def: $q1 +; NO_SVE-NEXT: tbz w8, #9, .LBB8_20 +; NO_SVE-NEXT: .LBB8_19: // %cond.load25 +; NO_SVE-NEXT: mov x9, v2.d[1] +; NO_SVE-NEXT: ld1 { v1.h }[1], [x9] +; NO_SVE-NEXT: .LBB8_20: // %else26 +; NO_SVE-NEXT: ldr q2, [x1, #80] +; NO_SVE-NEXT: tbnz w8, #10, .LBB8_28 +; NO_SVE-NEXT: // %bb.21: // %else29 +; NO_SVE-NEXT: tbnz w8, #11, .LBB8_29 +; NO_SVE-NEXT: .LBB8_22: // %else32 +; NO_SVE-NEXT: ldr q2, [x1, #96] +; NO_SVE-NEXT: tbnz w8, #12, .LBB8_30 +; NO_SVE-NEXT: .LBB8_23: // %else35 +; NO_SVE-NEXT: tbnz w8, #13, .LBB8_31 +; NO_SVE-NEXT: .LBB8_24: // %else38 +; NO_SVE-NEXT: ldr q2, [x1, #112] +; NO_SVE-NEXT: tbnz w8, #14, .LBB8_32 +; NO_SVE-NEXT: .LBB8_25: // %else41 +; NO_SVE-NEXT: tbz w8, #15, .LBB8_27 +; NO_SVE-NEXT: .LBB8_26: // %cond.load43 +; NO_SVE-NEXT: mov x8, v2.d[1] +; NO_SVE-NEXT: ld1 { v1.h }[7], [x8] +; NO_SVE-NEXT: .LBB8_27: // %else44 +; NO_SVE-NEXT: stp q0, q1, [x0] +; NO_SVE-NEXT: add sp, sp, #16 +; NO_SVE-NEXT: ret +; NO_SVE-NEXT: .LBB8_28: // %cond.load28 +; NO_SVE-NEXT: fmov x9, d2 +; NO_SVE-NEXT: ld1 { v1.h }[2], [x9] +; NO_SVE-NEXT: tbz w8, #11, .LBB8_22 +; NO_SVE-NEXT: .LBB8_29: // %cond.load31 +; NO_SVE-NEXT: mov x9, v2.d[1] +; NO_SVE-NEXT: ld1 { v1.h }[3], [x9] +; NO_SVE-NEXT: ldr q2, [x1, #96] +; NO_SVE-NEXT: tbz w8, #12, .LBB8_23 +; NO_SVE-NEXT: .LBB8_30: // %cond.load34 +; NO_SVE-NEXT: fmov x9, d2 +; NO_SVE-NEXT: ld1 { v1.h }[4], [x9] +; NO_SVE-NEXT: tbz w8, #13, .LBB8_24 +; NO_SVE-NEXT: .LBB8_31: // %cond.load37 +; NO_SVE-NEXT: mov x9, v2.d[1] +; NO_SVE-NEXT: ld1 { v1.h }[5], [x9] +; NO_SVE-NEXT: ldr q2, [x1, #112] +; NO_SVE-NEXT: tbz w8, #14, .LBB8_25 +; NO_SVE-NEXT: .LBB8_32: // %cond.load40 +; NO_SVE-NEXT: fmov x9, d2 +; NO_SVE-NEXT: ld1 { v1.h }[6], [x9] +; NO_SVE-NEXT: tbnz w8, #15, .LBB8_26 +; NO_SVE-NEXT: b .LBB8_27 +; +; VBITS_EQ_256-LABEL: masked_gather_v16i16: +; VBITS_EQ_256: // %bb.0: +; VBITS_EQ_256-NEXT: ptrue p0.h, vl16 +; VBITS_EQ_256-NEXT: mov x8, #12 +; VBITS_EQ_256-NEXT: ld1h { z0.h }, p0/z, [x0] +; VBITS_EQ_256-NEXT: mov x9, #8 +; VBITS_EQ_256-NEXT: mov x10, #4 +; VBITS_EQ_256-NEXT: ptrue p1.d, vl4 +; VBITS_EQ_256-NEXT: ld1d { z1.d }, p1/z, [x1, x8, lsl #3] +; VBITS_EQ_256-NEXT: cmpeq p2.h, p0/z, z0.h, #0 +; VBITS_EQ_256-NEXT: ld1d { z0.d }, p1/z, [x1, x9, lsl #3] +; VBITS_EQ_256-NEXT: mov z2.h, p2/z, #-1 // =0xffffffffffffffff +; VBITS_EQ_256-NEXT: ld1d { z3.d }, p1/z, [x1, x10, lsl #3] +; VBITS_EQ_256-NEXT: sunpklo z4.s, z2.h +; VBITS_EQ_256-NEXT: ld1d { z6.d }, p1/z, [x1] +; VBITS_EQ_256-NEXT: ext v5.16b, v2.16b, v2.16b, #8 +; VBITS_EQ_256-NEXT: sunpklo z4.d, z4.s +; VBITS_EQ_256-NEXT: ext z2.b, z2.b, z2.b, #16 +; VBITS_EQ_256-NEXT: cmpne p2.d, p1/z, z4.d, #0 +; VBITS_EQ_256-NEXT: ext v4.16b, v2.16b, v2.16b, #8 +; VBITS_EQ_256-NEXT: sunpklo z2.s, z2.h +; VBITS_EQ_256-NEXT: ld1h { z6.d }, p2/z, [z6.d] +; VBITS_EQ_256-NEXT: sunpklo z2.d, z2.s +; VBITS_EQ_256-NEXT: sunpklo z5.s, z5.h +; VBITS_EQ_256-NEXT: sunpklo z5.d, z5.s +; VBITS_EQ_256-NEXT: cmpne p2.d, p1/z, z5.d, #0 +; VBITS_EQ_256-NEXT: sunpklo z4.s, z4.h +; VBITS_EQ_256-NEXT: ld1h { z3.d }, p2/z, [z3.d] +; VBITS_EQ_256-NEXT: cmpne p2.d, p1/z, z2.d, #0 +; VBITS_EQ_256-NEXT: sunpklo z2.d, z4.s +; VBITS_EQ_256-NEXT: ld1h { z0.d }, p2/z, [z0.d] +; VBITS_EQ_256-NEXT: cmpne p1.d, p1/z, z2.d, #0 +; VBITS_EQ_256-NEXT: uzp1 z2.s, z6.s, z6.s +; VBITS_EQ_256-NEXT: ld1h { z1.d }, p1/z, [z1.d] +; VBITS_EQ_256-NEXT: uzp1 z2.h, z2.h, z2.h +; VBITS_EQ_256-NEXT: uzp1 z3.s, z3.s, z3.s +; VBITS_EQ_256-NEXT: ptrue p1.h, vl8 +; VBITS_EQ_256-NEXT: uzp1 z0.s, z0.s, z0.s +; VBITS_EQ_256-NEXT: uzp1 z3.h, z3.h, z3.h +; VBITS_EQ_256-NEXT: uzp1 z0.h, z0.h, z0.h +; VBITS_EQ_256-NEXT: uzp1 z1.s, z1.s, z1.s +; VBITS_EQ_256-NEXT: uzp1 z1.h, z1.h, z1.h +; VBITS_EQ_256-NEXT: mov v2.d[1], v3.d[0] +; VBITS_EQ_256-NEXT: mov v0.d[1], v1.d[0] +; VBITS_EQ_256-NEXT: splice z2.h, p1, z2.h, z0.h +; VBITS_EQ_256-NEXT: st1h { z2.h }, p0, [x0] +; VBITS_EQ_256-NEXT: ret +; ; VBITS_GE_1024-LABEL: masked_gather_v16i16: ; VBITS_GE_1024: // %bb.0: ; VBITS_GE_1024-NEXT: ptrue p0.h, vl16 @@ -305,6 +1593,431 @@ } define void @masked_gather_v32i16(<32 x i16>* %a, <32 x i16*>* %b) #0 { +; NO_SVE-LABEL: masked_gather_v32i16: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: sub sp, sp, #16 +; NO_SVE-NEXT: .cfi_def_cfa_offset 16 +; NO_SVE-NEXT: ldp q0, q1, [x0, #32] +; NO_SVE-NEXT: cmeq v0.8h, v0.8h, #0 +; NO_SVE-NEXT: xtn v0.8b, v0.8h +; NO_SVE-NEXT: ldp q2, q3, [x0] +; NO_SVE-NEXT: umov w8, v0.b[1] +; NO_SVE-NEXT: umov w10, v0.b[2] +; NO_SVE-NEXT: umov w9, v0.b[0] +; NO_SVE-NEXT: umov w11, v0.b[3] +; NO_SVE-NEXT: umov w12, v0.b[4] +; NO_SVE-NEXT: umov w13, v0.b[5] +; NO_SVE-NEXT: cmeq v1.8h, v1.8h, #0 +; NO_SVE-NEXT: umov w14, v0.b[6] +; NO_SVE-NEXT: and w8, w8, #0x1 +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: xtn v1.8b, v1.8h +; NO_SVE-NEXT: and w9, w9, #0x1 +; NO_SVE-NEXT: and w11, w11, #0x1 +; NO_SVE-NEXT: umov w15, v0.b[7] +; NO_SVE-NEXT: bfi w9, w8, #1, #1 +; NO_SVE-NEXT: and w12, w12, #0x1 +; NO_SVE-NEXT: cmeq v2.8h, v2.8h, #0 +; NO_SVE-NEXT: umov w16, v1.b[0] +; NO_SVE-NEXT: bfi w9, w10, #2, #1 +; NO_SVE-NEXT: and w13, w13, #0x1 +; NO_SVE-NEXT: umov w8, v1.b[1] +; NO_SVE-NEXT: bfi w9, w11, #3, #1 +; NO_SVE-NEXT: xtn v2.8b, v2.8h +; NO_SVE-NEXT: umov w10, v1.b[2] +; NO_SVE-NEXT: bfi w9, w12, #4, #1 +; NO_SVE-NEXT: and w14, w14, #0x1 +; NO_SVE-NEXT: umov w11, v1.b[3] +; NO_SVE-NEXT: bfi w9, w13, #5, #1 +; NO_SVE-NEXT: and w15, w15, #0x1 +; NO_SVE-NEXT: orr w9, w9, w14, lsl #6 +; NO_SVE-NEXT: umov w14, v2.b[1] +; NO_SVE-NEXT: and w16, w16, #0x1 +; NO_SVE-NEXT: orr w9, w9, w15, lsl #7 +; NO_SVE-NEXT: umov w15, v2.b[2] +; NO_SVE-NEXT: and w8, w8, #0x1 +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: orr w9, w9, w16, lsl #8 +; NO_SVE-NEXT: umov w16, v2.b[0] +; NO_SVE-NEXT: umov w12, v1.b[4] +; NO_SVE-NEXT: and w11, w11, #0x1 +; NO_SVE-NEXT: orr w8, w9, w8, lsl #9 +; NO_SVE-NEXT: and w14, w14, #0x1 +; NO_SVE-NEXT: orr w8, w8, w10, lsl #10 +; NO_SVE-NEXT: umov w10, v2.b[3] +; NO_SVE-NEXT: umov w13, v1.b[5] +; NO_SVE-NEXT: and w9, w15, #0x1 +; NO_SVE-NEXT: orr w8, w8, w11, lsl #11 +; NO_SVE-NEXT: umov w11, v2.b[4] +; NO_SVE-NEXT: and w15, w16, #0x1 +; NO_SVE-NEXT: umov w16, v2.b[5] +; NO_SVE-NEXT: and w12, w12, #0x1 +; NO_SVE-NEXT: bfi w15, w14, #1, #1 +; NO_SVE-NEXT: bfi w15, w9, #2, #1 +; NO_SVE-NEXT: and w9, w10, #0x1 +; NO_SVE-NEXT: orr w8, w8, w12, lsl #12 +; NO_SVE-NEXT: and w12, w13, #0x1 +; NO_SVE-NEXT: and w10, w11, #0x1 +; NO_SVE-NEXT: umov w13, v2.b[6] +; NO_SVE-NEXT: cmeq v0.8h, v3.8h, #0 +; NO_SVE-NEXT: and w11, w16, #0x1 +; NO_SVE-NEXT: bfi w15, w9, #3, #1 +; NO_SVE-NEXT: umov w9, v2.b[7] +; NO_SVE-NEXT: xtn v0.8b, v0.8h +; NO_SVE-NEXT: bfi w15, w10, #4, #1 +; NO_SVE-NEXT: umov w10, v1.b[6] +; NO_SVE-NEXT: bfi w15, w11, #5, #1 +; NO_SVE-NEXT: and w11, w13, #0x1 +; NO_SVE-NEXT: umov w13, v0.b[0] +; NO_SVE-NEXT: umov w14, v0.b[1] +; NO_SVE-NEXT: and w9, w9, #0x1 +; NO_SVE-NEXT: orr w8, w8, w12, lsl #13 +; NO_SVE-NEXT: orr w11, w15, w11, lsl #6 +; NO_SVE-NEXT: umov w12, v0.b[2] +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: orr w9, w11, w9, lsl #7 +; NO_SVE-NEXT: and w11, w13, #0x1 +; NO_SVE-NEXT: and w13, w14, #0x1 +; NO_SVE-NEXT: umov w14, v0.b[3] +; NO_SVE-NEXT: orr w8, w8, w10, lsl #14 +; NO_SVE-NEXT: umov w10, v0.b[4] +; NO_SVE-NEXT: orr w9, w9, w11, lsl #8 +; NO_SVE-NEXT: and w11, w12, #0x1 +; NO_SVE-NEXT: umov w12, v0.b[5] +; NO_SVE-NEXT: orr w9, w9, w13, lsl #9 +; NO_SVE-NEXT: and w13, w14, #0x1 +; NO_SVE-NEXT: umov w14, v0.b[6] +; NO_SVE-NEXT: orr w9, w9, w11, lsl #10 +; NO_SVE-NEXT: umov w11, v1.b[7] +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: orr w9, w9, w13, lsl #11 +; NO_SVE-NEXT: and w12, w12, #0x1 +; NO_SVE-NEXT: umov w13, v0.b[7] +; NO_SVE-NEXT: orr w9, w9, w10, lsl #12 +; NO_SVE-NEXT: and w10, w14, #0x1 +; NO_SVE-NEXT: orr w11, w8, w11, lsl #15 +; NO_SVE-NEXT: orr w8, w9, w12, lsl #13 +; NO_SVE-NEXT: orr w8, w8, w10, lsl #14 +; NO_SVE-NEXT: ldr q1, [x1] +; NO_SVE-NEXT: orr w8, w8, w13, lsl #15 +; NO_SVE-NEXT: bfi w8, w11, #16, #16 +; NO_SVE-NEXT: tbz w8, #0, .LBB9_2 +; NO_SVE-NEXT: // %bb.1: // %cond.load +; NO_SVE-NEXT: fmov x9, d1 +; NO_SVE-NEXT: ldr h0, [x9] +; NO_SVE-NEXT: tbnz w8, #1, .LBB9_3 +; NO_SVE-NEXT: b .LBB9_4 +; NO_SVE-NEXT: .LBB9_2: +; NO_SVE-NEXT: // implicit-def: $q0 +; NO_SVE-NEXT: tbz w8, #1, .LBB9_4 +; NO_SVE-NEXT: .LBB9_3: // %cond.load1 +; NO_SVE-NEXT: mov x9, v1.d[1] +; NO_SVE-NEXT: ld1 { v0.h }[1], [x9] +; NO_SVE-NEXT: .LBB9_4: // %else2 +; NO_SVE-NEXT: ldr q1, [x1, #16] +; NO_SVE-NEXT: tbnz w8, #2, .LBB9_12 +; NO_SVE-NEXT: // %bb.5: // %else5 +; NO_SVE-NEXT: tbnz w8, #3, .LBB9_13 +; NO_SVE-NEXT: .LBB9_6: // %else8 +; NO_SVE-NEXT: ldr q1, [x1, #32] +; NO_SVE-NEXT: tbnz w8, #4, .LBB9_14 +; NO_SVE-NEXT: .LBB9_7: // %else11 +; NO_SVE-NEXT: tbnz w8, #5, .LBB9_15 +; NO_SVE-NEXT: .LBB9_8: // %else14 +; NO_SVE-NEXT: ldr q1, [x1, #48] +; NO_SVE-NEXT: tbnz w8, #6, .LBB9_16 +; NO_SVE-NEXT: .LBB9_9: // %else17 +; NO_SVE-NEXT: tbnz w8, #7, .LBB9_17 +; NO_SVE-NEXT: .LBB9_10: // %else20 +; NO_SVE-NEXT: ldr q2, [x1, #64] +; NO_SVE-NEXT: tbz w8, #8, .LBB9_18 +; NO_SVE-NEXT: .LBB9_11: // %cond.load22 +; NO_SVE-NEXT: fmov x9, d2 +; NO_SVE-NEXT: ld1 { v1.h }[0], [x9] +; NO_SVE-NEXT: tbnz w8, #9, .LBB9_19 +; NO_SVE-NEXT: b .LBB9_20 +; NO_SVE-NEXT: .LBB9_12: // %cond.load4 +; NO_SVE-NEXT: fmov x9, d1 +; NO_SVE-NEXT: ld1 { v0.h }[2], [x9] +; NO_SVE-NEXT: tbz w8, #3, .LBB9_6 +; NO_SVE-NEXT: .LBB9_13: // %cond.load7 +; NO_SVE-NEXT: mov x9, v1.d[1] +; NO_SVE-NEXT: ld1 { v0.h }[3], [x9] +; NO_SVE-NEXT: ldr q1, [x1, #32] +; NO_SVE-NEXT: tbz w8, #4, .LBB9_7 +; NO_SVE-NEXT: .LBB9_14: // %cond.load10 +; NO_SVE-NEXT: fmov x9, d1 +; NO_SVE-NEXT: ld1 { v0.h }[4], [x9] +; NO_SVE-NEXT: tbz w8, #5, .LBB9_8 +; NO_SVE-NEXT: .LBB9_15: // %cond.load13 +; NO_SVE-NEXT: mov x9, v1.d[1] +; NO_SVE-NEXT: ld1 { v0.h }[5], [x9] +; NO_SVE-NEXT: ldr q1, [x1, #48] +; NO_SVE-NEXT: tbz w8, #6, .LBB9_9 +; NO_SVE-NEXT: .LBB9_16: // %cond.load16 +; NO_SVE-NEXT: fmov x9, d1 +; NO_SVE-NEXT: ld1 { v0.h }[6], [x9] +; NO_SVE-NEXT: tbz w8, #7, .LBB9_10 +; NO_SVE-NEXT: .LBB9_17: // %cond.load19 +; NO_SVE-NEXT: mov x9, v1.d[1] +; NO_SVE-NEXT: ld1 { v0.h }[7], [x9] +; NO_SVE-NEXT: ldr q2, [x1, #64] +; NO_SVE-NEXT: tbnz w8, #8, .LBB9_11 +; NO_SVE-NEXT: .LBB9_18: +; NO_SVE-NEXT: // implicit-def: $q1 +; NO_SVE-NEXT: tbz w8, #9, .LBB9_20 +; NO_SVE-NEXT: .LBB9_19: // %cond.load25 +; NO_SVE-NEXT: mov x9, v2.d[1] +; NO_SVE-NEXT: ld1 { v1.h }[1], [x9] +; NO_SVE-NEXT: .LBB9_20: // %else26 +; NO_SVE-NEXT: ldr q2, [x1, #80] +; NO_SVE-NEXT: tbnz w8, #10, .LBB9_28 +; NO_SVE-NEXT: // %bb.21: // %else29 +; NO_SVE-NEXT: tbnz w8, #11, .LBB9_29 +; NO_SVE-NEXT: .LBB9_22: // %else32 +; NO_SVE-NEXT: ldr q2, [x1, #96] +; NO_SVE-NEXT: tbnz w8, #12, .LBB9_30 +; NO_SVE-NEXT: .LBB9_23: // %else35 +; NO_SVE-NEXT: tbnz w8, #13, .LBB9_31 +; NO_SVE-NEXT: .LBB9_24: // %else38 +; NO_SVE-NEXT: ldr q2, [x1, #112] +; NO_SVE-NEXT: tbnz w8, #14, .LBB9_32 +; NO_SVE-NEXT: .LBB9_25: // %else41 +; NO_SVE-NEXT: tbnz w8, #15, .LBB9_33 +; NO_SVE-NEXT: .LBB9_26: // %else44 +; NO_SVE-NEXT: ldr q3, [x1, #128] +; NO_SVE-NEXT: tbz w8, #16, .LBB9_34 +; NO_SVE-NEXT: .LBB9_27: // %cond.load46 +; NO_SVE-NEXT: fmov x9, d3 +; NO_SVE-NEXT: ld1 { v2.h }[0], [x9] +; NO_SVE-NEXT: tbnz w8, #17, .LBB9_35 +; NO_SVE-NEXT: b .LBB9_36 +; NO_SVE-NEXT: .LBB9_28: // %cond.load28 +; NO_SVE-NEXT: fmov x9, d2 +; NO_SVE-NEXT: ld1 { v1.h }[2], [x9] +; NO_SVE-NEXT: tbz w8, #11, .LBB9_22 +; NO_SVE-NEXT: .LBB9_29: // %cond.load31 +; NO_SVE-NEXT: mov x9, v2.d[1] +; NO_SVE-NEXT: ld1 { v1.h }[3], [x9] +; NO_SVE-NEXT: ldr q2, [x1, #96] +; NO_SVE-NEXT: tbz w8, #12, .LBB9_23 +; NO_SVE-NEXT: .LBB9_30: // %cond.load34 +; NO_SVE-NEXT: fmov x9, d2 +; NO_SVE-NEXT: ld1 { v1.h }[4], [x9] +; NO_SVE-NEXT: tbz w8, #13, .LBB9_24 +; NO_SVE-NEXT: .LBB9_31: // %cond.load37 +; NO_SVE-NEXT: mov x9, v2.d[1] +; NO_SVE-NEXT: ld1 { v1.h }[5], [x9] +; NO_SVE-NEXT: ldr q2, [x1, #112] +; NO_SVE-NEXT: tbz w8, #14, .LBB9_25 +; NO_SVE-NEXT: .LBB9_32: // %cond.load40 +; NO_SVE-NEXT: fmov x9, d2 +; NO_SVE-NEXT: ld1 { v1.h }[6], [x9] +; NO_SVE-NEXT: tbz w8, #15, .LBB9_26 +; NO_SVE-NEXT: .LBB9_33: // %cond.load43 +; NO_SVE-NEXT: mov x9, v2.d[1] +; NO_SVE-NEXT: ld1 { v1.h }[7], [x9] +; NO_SVE-NEXT: ldr q3, [x1, #128] +; NO_SVE-NEXT: tbnz w8, #16, .LBB9_27 +; NO_SVE-NEXT: .LBB9_34: +; NO_SVE-NEXT: // implicit-def: $q2 +; NO_SVE-NEXT: tbz w8, #17, .LBB9_36 +; NO_SVE-NEXT: .LBB9_35: // %cond.load49 +; NO_SVE-NEXT: mov x9, v3.d[1] +; NO_SVE-NEXT: ld1 { v2.h }[1], [x9] +; NO_SVE-NEXT: .LBB9_36: // %else50 +; NO_SVE-NEXT: ldr q3, [x1, #144] +; NO_SVE-NEXT: tbnz w8, #18, .LBB9_44 +; NO_SVE-NEXT: // %bb.37: // %else53 +; NO_SVE-NEXT: tbnz w8, #19, .LBB9_45 +; NO_SVE-NEXT: .LBB9_38: // %else56 +; NO_SVE-NEXT: ldr q3, [x1, #160] +; NO_SVE-NEXT: tbnz w8, #20, .LBB9_46 +; NO_SVE-NEXT: .LBB9_39: // %else59 +; NO_SVE-NEXT: tbnz w8, #21, .LBB9_47 +; NO_SVE-NEXT: .LBB9_40: // %else62 +; NO_SVE-NEXT: ldr q3, [x1, #176] +; NO_SVE-NEXT: tbnz w8, #22, .LBB9_48 +; NO_SVE-NEXT: .LBB9_41: // %else65 +; NO_SVE-NEXT: tbnz w8, #23, .LBB9_49 +; NO_SVE-NEXT: .LBB9_42: // %else68 +; NO_SVE-NEXT: ldr q4, [x1, #192] +; NO_SVE-NEXT: tbz w8, #24, .LBB9_50 +; NO_SVE-NEXT: .LBB9_43: // %cond.load70 +; NO_SVE-NEXT: fmov x9, d4 +; NO_SVE-NEXT: ld1 { v3.h }[0], [x9] +; NO_SVE-NEXT: tbnz w8, #25, .LBB9_51 +; NO_SVE-NEXT: b .LBB9_52 +; NO_SVE-NEXT: .LBB9_44: // %cond.load52 +; NO_SVE-NEXT: fmov x9, d3 +; NO_SVE-NEXT: ld1 { v2.h }[2], [x9] +; NO_SVE-NEXT: tbz w8, #19, .LBB9_38 +; NO_SVE-NEXT: .LBB9_45: // %cond.load55 +; NO_SVE-NEXT: mov x9, v3.d[1] +; NO_SVE-NEXT: ld1 { v2.h }[3], [x9] +; NO_SVE-NEXT: ldr q3, [x1, #160] +; NO_SVE-NEXT: tbz w8, #20, .LBB9_39 +; NO_SVE-NEXT: .LBB9_46: // %cond.load58 +; NO_SVE-NEXT: fmov x9, d3 +; NO_SVE-NEXT: ld1 { v2.h }[4], [x9] +; NO_SVE-NEXT: tbz w8, #21, .LBB9_40 +; NO_SVE-NEXT: .LBB9_47: // %cond.load61 +; NO_SVE-NEXT: mov x9, v3.d[1] +; NO_SVE-NEXT: ld1 { v2.h }[5], [x9] +; NO_SVE-NEXT: ldr q3, [x1, #176] +; NO_SVE-NEXT: tbz w8, #22, .LBB9_41 +; NO_SVE-NEXT: .LBB9_48: // %cond.load64 +; NO_SVE-NEXT: fmov x9, d3 +; NO_SVE-NEXT: ld1 { v2.h }[6], [x9] +; NO_SVE-NEXT: tbz w8, #23, .LBB9_42 +; NO_SVE-NEXT: .LBB9_49: // %cond.load67 +; NO_SVE-NEXT: mov x9, v3.d[1] +; NO_SVE-NEXT: ld1 { v2.h }[7], [x9] +; NO_SVE-NEXT: ldr q4, [x1, #192] +; NO_SVE-NEXT: tbnz w8, #24, .LBB9_43 +; NO_SVE-NEXT: .LBB9_50: +; NO_SVE-NEXT: // implicit-def: $q3 +; NO_SVE-NEXT: tbz w8, #25, .LBB9_52 +; NO_SVE-NEXT: .LBB9_51: // %cond.load73 +; NO_SVE-NEXT: mov x9, v4.d[1] +; NO_SVE-NEXT: ld1 { v3.h }[1], [x9] +; NO_SVE-NEXT: .LBB9_52: // %else74 +; NO_SVE-NEXT: ldr q4, [x1, #208] +; NO_SVE-NEXT: tbnz w8, #26, .LBB9_60 +; NO_SVE-NEXT: // %bb.53: // %else77 +; NO_SVE-NEXT: tbnz w8, #27, .LBB9_61 +; NO_SVE-NEXT: .LBB9_54: // %else80 +; NO_SVE-NEXT: ldr q4, [x1, #224] +; NO_SVE-NEXT: tbnz w8, #28, .LBB9_62 +; NO_SVE-NEXT: .LBB9_55: // %else83 +; NO_SVE-NEXT: tbnz w8, #29, .LBB9_63 +; NO_SVE-NEXT: .LBB9_56: // %else86 +; NO_SVE-NEXT: ldr q4, [x1, #240] +; NO_SVE-NEXT: tbnz w8, #30, .LBB9_64 +; NO_SVE-NEXT: .LBB9_57: // %else89 +; NO_SVE-NEXT: tbz w8, #31, .LBB9_59 +; NO_SVE-NEXT: .LBB9_58: // %cond.load91 +; NO_SVE-NEXT: mov x8, v4.d[1] +; NO_SVE-NEXT: ld1 { v3.h }[7], [x8] +; NO_SVE-NEXT: .LBB9_59: // %else92 +; NO_SVE-NEXT: stp q0, q1, [x0] +; NO_SVE-NEXT: stp q2, q3, [x0, #32] +; NO_SVE-NEXT: add sp, sp, #16 +; NO_SVE-NEXT: ret +; NO_SVE-NEXT: .LBB9_60: // %cond.load76 +; NO_SVE-NEXT: fmov x9, d4 +; NO_SVE-NEXT: ld1 { v3.h }[2], [x9] +; NO_SVE-NEXT: tbz w8, #27, .LBB9_54 +; NO_SVE-NEXT: .LBB9_61: // %cond.load79 +; NO_SVE-NEXT: mov x9, v4.d[1] +; NO_SVE-NEXT: ld1 { v3.h }[3], [x9] +; NO_SVE-NEXT: ldr q4, [x1, #224] +; NO_SVE-NEXT: tbz w8, #28, .LBB9_55 +; NO_SVE-NEXT: .LBB9_62: // %cond.load82 +; NO_SVE-NEXT: fmov x9, d4 +; NO_SVE-NEXT: ld1 { v3.h }[4], [x9] +; NO_SVE-NEXT: tbz w8, #29, .LBB9_56 +; NO_SVE-NEXT: .LBB9_63: // %cond.load85 +; NO_SVE-NEXT: mov x9, v4.d[1] +; NO_SVE-NEXT: ld1 { v3.h }[5], [x9] +; NO_SVE-NEXT: ldr q4, [x1, #240] +; NO_SVE-NEXT: tbz w8, #30, .LBB9_57 +; NO_SVE-NEXT: .LBB9_64: // %cond.load88 +; NO_SVE-NEXT: fmov x9, d4 +; NO_SVE-NEXT: ld1 { v3.h }[6], [x9] +; NO_SVE-NEXT: tbnz w8, #31, .LBB9_58 +; NO_SVE-NEXT: b .LBB9_59 +; +; VBITS_EQ_256-LABEL: masked_gather_v32i16: +; VBITS_EQ_256: // %bb.0: +; VBITS_EQ_256-NEXT: mov x8, #16 +; VBITS_EQ_256-NEXT: ptrue p0.h, vl16 +; VBITS_EQ_256-NEXT: mov x11, #4 +; VBITS_EQ_256-NEXT: mov x9, #12 +; VBITS_EQ_256-NEXT: mov x10, #8 +; VBITS_EQ_256-NEXT: ptrue p1.d, vl4 +; VBITS_EQ_256-NEXT: ld1h { z3.h }, p0/z, [x0, x8, lsl #1] +; VBITS_EQ_256-NEXT: ld1h { z4.h }, p0/z, [x0] +; VBITS_EQ_256-NEXT: ld1d { z2.d }, p1/z, [x1, x11, lsl #3] +; VBITS_EQ_256-NEXT: mov x11, #20 +; VBITS_EQ_256-NEXT: ld1d { z0.d }, p1/z, [x1, x9, lsl #3] +; VBITS_EQ_256-NEXT: mov x9, #28 +; VBITS_EQ_256-NEXT: ld1d { z1.d }, p1/z, [x1, x10, lsl #3] +; VBITS_EQ_256-NEXT: mov x10, #24 +; VBITS_EQ_256-NEXT: cmpeq p2.h, p0/z, z3.h, #0 +; VBITS_EQ_256-NEXT: ld1d { z17.d }, p1/z, [x1, x8, lsl #3] +; VBITS_EQ_256-NEXT: mov z3.h, p2/z, #-1 // =0xffffffffffffffff +; VBITS_EQ_256-NEXT: ld1d { z16.d }, p1/z, [x1, x11, lsl #3] +; VBITS_EQ_256-NEXT: sunpklo z18.s, z3.h +; VBITS_EQ_256-NEXT: ld1d { z6.d }, p1/z, [x1, x9, lsl #3] +; VBITS_EQ_256-NEXT: ext v5.16b, v3.16b, v3.16b, #8 +; VBITS_EQ_256-NEXT: ext z3.b, z3.b, z3.b, #16 +; VBITS_EQ_256-NEXT: sunpklo z18.d, z18.s +; VBITS_EQ_256-NEXT: ld1d { z7.d }, p1/z, [x1, x10, lsl #3] +; VBITS_EQ_256-NEXT: cmpne p2.d, p1/z, z18.d, #0 +; VBITS_EQ_256-NEXT: ext v18.16b, v3.16b, v3.16b, #8 +; VBITS_EQ_256-NEXT: sunpklo z3.s, z3.h +; VBITS_EQ_256-NEXT: sunpklo z5.s, z5.h +; VBITS_EQ_256-NEXT: sunpklo z3.d, z3.s +; VBITS_EQ_256-NEXT: sunpklo z5.d, z5.s +; VBITS_EQ_256-NEXT: cmpne p3.d, p1/z, z5.d, #0 +; VBITS_EQ_256-NEXT: ld1d { z5.d }, p1/z, [x1] +; VBITS_EQ_256-NEXT: sunpklo z18.s, z18.h +; VBITS_EQ_256-NEXT: ld1h { z17.d }, p2/z, [z17.d] +; VBITS_EQ_256-NEXT: cmpeq p2.h, p0/z, z4.h, #0 +; VBITS_EQ_256-NEXT: sunpklo z18.d, z18.s +; VBITS_EQ_256-NEXT: ld1h { z4.d }, p3/z, [z16.d] +; VBITS_EQ_256-NEXT: mov z16.h, p2/z, #-1 // =0xffffffffffffffff +; VBITS_EQ_256-NEXT: cmpne p2.d, p1/z, z3.d, #0 +; VBITS_EQ_256-NEXT: cmpne p3.d, p1/z, z18.d, #0 +; VBITS_EQ_256-NEXT: ld1h { z3.d }, p2/z, [z7.d] +; VBITS_EQ_256-NEXT: ld1h { z6.d }, p3/z, [z6.d] +; VBITS_EQ_256-NEXT: uzp1 z17.s, z17.s, z17.s +; VBITS_EQ_256-NEXT: uzp1 z4.s, z4.s, z4.s +; VBITS_EQ_256-NEXT: uzp1 z7.h, z17.h, z17.h +; VBITS_EQ_256-NEXT: uzp1 z4.h, z4.h, z4.h +; VBITS_EQ_256-NEXT: uzp1 z3.s, z3.s, z3.s +; VBITS_EQ_256-NEXT: uzp1 z6.s, z6.s, z6.s +; VBITS_EQ_256-NEXT: mov v7.d[1], v4.d[0] +; VBITS_EQ_256-NEXT: uzp1 z3.h, z3.h, z3.h +; VBITS_EQ_256-NEXT: ext v4.16b, v16.16b, v16.16b, #8 +; VBITS_EQ_256-NEXT: uzp1 z6.h, z6.h, z6.h +; VBITS_EQ_256-NEXT: mov v3.d[1], v6.d[0] +; VBITS_EQ_256-NEXT: sunpklo z6.s, z16.h +; VBITS_EQ_256-NEXT: ext z16.b, z16.b, z16.b, #16 +; VBITS_EQ_256-NEXT: sunpklo z6.d, z6.s +; VBITS_EQ_256-NEXT: ext v17.16b, v16.16b, v16.16b, #8 +; VBITS_EQ_256-NEXT: cmpne p2.d, p1/z, z6.d, #0 +; VBITS_EQ_256-NEXT: sunpklo z4.s, z4.h +; VBITS_EQ_256-NEXT: sunpklo z4.d, z4.s +; VBITS_EQ_256-NEXT: cmpne p3.d, p1/z, z4.d, #0 +; VBITS_EQ_256-NEXT: ld1h { z4.d }, p2/z, [z5.d] +; VBITS_EQ_256-NEXT: sunpklo z5.s, z16.h +; VBITS_EQ_256-NEXT: sunpklo z6.s, z17.h +; VBITS_EQ_256-NEXT: sunpklo z5.d, z5.s +; VBITS_EQ_256-NEXT: sunpklo z6.d, z6.s +; VBITS_EQ_256-NEXT: cmpne p2.d, p1/z, z5.d, #0 +; VBITS_EQ_256-NEXT: cmpne p1.d, p1/z, z6.d, #0 +; VBITS_EQ_256-NEXT: ld1h { z2.d }, p3/z, [z2.d] +; VBITS_EQ_256-NEXT: ld1h { z1.d }, p2/z, [z1.d] +; VBITS_EQ_256-NEXT: ld1h { z0.d }, p1/z, [z0.d] +; VBITS_EQ_256-NEXT: uzp1 z4.s, z4.s, z4.s +; VBITS_EQ_256-NEXT: uzp1 z4.h, z4.h, z4.h +; VBITS_EQ_256-NEXT: ptrue p1.h, vl8 +; VBITS_EQ_256-NEXT: splice z7.h, p1, z7.h, z3.h +; VBITS_EQ_256-NEXT: uzp1 z2.s, z2.s, z2.s +; VBITS_EQ_256-NEXT: uzp1 z1.s, z1.s, z1.s +; VBITS_EQ_256-NEXT: uzp1 z0.s, z0.s, z0.s +; VBITS_EQ_256-NEXT: uzp1 z2.h, z2.h, z2.h +; VBITS_EQ_256-NEXT: uzp1 z1.h, z1.h, z1.h +; VBITS_EQ_256-NEXT: uzp1 z0.h, z0.h, z0.h +; VBITS_EQ_256-NEXT: mov v4.d[1], v2.d[0] +; VBITS_EQ_256-NEXT: st1h { z7.h }, p0, [x0, x8, lsl #1] +; VBITS_EQ_256-NEXT: mov v1.d[1], v0.d[0] +; VBITS_EQ_256-NEXT: splice z4.h, p1, z4.h, z1.h +; VBITS_EQ_256-NEXT: st1h { z4.h }, p0, [x0] +; VBITS_EQ_256-NEXT: ret +; ; VBITS_GE_2048-LABEL: masked_gather_v32i16: ; VBITS_GE_2048: // %bb.0: ; VBITS_GE_2048-NEXT: ptrue p0.h, vl32 @@ -332,6 +2045,32 @@ ; define void @masked_gather_v2i32(<2 x i32>* %a, <2 x i32*>* %b) #0 { +; NO_SVE-LABEL: masked_gather_v2i32: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: sub sp, sp, #16 +; NO_SVE-NEXT: .cfi_def_cfa_offset 16 +; NO_SVE-NEXT: ldr d0, [x0] +; NO_SVE-NEXT: ldr q1, [x1] +; NO_SVE-NEXT: cmeq v0.2s, v0.2s, #0 +; NO_SVE-NEXT: mov w8, v0.s[1] +; NO_SVE-NEXT: fmov w9, s0 +; NO_SVE-NEXT: // implicit-def: $d0 +; NO_SVE-NEXT: bfi w9, w8, #1, #31 +; NO_SVE-NEXT: and w8, w9, #0x3 +; NO_SVE-NEXT: tbz w9, #0, .LBB10_2 +; NO_SVE-NEXT: // %bb.1: // %cond.load +; NO_SVE-NEXT: fmov x9, d1 +; NO_SVE-NEXT: ldr s0, [x9] +; NO_SVE-NEXT: .LBB10_2: // %else +; NO_SVE-NEXT: tbz w8, #1, .LBB10_4 +; NO_SVE-NEXT: // %bb.3: // %cond.load1 +; NO_SVE-NEXT: mov x8, v1.d[1] +; NO_SVE-NEXT: ld1 { v0.s }[1], [x8] +; NO_SVE-NEXT: .LBB10_4: // %else2 +; NO_SVE-NEXT: str d0, [x0] +; NO_SVE-NEXT: add sp, sp, #16 +; NO_SVE-NEXT: ret +; ; CHECK-LABEL: masked_gather_v2i32: ; CHECK: // %bb.0: ; CHECK-NEXT: ldr d0, [x0] @@ -353,6 +2092,55 @@ } define void @masked_gather_v4i32(<4 x i32>* %a, <4 x i32*>* %b) #0 { +; NO_SVE-LABEL: masked_gather_v4i32: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: sub sp, sp, #16 +; NO_SVE-NEXT: .cfi_def_cfa_offset 16 +; NO_SVE-NEXT: ldr q0, [x0] +; NO_SVE-NEXT: ldr q1, [x1] +; NO_SVE-NEXT: cmeq v0.4s, v0.4s, #0 +; NO_SVE-NEXT: xtn v0.4h, v0.4s +; NO_SVE-NEXT: umov w8, v0.h[1] +; NO_SVE-NEXT: umov w9, v0.h[2] +; NO_SVE-NEXT: umov w10, v0.h[0] +; NO_SVE-NEXT: umov w11, v0.h[3] +; NO_SVE-NEXT: // implicit-def: $q0 +; NO_SVE-NEXT: and w8, w8, #0x1 +; NO_SVE-NEXT: and w9, w9, #0x1 +; NO_SVE-NEXT: bfi w10, w8, #1, #1 +; NO_SVE-NEXT: bfi w10, w9, #2, #1 +; NO_SVE-NEXT: bfi w10, w11, #3, #29 +; NO_SVE-NEXT: and w8, w10, #0xf +; NO_SVE-NEXT: tbnz w10, #0, .LBB11_6 +; NO_SVE-NEXT: // %bb.1: // %else +; NO_SVE-NEXT: tbnz w8, #1, .LBB11_7 +; NO_SVE-NEXT: .LBB11_2: // %else2 +; NO_SVE-NEXT: ldr q1, [x1, #16] +; NO_SVE-NEXT: tbnz w8, #2, .LBB11_8 +; NO_SVE-NEXT: .LBB11_3: // %else5 +; NO_SVE-NEXT: tbz w8, #3, .LBB11_5 +; NO_SVE-NEXT: .LBB11_4: // %cond.load7 +; NO_SVE-NEXT: mov x8, v1.d[1] +; NO_SVE-NEXT: ld1 { v0.s }[3], [x8] +; NO_SVE-NEXT: .LBB11_5: // %else8 +; NO_SVE-NEXT: str q0, [x0] +; NO_SVE-NEXT: add sp, sp, #16 +; NO_SVE-NEXT: ret +; NO_SVE-NEXT: .LBB11_6: // %cond.load +; NO_SVE-NEXT: fmov x9, d1 +; NO_SVE-NEXT: ldr s0, [x9] +; NO_SVE-NEXT: tbz w8, #1, .LBB11_2 +; NO_SVE-NEXT: .LBB11_7: // %cond.load1 +; NO_SVE-NEXT: mov x9, v1.d[1] +; NO_SVE-NEXT: ld1 { v0.s }[1], [x9] +; NO_SVE-NEXT: ldr q1, [x1, #16] +; NO_SVE-NEXT: tbz w8, #2, .LBB11_3 +; NO_SVE-NEXT: .LBB11_8: // %cond.load4 +; NO_SVE-NEXT: fmov x9, d1 +; NO_SVE-NEXT: ld1 { v0.s }[2], [x9] +; NO_SVE-NEXT: tbnz w8, #3, .LBB11_4 +; NO_SVE-NEXT: b .LBB11_5 +; ; CHECK-LABEL: masked_gather_v4i32: ; CHECK: // %bb.0: ; CHECK-NEXT: ldr q0, [x0] @@ -375,6 +2163,97 @@ define void @masked_gather_v8i32(<8 x i32>* %a, <8 x i32*>* %b) #0 { ; Ensure sensible type legalisation. +; NO_SVE-LABEL: masked_gather_v8i32: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: sub sp, sp, #16 +; NO_SVE-NEXT: .cfi_def_cfa_offset 16 +; NO_SVE-NEXT: ldp q1, q0, [x0] +; NO_SVE-NEXT: cmeq v1.4s, v1.4s, #0 +; NO_SVE-NEXT: cmeq v0.4s, v0.4s, #0 +; NO_SVE-NEXT: uzp1 v0.8h, v1.8h, v0.8h +; NO_SVE-NEXT: ldr q1, [x1] +; NO_SVE-NEXT: xtn v0.8b, v0.8h +; NO_SVE-NEXT: umov w8, v0.b[1] +; NO_SVE-NEXT: umov w10, v0.b[2] +; NO_SVE-NEXT: umov w9, v0.b[0] +; NO_SVE-NEXT: umov w11, v0.b[3] +; NO_SVE-NEXT: umov w12, v0.b[4] +; NO_SVE-NEXT: umov w13, v0.b[5] +; NO_SVE-NEXT: umov w14, v0.b[6] +; NO_SVE-NEXT: and w8, w8, #0x1 +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: and w9, w9, #0x1 +; NO_SVE-NEXT: and w11, w11, #0x1 +; NO_SVE-NEXT: bfi w9, w8, #1, #1 +; NO_SVE-NEXT: and w8, w12, #0x1 +; NO_SVE-NEXT: bfi w9, w10, #2, #1 +; NO_SVE-NEXT: and w10, w13, #0x1 +; NO_SVE-NEXT: bfi w9, w11, #3, #1 +; NO_SVE-NEXT: umov w11, v0.b[7] +; NO_SVE-NEXT: bfi w9, w8, #4, #1 +; NO_SVE-NEXT: and w8, w14, #0x1 +; NO_SVE-NEXT: bfi w9, w10, #5, #1 +; NO_SVE-NEXT: orr w8, w9, w8, lsl #6 +; NO_SVE-NEXT: orr w9, w8, w11, lsl #7 +; NO_SVE-NEXT: and w8, w9, #0xff +; NO_SVE-NEXT: tbz w9, #0, .LBB12_2 +; NO_SVE-NEXT: // %bb.1: // %cond.load +; NO_SVE-NEXT: fmov x9, d1 +; NO_SVE-NEXT: ldr s0, [x9] +; NO_SVE-NEXT: tbnz w8, #1, .LBB12_3 +; NO_SVE-NEXT: b .LBB12_4 +; NO_SVE-NEXT: .LBB12_2: +; NO_SVE-NEXT: // implicit-def: $q0 +; NO_SVE-NEXT: tbz w8, #1, .LBB12_4 +; NO_SVE-NEXT: .LBB12_3: // %cond.load1 +; NO_SVE-NEXT: mov x9, v1.d[1] +; NO_SVE-NEXT: ld1 { v0.s }[1], [x9] +; NO_SVE-NEXT: .LBB12_4: // %else2 +; NO_SVE-NEXT: ldr q1, [x1, #16] +; NO_SVE-NEXT: tbnz w8, #2, .LBB12_8 +; NO_SVE-NEXT: // %bb.5: // %else5 +; NO_SVE-NEXT: tbnz w8, #3, .LBB12_9 +; NO_SVE-NEXT: .LBB12_6: // %else8 +; NO_SVE-NEXT: ldr q2, [x1, #32] +; NO_SVE-NEXT: tbz w8, #4, .LBB12_10 +; NO_SVE-NEXT: .LBB12_7: // %cond.load10 +; NO_SVE-NEXT: fmov x9, d2 +; NO_SVE-NEXT: ld1 { v1.s }[0], [x9] +; NO_SVE-NEXT: tbnz w8, #5, .LBB12_11 +; NO_SVE-NEXT: b .LBB12_12 +; NO_SVE-NEXT: .LBB12_8: // %cond.load4 +; NO_SVE-NEXT: fmov x9, d1 +; NO_SVE-NEXT: ld1 { v0.s }[2], [x9] +; NO_SVE-NEXT: tbz w8, #3, .LBB12_6 +; NO_SVE-NEXT: .LBB12_9: // %cond.load7 +; NO_SVE-NEXT: mov x9, v1.d[1] +; NO_SVE-NEXT: ld1 { v0.s }[3], [x9] +; NO_SVE-NEXT: ldr q2, [x1, #32] +; NO_SVE-NEXT: tbnz w8, #4, .LBB12_7 +; NO_SVE-NEXT: .LBB12_10: +; NO_SVE-NEXT: // implicit-def: $q1 +; NO_SVE-NEXT: tbz w8, #5, .LBB12_12 +; NO_SVE-NEXT: .LBB12_11: // %cond.load13 +; NO_SVE-NEXT: mov x9, v2.d[1] +; NO_SVE-NEXT: ld1 { v1.s }[1], [x9] +; NO_SVE-NEXT: .LBB12_12: // %else14 +; NO_SVE-NEXT: ldr q2, [x1, #48] +; NO_SVE-NEXT: tbnz w8, #6, .LBB12_16 +; NO_SVE-NEXT: // %bb.13: // %else17 +; NO_SVE-NEXT: tbz w8, #7, .LBB12_15 +; NO_SVE-NEXT: .LBB12_14: // %cond.load19 +; NO_SVE-NEXT: mov x8, v2.d[1] +; NO_SVE-NEXT: ld1 { v1.s }[3], [x8] +; NO_SVE-NEXT: .LBB12_15: // %else20 +; NO_SVE-NEXT: stp q0, q1, [x0] +; NO_SVE-NEXT: add sp, sp, #16 +; NO_SVE-NEXT: ret +; NO_SVE-NEXT: .LBB12_16: // %cond.load16 +; NO_SVE-NEXT: fmov x9, d2 +; NO_SVE-NEXT: ld1 { v1.s }[2], [x9] +; NO_SVE-NEXT: tbnz w8, #7, .LBB12_14 +; NO_SVE-NEXT: b .LBB12_15 +; ; VBITS_EQ_256-LABEL: masked_gather_v8i32: ; VBITS_EQ_256: // %bb.0: ; VBITS_EQ_256-NEXT: ptrue p0.s, vl8 @@ -420,6 +2299,225 @@ } define void @masked_gather_v16i32(<16 x i32>* %a, <16 x i32*>* %b) #0 { +; NO_SVE-LABEL: masked_gather_v16i32: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: sub sp, sp, #16 +; NO_SVE-NEXT: .cfi_def_cfa_offset 16 +; NO_SVE-NEXT: ldp q1, q0, [x0] +; NO_SVE-NEXT: cmeq v1.4s, v1.4s, #0 +; NO_SVE-NEXT: cmeq v0.4s, v0.4s, #0 +; NO_SVE-NEXT: uzp1 v0.8h, v1.8h, v0.8h +; NO_SVE-NEXT: ldp q2, q1, [x0, #32] +; NO_SVE-NEXT: xtn v0.8b, v0.8h +; NO_SVE-NEXT: cmeq v2.4s, v2.4s, #0 +; NO_SVE-NEXT: umov w8, v0.b[1] +; NO_SVE-NEXT: cmeq v1.4s, v1.4s, #0 +; NO_SVE-NEXT: umov w10, v0.b[2] +; NO_SVE-NEXT: umov w9, v0.b[0] +; NO_SVE-NEXT: umov w11, v0.b[3] +; NO_SVE-NEXT: umov w12, v0.b[4] +; NO_SVE-NEXT: uzp1 v1.8h, v2.8h, v1.8h +; NO_SVE-NEXT: and w8, w8, #0x1 +; NO_SVE-NEXT: umov w13, v0.b[5] +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: and w9, w9, #0x1 +; NO_SVE-NEXT: and w11, w11, #0x1 +; NO_SVE-NEXT: bfi w9, w8, #1, #1 +; NO_SVE-NEXT: xtn v1.8b, v1.8h +; NO_SVE-NEXT: umov w8, v0.b[6] +; NO_SVE-NEXT: and w12, w12, #0x1 +; NO_SVE-NEXT: bfi w9, w10, #2, #1 +; NO_SVE-NEXT: umov w10, v0.b[7] +; NO_SVE-NEXT: and w13, w13, #0x1 +; NO_SVE-NEXT: bfi w9, w11, #3, #1 +; NO_SVE-NEXT: umov w11, v1.b[0] +; NO_SVE-NEXT: bfi w9, w12, #4, #1 +; NO_SVE-NEXT: umov w12, v1.b[1] +; NO_SVE-NEXT: and w8, w8, #0x1 +; NO_SVE-NEXT: bfi w9, w13, #5, #1 +; NO_SVE-NEXT: umov w13, v1.b[2] +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: orr w8, w9, w8, lsl #6 +; NO_SVE-NEXT: umov w9, v1.b[3] +; NO_SVE-NEXT: and w11, w11, #0x1 +; NO_SVE-NEXT: orr w8, w8, w10, lsl #7 +; NO_SVE-NEXT: umov w10, v1.b[4] +; NO_SVE-NEXT: and w12, w12, #0x1 +; NO_SVE-NEXT: orr w8, w8, w11, lsl #8 +; NO_SVE-NEXT: umov w11, v1.b[5] +; NO_SVE-NEXT: and w13, w13, #0x1 +; NO_SVE-NEXT: orr w8, w8, w12, lsl #9 +; NO_SVE-NEXT: umov w12, v1.b[6] +; NO_SVE-NEXT: and w9, w9, #0x1 +; NO_SVE-NEXT: orr w8, w8, w13, lsl #10 +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: orr w8, w8, w9, lsl #11 +; NO_SVE-NEXT: and w9, w11, #0x1 +; NO_SVE-NEXT: umov w11, v1.b[7] +; NO_SVE-NEXT: orr w8, w8, w10, lsl #12 +; NO_SVE-NEXT: and w10, w12, #0x1 +; NO_SVE-NEXT: orr w8, w8, w9, lsl #13 +; NO_SVE-NEXT: ldr q1, [x1] +; NO_SVE-NEXT: orr w8, w8, w10, lsl #14 +; NO_SVE-NEXT: orr w9, w8, w11, lsl #15 +; NO_SVE-NEXT: and w8, w9, #0xffff +; NO_SVE-NEXT: tbz w9, #0, .LBB13_2 +; NO_SVE-NEXT: // %bb.1: // %cond.load +; NO_SVE-NEXT: fmov x9, d1 +; NO_SVE-NEXT: ldr s0, [x9] +; NO_SVE-NEXT: tbnz w8, #1, .LBB13_3 +; NO_SVE-NEXT: b .LBB13_4 +; NO_SVE-NEXT: .LBB13_2: +; NO_SVE-NEXT: // implicit-def: $q0 +; NO_SVE-NEXT: tbz w8, #1, .LBB13_4 +; NO_SVE-NEXT: .LBB13_3: // %cond.load1 +; NO_SVE-NEXT: mov x9, v1.d[1] +; NO_SVE-NEXT: ld1 { v0.s }[1], [x9] +; NO_SVE-NEXT: .LBB13_4: // %else2 +; NO_SVE-NEXT: ldr q1, [x1, #16] +; NO_SVE-NEXT: tbnz w8, #2, .LBB13_8 +; NO_SVE-NEXT: // %bb.5: // %else5 +; NO_SVE-NEXT: tbnz w8, #3, .LBB13_9 +; NO_SVE-NEXT: .LBB13_6: // %else8 +; NO_SVE-NEXT: ldr q2, [x1, #32] +; NO_SVE-NEXT: tbz w8, #4, .LBB13_10 +; NO_SVE-NEXT: .LBB13_7: // %cond.load10 +; NO_SVE-NEXT: fmov x9, d2 +; NO_SVE-NEXT: ld1 { v1.s }[0], [x9] +; NO_SVE-NEXT: tbnz w8, #5, .LBB13_11 +; NO_SVE-NEXT: b .LBB13_12 +; NO_SVE-NEXT: .LBB13_8: // %cond.load4 +; NO_SVE-NEXT: fmov x9, d1 +; NO_SVE-NEXT: ld1 { v0.s }[2], [x9] +; NO_SVE-NEXT: tbz w8, #3, .LBB13_6 +; NO_SVE-NEXT: .LBB13_9: // %cond.load7 +; NO_SVE-NEXT: mov x9, v1.d[1] +; NO_SVE-NEXT: ld1 { v0.s }[3], [x9] +; NO_SVE-NEXT: ldr q2, [x1, #32] +; NO_SVE-NEXT: tbnz w8, #4, .LBB13_7 +; NO_SVE-NEXT: .LBB13_10: +; NO_SVE-NEXT: // implicit-def: $q1 +; NO_SVE-NEXT: tbz w8, #5, .LBB13_12 +; NO_SVE-NEXT: .LBB13_11: // %cond.load13 +; NO_SVE-NEXT: mov x9, v2.d[1] +; NO_SVE-NEXT: ld1 { v1.s }[1], [x9] +; NO_SVE-NEXT: .LBB13_12: // %else14 +; NO_SVE-NEXT: ldr q2, [x1, #48] +; NO_SVE-NEXT: tbnz w8, #6, .LBB13_16 +; NO_SVE-NEXT: // %bb.13: // %else17 +; NO_SVE-NEXT: tbnz w8, #7, .LBB13_17 +; NO_SVE-NEXT: .LBB13_14: // %else20 +; NO_SVE-NEXT: ldr q3, [x1, #64] +; NO_SVE-NEXT: tbz w8, #8, .LBB13_18 +; NO_SVE-NEXT: .LBB13_15: // %cond.load22 +; NO_SVE-NEXT: fmov x9, d3 +; NO_SVE-NEXT: ld1 { v2.s }[0], [x9] +; NO_SVE-NEXT: tbnz w8, #9, .LBB13_19 +; NO_SVE-NEXT: b .LBB13_20 +; NO_SVE-NEXT: .LBB13_16: // %cond.load16 +; NO_SVE-NEXT: fmov x9, d2 +; NO_SVE-NEXT: ld1 { v1.s }[2], [x9] +; NO_SVE-NEXT: tbz w8, #7, .LBB13_14 +; NO_SVE-NEXT: .LBB13_17: // %cond.load19 +; NO_SVE-NEXT: mov x9, v2.d[1] +; NO_SVE-NEXT: ld1 { v1.s }[3], [x9] +; NO_SVE-NEXT: ldr q3, [x1, #64] +; NO_SVE-NEXT: tbnz w8, #8, .LBB13_15 +; NO_SVE-NEXT: .LBB13_18: +; NO_SVE-NEXT: // implicit-def: $q2 +; NO_SVE-NEXT: tbz w8, #9, .LBB13_20 +; NO_SVE-NEXT: .LBB13_19: // %cond.load25 +; NO_SVE-NEXT: mov x9, v3.d[1] +; NO_SVE-NEXT: ld1 { v2.s }[1], [x9] +; NO_SVE-NEXT: .LBB13_20: // %else26 +; NO_SVE-NEXT: ldr q3, [x1, #80] +; NO_SVE-NEXT: tbnz w8, #10, .LBB13_24 +; NO_SVE-NEXT: // %bb.21: // %else29 +; NO_SVE-NEXT: tbnz w8, #11, .LBB13_25 +; NO_SVE-NEXT: .LBB13_22: // %else32 +; NO_SVE-NEXT: ldr q4, [x1, #96] +; NO_SVE-NEXT: tbz w8, #12, .LBB13_26 +; NO_SVE-NEXT: .LBB13_23: // %cond.load34 +; NO_SVE-NEXT: fmov x9, d4 +; NO_SVE-NEXT: ld1 { v3.s }[0], [x9] +; NO_SVE-NEXT: tbnz w8, #13, .LBB13_27 +; NO_SVE-NEXT: b .LBB13_28 +; NO_SVE-NEXT: .LBB13_24: // %cond.load28 +; NO_SVE-NEXT: fmov x9, d3 +; NO_SVE-NEXT: ld1 { v2.s }[2], [x9] +; NO_SVE-NEXT: tbz w8, #11, .LBB13_22 +; NO_SVE-NEXT: .LBB13_25: // %cond.load31 +; NO_SVE-NEXT: mov x9, v3.d[1] +; NO_SVE-NEXT: ld1 { v2.s }[3], [x9] +; NO_SVE-NEXT: ldr q4, [x1, #96] +; NO_SVE-NEXT: tbnz w8, #12, .LBB13_23 +; NO_SVE-NEXT: .LBB13_26: +; NO_SVE-NEXT: // implicit-def: $q3 +; NO_SVE-NEXT: tbz w8, #13, .LBB13_28 +; NO_SVE-NEXT: .LBB13_27: // %cond.load37 +; NO_SVE-NEXT: mov x9, v4.d[1] +; NO_SVE-NEXT: ld1 { v3.s }[1], [x9] +; NO_SVE-NEXT: .LBB13_28: // %else38 +; NO_SVE-NEXT: ldr q4, [x1, #112] +; NO_SVE-NEXT: tbnz w8, #14, .LBB13_32 +; NO_SVE-NEXT: // %bb.29: // %else41 +; NO_SVE-NEXT: tbz w8, #15, .LBB13_31 +; NO_SVE-NEXT: .LBB13_30: // %cond.load43 +; NO_SVE-NEXT: mov x8, v4.d[1] +; NO_SVE-NEXT: ld1 { v3.s }[3], [x8] +; NO_SVE-NEXT: .LBB13_31: // %else44 +; NO_SVE-NEXT: stp q0, q1, [x0] +; NO_SVE-NEXT: stp q2, q3, [x0, #32] +; NO_SVE-NEXT: add sp, sp, #16 +; NO_SVE-NEXT: ret +; NO_SVE-NEXT: .LBB13_32: // %cond.load40 +; NO_SVE-NEXT: fmov x9, d4 +; NO_SVE-NEXT: ld1 { v3.s }[2], [x9] +; NO_SVE-NEXT: tbnz w8, #15, .LBB13_30 +; NO_SVE-NEXT: b .LBB13_31 +; +; VBITS_EQ_256-LABEL: masked_gather_v16i32: +; VBITS_EQ_256: // %bb.0: +; VBITS_EQ_256-NEXT: mov x8, #8 +; VBITS_EQ_256-NEXT: ptrue p0.s, vl8 +; VBITS_EQ_256-NEXT: mov x9, #4 +; VBITS_EQ_256-NEXT: mov x10, #12 +; VBITS_EQ_256-NEXT: ptrue p1.d, vl4 +; VBITS_EQ_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] +; VBITS_EQ_256-NEXT: ld1w { z1.s }, p0/z, [x0] +; VBITS_EQ_256-NEXT: ld1d { z4.d }, p1/z, [x1, x8, lsl #3] +; VBITS_EQ_256-NEXT: ld1d { z2.d }, p1/z, [x1, x9, lsl #3] +; VBITS_EQ_256-NEXT: ld1d { z3.d }, p1/z, [x1, x10, lsl #3] +; VBITS_EQ_256-NEXT: ld1d { z5.d }, p1/z, [x1] +; VBITS_EQ_256-NEXT: cmpeq p2.s, p0/z, z0.s, #0 +; VBITS_EQ_256-NEXT: cmpeq p3.s, p0/z, z1.s, #0 +; VBITS_EQ_256-NEXT: mov z0.s, p2/z, #-1 // =0xffffffffffffffff +; VBITS_EQ_256-NEXT: punpklo p2.h, p2.b +; VBITS_EQ_256-NEXT: mov z1.s, p3/z, #-1 // =0xffffffffffffffff +; VBITS_EQ_256-NEXT: and p2.b, p2/z, p2.b, p1.b +; VBITS_EQ_256-NEXT: ext z0.b, z0.b, z0.b, #16 +; VBITS_EQ_256-NEXT: ext z1.b, z1.b, z1.b, #16 +; VBITS_EQ_256-NEXT: ld1w { z4.d }, p2/z, [z4.d] +; VBITS_EQ_256-NEXT: sunpklo z0.d, z0.s +; VBITS_EQ_256-NEXT: punpklo p2.h, p3.b +; VBITS_EQ_256-NEXT: sunpklo z1.d, z1.s +; VBITS_EQ_256-NEXT: cmpne p3.d, p1/z, z0.d, #0 +; VBITS_EQ_256-NEXT: and p2.b, p2/z, p2.b, p1.b +; VBITS_EQ_256-NEXT: cmpne p1.d, p1/z, z1.d, #0 +; VBITS_EQ_256-NEXT: ld1w { z0.d }, p3/z, [z3.d] +; VBITS_EQ_256-NEXT: ld1w { z1.d }, p2/z, [z5.d] +; VBITS_EQ_256-NEXT: ld1w { z2.d }, p1/z, [z2.d] +; VBITS_EQ_256-NEXT: ptrue p1.s, vl4 +; VBITS_EQ_256-NEXT: uzp1 z3.s, z4.s, z4.s +; VBITS_EQ_256-NEXT: uzp1 z0.s, z0.s, z0.s +; VBITS_EQ_256-NEXT: uzp1 z1.s, z1.s, z1.s +; VBITS_EQ_256-NEXT: uzp1 z2.s, z2.s, z2.s +; VBITS_EQ_256-NEXT: splice z3.s, p1, z3.s, z0.s +; VBITS_EQ_256-NEXT: splice z1.s, p1, z1.s, z2.s +; VBITS_EQ_256-NEXT: st1w { z3.s }, p0, [x0, x8, lsl #2] +; VBITS_EQ_256-NEXT: st1w { z1.s }, p0, [x0] +; VBITS_EQ_256-NEXT: ret +; ; VBITS_GE_1024-LABEL: masked_gather_v16i32: ; VBITS_GE_1024: // %bb.0: ; VBITS_GE_1024-NEXT: ptrue p0.s, vl16 @@ -441,6 +2539,431 @@ } define void @masked_gather_v32i32(<32 x i32>* %a, <32 x i32*>* %b) #0 { +; NO_SVE-LABEL: masked_gather_v32i32: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: sub sp, sp, #16 +; NO_SVE-NEXT: .cfi_def_cfa_offset 16 +; NO_SVE-NEXT: ldp q2, q3, [x0, #64] +; NO_SVE-NEXT: cmeq v2.4s, v2.4s, #0 +; NO_SVE-NEXT: cmeq v3.4s, v3.4s, #0 +; NO_SVE-NEXT: ldp q4, q5, [x0, #96] +; NO_SVE-NEXT: uzp1 v2.8h, v2.8h, v3.8h +; NO_SVE-NEXT: cmeq v4.4s, v4.4s, #0 +; NO_SVE-NEXT: xtn v2.8b, v2.8h +; NO_SVE-NEXT: cmeq v5.4s, v5.4s, #0 +; NO_SVE-NEXT: umov w8, v2.b[1] +; NO_SVE-NEXT: ldp q0, q1, [x0] +; NO_SVE-NEXT: umov w10, v2.b[2] +; NO_SVE-NEXT: umov w9, v2.b[0] +; NO_SVE-NEXT: uzp1 v3.8h, v4.8h, v5.8h +; NO_SVE-NEXT: umov w11, v2.b[3] +; NO_SVE-NEXT: umov w12, v2.b[4] +; NO_SVE-NEXT: and w8, w8, #0x1 +; NO_SVE-NEXT: umov w13, v2.b[5] +; NO_SVE-NEXT: umov w14, v2.b[6] +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: and w9, w9, #0x1 +; NO_SVE-NEXT: xtn v3.8b, v3.8h +; NO_SVE-NEXT: and w11, w11, #0x1 +; NO_SVE-NEXT: cmeq v1.4s, v1.4s, #0 +; NO_SVE-NEXT: umov w15, v2.b[7] +; NO_SVE-NEXT: cmeq v0.4s, v0.4s, #0 +; NO_SVE-NEXT: bfi w9, w8, #1, #1 +; NO_SVE-NEXT: and w12, w12, #0x1 +; NO_SVE-NEXT: umov w8, v3.b[0] +; NO_SVE-NEXT: bfi w9, w10, #2, #1 +; NO_SVE-NEXT: and w13, w13, #0x1 +; NO_SVE-NEXT: uzp1 v0.8h, v0.8h, v1.8h +; NO_SVE-NEXT: umov w10, v3.b[1] +; NO_SVE-NEXT: bfi w9, w11, #3, #1 +; NO_SVE-NEXT: umov w11, v3.b[2] +; NO_SVE-NEXT: bfi w9, w12, #4, #1 +; NO_SVE-NEXT: and w14, w14, #0x1 +; NO_SVE-NEXT: umov w12, v3.b[3] +; NO_SVE-NEXT: bfi w9, w13, #5, #1 +; NO_SVE-NEXT: and w15, w15, #0x1 +; NO_SVE-NEXT: umov w13, v3.b[4] +; NO_SVE-NEXT: and w8, w8, #0x1 +; NO_SVE-NEXT: orr w9, w9, w14, lsl #6 +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: xtn v0.8b, v0.8h +; NO_SVE-NEXT: orr w9, w9, w15, lsl #7 +; NO_SVE-NEXT: and w11, w11, #0x1 +; NO_SVE-NEXT: ldp q1, q4, [x0, #32] +; NO_SVE-NEXT: orr w8, w9, w8, lsl #8 +; NO_SVE-NEXT: and w12, w12, #0x1 +; NO_SVE-NEXT: umov w15, v0.b[1] +; NO_SVE-NEXT: orr w8, w8, w10, lsl #9 +; NO_SVE-NEXT: and w13, w13, #0x1 +; NO_SVE-NEXT: umov w14, v3.b[5] +; NO_SVE-NEXT: orr w8, w8, w11, lsl #10 +; NO_SVE-NEXT: umov w9, v0.b[0] +; NO_SVE-NEXT: umov w10, v0.b[2] +; NO_SVE-NEXT: orr w8, w8, w12, lsl #11 +; NO_SVE-NEXT: orr w8, w8, w13, lsl #12 +; NO_SVE-NEXT: umov w13, v0.b[3] +; NO_SVE-NEXT: and w12, w15, #0x1 +; NO_SVE-NEXT: and w11, w14, #0x1 +; NO_SVE-NEXT: umov w14, v0.b[4] +; NO_SVE-NEXT: and w9, w9, #0x1 +; NO_SVE-NEXT: cmeq v2.4s, v4.4s, #0 +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: cmeq v1.4s, v1.4s, #0 +; NO_SVE-NEXT: bfi w9, w12, #1, #1 +; NO_SVE-NEXT: umov w15, v0.b[5] +; NO_SVE-NEXT: and w12, w13, #0x1 +; NO_SVE-NEXT: uzp1 v1.8h, v1.8h, v2.8h +; NO_SVE-NEXT: bfi w9, w10, #2, #1 +; NO_SVE-NEXT: and w10, w14, #0x1 +; NO_SVE-NEXT: umov w14, v0.b[6] +; NO_SVE-NEXT: bfi w9, w12, #3, #1 +; NO_SVE-NEXT: umov w12, v0.b[7] +; NO_SVE-NEXT: and w13, w15, #0x1 +; NO_SVE-NEXT: orr w8, w8, w11, lsl #13 +; NO_SVE-NEXT: xtn v0.8b, v1.8h +; NO_SVE-NEXT: bfi w9, w10, #4, #1 +; NO_SVE-NEXT: umov w10, v3.b[6] +; NO_SVE-NEXT: ldr q1, [x1] +; NO_SVE-NEXT: bfi w9, w13, #5, #1 +; NO_SVE-NEXT: and w13, w14, #0x1 +; NO_SVE-NEXT: umov w14, v0.b[0] +; NO_SVE-NEXT: and w12, w12, #0x1 +; NO_SVE-NEXT: umov w15, v0.b[1] +; NO_SVE-NEXT: orr w9, w9, w13, lsl #6 +; NO_SVE-NEXT: orr w9, w9, w12, lsl #7 +; NO_SVE-NEXT: umov w12, v0.b[2] +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: and w11, w14, #0x1 +; NO_SVE-NEXT: umov w14, v0.b[3] +; NO_SVE-NEXT: and w13, w15, #0x1 +; NO_SVE-NEXT: orr w8, w8, w10, lsl #14 +; NO_SVE-NEXT: umov w10, v0.b[4] +; NO_SVE-NEXT: orr w9, w9, w11, lsl #8 +; NO_SVE-NEXT: and w11, w12, #0x1 +; NO_SVE-NEXT: umov w12, v0.b[5] +; NO_SVE-NEXT: orr w9, w9, w13, lsl #9 +; NO_SVE-NEXT: and w13, w14, #0x1 +; NO_SVE-NEXT: umov w14, v0.b[6] +; NO_SVE-NEXT: orr w9, w9, w11, lsl #10 +; NO_SVE-NEXT: umov w11, v3.b[7] +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: orr w9, w9, w13, lsl #11 +; NO_SVE-NEXT: and w12, w12, #0x1 +; NO_SVE-NEXT: umov w13, v0.b[7] +; NO_SVE-NEXT: orr w9, w9, w10, lsl #12 +; NO_SVE-NEXT: and w10, w14, #0x1 +; NO_SVE-NEXT: orr w11, w8, w11, lsl #15 +; NO_SVE-NEXT: orr w8, w9, w12, lsl #13 +; NO_SVE-NEXT: orr w8, w8, w10, lsl #14 +; NO_SVE-NEXT: orr w8, w8, w13, lsl #15 +; NO_SVE-NEXT: bfi w8, w11, #16, #16 +; NO_SVE-NEXT: tbz w8, #0, .LBB14_2 +; NO_SVE-NEXT: // %bb.1: // %cond.load +; NO_SVE-NEXT: fmov x9, d1 +; NO_SVE-NEXT: ldr s0, [x9] +; NO_SVE-NEXT: tbnz w8, #1, .LBB14_3 +; NO_SVE-NEXT: b .LBB14_4 +; NO_SVE-NEXT: .LBB14_2: +; NO_SVE-NEXT: // implicit-def: $q0 +; NO_SVE-NEXT: tbz w8, #1, .LBB14_4 +; NO_SVE-NEXT: .LBB14_3: // %cond.load1 +; NO_SVE-NEXT: mov x9, v1.d[1] +; NO_SVE-NEXT: ld1 { v0.s }[1], [x9] +; NO_SVE-NEXT: .LBB14_4: // %else2 +; NO_SVE-NEXT: ldr q1, [x1, #16] +; NO_SVE-NEXT: tbnz w8, #2, .LBB14_8 +; NO_SVE-NEXT: // %bb.5: // %else5 +; NO_SVE-NEXT: tbnz w8, #3, .LBB14_9 +; NO_SVE-NEXT: .LBB14_6: // %else8 +; NO_SVE-NEXT: ldr q2, [x1, #32] +; NO_SVE-NEXT: tbz w8, #4, .LBB14_10 +; NO_SVE-NEXT: .LBB14_7: // %cond.load10 +; NO_SVE-NEXT: fmov x9, d2 +; NO_SVE-NEXT: ld1 { v1.s }[0], [x9] +; NO_SVE-NEXT: tbnz w8, #5, .LBB14_11 +; NO_SVE-NEXT: b .LBB14_12 +; NO_SVE-NEXT: .LBB14_8: // %cond.load4 +; NO_SVE-NEXT: fmov x9, d1 +; NO_SVE-NEXT: ld1 { v0.s }[2], [x9] +; NO_SVE-NEXT: tbz w8, #3, .LBB14_6 +; NO_SVE-NEXT: .LBB14_9: // %cond.load7 +; NO_SVE-NEXT: mov x9, v1.d[1] +; NO_SVE-NEXT: ld1 { v0.s }[3], [x9] +; NO_SVE-NEXT: ldr q2, [x1, #32] +; NO_SVE-NEXT: tbnz w8, #4, .LBB14_7 +; NO_SVE-NEXT: .LBB14_10: +; NO_SVE-NEXT: // implicit-def: $q1 +; NO_SVE-NEXT: tbz w8, #5, .LBB14_12 +; NO_SVE-NEXT: .LBB14_11: // %cond.load13 +; NO_SVE-NEXT: mov x9, v2.d[1] +; NO_SVE-NEXT: ld1 { v1.s }[1], [x9] +; NO_SVE-NEXT: .LBB14_12: // %else14 +; NO_SVE-NEXT: ldr q2, [x1, #48] +; NO_SVE-NEXT: tbnz w8, #6, .LBB14_16 +; NO_SVE-NEXT: // %bb.13: // %else17 +; NO_SVE-NEXT: tbnz w8, #7, .LBB14_17 +; NO_SVE-NEXT: .LBB14_14: // %else20 +; NO_SVE-NEXT: ldr q3, [x1, #64] +; NO_SVE-NEXT: tbz w8, #8, .LBB14_18 +; NO_SVE-NEXT: .LBB14_15: // %cond.load22 +; NO_SVE-NEXT: fmov x9, d3 +; NO_SVE-NEXT: ld1 { v2.s }[0], [x9] +; NO_SVE-NEXT: tbnz w8, #9, .LBB14_19 +; NO_SVE-NEXT: b .LBB14_20 +; NO_SVE-NEXT: .LBB14_16: // %cond.load16 +; NO_SVE-NEXT: fmov x9, d2 +; NO_SVE-NEXT: ld1 { v1.s }[2], [x9] +; NO_SVE-NEXT: tbz w8, #7, .LBB14_14 +; NO_SVE-NEXT: .LBB14_17: // %cond.load19 +; NO_SVE-NEXT: mov x9, v2.d[1] +; NO_SVE-NEXT: ld1 { v1.s }[3], [x9] +; NO_SVE-NEXT: ldr q3, [x1, #64] +; NO_SVE-NEXT: tbnz w8, #8, .LBB14_15 +; NO_SVE-NEXT: .LBB14_18: +; NO_SVE-NEXT: // implicit-def: $q2 +; NO_SVE-NEXT: tbz w8, #9, .LBB14_20 +; NO_SVE-NEXT: .LBB14_19: // %cond.load25 +; NO_SVE-NEXT: mov x9, v3.d[1] +; NO_SVE-NEXT: ld1 { v2.s }[1], [x9] +; NO_SVE-NEXT: .LBB14_20: // %else26 +; NO_SVE-NEXT: ldr q3, [x1, #80] +; NO_SVE-NEXT: tbnz w8, #10, .LBB14_24 +; NO_SVE-NEXT: // %bb.21: // %else29 +; NO_SVE-NEXT: tbnz w8, #11, .LBB14_25 +; NO_SVE-NEXT: .LBB14_22: // %else32 +; NO_SVE-NEXT: ldr q4, [x1, #96] +; NO_SVE-NEXT: tbz w8, #12, .LBB14_26 +; NO_SVE-NEXT: .LBB14_23: // %cond.load34 +; NO_SVE-NEXT: fmov x9, d4 +; NO_SVE-NEXT: ld1 { v3.s }[0], [x9] +; NO_SVE-NEXT: tbnz w8, #13, .LBB14_27 +; NO_SVE-NEXT: b .LBB14_28 +; NO_SVE-NEXT: .LBB14_24: // %cond.load28 +; NO_SVE-NEXT: fmov x9, d3 +; NO_SVE-NEXT: ld1 { v2.s }[2], [x9] +; NO_SVE-NEXT: tbz w8, #11, .LBB14_22 +; NO_SVE-NEXT: .LBB14_25: // %cond.load31 +; NO_SVE-NEXT: mov x9, v3.d[1] +; NO_SVE-NEXT: ld1 { v2.s }[3], [x9] +; NO_SVE-NEXT: ldr q4, [x1, #96] +; NO_SVE-NEXT: tbnz w8, #12, .LBB14_23 +; NO_SVE-NEXT: .LBB14_26: +; NO_SVE-NEXT: // implicit-def: $q3 +; NO_SVE-NEXT: tbz w8, #13, .LBB14_28 +; NO_SVE-NEXT: .LBB14_27: // %cond.load37 +; NO_SVE-NEXT: mov x9, v4.d[1] +; NO_SVE-NEXT: ld1 { v3.s }[1], [x9] +; NO_SVE-NEXT: .LBB14_28: // %else38 +; NO_SVE-NEXT: ldr q4, [x1, #112] +; NO_SVE-NEXT: tbnz w8, #14, .LBB14_32 +; NO_SVE-NEXT: // %bb.29: // %else41 +; NO_SVE-NEXT: tbnz w8, #15, .LBB14_33 +; NO_SVE-NEXT: .LBB14_30: // %else44 +; NO_SVE-NEXT: ldr q5, [x1, #128] +; NO_SVE-NEXT: tbz w8, #16, .LBB14_34 +; NO_SVE-NEXT: .LBB14_31: // %cond.load46 +; NO_SVE-NEXT: fmov x9, d5 +; NO_SVE-NEXT: ld1 { v4.s }[0], [x9] +; NO_SVE-NEXT: tbnz w8, #17, .LBB14_35 +; NO_SVE-NEXT: b .LBB14_36 +; NO_SVE-NEXT: .LBB14_32: // %cond.load40 +; NO_SVE-NEXT: fmov x9, d4 +; NO_SVE-NEXT: ld1 { v3.s }[2], [x9] +; NO_SVE-NEXT: tbz w8, #15, .LBB14_30 +; NO_SVE-NEXT: .LBB14_33: // %cond.load43 +; NO_SVE-NEXT: mov x9, v4.d[1] +; NO_SVE-NEXT: ld1 { v3.s }[3], [x9] +; NO_SVE-NEXT: ldr q5, [x1, #128] +; NO_SVE-NEXT: tbnz w8, #16, .LBB14_31 +; NO_SVE-NEXT: .LBB14_34: +; NO_SVE-NEXT: // implicit-def: $q4 +; NO_SVE-NEXT: tbz w8, #17, .LBB14_36 +; NO_SVE-NEXT: .LBB14_35: // %cond.load49 +; NO_SVE-NEXT: mov x9, v5.d[1] +; NO_SVE-NEXT: ld1 { v4.s }[1], [x9] +; NO_SVE-NEXT: .LBB14_36: // %else50 +; NO_SVE-NEXT: ldr q5, [x1, #144] +; NO_SVE-NEXT: tbnz w8, #18, .LBB14_40 +; NO_SVE-NEXT: // %bb.37: // %else53 +; NO_SVE-NEXT: tbnz w8, #19, .LBB14_41 +; NO_SVE-NEXT: .LBB14_38: // %else56 +; NO_SVE-NEXT: ldr q6, [x1, #160] +; NO_SVE-NEXT: tbz w8, #20, .LBB14_42 +; NO_SVE-NEXT: .LBB14_39: // %cond.load58 +; NO_SVE-NEXT: fmov x9, d6 +; NO_SVE-NEXT: ld1 { v5.s }[0], [x9] +; NO_SVE-NEXT: tbnz w8, #21, .LBB14_43 +; NO_SVE-NEXT: b .LBB14_44 +; NO_SVE-NEXT: .LBB14_40: // %cond.load52 +; NO_SVE-NEXT: fmov x9, d5 +; NO_SVE-NEXT: ld1 { v4.s }[2], [x9] +; NO_SVE-NEXT: tbz w8, #19, .LBB14_38 +; NO_SVE-NEXT: .LBB14_41: // %cond.load55 +; NO_SVE-NEXT: mov x9, v5.d[1] +; NO_SVE-NEXT: ld1 { v4.s }[3], [x9] +; NO_SVE-NEXT: ldr q6, [x1, #160] +; NO_SVE-NEXT: tbnz w8, #20, .LBB14_39 +; NO_SVE-NEXT: .LBB14_42: +; NO_SVE-NEXT: // implicit-def: $q5 +; NO_SVE-NEXT: tbz w8, #21, .LBB14_44 +; NO_SVE-NEXT: .LBB14_43: // %cond.load61 +; NO_SVE-NEXT: mov x9, v6.d[1] +; NO_SVE-NEXT: ld1 { v5.s }[1], [x9] +; NO_SVE-NEXT: .LBB14_44: // %else62 +; NO_SVE-NEXT: ldr q6, [x1, #176] +; NO_SVE-NEXT: tbnz w8, #22, .LBB14_48 +; NO_SVE-NEXT: // %bb.45: // %else65 +; NO_SVE-NEXT: tbnz w8, #23, .LBB14_49 +; NO_SVE-NEXT: .LBB14_46: // %else68 +; NO_SVE-NEXT: ldr q7, [x1, #192] +; NO_SVE-NEXT: tbz w8, #24, .LBB14_50 +; NO_SVE-NEXT: .LBB14_47: // %cond.load70 +; NO_SVE-NEXT: fmov x9, d7 +; NO_SVE-NEXT: ld1 { v6.s }[0], [x9] +; NO_SVE-NEXT: tbnz w8, #25, .LBB14_51 +; NO_SVE-NEXT: b .LBB14_52 +; NO_SVE-NEXT: .LBB14_48: // %cond.load64 +; NO_SVE-NEXT: fmov x9, d6 +; NO_SVE-NEXT: ld1 { v5.s }[2], [x9] +; NO_SVE-NEXT: tbz w8, #23, .LBB14_46 +; NO_SVE-NEXT: .LBB14_49: // %cond.load67 +; NO_SVE-NEXT: mov x9, v6.d[1] +; NO_SVE-NEXT: ld1 { v5.s }[3], [x9] +; NO_SVE-NEXT: ldr q7, [x1, #192] +; NO_SVE-NEXT: tbnz w8, #24, .LBB14_47 +; NO_SVE-NEXT: .LBB14_50: +; NO_SVE-NEXT: // implicit-def: $q6 +; NO_SVE-NEXT: tbz w8, #25, .LBB14_52 +; NO_SVE-NEXT: .LBB14_51: // %cond.load73 +; NO_SVE-NEXT: mov x9, v7.d[1] +; NO_SVE-NEXT: ld1 { v6.s }[1], [x9] +; NO_SVE-NEXT: .LBB14_52: // %else74 +; NO_SVE-NEXT: ldr q7, [x1, #208] +; NO_SVE-NEXT: tbnz w8, #26, .LBB14_56 +; NO_SVE-NEXT: // %bb.53: // %else77 +; NO_SVE-NEXT: tbnz w8, #27, .LBB14_57 +; NO_SVE-NEXT: .LBB14_54: // %else80 +; NO_SVE-NEXT: ldr q16, [x1, #224] +; NO_SVE-NEXT: tbz w8, #28, .LBB14_58 +; NO_SVE-NEXT: .LBB14_55: // %cond.load82 +; NO_SVE-NEXT: fmov x9, d16 +; NO_SVE-NEXT: ld1 { v7.s }[0], [x9] +; NO_SVE-NEXT: tbnz w8, #29, .LBB14_59 +; NO_SVE-NEXT: b .LBB14_60 +; NO_SVE-NEXT: .LBB14_56: // %cond.load76 +; NO_SVE-NEXT: fmov x9, d7 +; NO_SVE-NEXT: ld1 { v6.s }[2], [x9] +; NO_SVE-NEXT: tbz w8, #27, .LBB14_54 +; NO_SVE-NEXT: .LBB14_57: // %cond.load79 +; NO_SVE-NEXT: mov x9, v7.d[1] +; NO_SVE-NEXT: ld1 { v6.s }[3], [x9] +; NO_SVE-NEXT: ldr q16, [x1, #224] +; NO_SVE-NEXT: tbnz w8, #28, .LBB14_55 +; NO_SVE-NEXT: .LBB14_58: +; NO_SVE-NEXT: // implicit-def: $q7 +; NO_SVE-NEXT: tbz w8, #29, .LBB14_60 +; NO_SVE-NEXT: .LBB14_59: // %cond.load85 +; NO_SVE-NEXT: mov x9, v16.d[1] +; NO_SVE-NEXT: ld1 { v7.s }[1], [x9] +; NO_SVE-NEXT: .LBB14_60: // %else86 +; NO_SVE-NEXT: ldr q16, [x1, #240] +; NO_SVE-NEXT: tbnz w8, #30, .LBB14_64 +; NO_SVE-NEXT: // %bb.61: // %else89 +; NO_SVE-NEXT: tbz w8, #31, .LBB14_63 +; NO_SVE-NEXT: .LBB14_62: // %cond.load91 +; NO_SVE-NEXT: mov x8, v16.d[1] +; NO_SVE-NEXT: ld1 { v7.s }[3], [x8] +; NO_SVE-NEXT: .LBB14_63: // %else92 +; NO_SVE-NEXT: stp q0, q1, [x0] +; NO_SVE-NEXT: stp q2, q3, [x0, #32] +; NO_SVE-NEXT: stp q4, q5, [x0, #64] +; NO_SVE-NEXT: stp q6, q7, [x0, #96] +; NO_SVE-NEXT: add sp, sp, #16 +; NO_SVE-NEXT: ret +; NO_SVE-NEXT: .LBB14_64: // %cond.load88 +; NO_SVE-NEXT: fmov x9, d16 +; NO_SVE-NEXT: ld1 { v7.s }[2], [x9] +; NO_SVE-NEXT: tbnz w8, #31, .LBB14_62 +; NO_SVE-NEXT: b .LBB14_63 +; +; VBITS_EQ_256-LABEL: masked_gather_v32i32: +; VBITS_EQ_256: // %bb.0: +; VBITS_EQ_256-NEXT: mov x8, #24 +; VBITS_EQ_256-NEXT: ptrue p0.s, vl8 +; VBITS_EQ_256-NEXT: mov x10, #16 +; VBITS_EQ_256-NEXT: mov x9, #8 +; VBITS_EQ_256-NEXT: mov x14, #28 +; VBITS_EQ_256-NEXT: ptrue p1.d, vl4 +; VBITS_EQ_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] +; VBITS_EQ_256-NEXT: mov x11, #4 +; VBITS_EQ_256-NEXT: ld1w { z2.s }, p0/z, [x0, x10, lsl #2] +; VBITS_EQ_256-NEXT: ld1w { z1.s }, p0/z, [x0, x9, lsl #2] +; VBITS_EQ_256-NEXT: ld1w { z3.s }, p0/z, [x0] +; VBITS_EQ_256-NEXT: ld1d { z17.d }, p1/z, [x1, x14, lsl #3] +; VBITS_EQ_256-NEXT: ld1d { z18.d }, p1/z, [x1, x8, lsl #3] +; VBITS_EQ_256-NEXT: mov x12, #20 +; VBITS_EQ_256-NEXT: cmpeq p2.s, p0/z, z0.s, #0 +; VBITS_EQ_256-NEXT: mov x13, #12 +; VBITS_EQ_256-NEXT: mov z19.s, p2/z, #-1 // =0xffffffffffffffff +; VBITS_EQ_256-NEXT: cmpeq p4.s, p0/z, z2.s, #0 +; VBITS_EQ_256-NEXT: ext z19.b, z19.b, z19.b, #16 +; VBITS_EQ_256-NEXT: punpklo p2.h, p2.b +; VBITS_EQ_256-NEXT: sunpklo z2.d, z19.s +; VBITS_EQ_256-NEXT: and p2.b, p2/z, p2.b, p1.b +; VBITS_EQ_256-NEXT: cmpne p3.d, p1/z, z2.d, #0 +; VBITS_EQ_256-NEXT: ld1d { z4.d }, p1/z, [x1, x11, lsl #3] +; VBITS_EQ_256-NEXT: ld1d { z5.d }, p1/z, [x1, x12, lsl #3] +; VBITS_EQ_256-NEXT: ld1d { z6.d }, p1/z, [x1, x13, lsl #3] +; VBITS_EQ_256-NEXT: ld1d { z7.d }, p1/z, [x1, x10, lsl #3] +; VBITS_EQ_256-NEXT: ld1d { z16.d }, p1/z, [x1, x9, lsl #3] +; VBITS_EQ_256-NEXT: ld1d { z0.d }, p1/z, [x1] +; VBITS_EQ_256-NEXT: ld1w { z2.d }, p2/z, [z18.d] +; VBITS_EQ_256-NEXT: ld1w { z17.d }, p3/z, [z17.d] +; VBITS_EQ_256-NEXT: cmpeq p3.s, p0/z, z1.s, #0 +; VBITS_EQ_256-NEXT: mov z1.s, p3/z, #-1 // =0xffffffffffffffff +; VBITS_EQ_256-NEXT: cmpeq p2.s, p0/z, z3.s, #0 +; VBITS_EQ_256-NEXT: ext z1.b, z1.b, z1.b, #16 +; VBITS_EQ_256-NEXT: mov z18.s, p4/z, #-1 // =0xffffffffffffffff +; VBITS_EQ_256-NEXT: mov z3.s, p2/z, #-1 // =0xffffffffffffffff +; VBITS_EQ_256-NEXT: punpklo p3.h, p3.b +; VBITS_EQ_256-NEXT: sunpklo z1.d, z1.s +; VBITS_EQ_256-NEXT: and p3.b, p3/z, p3.b, p1.b +; VBITS_EQ_256-NEXT: cmpne p5.d, p1/z, z1.d, #0 +; VBITS_EQ_256-NEXT: punpklo p4.h, p4.b +; VBITS_EQ_256-NEXT: ext z18.b, z18.b, z18.b, #16 +; VBITS_EQ_256-NEXT: ext z3.b, z3.b, z3.b, #16 +; VBITS_EQ_256-NEXT: ld1w { z16.d }, p3/z, [z16.d] +; VBITS_EQ_256-NEXT: ld1w { z1.d }, p5/z, [z6.d] +; VBITS_EQ_256-NEXT: and p4.b, p4/z, p4.b, p1.b +; VBITS_EQ_256-NEXT: sunpklo z6.d, z18.s +; VBITS_EQ_256-NEXT: punpklo p2.h, p2.b +; VBITS_EQ_256-NEXT: sunpklo z3.d, z3.s +; VBITS_EQ_256-NEXT: ld1w { z7.d }, p4/z, [z7.d] +; VBITS_EQ_256-NEXT: cmpne p4.d, p1/z, z6.d, #0 +; VBITS_EQ_256-NEXT: and p2.b, p2/z, p2.b, p1.b +; VBITS_EQ_256-NEXT: cmpne p1.d, p1/z, z3.d, #0 +; VBITS_EQ_256-NEXT: ld1w { z5.d }, p4/z, [z5.d] +; VBITS_EQ_256-NEXT: ld1w { z0.d }, p2/z, [z0.d] +; VBITS_EQ_256-NEXT: ld1w { z3.d }, p1/z, [z4.d] +; VBITS_EQ_256-NEXT: ptrue p3.s, vl4 +; VBITS_EQ_256-NEXT: uzp1 z4.s, z16.s, z16.s +; VBITS_EQ_256-NEXT: uzp1 z1.s, z1.s, z1.s +; VBITS_EQ_256-NEXT: uzp1 z2.s, z2.s, z2.s +; VBITS_EQ_256-NEXT: uzp1 z17.s, z17.s, z17.s +; VBITS_EQ_256-NEXT: splice z4.s, p3, z4.s, z1.s +; VBITS_EQ_256-NEXT: uzp1 z1.s, z7.s, z7.s +; VBITS_EQ_256-NEXT: uzp1 z5.s, z5.s, z5.s +; VBITS_EQ_256-NEXT: uzp1 z0.s, z0.s, z0.s +; VBITS_EQ_256-NEXT: uzp1 z3.s, z3.s, z3.s +; VBITS_EQ_256-NEXT: splice z2.s, p3, z2.s, z17.s +; VBITS_EQ_256-NEXT: splice z1.s, p3, z1.s, z5.s +; VBITS_EQ_256-NEXT: splice z0.s, p3, z0.s, z3.s +; VBITS_EQ_256-NEXT: st1w { z1.s }, p0, [x0, x10, lsl #2] +; VBITS_EQ_256-NEXT: st1w { z4.s }, p0, [x0, x9, lsl #2] +; VBITS_EQ_256-NEXT: st1w { z2.s }, p0, [x0, x8, lsl #2] +; VBITS_EQ_256-NEXT: st1w { z0.s }, p0, [x0] +; VBITS_EQ_256-NEXT: ret +; ; VBITS_GE_2048-LABEL: masked_gather_v32i32: ; VBITS_GE_2048: // %bb.0: ; VBITS_GE_2048-NEXT: ptrue p0.s, vl32 @@ -467,6 +2990,20 @@ ; Scalarize 1 x i64 gathers define void @masked_gather_v1i64(<1 x i64>* %a, <1 x i64*>* %b) #0 { +; NO_SVE-LABEL: masked_gather_v1i64: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: ldr d0, [x0] +; NO_SVE-NEXT: fmov x8, d0 +; NO_SVE-NEXT: // implicit-def: $d0 +; NO_SVE-NEXT: cbnz x8, .LBB15_2 +; NO_SVE-NEXT: // %bb.1: // %cond.load +; NO_SVE-NEXT: ldr d0, [x1] +; NO_SVE-NEXT: fmov x8, d0 +; NO_SVE-NEXT: ldr d0, [x8] +; NO_SVE-NEXT: .LBB15_2: // %else +; NO_SVE-NEXT: str d0, [x0] +; NO_SVE-NEXT: ret +; ; CHECK-LABEL: masked_gather_v1i64: ; CHECK: // %bb.0: ; CHECK-NEXT: ldr d0, [x0] @@ -489,6 +3026,33 @@ } define void @masked_gather_v2i64(<2 x i64>* %a, <2 x i64*>* %b) #0 { +; NO_SVE-LABEL: masked_gather_v2i64: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: sub sp, sp, #16 +; NO_SVE-NEXT: .cfi_def_cfa_offset 16 +; NO_SVE-NEXT: ldr q0, [x0] +; NO_SVE-NEXT: ldr q1, [x1] +; NO_SVE-NEXT: cmeq v0.2d, v0.2d, #0 +; NO_SVE-NEXT: xtn v0.2s, v0.2d +; NO_SVE-NEXT: mov w8, v0.s[1] +; NO_SVE-NEXT: fmov w9, s0 +; NO_SVE-NEXT: // implicit-def: $q0 +; NO_SVE-NEXT: bfi w9, w8, #1, #31 +; NO_SVE-NEXT: and w8, w9, #0x3 +; NO_SVE-NEXT: tbz w9, #0, .LBB16_2 +; NO_SVE-NEXT: // %bb.1: // %cond.load +; NO_SVE-NEXT: fmov x9, d1 +; NO_SVE-NEXT: ldr d0, [x9] +; NO_SVE-NEXT: .LBB16_2: // %else +; NO_SVE-NEXT: tbz w8, #1, .LBB16_4 +; NO_SVE-NEXT: // %bb.3: // %cond.load1 +; NO_SVE-NEXT: mov x8, v1.d[1] +; NO_SVE-NEXT: ld1 { v0.d }[1], [x8] +; NO_SVE-NEXT: .LBB16_4: // %else2 +; NO_SVE-NEXT: str q0, [x0] +; NO_SVE-NEXT: add sp, sp, #16 +; NO_SVE-NEXT: ret +; ; CHECK-LABEL: masked_gather_v2i64: ; CHECK: // %bb.0: ; CHECK-NEXT: ldr q0, [x0] @@ -508,6 +3072,57 @@ } define void @masked_gather_v4i64(<4 x i64>* %a, <4 x i64*>* %b) #0 { +; NO_SVE-LABEL: masked_gather_v4i64: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: sub sp, sp, #16 +; NO_SVE-NEXT: .cfi_def_cfa_offset 16 +; NO_SVE-NEXT: ldp q1, q0, [x0] +; NO_SVE-NEXT: cmeq v1.2d, v1.2d, #0 +; NO_SVE-NEXT: cmeq v0.2d, v0.2d, #0 +; NO_SVE-NEXT: uzp1 v0.4s, v1.4s, v0.4s +; NO_SVE-NEXT: ldr q1, [x1] +; NO_SVE-NEXT: xtn v0.4h, v0.4s +; NO_SVE-NEXT: umov w8, v0.h[1] +; NO_SVE-NEXT: umov w9, v0.h[2] +; NO_SVE-NEXT: umov w10, v0.h[0] +; NO_SVE-NEXT: umov w11, v0.h[3] +; NO_SVE-NEXT: and w8, w8, #0x1 +; NO_SVE-NEXT: and w9, w9, #0x1 +; NO_SVE-NEXT: bfi w10, w8, #1, #1 +; NO_SVE-NEXT: bfi w10, w9, #2, #1 +; NO_SVE-NEXT: bfi w10, w11, #3, #29 +; NO_SVE-NEXT: and w8, w10, #0xf +; NO_SVE-NEXT: tbz w10, #0, .LBB17_2 +; NO_SVE-NEXT: // %bb.1: // %cond.load +; NO_SVE-NEXT: fmov x9, d1 +; NO_SVE-NEXT: ldr d0, [x9] +; NO_SVE-NEXT: tbnz w8, #1, .LBB17_3 +; NO_SVE-NEXT: b .LBB17_4 +; NO_SVE-NEXT: .LBB17_2: +; NO_SVE-NEXT: // implicit-def: $q0 +; NO_SVE-NEXT: tbz w8, #1, .LBB17_4 +; NO_SVE-NEXT: .LBB17_3: // %cond.load1 +; NO_SVE-NEXT: mov x9, v1.d[1] +; NO_SVE-NEXT: ld1 { v0.d }[1], [x9] +; NO_SVE-NEXT: .LBB17_4: // %else2 +; NO_SVE-NEXT: ldr q2, [x1, #16] +; NO_SVE-NEXT: tbz w8, #2, .LBB17_6 +; NO_SVE-NEXT: // %bb.5: // %cond.load4 +; NO_SVE-NEXT: fmov x9, d2 +; NO_SVE-NEXT: ld1 { v1.d }[0], [x9] +; NO_SVE-NEXT: tbnz w8, #3, .LBB17_7 +; NO_SVE-NEXT: b .LBB17_8 +; NO_SVE-NEXT: .LBB17_6: +; NO_SVE-NEXT: // implicit-def: $q1 +; NO_SVE-NEXT: tbz w8, #3, .LBB17_8 +; NO_SVE-NEXT: .LBB17_7: // %cond.load7 +; NO_SVE-NEXT: mov x8, v2.d[1] +; NO_SVE-NEXT: ld1 { v1.d }[1], [x8] +; NO_SVE-NEXT: .LBB17_8: // %else8 +; NO_SVE-NEXT: stp q0, q1, [x0] +; NO_SVE-NEXT: add sp, sp, #16 +; NO_SVE-NEXT: ret +; ; CHECK-LABEL: masked_gather_v4i64: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.d, vl4 @@ -527,6 +3142,104 @@ define void @masked_gather_v8i64(<8 x i64>* %a, <8 x i64*>* %b) #0 { ; Ensure sensible type legalisation. +; NO_SVE-LABEL: masked_gather_v8i64: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: sub sp, sp, #16 +; NO_SVE-NEXT: .cfi_def_cfa_offset 16 +; NO_SVE-NEXT: ldp q0, q1, [x0, #32] +; NO_SVE-NEXT: cmeq v0.2d, v0.2d, #0 +; NO_SVE-NEXT: ldp q2, q3, [x0] +; NO_SVE-NEXT: cmeq v1.2d, v1.2d, #0 +; NO_SVE-NEXT: uzp1 v0.4s, v0.4s, v1.4s +; NO_SVE-NEXT: cmeq v2.2d, v2.2d, #0 +; NO_SVE-NEXT: cmeq v3.2d, v3.2d, #0 +; NO_SVE-NEXT: uzp1 v1.4s, v2.4s, v3.4s +; NO_SVE-NEXT: uzp1 v0.8h, v1.8h, v0.8h +; NO_SVE-NEXT: ldr q1, [x1] +; NO_SVE-NEXT: xtn v0.8b, v0.8h +; NO_SVE-NEXT: umov w8, v0.b[1] +; NO_SVE-NEXT: umov w10, v0.b[2] +; NO_SVE-NEXT: umov w9, v0.b[0] +; NO_SVE-NEXT: umov w11, v0.b[3] +; NO_SVE-NEXT: umov w12, v0.b[4] +; NO_SVE-NEXT: umov w13, v0.b[5] +; NO_SVE-NEXT: umov w14, v0.b[6] +; NO_SVE-NEXT: and w8, w8, #0x1 +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: and w9, w9, #0x1 +; NO_SVE-NEXT: and w11, w11, #0x1 +; NO_SVE-NEXT: bfi w9, w8, #1, #1 +; NO_SVE-NEXT: and w8, w12, #0x1 +; NO_SVE-NEXT: bfi w9, w10, #2, #1 +; NO_SVE-NEXT: and w10, w13, #0x1 +; NO_SVE-NEXT: bfi w9, w11, #3, #1 +; NO_SVE-NEXT: umov w11, v0.b[7] +; NO_SVE-NEXT: bfi w9, w8, #4, #1 +; NO_SVE-NEXT: and w8, w14, #0x1 +; NO_SVE-NEXT: bfi w9, w10, #5, #1 +; NO_SVE-NEXT: orr w8, w9, w8, lsl #6 +; NO_SVE-NEXT: orr w9, w8, w11, lsl #7 +; NO_SVE-NEXT: and w8, w9, #0xff +; NO_SVE-NEXT: tbz w9, #0, .LBB18_2 +; NO_SVE-NEXT: // %bb.1: // %cond.load +; NO_SVE-NEXT: fmov x9, d1 +; NO_SVE-NEXT: ldr d0, [x9] +; NO_SVE-NEXT: tbnz w8, #1, .LBB18_3 +; NO_SVE-NEXT: b .LBB18_4 +; NO_SVE-NEXT: .LBB18_2: +; NO_SVE-NEXT: // implicit-def: $q0 +; NO_SVE-NEXT: tbz w8, #1, .LBB18_4 +; NO_SVE-NEXT: .LBB18_3: // %cond.load1 +; NO_SVE-NEXT: mov x9, v1.d[1] +; NO_SVE-NEXT: ld1 { v0.d }[1], [x9] +; NO_SVE-NEXT: .LBB18_4: // %else2 +; NO_SVE-NEXT: ldr q2, [x1, #16] +; NO_SVE-NEXT: tbz w8, #2, .LBB18_6 +; NO_SVE-NEXT: // %bb.5: // %cond.load4 +; NO_SVE-NEXT: fmov x9, d2 +; NO_SVE-NEXT: ld1 { v1.d }[0], [x9] +; NO_SVE-NEXT: tbnz w8, #3, .LBB18_7 +; NO_SVE-NEXT: b .LBB18_8 +; NO_SVE-NEXT: .LBB18_6: +; NO_SVE-NEXT: // implicit-def: $q1 +; NO_SVE-NEXT: tbz w8, #3, .LBB18_8 +; NO_SVE-NEXT: .LBB18_7: // %cond.load7 +; NO_SVE-NEXT: mov x9, v2.d[1] +; NO_SVE-NEXT: ld1 { v1.d }[1], [x9] +; NO_SVE-NEXT: .LBB18_8: // %else8 +; NO_SVE-NEXT: ldr q3, [x1, #32] +; NO_SVE-NEXT: tbz w8, #4, .LBB18_10 +; NO_SVE-NEXT: // %bb.9: // %cond.load10 +; NO_SVE-NEXT: fmov x9, d3 +; NO_SVE-NEXT: ld1 { v2.d }[0], [x9] +; NO_SVE-NEXT: tbnz w8, #5, .LBB18_11 +; NO_SVE-NEXT: b .LBB18_12 +; NO_SVE-NEXT: .LBB18_10: +; NO_SVE-NEXT: // implicit-def: $q2 +; NO_SVE-NEXT: tbz w8, #5, .LBB18_12 +; NO_SVE-NEXT: .LBB18_11: // %cond.load13 +; NO_SVE-NEXT: mov x9, v3.d[1] +; NO_SVE-NEXT: ld1 { v2.d }[1], [x9] +; NO_SVE-NEXT: .LBB18_12: // %else14 +; NO_SVE-NEXT: ldr q4, [x1, #48] +; NO_SVE-NEXT: tbz w8, #6, .LBB18_14 +; NO_SVE-NEXT: // %bb.13: // %cond.load16 +; NO_SVE-NEXT: fmov x9, d4 +; NO_SVE-NEXT: ld1 { v3.d }[0], [x9] +; NO_SVE-NEXT: tbnz w8, #7, .LBB18_15 +; NO_SVE-NEXT: b .LBB18_16 +; NO_SVE-NEXT: .LBB18_14: +; NO_SVE-NEXT: // implicit-def: $q3 +; NO_SVE-NEXT: tbz w8, #7, .LBB18_16 +; NO_SVE-NEXT: .LBB18_15: // %cond.load19 +; NO_SVE-NEXT: mov x8, v4.d[1] +; NO_SVE-NEXT: ld1 { v3.d }[1], [x8] +; NO_SVE-NEXT: .LBB18_16: // %else20 +; NO_SVE-NEXT: stp q0, q1, [x0] +; NO_SVE-NEXT: stp q2, q3, [x0, #32] +; NO_SVE-NEXT: add sp, sp, #16 +; NO_SVE-NEXT: ret +; ; VBITS_EQ_256-LABEL: masked_gather_v8i64: ; VBITS_EQ_256: // %bb.0: ; VBITS_EQ_256-NEXT: mov x8, #4 @@ -562,6 +3275,224 @@ } define void @masked_gather_v16i64(<16 x i64>* %a, <16 x i64*>* %b) #0 { +; NO_SVE-LABEL: masked_gather_v16i64: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: sub sp, sp, #16 +; NO_SVE-NEXT: .cfi_def_cfa_offset 16 +; NO_SVE-NEXT: ldp q2, q3, [x0, #32] +; NO_SVE-NEXT: cmeq v2.2d, v2.2d, #0 +; NO_SVE-NEXT: ldp q4, q5, [x0] +; NO_SVE-NEXT: cmeq v3.2d, v3.2d, #0 +; NO_SVE-NEXT: uzp1 v2.4s, v2.4s, v3.4s +; NO_SVE-NEXT: cmeq v4.2d, v4.2d, #0 +; NO_SVE-NEXT: cmeq v5.2d, v5.2d, #0 +; NO_SVE-NEXT: ldp q0, q1, [x0, #96] +; NO_SVE-NEXT: uzp1 v3.4s, v4.4s, v5.4s +; NO_SVE-NEXT: cmeq v0.2d, v0.2d, #0 +; NO_SVE-NEXT: uzp1 v2.8h, v3.8h, v2.8h +; NO_SVE-NEXT: ldp q4, q5, [x0, #64] +; NO_SVE-NEXT: cmeq v1.2d, v1.2d, #0 +; NO_SVE-NEXT: uzp1 v0.4s, v0.4s, v1.4s +; NO_SVE-NEXT: xtn v1.8b, v2.8h +; NO_SVE-NEXT: cmeq v4.2d, v4.2d, #0 +; NO_SVE-NEXT: umov w8, v1.b[1] +; NO_SVE-NEXT: cmeq v3.2d, v5.2d, #0 +; NO_SVE-NEXT: umov w10, v1.b[2] +; NO_SVE-NEXT: umov w9, v1.b[0] +; NO_SVE-NEXT: uzp1 v2.4s, v4.4s, v3.4s +; NO_SVE-NEXT: umov w11, v1.b[3] +; NO_SVE-NEXT: umov w12, v1.b[4] +; NO_SVE-NEXT: and w8, w8, #0x1 +; NO_SVE-NEXT: umov w13, v1.b[5] +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: and w9, w9, #0x1 +; NO_SVE-NEXT: uzp1 v0.8h, v2.8h, v0.8h +; NO_SVE-NEXT: and w11, w11, #0x1 +; NO_SVE-NEXT: bfi w9, w8, #1, #1 +; NO_SVE-NEXT: umov w8, v1.b[6] +; NO_SVE-NEXT: and w12, w12, #0x1 +; NO_SVE-NEXT: bfi w9, w10, #2, #1 +; NO_SVE-NEXT: umov w10, v1.b[7] +; NO_SVE-NEXT: and w13, w13, #0x1 +; NO_SVE-NEXT: xtn v0.8b, v0.8h +; NO_SVE-NEXT: bfi w9, w11, #3, #1 +; NO_SVE-NEXT: bfi w9, w12, #4, #1 +; NO_SVE-NEXT: and w8, w8, #0x1 +; NO_SVE-NEXT: umov w11, v0.b[0] +; NO_SVE-NEXT: umov w12, v0.b[1] +; NO_SVE-NEXT: bfi w9, w13, #5, #1 +; NO_SVE-NEXT: umov w13, v0.b[2] +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: orr w8, w9, w8, lsl #6 +; NO_SVE-NEXT: umov w9, v0.b[3] +; NO_SVE-NEXT: ldr q1, [x1] +; NO_SVE-NEXT: and w11, w11, #0x1 +; NO_SVE-NEXT: orr w8, w8, w10, lsl #7 +; NO_SVE-NEXT: umov w10, v0.b[4] +; NO_SVE-NEXT: and w12, w12, #0x1 +; NO_SVE-NEXT: orr w8, w8, w11, lsl #8 +; NO_SVE-NEXT: umov w11, v0.b[5] +; NO_SVE-NEXT: and w13, w13, #0x1 +; NO_SVE-NEXT: orr w8, w8, w12, lsl #9 +; NO_SVE-NEXT: umov w12, v0.b[6] +; NO_SVE-NEXT: and w9, w9, #0x1 +; NO_SVE-NEXT: orr w8, w8, w13, lsl #10 +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: orr w8, w8, w9, lsl #11 +; NO_SVE-NEXT: and w9, w11, #0x1 +; NO_SVE-NEXT: umov w11, v0.b[7] +; NO_SVE-NEXT: orr w8, w8, w10, lsl #12 +; NO_SVE-NEXT: and w10, w12, #0x1 +; NO_SVE-NEXT: orr w8, w8, w9, lsl #13 +; NO_SVE-NEXT: orr w8, w8, w10, lsl #14 +; NO_SVE-NEXT: orr w9, w8, w11, lsl #15 +; NO_SVE-NEXT: and w8, w9, #0xffff +; NO_SVE-NEXT: tbz w9, #0, .LBB19_2 +; NO_SVE-NEXT: // %bb.1: // %cond.load +; NO_SVE-NEXT: fmov x9, d1 +; NO_SVE-NEXT: ldr d0, [x9] +; NO_SVE-NEXT: tbnz w8, #1, .LBB19_3 +; NO_SVE-NEXT: b .LBB19_4 +; NO_SVE-NEXT: .LBB19_2: +; NO_SVE-NEXT: // implicit-def: $q0 +; NO_SVE-NEXT: tbz w8, #1, .LBB19_4 +; NO_SVE-NEXT: .LBB19_3: // %cond.load1 +; NO_SVE-NEXT: mov x9, v1.d[1] +; NO_SVE-NEXT: ld1 { v0.d }[1], [x9] +; NO_SVE-NEXT: .LBB19_4: // %else2 +; NO_SVE-NEXT: ldr q2, [x1, #16] +; NO_SVE-NEXT: tbz w8, #2, .LBB19_6 +; NO_SVE-NEXT: // %bb.5: // %cond.load4 +; NO_SVE-NEXT: fmov x9, d2 +; NO_SVE-NEXT: ld1 { v1.d }[0], [x9] +; NO_SVE-NEXT: tbnz w8, #3, .LBB19_7 +; NO_SVE-NEXT: b .LBB19_8 +; NO_SVE-NEXT: .LBB19_6: +; NO_SVE-NEXT: // implicit-def: $q1 +; NO_SVE-NEXT: tbz w8, #3, .LBB19_8 +; NO_SVE-NEXT: .LBB19_7: // %cond.load7 +; NO_SVE-NEXT: mov x9, v2.d[1] +; NO_SVE-NEXT: ld1 { v1.d }[1], [x9] +; NO_SVE-NEXT: .LBB19_8: // %else8 +; NO_SVE-NEXT: ldr q3, [x1, #32] +; NO_SVE-NEXT: tbz w8, #4, .LBB19_10 +; NO_SVE-NEXT: // %bb.9: // %cond.load10 +; NO_SVE-NEXT: fmov x9, d3 +; NO_SVE-NEXT: ld1 { v2.d }[0], [x9] +; NO_SVE-NEXT: tbnz w8, #5, .LBB19_11 +; NO_SVE-NEXT: b .LBB19_12 +; NO_SVE-NEXT: .LBB19_10: +; NO_SVE-NEXT: // implicit-def: $q2 +; NO_SVE-NEXT: tbz w8, #5, .LBB19_12 +; NO_SVE-NEXT: .LBB19_11: // %cond.load13 +; NO_SVE-NEXT: mov x9, v3.d[1] +; NO_SVE-NEXT: ld1 { v2.d }[1], [x9] +; NO_SVE-NEXT: .LBB19_12: // %else14 +; NO_SVE-NEXT: ldr q4, [x1, #48] +; NO_SVE-NEXT: tbz w8, #6, .LBB19_14 +; NO_SVE-NEXT: // %bb.13: // %cond.load16 +; NO_SVE-NEXT: fmov x9, d4 +; NO_SVE-NEXT: ld1 { v3.d }[0], [x9] +; NO_SVE-NEXT: tbnz w8, #7, .LBB19_15 +; NO_SVE-NEXT: b .LBB19_16 +; NO_SVE-NEXT: .LBB19_14: +; NO_SVE-NEXT: // implicit-def: $q3 +; NO_SVE-NEXT: tbz w8, #7, .LBB19_16 +; NO_SVE-NEXT: .LBB19_15: // %cond.load19 +; NO_SVE-NEXT: mov x9, v4.d[1] +; NO_SVE-NEXT: ld1 { v3.d }[1], [x9] +; NO_SVE-NEXT: .LBB19_16: // %else20 +; NO_SVE-NEXT: ldr q5, [x1, #64] +; NO_SVE-NEXT: tbz w8, #8, .LBB19_18 +; NO_SVE-NEXT: // %bb.17: // %cond.load22 +; NO_SVE-NEXT: fmov x9, d5 +; NO_SVE-NEXT: ld1 { v4.d }[0], [x9] +; NO_SVE-NEXT: tbnz w8, #9, .LBB19_19 +; NO_SVE-NEXT: b .LBB19_20 +; NO_SVE-NEXT: .LBB19_18: +; NO_SVE-NEXT: // implicit-def: $q4 +; NO_SVE-NEXT: tbz w8, #9, .LBB19_20 +; NO_SVE-NEXT: .LBB19_19: // %cond.load25 +; NO_SVE-NEXT: mov x9, v5.d[1] +; NO_SVE-NEXT: ld1 { v4.d }[1], [x9] +; NO_SVE-NEXT: .LBB19_20: // %else26 +; NO_SVE-NEXT: ldr q6, [x1, #80] +; NO_SVE-NEXT: tbz w8, #10, .LBB19_22 +; NO_SVE-NEXT: // %bb.21: // %cond.load28 +; NO_SVE-NEXT: fmov x9, d6 +; NO_SVE-NEXT: ld1 { v5.d }[0], [x9] +; NO_SVE-NEXT: tbnz w8, #11, .LBB19_23 +; NO_SVE-NEXT: b .LBB19_24 +; NO_SVE-NEXT: .LBB19_22: +; NO_SVE-NEXT: // implicit-def: $q5 +; NO_SVE-NEXT: tbz w8, #11, .LBB19_24 +; NO_SVE-NEXT: .LBB19_23: // %cond.load31 +; NO_SVE-NEXT: mov x9, v6.d[1] +; NO_SVE-NEXT: ld1 { v5.d }[1], [x9] +; NO_SVE-NEXT: .LBB19_24: // %else32 +; NO_SVE-NEXT: ldr q7, [x1, #96] +; NO_SVE-NEXT: tbz w8, #12, .LBB19_26 +; NO_SVE-NEXT: // %bb.25: // %cond.load34 +; NO_SVE-NEXT: fmov x9, d7 +; NO_SVE-NEXT: ld1 { v6.d }[0], [x9] +; NO_SVE-NEXT: tbnz w8, #13, .LBB19_27 +; NO_SVE-NEXT: b .LBB19_28 +; NO_SVE-NEXT: .LBB19_26: +; NO_SVE-NEXT: // implicit-def: $q6 +; NO_SVE-NEXT: tbz w8, #13, .LBB19_28 +; NO_SVE-NEXT: .LBB19_27: // %cond.load37 +; NO_SVE-NEXT: mov x9, v7.d[1] +; NO_SVE-NEXT: ld1 { v6.d }[1], [x9] +; NO_SVE-NEXT: .LBB19_28: // %else38 +; NO_SVE-NEXT: ldr q16, [x1, #112] +; NO_SVE-NEXT: tbz w8, #14, .LBB19_30 +; NO_SVE-NEXT: // %bb.29: // %cond.load40 +; NO_SVE-NEXT: fmov x9, d16 +; NO_SVE-NEXT: ld1 { v7.d }[0], [x9] +; NO_SVE-NEXT: tbnz w8, #15, .LBB19_31 +; NO_SVE-NEXT: b .LBB19_32 +; NO_SVE-NEXT: .LBB19_30: +; NO_SVE-NEXT: // implicit-def: $q7 +; NO_SVE-NEXT: tbz w8, #15, .LBB19_32 +; NO_SVE-NEXT: .LBB19_31: // %cond.load43 +; NO_SVE-NEXT: mov x8, v16.d[1] +; NO_SVE-NEXT: ld1 { v7.d }[1], [x8] +; NO_SVE-NEXT: .LBB19_32: // %else44 +; NO_SVE-NEXT: stp q0, q1, [x0] +; NO_SVE-NEXT: stp q2, q3, [x0, #32] +; NO_SVE-NEXT: stp q4, q5, [x0, #64] +; NO_SVE-NEXT: stp q6, q7, [x0, #96] +; NO_SVE-NEXT: add sp, sp, #16 +; NO_SVE-NEXT: ret +; +; VBITS_EQ_256-LABEL: masked_gather_v16i64: +; VBITS_EQ_256: // %bb.0: +; VBITS_EQ_256-NEXT: mov x8, #8 +; VBITS_EQ_256-NEXT: mov x9, #4 +; VBITS_EQ_256-NEXT: mov x10, #12 +; VBITS_EQ_256-NEXT: ptrue p0.d, vl4 +; VBITS_EQ_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] +; VBITS_EQ_256-NEXT: ld1d { z1.d }, p0/z, [x0, x9, lsl #3] +; VBITS_EQ_256-NEXT: ld1d { z2.d }, p0/z, [x0, x10, lsl #3] +; VBITS_EQ_256-NEXT: ld1d { z3.d }, p0/z, [x0] +; VBITS_EQ_256-NEXT: ld1d { z4.d }, p0/z, [x1, x9, lsl #3] +; VBITS_EQ_256-NEXT: ld1d { z5.d }, p0/z, [x1, x8, lsl #3] +; VBITS_EQ_256-NEXT: ld1d { z6.d }, p0/z, [x1, x10, lsl #3] +; VBITS_EQ_256-NEXT: ld1d { z7.d }, p0/z, [x1] +; VBITS_EQ_256-NEXT: cmpeq p1.d, p0/z, z0.d, #0 +; VBITS_EQ_256-NEXT: cmpeq p3.d, p0/z, z1.d, #0 +; VBITS_EQ_256-NEXT: cmpeq p2.d, p0/z, z2.d, #0 +; VBITS_EQ_256-NEXT: cmpeq p4.d, p0/z, z3.d, #0 +; VBITS_EQ_256-NEXT: ld1d { z0.d }, p3/z, [z4.d] +; VBITS_EQ_256-NEXT: ld1d { z1.d }, p2/z, [z6.d] +; VBITS_EQ_256-NEXT: ld1d { z2.d }, p1/z, [z5.d] +; VBITS_EQ_256-NEXT: ld1d { z3.d }, p4/z, [z7.d] +; VBITS_EQ_256-NEXT: st1d { z0.d }, p0, [x0, x9, lsl #3] +; VBITS_EQ_256-NEXT: st1d { z2.d }, p0, [x0, x8, lsl #3] +; VBITS_EQ_256-NEXT: st1d { z1.d }, p0, [x0, x10, lsl #3] +; VBITS_EQ_256-NEXT: st1d { z3.d }, p0, [x0] +; VBITS_EQ_256-NEXT: ret +; ; VBITS_GE_1024-LABEL: masked_gather_v16i64: ; VBITS_GE_1024: // %bb.0: ; VBITS_GE_1024-NEXT: ptrue p0.d, vl16 @@ -580,6 +3511,430 @@ } define void @masked_gather_v32i64(<32 x i64>* %a, <32 x i64*>* %b) #0 { +; NO_SVE-LABEL: masked_gather_v32i64: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: sub sp, sp, #16 +; NO_SVE-NEXT: .cfi_def_cfa_offset 16 +; NO_SVE-NEXT: ldp q2, q3, [x0, #160] +; NO_SVE-NEXT: cmeq v2.2d, v2.2d, #0 +; NO_SVE-NEXT: ldp q4, q5, [x0, #128] +; NO_SVE-NEXT: cmeq v3.2d, v3.2d, #0 +; NO_SVE-NEXT: uzp1 v2.4s, v2.4s, v3.4s +; NO_SVE-NEXT: cmeq v4.2d, v4.2d, #0 +; NO_SVE-NEXT: ldp q0, q1, [x0, #224] +; NO_SVE-NEXT: cmeq v5.2d, v5.2d, #0 +; NO_SVE-NEXT: uzp1 v4.4s, v4.4s, v5.4s +; NO_SVE-NEXT: cmeq v0.2d, v0.2d, #0 +; NO_SVE-NEXT: ldp q6, q7, [x0, #192] +; NO_SVE-NEXT: cmeq v1.2d, v1.2d, #0 +; NO_SVE-NEXT: uzp1 v2.8h, v4.8h, v2.8h +; NO_SVE-NEXT: uzp1 v0.4s, v0.4s, v1.4s +; NO_SVE-NEXT: cmeq v6.2d, v6.2d, #0 +; NO_SVE-NEXT: cmeq v7.2d, v7.2d, #0 +; NO_SVE-NEXT: ldp q3, q5, [x0] +; NO_SVE-NEXT: uzp1 v1.4s, v6.4s, v7.4s +; NO_SVE-NEXT: cmeq v3.2d, v3.2d, #0 +; NO_SVE-NEXT: uzp1 v0.8h, v1.8h, v0.8h +; NO_SVE-NEXT: xtn v1.8b, v2.8h +; NO_SVE-NEXT: ldp q16, q17, [x0, #32] +; NO_SVE-NEXT: umov w8, v1.b[1] +; NO_SVE-NEXT: umov w10, v1.b[2] +; NO_SVE-NEXT: umov w9, v1.b[0] +; NO_SVE-NEXT: umov w11, v1.b[3] +; NO_SVE-NEXT: umov w12, v1.b[4] +; NO_SVE-NEXT: umov w13, v1.b[5] +; NO_SVE-NEXT: cmeq v6.2d, v16.2d, #0 +; NO_SVE-NEXT: xtn v0.8b, v0.8h +; NO_SVE-NEXT: and w8, w8, #0x1 +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: cmeq v4.2d, v17.2d, #0 +; NO_SVE-NEXT: umov w14, v1.b[6] +; NO_SVE-NEXT: cmeq v2.2d, v5.2d, #0 +; NO_SVE-NEXT: and w9, w9, #0x1 +; NO_SVE-NEXT: and w11, w11, #0x1 +; NO_SVE-NEXT: umov w15, v1.b[7] +; NO_SVE-NEXT: bfi w9, w8, #1, #1 +; NO_SVE-NEXT: and w12, w12, #0x1 +; NO_SVE-NEXT: uzp1 v4.4s, v6.4s, v4.4s +; NO_SVE-NEXT: umov w8, v0.b[0] +; NO_SVE-NEXT: uzp1 v2.4s, v3.4s, v2.4s +; NO_SVE-NEXT: bfi w9, w10, #2, #1 +; NO_SVE-NEXT: and w13, w13, #0x1 +; NO_SVE-NEXT: bfi w9, w11, #3, #1 +; NO_SVE-NEXT: umov w10, v0.b[1] +; NO_SVE-NEXT: and w14, w14, #0x1 +; NO_SVE-NEXT: bfi w9, w12, #4, #1 +; NO_SVE-NEXT: and w12, w15, #0x1 +; NO_SVE-NEXT: bfi w9, w13, #5, #1 +; NO_SVE-NEXT: umov w11, v0.b[2] +; NO_SVE-NEXT: and w8, w8, #0x1 +; NO_SVE-NEXT: orr w9, w9, w14, lsl #6 +; NO_SVE-NEXT: uzp1 v1.8h, v2.8h, v4.8h +; NO_SVE-NEXT: orr w9, w9, w12, lsl #7 +; NO_SVE-NEXT: ldp q2, q3, [x0, #64] +; NO_SVE-NEXT: orr w8, w9, w8, lsl #8 +; NO_SVE-NEXT: and w9, w10, #0x1 +; NO_SVE-NEXT: umov w10, v0.b[3] +; NO_SVE-NEXT: umov w12, v0.b[4] +; NO_SVE-NEXT: and w11, w11, #0x1 +; NO_SVE-NEXT: xtn v1.8b, v1.8h +; NO_SVE-NEXT: orr w8, w8, w9, lsl #9 +; NO_SVE-NEXT: umov w9, v0.b[5] +; NO_SVE-NEXT: orr w8, w8, w11, lsl #10 +; NO_SVE-NEXT: umov w11, v1.b[1] +; NO_SVE-NEXT: ldp q4, q5, [x0, #96] +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: and w12, w12, #0x1 +; NO_SVE-NEXT: cmeq v3.2d, v3.2d, #0 +; NO_SVE-NEXT: umov w13, v1.b[3] +; NO_SVE-NEXT: orr w8, w8, w10, lsl #11 +; NO_SVE-NEXT: and w10, w11, #0x1 +; NO_SVE-NEXT: cmeq v4.2d, v4.2d, #0 +; NO_SVE-NEXT: orr w8, w8, w12, lsl #12 +; NO_SVE-NEXT: cmeq v2.2d, v2.2d, #0 +; NO_SVE-NEXT: and w9, w9, #0x1 +; NO_SVE-NEXT: cmeq v5.2d, v5.2d, #0 +; NO_SVE-NEXT: umov w12, v1.b[2] +; NO_SVE-NEXT: orr w8, w8, w9, lsl #13 +; NO_SVE-NEXT: umov w11, v1.b[0] +; NO_SVE-NEXT: umov w14, v1.b[4] +; NO_SVE-NEXT: uzp1 v4.4s, v4.4s, v5.4s +; NO_SVE-NEXT: umov w15, v1.b[5] +; NO_SVE-NEXT: uzp1 v2.4s, v2.4s, v3.4s +; NO_SVE-NEXT: and w12, w12, #0x1 +; NO_SVE-NEXT: and w11, w11, #0x1 +; NO_SVE-NEXT: bfi w11, w10, #1, #1 +; NO_SVE-NEXT: and w10, w13, #0x1 +; NO_SVE-NEXT: uzp1 v2.8h, v2.8h, v4.8h +; NO_SVE-NEXT: bfi w11, w12, #2, #1 +; NO_SVE-NEXT: and w12, w14, #0x1 +; NO_SVE-NEXT: umov w14, v1.b[6] +; NO_SVE-NEXT: and w13, w15, #0x1 +; NO_SVE-NEXT: bfi w11, w10, #3, #1 +; NO_SVE-NEXT: umov w10, v1.b[7] +; NO_SVE-NEXT: xtn v1.8b, v2.8h +; NO_SVE-NEXT: bfi w11, w12, #4, #1 +; NO_SVE-NEXT: umov w12, v0.b[6] +; NO_SVE-NEXT: bfi w11, w13, #5, #1 +; NO_SVE-NEXT: and w13, w14, #0x1 +; NO_SVE-NEXT: umov w14, v1.b[0] +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: umov w15, v1.b[1] +; NO_SVE-NEXT: orr w9, w11, w13, lsl #6 +; NO_SVE-NEXT: and w11, w12, #0x1 +; NO_SVE-NEXT: umov w12, v1.b[2] +; NO_SVE-NEXT: orr w9, w9, w10, lsl #7 +; NO_SVE-NEXT: and w10, w14, #0x1 +; NO_SVE-NEXT: umov w14, v1.b[3] +; NO_SVE-NEXT: and w13, w15, #0x1 +; NO_SVE-NEXT: orr w9, w9, w10, lsl #8 +; NO_SVE-NEXT: umov w10, v1.b[4] +; NO_SVE-NEXT: orr w8, w8, w11, lsl #14 +; NO_SVE-NEXT: and w11, w12, #0x1 +; NO_SVE-NEXT: umov w12, v1.b[5] +; NO_SVE-NEXT: orr w9, w9, w13, lsl #9 +; NO_SVE-NEXT: and w13, w14, #0x1 +; NO_SVE-NEXT: umov w14, v1.b[6] +; NO_SVE-NEXT: orr w9, w9, w11, lsl #10 +; NO_SVE-NEXT: umov w11, v0.b[7] +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: orr w9, w9, w13, lsl #11 +; NO_SVE-NEXT: and w12, w12, #0x1 +; NO_SVE-NEXT: umov w13, v1.b[7] +; NO_SVE-NEXT: orr w9, w9, w10, lsl #12 +; NO_SVE-NEXT: and w10, w14, #0x1 +; NO_SVE-NEXT: orr w11, w8, w11, lsl #15 +; NO_SVE-NEXT: orr w8, w9, w12, lsl #13 +; NO_SVE-NEXT: orr w8, w8, w10, lsl #14 +; NO_SVE-NEXT: ldr q1, [x1] +; NO_SVE-NEXT: orr w8, w8, w13, lsl #15 +; NO_SVE-NEXT: bfi w8, w11, #16, #16 +; NO_SVE-NEXT: tbz w8, #0, .LBB20_2 +; NO_SVE-NEXT: // %bb.1: // %cond.load +; NO_SVE-NEXT: fmov x9, d1 +; NO_SVE-NEXT: ldr d0, [x9] +; NO_SVE-NEXT: tbnz w8, #1, .LBB20_3 +; NO_SVE-NEXT: b .LBB20_4 +; NO_SVE-NEXT: .LBB20_2: +; NO_SVE-NEXT: // implicit-def: $q0 +; NO_SVE-NEXT: tbz w8, #1, .LBB20_4 +; NO_SVE-NEXT: .LBB20_3: // %cond.load1 +; NO_SVE-NEXT: mov x9, v1.d[1] +; NO_SVE-NEXT: ld1 { v0.d }[1], [x9] +; NO_SVE-NEXT: .LBB20_4: // %else2 +; NO_SVE-NEXT: ldr q2, [x1, #16] +; NO_SVE-NEXT: tbz w8, #2, .LBB20_6 +; NO_SVE-NEXT: // %bb.5: // %cond.load4 +; NO_SVE-NEXT: fmov x9, d2 +; NO_SVE-NEXT: ld1 { v1.d }[0], [x9] +; NO_SVE-NEXT: tbnz w8, #3, .LBB20_7 +; NO_SVE-NEXT: b .LBB20_8 +; NO_SVE-NEXT: .LBB20_6: +; NO_SVE-NEXT: // implicit-def: $q1 +; NO_SVE-NEXT: tbz w8, #3, .LBB20_8 +; NO_SVE-NEXT: .LBB20_7: // %cond.load7 +; NO_SVE-NEXT: mov x9, v2.d[1] +; NO_SVE-NEXT: ld1 { v1.d }[1], [x9] +; NO_SVE-NEXT: .LBB20_8: // %else8 +; NO_SVE-NEXT: ldr q3, [x1, #32] +; NO_SVE-NEXT: tbz w8, #4, .LBB20_10 +; NO_SVE-NEXT: // %bb.9: // %cond.load10 +; NO_SVE-NEXT: fmov x9, d3 +; NO_SVE-NEXT: ld1 { v2.d }[0], [x9] +; NO_SVE-NEXT: tbnz w8, #5, .LBB20_11 +; NO_SVE-NEXT: b .LBB20_12 +; NO_SVE-NEXT: .LBB20_10: +; NO_SVE-NEXT: // implicit-def: $q2 +; NO_SVE-NEXT: tbz w8, #5, .LBB20_12 +; NO_SVE-NEXT: .LBB20_11: // %cond.load13 +; NO_SVE-NEXT: mov x9, v3.d[1] +; NO_SVE-NEXT: ld1 { v2.d }[1], [x9] +; NO_SVE-NEXT: .LBB20_12: // %else14 +; NO_SVE-NEXT: ldr q4, [x1, #48] +; NO_SVE-NEXT: tbz w8, #6, .LBB20_14 +; NO_SVE-NEXT: // %bb.13: // %cond.load16 +; NO_SVE-NEXT: fmov x9, d4 +; NO_SVE-NEXT: ld1 { v3.d }[0], [x9] +; NO_SVE-NEXT: tbnz w8, #7, .LBB20_15 +; NO_SVE-NEXT: b .LBB20_16 +; NO_SVE-NEXT: .LBB20_14: +; NO_SVE-NEXT: // implicit-def: $q3 +; NO_SVE-NEXT: tbz w8, #7, .LBB20_16 +; NO_SVE-NEXT: .LBB20_15: // %cond.load19 +; NO_SVE-NEXT: mov x9, v4.d[1] +; NO_SVE-NEXT: ld1 { v3.d }[1], [x9] +; NO_SVE-NEXT: .LBB20_16: // %else20 +; NO_SVE-NEXT: ldr q5, [x1, #64] +; NO_SVE-NEXT: tbz w8, #8, .LBB20_18 +; NO_SVE-NEXT: // %bb.17: // %cond.load22 +; NO_SVE-NEXT: fmov x9, d5 +; NO_SVE-NEXT: ld1 { v4.d }[0], [x9] +; NO_SVE-NEXT: tbnz w8, #9, .LBB20_19 +; NO_SVE-NEXT: b .LBB20_20 +; NO_SVE-NEXT: .LBB20_18: +; NO_SVE-NEXT: // implicit-def: $q4 +; NO_SVE-NEXT: tbz w8, #9, .LBB20_20 +; NO_SVE-NEXT: .LBB20_19: // %cond.load25 +; NO_SVE-NEXT: mov x9, v5.d[1] +; NO_SVE-NEXT: ld1 { v4.d }[1], [x9] +; NO_SVE-NEXT: .LBB20_20: // %else26 +; NO_SVE-NEXT: ldr q6, [x1, #80] +; NO_SVE-NEXT: tbz w8, #10, .LBB20_22 +; NO_SVE-NEXT: // %bb.21: // %cond.load28 +; NO_SVE-NEXT: fmov x9, d6 +; NO_SVE-NEXT: ld1 { v5.d }[0], [x9] +; NO_SVE-NEXT: tbnz w8, #11, .LBB20_23 +; NO_SVE-NEXT: b .LBB20_24 +; NO_SVE-NEXT: .LBB20_22: +; NO_SVE-NEXT: // implicit-def: $q5 +; NO_SVE-NEXT: tbz w8, #11, .LBB20_24 +; NO_SVE-NEXT: .LBB20_23: // %cond.load31 +; NO_SVE-NEXT: mov x9, v6.d[1] +; NO_SVE-NEXT: ld1 { v5.d }[1], [x9] +; NO_SVE-NEXT: .LBB20_24: // %else32 +; NO_SVE-NEXT: ldr q7, [x1, #96] +; NO_SVE-NEXT: tbz w8, #12, .LBB20_26 +; NO_SVE-NEXT: // %bb.25: // %cond.load34 +; NO_SVE-NEXT: fmov x9, d7 +; NO_SVE-NEXT: ld1 { v6.d }[0], [x9] +; NO_SVE-NEXT: tbnz w8, #13, .LBB20_27 +; NO_SVE-NEXT: b .LBB20_28 +; NO_SVE-NEXT: .LBB20_26: +; NO_SVE-NEXT: // implicit-def: $q6 +; NO_SVE-NEXT: tbz w8, #13, .LBB20_28 +; NO_SVE-NEXT: .LBB20_27: // %cond.load37 +; NO_SVE-NEXT: mov x9, v7.d[1] +; NO_SVE-NEXT: ld1 { v6.d }[1], [x9] +; NO_SVE-NEXT: .LBB20_28: // %else38 +; NO_SVE-NEXT: ldr q16, [x1, #112] +; NO_SVE-NEXT: tbz w8, #14, .LBB20_30 +; NO_SVE-NEXT: // %bb.29: // %cond.load40 +; NO_SVE-NEXT: fmov x9, d16 +; NO_SVE-NEXT: ld1 { v7.d }[0], [x9] +; NO_SVE-NEXT: tbnz w8, #15, .LBB20_31 +; NO_SVE-NEXT: b .LBB20_32 +; NO_SVE-NEXT: .LBB20_30: +; NO_SVE-NEXT: // implicit-def: $q7 +; NO_SVE-NEXT: tbz w8, #15, .LBB20_32 +; NO_SVE-NEXT: .LBB20_31: // %cond.load43 +; NO_SVE-NEXT: mov x9, v16.d[1] +; NO_SVE-NEXT: ld1 { v7.d }[1], [x9] +; NO_SVE-NEXT: .LBB20_32: // %else44 +; NO_SVE-NEXT: ldr q17, [x1, #128] +; NO_SVE-NEXT: tbz w8, #16, .LBB20_34 +; NO_SVE-NEXT: // %bb.33: // %cond.load46 +; NO_SVE-NEXT: fmov x9, d17 +; NO_SVE-NEXT: ld1 { v16.d }[0], [x9] +; NO_SVE-NEXT: tbnz w8, #17, .LBB20_35 +; NO_SVE-NEXT: b .LBB20_36 +; NO_SVE-NEXT: .LBB20_34: +; NO_SVE-NEXT: // implicit-def: $q16 +; NO_SVE-NEXT: tbz w8, #17, .LBB20_36 +; NO_SVE-NEXT: .LBB20_35: // %cond.load49 +; NO_SVE-NEXT: mov x9, v17.d[1] +; NO_SVE-NEXT: ld1 { v16.d }[1], [x9] +; NO_SVE-NEXT: .LBB20_36: // %else50 +; NO_SVE-NEXT: ldr q18, [x1, #144] +; NO_SVE-NEXT: tbz w8, #18, .LBB20_38 +; NO_SVE-NEXT: // %bb.37: // %cond.load52 +; NO_SVE-NEXT: fmov x9, d18 +; NO_SVE-NEXT: ld1 { v17.d }[0], [x9] +; NO_SVE-NEXT: tbnz w8, #19, .LBB20_39 +; NO_SVE-NEXT: b .LBB20_40 +; NO_SVE-NEXT: .LBB20_38: +; NO_SVE-NEXT: // implicit-def: $q17 +; NO_SVE-NEXT: tbz w8, #19, .LBB20_40 +; NO_SVE-NEXT: .LBB20_39: // %cond.load55 +; NO_SVE-NEXT: mov x9, v18.d[1] +; NO_SVE-NEXT: ld1 { v17.d }[1], [x9] +; NO_SVE-NEXT: .LBB20_40: // %else56 +; NO_SVE-NEXT: ldr q19, [x1, #160] +; NO_SVE-NEXT: tbz w8, #20, .LBB20_42 +; NO_SVE-NEXT: // %bb.41: // %cond.load58 +; NO_SVE-NEXT: fmov x9, d19 +; NO_SVE-NEXT: ld1 { v18.d }[0], [x9] +; NO_SVE-NEXT: tbnz w8, #21, .LBB20_43 +; NO_SVE-NEXT: b .LBB20_44 +; NO_SVE-NEXT: .LBB20_42: +; NO_SVE-NEXT: // implicit-def: $q18 +; NO_SVE-NEXT: tbz w8, #21, .LBB20_44 +; NO_SVE-NEXT: .LBB20_43: // %cond.load61 +; NO_SVE-NEXT: mov x9, v19.d[1] +; NO_SVE-NEXT: ld1 { v18.d }[1], [x9] +; NO_SVE-NEXT: .LBB20_44: // %else62 +; NO_SVE-NEXT: ldr q20, [x1, #176] +; NO_SVE-NEXT: tbz w8, #22, .LBB20_46 +; NO_SVE-NEXT: // %bb.45: // %cond.load64 +; NO_SVE-NEXT: fmov x9, d20 +; NO_SVE-NEXT: ld1 { v19.d }[0], [x9] +; NO_SVE-NEXT: tbnz w8, #23, .LBB20_47 +; NO_SVE-NEXT: b .LBB20_48 +; NO_SVE-NEXT: .LBB20_46: +; NO_SVE-NEXT: // implicit-def: $q19 +; NO_SVE-NEXT: tbz w8, #23, .LBB20_48 +; NO_SVE-NEXT: .LBB20_47: // %cond.load67 +; NO_SVE-NEXT: mov x9, v20.d[1] +; NO_SVE-NEXT: ld1 { v19.d }[1], [x9] +; NO_SVE-NEXT: .LBB20_48: // %else68 +; NO_SVE-NEXT: ldr q21, [x1, #192] +; NO_SVE-NEXT: tbz w8, #24, .LBB20_50 +; NO_SVE-NEXT: // %bb.49: // %cond.load70 +; NO_SVE-NEXT: fmov x9, d21 +; NO_SVE-NEXT: ld1 { v20.d }[0], [x9] +; NO_SVE-NEXT: tbnz w8, #25, .LBB20_51 +; NO_SVE-NEXT: b .LBB20_52 +; NO_SVE-NEXT: .LBB20_50: +; NO_SVE-NEXT: // implicit-def: $q20 +; NO_SVE-NEXT: tbz w8, #25, .LBB20_52 +; NO_SVE-NEXT: .LBB20_51: // %cond.load73 +; NO_SVE-NEXT: mov x9, v21.d[1] +; NO_SVE-NEXT: ld1 { v20.d }[1], [x9] +; NO_SVE-NEXT: .LBB20_52: // %else74 +; NO_SVE-NEXT: ldr q22, [x1, #208] +; NO_SVE-NEXT: tbz w8, #26, .LBB20_54 +; NO_SVE-NEXT: // %bb.53: // %cond.load76 +; NO_SVE-NEXT: fmov x9, d22 +; NO_SVE-NEXT: ld1 { v21.d }[0], [x9] +; NO_SVE-NEXT: tbnz w8, #27, .LBB20_55 +; NO_SVE-NEXT: b .LBB20_56 +; NO_SVE-NEXT: .LBB20_54: +; NO_SVE-NEXT: // implicit-def: $q21 +; NO_SVE-NEXT: tbz w8, #27, .LBB20_56 +; NO_SVE-NEXT: .LBB20_55: // %cond.load79 +; NO_SVE-NEXT: mov x9, v22.d[1] +; NO_SVE-NEXT: ld1 { v21.d }[1], [x9] +; NO_SVE-NEXT: .LBB20_56: // %else80 +; NO_SVE-NEXT: ldr q23, [x1, #224] +; NO_SVE-NEXT: tbz w8, #28, .LBB20_58 +; NO_SVE-NEXT: // %bb.57: // %cond.load82 +; NO_SVE-NEXT: fmov x9, d23 +; NO_SVE-NEXT: ld1 { v22.d }[0], [x9] +; NO_SVE-NEXT: tbnz w8, #29, .LBB20_59 +; NO_SVE-NEXT: b .LBB20_60 +; NO_SVE-NEXT: .LBB20_58: +; NO_SVE-NEXT: // implicit-def: $q22 +; NO_SVE-NEXT: tbz w8, #29, .LBB20_60 +; NO_SVE-NEXT: .LBB20_59: // %cond.load85 +; NO_SVE-NEXT: mov x9, v23.d[1] +; NO_SVE-NEXT: ld1 { v22.d }[1], [x9] +; NO_SVE-NEXT: .LBB20_60: // %else86 +; NO_SVE-NEXT: ldr q24, [x1, #240] +; NO_SVE-NEXT: tbz w8, #30, .LBB20_62 +; NO_SVE-NEXT: // %bb.61: // %cond.load88 +; NO_SVE-NEXT: fmov x9, d24 +; NO_SVE-NEXT: ld1 { v23.d }[0], [x9] +; NO_SVE-NEXT: tbnz w8, #31, .LBB20_63 +; NO_SVE-NEXT: b .LBB20_64 +; NO_SVE-NEXT: .LBB20_62: +; NO_SVE-NEXT: // implicit-def: $q23 +; NO_SVE-NEXT: tbz w8, #31, .LBB20_64 +; NO_SVE-NEXT: .LBB20_63: // %cond.load91 +; NO_SVE-NEXT: mov x8, v24.d[1] +; NO_SVE-NEXT: ld1 { v23.d }[1], [x8] +; NO_SVE-NEXT: .LBB20_64: // %else92 +; NO_SVE-NEXT: stp q0, q1, [x0] +; NO_SVE-NEXT: stp q2, q3, [x0, #32] +; NO_SVE-NEXT: stp q4, q5, [x0, #64] +; NO_SVE-NEXT: stp q6, q7, [x0, #96] +; NO_SVE-NEXT: stp q16, q17, [x0, #128] +; NO_SVE-NEXT: stp q18, q19, [x0, #160] +; NO_SVE-NEXT: stp q20, q21, [x0, #192] +; NO_SVE-NEXT: stp q22, q23, [x0, #224] +; NO_SVE-NEXT: add sp, sp, #16 +; NO_SVE-NEXT: ret +; +; VBITS_EQ_256-LABEL: masked_gather_v32i64: +; VBITS_EQ_256: // %bb.0: +; VBITS_EQ_256-NEXT: mov x8, #4 +; VBITS_EQ_256-NEXT: mov x9, #8 +; VBITS_EQ_256-NEXT: mov x10, #12 +; VBITS_EQ_256-NEXT: mov x11, #16 +; VBITS_EQ_256-NEXT: mov x12, #20 +; VBITS_EQ_256-NEXT: mov x13, #24 +; VBITS_EQ_256-NEXT: mov x14, #28 +; VBITS_EQ_256-NEXT: ptrue p0.d, vl4 +; VBITS_EQ_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] +; VBITS_EQ_256-NEXT: ld1d { z1.d }, p0/z, [x0, x9, lsl #3] +; VBITS_EQ_256-NEXT: ld1d { z2.d }, p0/z, [x0, x10, lsl #3] +; VBITS_EQ_256-NEXT: ld1d { z3.d }, p0/z, [x0, x11, lsl #3] +; VBITS_EQ_256-NEXT: ld1d { z4.d }, p0/z, [x0, x12, lsl #3] +; VBITS_EQ_256-NEXT: ld1d { z5.d }, p0/z, [x0, x13, lsl #3] +; VBITS_EQ_256-NEXT: ld1d { z6.d }, p0/z, [x0, x14, lsl #3] +; VBITS_EQ_256-NEXT: ld1d { z7.d }, p0/z, [x0] +; VBITS_EQ_256-NEXT: ld1d { z19.d }, p0/z, [x1, x8, lsl #3] +; VBITS_EQ_256-NEXT: ld1d { z21.d }, p0/z, [x1, x9, lsl #3] +; VBITS_EQ_256-NEXT: ld1d { z22.d }, p0/z, [x1, x10, lsl #3] +; VBITS_EQ_256-NEXT: ld1d { z20.d }, p0/z, [x1, x11, lsl #3] +; VBITS_EQ_256-NEXT: cmpeq p2.d, p0/z, z0.d, #0 +; VBITS_EQ_256-NEXT: ld1d { z16.d }, p0/z, [x1, x14, lsl #3] +; VBITS_EQ_256-NEXT: ld1d { z17.d }, p0/z, [x1, x13, lsl #3] +; VBITS_EQ_256-NEXT: ld1d { z18.d }, p0/z, [x1, x12, lsl #3] +; VBITS_EQ_256-NEXT: ld1d { z23.d }, p0/z, [x1] +; VBITS_EQ_256-NEXT: ld1d { z0.d }, p2/z, [z19.d] +; VBITS_EQ_256-NEXT: cmpeq p2.d, p0/z, z1.d, #0 +; VBITS_EQ_256-NEXT: cmpeq p1.d, p0/z, z6.d, #0 +; VBITS_EQ_256-NEXT: ld1d { z1.d }, p2/z, [z21.d] +; VBITS_EQ_256-NEXT: cmpeq p2.d, p0/z, z2.d, #0 +; VBITS_EQ_256-NEXT: ld1d { z2.d }, p2/z, [z22.d] +; VBITS_EQ_256-NEXT: cmpeq p2.d, p0/z, z3.d, #0 +; VBITS_EQ_256-NEXT: ld1d { z3.d }, p2/z, [z20.d] +; VBITS_EQ_256-NEXT: cmpeq p2.d, p0/z, z4.d, #0 +; VBITS_EQ_256-NEXT: ld1d { z4.d }, p2/z, [z18.d] +; VBITS_EQ_256-NEXT: cmpeq p2.d, p0/z, z5.d, #0 +; VBITS_EQ_256-NEXT: ld1d { z5.d }, p2/z, [z17.d] +; VBITS_EQ_256-NEXT: ld1d { z6.d }, p1/z, [z16.d] +; VBITS_EQ_256-NEXT: cmpeq p1.d, p0/z, z7.d, #0 +; VBITS_EQ_256-NEXT: ld1d { z7.d }, p1/z, [z23.d] +; VBITS_EQ_256-NEXT: st1d { z3.d }, p0, [x0, x11, lsl #3] +; VBITS_EQ_256-NEXT: st1d { z4.d }, p0, [x0, x12, lsl #3] +; VBITS_EQ_256-NEXT: st1d { z2.d }, p0, [x0, x10, lsl #3] +; VBITS_EQ_256-NEXT: st1d { z6.d }, p0, [x0, x14, lsl #3] +; VBITS_EQ_256-NEXT: st1d { z5.d }, p0, [x0, x13, lsl #3] +; VBITS_EQ_256-NEXT: st1d { z1.d }, p0, [x0, x9, lsl #3] +; VBITS_EQ_256-NEXT: st1d { z0.d }, p0, [x0, x8, lsl #3] +; VBITS_EQ_256-NEXT: st1d { z7.d }, p0, [x0] +; VBITS_EQ_256-NEXT: ret +; ; VBITS_GE_2048-LABEL: masked_gather_v32i64: ; VBITS_GE_2048: // %bb.0: ; VBITS_GE_2048-NEXT: ptrue p0.d, vl32 @@ -602,6 +3957,32 @@ ; define void @masked_gather_v2f16(<2 x half>* %a, <2 x half*>* %b) #0 { +; NO_SVE-LABEL: masked_gather_v2f16: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: sub sp, sp, #16 +; NO_SVE-NEXT: .cfi_def_cfa_offset 16 +; NO_SVE-NEXT: ldr s0, [x0] +; NO_SVE-NEXT: ldr q1, [x1] +; NO_SVE-NEXT: fcmeq v0.4h, v0.4h, #0.0 +; NO_SVE-NEXT: umov w8, v0.h[1] +; NO_SVE-NEXT: umov w9, v0.h[0] +; NO_SVE-NEXT: // implicit-def: $d0 +; NO_SVE-NEXT: bfi w9, w8, #1, #31 +; NO_SVE-NEXT: and w8, w9, #0x3 +; NO_SVE-NEXT: tbz w9, #0, .LBB21_2 +; NO_SVE-NEXT: // %bb.1: // %cond.load +; NO_SVE-NEXT: fmov x9, d1 +; NO_SVE-NEXT: ldr h0, [x9] +; NO_SVE-NEXT: .LBB21_2: // %else +; NO_SVE-NEXT: tbz w8, #1, .LBB21_4 +; NO_SVE-NEXT: // %bb.3: // %cond.load1 +; NO_SVE-NEXT: mov x8, v1.d[1] +; NO_SVE-NEXT: ld1 { v0.h }[1], [x8] +; NO_SVE-NEXT: .LBB21_4: // %else2 +; NO_SVE-NEXT: str s0, [x0] +; NO_SVE-NEXT: add sp, sp, #16 +; NO_SVE-NEXT: ret +; ; CHECK-LABEL: masked_gather_v2f16: ; CHECK: // %bb.0: ; CHECK-NEXT: ldr s1, [x0] @@ -638,6 +4019,54 @@ } define void @masked_gather_v4f16(<4 x half>* %a, <4 x half*>* %b) #0 { +; NO_SVE-LABEL: masked_gather_v4f16: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: sub sp, sp, #16 +; NO_SVE-NEXT: .cfi_def_cfa_offset 16 +; NO_SVE-NEXT: ldr d0, [x0] +; NO_SVE-NEXT: ldr q1, [x1] +; NO_SVE-NEXT: fcmeq v0.4h, v0.4h, #0.0 +; NO_SVE-NEXT: umov w8, v0.h[1] +; NO_SVE-NEXT: umov w9, v0.h[2] +; NO_SVE-NEXT: umov w10, v0.h[0] +; NO_SVE-NEXT: umov w11, v0.h[3] +; NO_SVE-NEXT: // implicit-def: $d0 +; NO_SVE-NEXT: and w8, w8, #0x1 +; NO_SVE-NEXT: and w9, w9, #0x1 +; NO_SVE-NEXT: bfi w10, w8, #1, #1 +; NO_SVE-NEXT: bfi w10, w9, #2, #1 +; NO_SVE-NEXT: bfi w10, w11, #3, #29 +; NO_SVE-NEXT: and w8, w10, #0xf +; NO_SVE-NEXT: tbnz w10, #0, .LBB22_6 +; NO_SVE-NEXT: // %bb.1: // %else +; NO_SVE-NEXT: tbnz w8, #1, .LBB22_7 +; NO_SVE-NEXT: .LBB22_2: // %else2 +; NO_SVE-NEXT: ldr q1, [x1, #16] +; NO_SVE-NEXT: tbnz w8, #2, .LBB22_8 +; NO_SVE-NEXT: .LBB22_3: // %else5 +; NO_SVE-NEXT: tbz w8, #3, .LBB22_5 +; NO_SVE-NEXT: .LBB22_4: // %cond.load7 +; NO_SVE-NEXT: mov x8, v1.d[1] +; NO_SVE-NEXT: ld1 { v0.h }[3], [x8] +; NO_SVE-NEXT: .LBB22_5: // %else8 +; NO_SVE-NEXT: str d0, [x0] +; NO_SVE-NEXT: add sp, sp, #16 +; NO_SVE-NEXT: ret +; NO_SVE-NEXT: .LBB22_6: // %cond.load +; NO_SVE-NEXT: fmov x9, d1 +; NO_SVE-NEXT: ldr h0, [x9] +; NO_SVE-NEXT: tbz w8, #1, .LBB22_2 +; NO_SVE-NEXT: .LBB22_7: // %cond.load1 +; NO_SVE-NEXT: mov x9, v1.d[1] +; NO_SVE-NEXT: ld1 { v0.h }[1], [x9] +; NO_SVE-NEXT: ldr q1, [x1, #16] +; NO_SVE-NEXT: tbz w8, #2, .LBB22_3 +; NO_SVE-NEXT: .LBB22_8: // %cond.load4 +; NO_SVE-NEXT: fmov x9, d1 +; NO_SVE-NEXT: ld1 { v0.h }[2], [x9] +; NO_SVE-NEXT: tbnz w8, #3, .LBB22_4 +; NO_SVE-NEXT: b .LBB22_5 +; ; CHECK-LABEL: masked_gather_v4f16: ; CHECK: // %bb.0: ; CHECK-NEXT: ldr d0, [x0] @@ -661,6 +4090,121 @@ } define void @masked_gather_v8f16(<8 x half>* %a, <8 x half*>* %b) #0 { +; NO_SVE-LABEL: masked_gather_v8f16: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: sub sp, sp, #16 +; NO_SVE-NEXT: .cfi_def_cfa_offset 16 +; NO_SVE-NEXT: ldr q0, [x0] +; NO_SVE-NEXT: ldr q1, [x1] +; NO_SVE-NEXT: fcmeq v0.8h, v0.8h, #0.0 +; NO_SVE-NEXT: xtn v0.8b, v0.8h +; NO_SVE-NEXT: umov w8, v0.b[1] +; NO_SVE-NEXT: umov w10, v0.b[2] +; NO_SVE-NEXT: umov w9, v0.b[0] +; NO_SVE-NEXT: umov w11, v0.b[3] +; NO_SVE-NEXT: umov w12, v0.b[4] +; NO_SVE-NEXT: umov w13, v0.b[5] +; NO_SVE-NEXT: umov w14, v0.b[6] +; NO_SVE-NEXT: and w8, w8, #0x1 +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: and w9, w9, #0x1 +; NO_SVE-NEXT: and w11, w11, #0x1 +; NO_SVE-NEXT: bfi w9, w8, #1, #1 +; NO_SVE-NEXT: and w8, w12, #0x1 +; NO_SVE-NEXT: bfi w9, w10, #2, #1 +; NO_SVE-NEXT: and w10, w13, #0x1 +; NO_SVE-NEXT: bfi w9, w11, #3, #1 +; NO_SVE-NEXT: umov w11, v0.b[7] +; NO_SVE-NEXT: bfi w9, w8, #4, #1 +; NO_SVE-NEXT: and w8, w14, #0x1 +; NO_SVE-NEXT: bfi w9, w10, #5, #1 +; NO_SVE-NEXT: // implicit-def: $q0 +; NO_SVE-NEXT: orr w8, w9, w8, lsl #6 +; NO_SVE-NEXT: orr w9, w8, w11, lsl #7 +; NO_SVE-NEXT: and w8, w9, #0xff +; NO_SVE-NEXT: tbnz w9, #0, .LBB23_10 +; NO_SVE-NEXT: // %bb.1: // %else +; NO_SVE-NEXT: tbnz w8, #1, .LBB23_11 +; NO_SVE-NEXT: .LBB23_2: // %else2 +; NO_SVE-NEXT: ldr q1, [x1, #16] +; NO_SVE-NEXT: tbnz w8, #2, .LBB23_12 +; NO_SVE-NEXT: .LBB23_3: // %else5 +; NO_SVE-NEXT: tbnz w8, #3, .LBB23_13 +; NO_SVE-NEXT: .LBB23_4: // %else8 +; NO_SVE-NEXT: ldr q1, [x1, #32] +; NO_SVE-NEXT: tbnz w8, #4, .LBB23_14 +; NO_SVE-NEXT: .LBB23_5: // %else11 +; NO_SVE-NEXT: tbnz w8, #5, .LBB23_15 +; NO_SVE-NEXT: .LBB23_6: // %else14 +; NO_SVE-NEXT: ldr q1, [x1, #48] +; NO_SVE-NEXT: tbnz w8, #6, .LBB23_16 +; NO_SVE-NEXT: .LBB23_7: // %else17 +; NO_SVE-NEXT: tbz w8, #7, .LBB23_9 +; NO_SVE-NEXT: .LBB23_8: // %cond.load19 +; NO_SVE-NEXT: mov x8, v1.d[1] +; NO_SVE-NEXT: ld1 { v0.h }[7], [x8] +; NO_SVE-NEXT: .LBB23_9: // %else20 +; NO_SVE-NEXT: str q0, [x0] +; NO_SVE-NEXT: add sp, sp, #16 +; NO_SVE-NEXT: ret +; NO_SVE-NEXT: .LBB23_10: // %cond.load +; NO_SVE-NEXT: fmov x9, d1 +; NO_SVE-NEXT: ldr h0, [x9] +; NO_SVE-NEXT: tbz w8, #1, .LBB23_2 +; NO_SVE-NEXT: .LBB23_11: // %cond.load1 +; NO_SVE-NEXT: mov x9, v1.d[1] +; NO_SVE-NEXT: ld1 { v0.h }[1], [x9] +; NO_SVE-NEXT: ldr q1, [x1, #16] +; NO_SVE-NEXT: tbz w8, #2, .LBB23_3 +; NO_SVE-NEXT: .LBB23_12: // %cond.load4 +; NO_SVE-NEXT: fmov x9, d1 +; NO_SVE-NEXT: ld1 { v0.h }[2], [x9] +; NO_SVE-NEXT: tbz w8, #3, .LBB23_4 +; NO_SVE-NEXT: .LBB23_13: // %cond.load7 +; NO_SVE-NEXT: mov x9, v1.d[1] +; NO_SVE-NEXT: ld1 { v0.h }[3], [x9] +; NO_SVE-NEXT: ldr q1, [x1, #32] +; NO_SVE-NEXT: tbz w8, #4, .LBB23_5 +; NO_SVE-NEXT: .LBB23_14: // %cond.load10 +; NO_SVE-NEXT: fmov x9, d1 +; NO_SVE-NEXT: ld1 { v0.h }[4], [x9] +; NO_SVE-NEXT: tbz w8, #5, .LBB23_6 +; NO_SVE-NEXT: .LBB23_15: // %cond.load13 +; NO_SVE-NEXT: mov x9, v1.d[1] +; NO_SVE-NEXT: ld1 { v0.h }[5], [x9] +; NO_SVE-NEXT: ldr q1, [x1, #48] +; NO_SVE-NEXT: tbz w8, #6, .LBB23_7 +; NO_SVE-NEXT: .LBB23_16: // %cond.load16 +; NO_SVE-NEXT: fmov x9, d1 +; NO_SVE-NEXT: ld1 { v0.h }[6], [x9] +; NO_SVE-NEXT: tbnz w8, #7, .LBB23_8 +; NO_SVE-NEXT: b .LBB23_9 +; +; VBITS_EQ_256-LABEL: masked_gather_v8f16: +; VBITS_EQ_256: // %bb.0: +; VBITS_EQ_256-NEXT: ldr q0, [x0] +; VBITS_EQ_256-NEXT: mov x8, #4 +; VBITS_EQ_256-NEXT: ptrue p0.d, vl4 +; VBITS_EQ_256-NEXT: fcmeq v0.8h, v0.8h, #0.0 +; VBITS_EQ_256-NEXT: ld1d { z2.d }, p0/z, [x1, x8, lsl #3] +; VBITS_EQ_256-NEXT: ld1d { z3.d }, p0/z, [x1] +; VBITS_EQ_256-NEXT: ext v1.16b, v0.16b, v0.16b, #8 +; VBITS_EQ_256-NEXT: sunpklo z0.s, z0.h +; VBITS_EQ_256-NEXT: sunpklo z0.d, z0.s +; VBITS_EQ_256-NEXT: cmpne p1.d, p0/z, z0.d, #0 +; VBITS_EQ_256-NEXT: ld1h { z0.d }, p1/z, [z3.d] +; VBITS_EQ_256-NEXT: sunpklo z1.s, z1.h +; VBITS_EQ_256-NEXT: sunpklo z1.d, z1.s +; VBITS_EQ_256-NEXT: cmpne p0.d, p0/z, z1.d, #0 +; VBITS_EQ_256-NEXT: ld1h { z1.d }, p0/z, [z2.d] +; VBITS_EQ_256-NEXT: uzp1 z0.s, z0.s, z0.s +; VBITS_EQ_256-NEXT: uzp1 z0.h, z0.h, z0.h +; VBITS_EQ_256-NEXT: uzp1 z1.s, z1.s, z1.s +; VBITS_EQ_256-NEXT: uzp1 z1.h, z1.h, z1.h +; VBITS_EQ_256-NEXT: mov v0.d[1], v1.d[0] +; VBITS_EQ_256-NEXT: str q0, [x0] +; VBITS_EQ_256-NEXT: ret +; ; VBITS_GE_512-LABEL: masked_gather_v8f16: ; VBITS_GE_512: // %bb.0: ; VBITS_GE_512-NEXT: ldr q0, [x0] @@ -684,6 +4228,225 @@ } define void @masked_gather_v16f16(<16 x half>* %a, <16 x half*>* %b) #0 { +; NO_SVE-LABEL: masked_gather_v16f16: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: sub sp, sp, #16 +; NO_SVE-NEXT: .cfi_def_cfa_offset 16 +; NO_SVE-NEXT: ldp q0, q1, [x0] +; NO_SVE-NEXT: fcmeq v0.8h, v0.8h, #0.0 +; NO_SVE-NEXT: fcmeq v1.8h, v1.8h, #0.0 +; NO_SVE-NEXT: xtn v0.8b, v0.8h +; NO_SVE-NEXT: umov w8, v0.b[1] +; NO_SVE-NEXT: umov w10, v0.b[2] +; NO_SVE-NEXT: umov w9, v0.b[0] +; NO_SVE-NEXT: umov w11, v0.b[3] +; NO_SVE-NEXT: umov w12, v0.b[4] +; NO_SVE-NEXT: umov w13, v0.b[5] +; NO_SVE-NEXT: xtn v1.8b, v1.8h +; NO_SVE-NEXT: and w8, w8, #0x1 +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: and w9, w9, #0x1 +; NO_SVE-NEXT: and w11, w11, #0x1 +; NO_SVE-NEXT: and w12, w12, #0x1 +; NO_SVE-NEXT: and w13, w13, #0x1 +; NO_SVE-NEXT: bfi w9, w8, #1, #1 +; NO_SVE-NEXT: umov w8, v0.b[6] +; NO_SVE-NEXT: bfi w9, w10, #2, #1 +; NO_SVE-NEXT: umov w10, v0.b[7] +; NO_SVE-NEXT: bfi w9, w11, #3, #1 +; NO_SVE-NEXT: umov w11, v1.b[0] +; NO_SVE-NEXT: bfi w9, w12, #4, #1 +; NO_SVE-NEXT: umov w12, v1.b[1] +; NO_SVE-NEXT: and w8, w8, #0x1 +; NO_SVE-NEXT: bfi w9, w13, #5, #1 +; NO_SVE-NEXT: umov w13, v1.b[2] +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: orr w8, w9, w8, lsl #6 +; NO_SVE-NEXT: umov w9, v1.b[3] +; NO_SVE-NEXT: and w11, w11, #0x1 +; NO_SVE-NEXT: orr w8, w8, w10, lsl #7 +; NO_SVE-NEXT: umov w10, v1.b[4] +; NO_SVE-NEXT: and w12, w12, #0x1 +; NO_SVE-NEXT: orr w8, w8, w11, lsl #8 +; NO_SVE-NEXT: umov w11, v1.b[5] +; NO_SVE-NEXT: and w13, w13, #0x1 +; NO_SVE-NEXT: orr w8, w8, w12, lsl #9 +; NO_SVE-NEXT: umov w12, v1.b[6] +; NO_SVE-NEXT: and w9, w9, #0x1 +; NO_SVE-NEXT: orr w8, w8, w13, lsl #10 +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: orr w8, w8, w9, lsl #11 +; NO_SVE-NEXT: and w9, w11, #0x1 +; NO_SVE-NEXT: umov w11, v1.b[7] +; NO_SVE-NEXT: orr w8, w8, w10, lsl #12 +; NO_SVE-NEXT: and w10, w12, #0x1 +; NO_SVE-NEXT: orr w8, w8, w9, lsl #13 +; NO_SVE-NEXT: ldr q1, [x1] +; NO_SVE-NEXT: orr w8, w8, w10, lsl #14 +; NO_SVE-NEXT: orr w9, w8, w11, lsl #15 +; NO_SVE-NEXT: and w8, w9, #0xffff +; NO_SVE-NEXT: tbz w9, #0, .LBB24_2 +; NO_SVE-NEXT: // %bb.1: // %cond.load +; NO_SVE-NEXT: fmov x9, d1 +; NO_SVE-NEXT: ldr h0, [x9] +; NO_SVE-NEXT: tbnz w8, #1, .LBB24_3 +; NO_SVE-NEXT: b .LBB24_4 +; NO_SVE-NEXT: .LBB24_2: +; NO_SVE-NEXT: // implicit-def: $q0 +; NO_SVE-NEXT: tbz w8, #1, .LBB24_4 +; NO_SVE-NEXT: .LBB24_3: // %cond.load1 +; NO_SVE-NEXT: mov x9, v1.d[1] +; NO_SVE-NEXT: ld1 { v0.h }[1], [x9] +; NO_SVE-NEXT: .LBB24_4: // %else2 +; NO_SVE-NEXT: ldr q1, [x1, #16] +; NO_SVE-NEXT: tbnz w8, #2, .LBB24_12 +; NO_SVE-NEXT: // %bb.5: // %else5 +; NO_SVE-NEXT: tbnz w8, #3, .LBB24_13 +; NO_SVE-NEXT: .LBB24_6: // %else8 +; NO_SVE-NEXT: ldr q1, [x1, #32] +; NO_SVE-NEXT: tbnz w8, #4, .LBB24_14 +; NO_SVE-NEXT: .LBB24_7: // %else11 +; NO_SVE-NEXT: tbnz w8, #5, .LBB24_15 +; NO_SVE-NEXT: .LBB24_8: // %else14 +; NO_SVE-NEXT: ldr q1, [x1, #48] +; NO_SVE-NEXT: tbnz w8, #6, .LBB24_16 +; NO_SVE-NEXT: .LBB24_9: // %else17 +; NO_SVE-NEXT: tbnz w8, #7, .LBB24_17 +; NO_SVE-NEXT: .LBB24_10: // %else20 +; NO_SVE-NEXT: ldr q2, [x1, #64] +; NO_SVE-NEXT: tbz w8, #8, .LBB24_18 +; NO_SVE-NEXT: .LBB24_11: // %cond.load22 +; NO_SVE-NEXT: fmov x9, d2 +; NO_SVE-NEXT: ld1 { v1.h }[0], [x9] +; NO_SVE-NEXT: tbnz w8, #9, .LBB24_19 +; NO_SVE-NEXT: b .LBB24_20 +; NO_SVE-NEXT: .LBB24_12: // %cond.load4 +; NO_SVE-NEXT: fmov x9, d1 +; NO_SVE-NEXT: ld1 { v0.h }[2], [x9] +; NO_SVE-NEXT: tbz w8, #3, .LBB24_6 +; NO_SVE-NEXT: .LBB24_13: // %cond.load7 +; NO_SVE-NEXT: mov x9, v1.d[1] +; NO_SVE-NEXT: ld1 { v0.h }[3], [x9] +; NO_SVE-NEXT: ldr q1, [x1, #32] +; NO_SVE-NEXT: tbz w8, #4, .LBB24_7 +; NO_SVE-NEXT: .LBB24_14: // %cond.load10 +; NO_SVE-NEXT: fmov x9, d1 +; NO_SVE-NEXT: ld1 { v0.h }[4], [x9] +; NO_SVE-NEXT: tbz w8, #5, .LBB24_8 +; NO_SVE-NEXT: .LBB24_15: // %cond.load13 +; NO_SVE-NEXT: mov x9, v1.d[1] +; NO_SVE-NEXT: ld1 { v0.h }[5], [x9] +; NO_SVE-NEXT: ldr q1, [x1, #48] +; NO_SVE-NEXT: tbz w8, #6, .LBB24_9 +; NO_SVE-NEXT: .LBB24_16: // %cond.load16 +; NO_SVE-NEXT: fmov x9, d1 +; NO_SVE-NEXT: ld1 { v0.h }[6], [x9] +; NO_SVE-NEXT: tbz w8, #7, .LBB24_10 +; NO_SVE-NEXT: .LBB24_17: // %cond.load19 +; NO_SVE-NEXT: mov x9, v1.d[1] +; NO_SVE-NEXT: ld1 { v0.h }[7], [x9] +; NO_SVE-NEXT: ldr q2, [x1, #64] +; NO_SVE-NEXT: tbnz w8, #8, .LBB24_11 +; NO_SVE-NEXT: .LBB24_18: +; NO_SVE-NEXT: // implicit-def: $q1 +; NO_SVE-NEXT: tbz w8, #9, .LBB24_20 +; NO_SVE-NEXT: .LBB24_19: // %cond.load25 +; NO_SVE-NEXT: mov x9, v2.d[1] +; NO_SVE-NEXT: ld1 { v1.h }[1], [x9] +; NO_SVE-NEXT: .LBB24_20: // %else26 +; NO_SVE-NEXT: ldr q2, [x1, #80] +; NO_SVE-NEXT: tbnz w8, #10, .LBB24_28 +; NO_SVE-NEXT: // %bb.21: // %else29 +; NO_SVE-NEXT: tbnz w8, #11, .LBB24_29 +; NO_SVE-NEXT: .LBB24_22: // %else32 +; NO_SVE-NEXT: ldr q2, [x1, #96] +; NO_SVE-NEXT: tbnz w8, #12, .LBB24_30 +; NO_SVE-NEXT: .LBB24_23: // %else35 +; NO_SVE-NEXT: tbnz w8, #13, .LBB24_31 +; NO_SVE-NEXT: .LBB24_24: // %else38 +; NO_SVE-NEXT: ldr q2, [x1, #112] +; NO_SVE-NEXT: tbnz w8, #14, .LBB24_32 +; NO_SVE-NEXT: .LBB24_25: // %else41 +; NO_SVE-NEXT: tbz w8, #15, .LBB24_27 +; NO_SVE-NEXT: .LBB24_26: // %cond.load43 +; NO_SVE-NEXT: mov x8, v2.d[1] +; NO_SVE-NEXT: ld1 { v1.h }[7], [x8] +; NO_SVE-NEXT: .LBB24_27: // %else44 +; NO_SVE-NEXT: stp q0, q1, [x0] +; NO_SVE-NEXT: add sp, sp, #16 +; NO_SVE-NEXT: ret +; NO_SVE-NEXT: .LBB24_28: // %cond.load28 +; NO_SVE-NEXT: fmov x9, d2 +; NO_SVE-NEXT: ld1 { v1.h }[2], [x9] +; NO_SVE-NEXT: tbz w8, #11, .LBB24_22 +; NO_SVE-NEXT: .LBB24_29: // %cond.load31 +; NO_SVE-NEXT: mov x9, v2.d[1] +; NO_SVE-NEXT: ld1 { v1.h }[3], [x9] +; NO_SVE-NEXT: ldr q2, [x1, #96] +; NO_SVE-NEXT: tbz w8, #12, .LBB24_23 +; NO_SVE-NEXT: .LBB24_30: // %cond.load34 +; NO_SVE-NEXT: fmov x9, d2 +; NO_SVE-NEXT: ld1 { v1.h }[4], [x9] +; NO_SVE-NEXT: tbz w8, #13, .LBB24_24 +; NO_SVE-NEXT: .LBB24_31: // %cond.load37 +; NO_SVE-NEXT: mov x9, v2.d[1] +; NO_SVE-NEXT: ld1 { v1.h }[5], [x9] +; NO_SVE-NEXT: ldr q2, [x1, #112] +; NO_SVE-NEXT: tbz w8, #14, .LBB24_25 +; NO_SVE-NEXT: .LBB24_32: // %cond.load40 +; NO_SVE-NEXT: fmov x9, d2 +; NO_SVE-NEXT: ld1 { v1.h }[6], [x9] +; NO_SVE-NEXT: tbnz w8, #15, .LBB24_26 +; NO_SVE-NEXT: b .LBB24_27 +; +; VBITS_EQ_256-LABEL: masked_gather_v16f16: +; VBITS_EQ_256: // %bb.0: +; VBITS_EQ_256-NEXT: ptrue p0.h, vl16 +; VBITS_EQ_256-NEXT: mov x8, #12 +; VBITS_EQ_256-NEXT: ld1h { z0.h }, p0/z, [x0] +; VBITS_EQ_256-NEXT: mov x9, #8 +; VBITS_EQ_256-NEXT: mov x10, #4 +; VBITS_EQ_256-NEXT: ptrue p1.d, vl4 +; VBITS_EQ_256-NEXT: ld1d { z1.d }, p1/z, [x1, x8, lsl #3] +; VBITS_EQ_256-NEXT: fcmeq p2.h, p0/z, z0.h, #0.0 +; VBITS_EQ_256-NEXT: ld1d { z0.d }, p1/z, [x1, x9, lsl #3] +; VBITS_EQ_256-NEXT: mov z2.h, p2/z, #-1 // =0xffffffffffffffff +; VBITS_EQ_256-NEXT: ld1d { z3.d }, p1/z, [x1, x10, lsl #3] +; VBITS_EQ_256-NEXT: sunpklo z4.s, z2.h +; VBITS_EQ_256-NEXT: ld1d { z6.d }, p1/z, [x1] +; VBITS_EQ_256-NEXT: ext v5.16b, v2.16b, v2.16b, #8 +; VBITS_EQ_256-NEXT: sunpklo z4.d, z4.s +; VBITS_EQ_256-NEXT: ext z2.b, z2.b, z2.b, #16 +; VBITS_EQ_256-NEXT: cmpne p2.d, p1/z, z4.d, #0 +; VBITS_EQ_256-NEXT: ext v4.16b, v2.16b, v2.16b, #8 +; VBITS_EQ_256-NEXT: sunpklo z2.s, z2.h +; VBITS_EQ_256-NEXT: ld1h { z6.d }, p2/z, [z6.d] +; VBITS_EQ_256-NEXT: sunpklo z2.d, z2.s +; VBITS_EQ_256-NEXT: sunpklo z5.s, z5.h +; VBITS_EQ_256-NEXT: sunpklo z5.d, z5.s +; VBITS_EQ_256-NEXT: cmpne p2.d, p1/z, z5.d, #0 +; VBITS_EQ_256-NEXT: sunpklo z4.s, z4.h +; VBITS_EQ_256-NEXT: ld1h { z3.d }, p2/z, [z3.d] +; VBITS_EQ_256-NEXT: cmpne p2.d, p1/z, z2.d, #0 +; VBITS_EQ_256-NEXT: sunpklo z2.d, z4.s +; VBITS_EQ_256-NEXT: ld1h { z0.d }, p2/z, [z0.d] +; VBITS_EQ_256-NEXT: cmpne p1.d, p1/z, z2.d, #0 +; VBITS_EQ_256-NEXT: uzp1 z2.s, z6.s, z6.s +; VBITS_EQ_256-NEXT: ld1h { z1.d }, p1/z, [z1.d] +; VBITS_EQ_256-NEXT: uzp1 z2.h, z2.h, z2.h +; VBITS_EQ_256-NEXT: uzp1 z3.s, z3.s, z3.s +; VBITS_EQ_256-NEXT: ptrue p1.h, vl8 +; VBITS_EQ_256-NEXT: uzp1 z0.s, z0.s, z0.s +; VBITS_EQ_256-NEXT: uzp1 z3.h, z3.h, z3.h +; VBITS_EQ_256-NEXT: uzp1 z0.h, z0.h, z0.h +; VBITS_EQ_256-NEXT: uzp1 z1.s, z1.s, z1.s +; VBITS_EQ_256-NEXT: uzp1 z1.h, z1.h, z1.h +; VBITS_EQ_256-NEXT: mov v2.d[1], v3.d[0] +; VBITS_EQ_256-NEXT: mov v0.d[1], v1.d[0] +; VBITS_EQ_256-NEXT: splice z2.h, p1, z2.h, z0.h +; VBITS_EQ_256-NEXT: st1h { z2.h }, p0, [x0] +; VBITS_EQ_256-NEXT: ret +; ; VBITS_GE_1024-LABEL: masked_gather_v16f16: ; VBITS_GE_1024: // %bb.0: ; VBITS_GE_1024-NEXT: ptrue p0.h, vl16 @@ -707,6 +4470,431 @@ } define void @masked_gather_v32f16(<32 x half>* %a, <32 x half*>* %b) #0 { +; NO_SVE-LABEL: masked_gather_v32f16: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: sub sp, sp, #16 +; NO_SVE-NEXT: .cfi_def_cfa_offset 16 +; NO_SVE-NEXT: ldp q0, q1, [x0, #32] +; NO_SVE-NEXT: fcmeq v0.8h, v0.8h, #0.0 +; NO_SVE-NEXT: ldp q2, q3, [x0] +; NO_SVE-NEXT: fcmeq v1.8h, v1.8h, #0.0 +; NO_SVE-NEXT: xtn v0.8b, v0.8h +; NO_SVE-NEXT: umov w8, v0.b[1] +; NO_SVE-NEXT: umov w10, v0.b[2] +; NO_SVE-NEXT: umov w9, v0.b[0] +; NO_SVE-NEXT: umov w11, v0.b[3] +; NO_SVE-NEXT: umov w12, v0.b[4] +; NO_SVE-NEXT: umov w13, v0.b[5] +; NO_SVE-NEXT: fcmeq v2.8h, v2.8h, #0.0 +; NO_SVE-NEXT: xtn v1.8b, v1.8h +; NO_SVE-NEXT: and w8, w8, #0x1 +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: umov w14, v0.b[6] +; NO_SVE-NEXT: and w9, w9, #0x1 +; NO_SVE-NEXT: and w11, w11, #0x1 +; NO_SVE-NEXT: umov w15, v0.b[7] +; NO_SVE-NEXT: bfi w9, w8, #1, #1 +; NO_SVE-NEXT: and w12, w12, #0x1 +; NO_SVE-NEXT: umov w16, v1.b[0] +; NO_SVE-NEXT: bfi w9, w10, #2, #1 +; NO_SVE-NEXT: and w13, w13, #0x1 +; NO_SVE-NEXT: xtn v2.8b, v2.8h +; NO_SVE-NEXT: umov w8, v1.b[1] +; NO_SVE-NEXT: bfi w9, w11, #3, #1 +; NO_SVE-NEXT: umov w10, v1.b[2] +; NO_SVE-NEXT: bfi w9, w12, #4, #1 +; NO_SVE-NEXT: and w14, w14, #0x1 +; NO_SVE-NEXT: umov w11, v1.b[3] +; NO_SVE-NEXT: umov w12, v2.b[1] +; NO_SVE-NEXT: bfi w9, w13, #5, #1 +; NO_SVE-NEXT: and w15, w15, #0x1 +; NO_SVE-NEXT: orr w9, w9, w14, lsl #6 +; NO_SVE-NEXT: and w16, w16, #0x1 +; NO_SVE-NEXT: umov w14, v2.b[0] +; NO_SVE-NEXT: orr w9, w9, w15, lsl #7 +; NO_SVE-NEXT: and w8, w8, #0x1 +; NO_SVE-NEXT: umov w13, v1.b[4] +; NO_SVE-NEXT: umov w15, v1.b[5] +; NO_SVE-NEXT: orr w9, w9, w16, lsl #8 +; NO_SVE-NEXT: umov w16, v2.b[2] +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: and w12, w12, #0x1 +; NO_SVE-NEXT: orr w8, w9, w8, lsl #9 +; NO_SVE-NEXT: umov w9, v2.b[3] +; NO_SVE-NEXT: and w11, w11, #0x1 +; NO_SVE-NEXT: orr w8, w8, w10, lsl #10 +; NO_SVE-NEXT: umov w10, v2.b[4] +; NO_SVE-NEXT: and w14, w14, #0x1 +; NO_SVE-NEXT: orr w8, w8, w11, lsl #11 +; NO_SVE-NEXT: umov w11, v2.b[5] +; NO_SVE-NEXT: bfi w14, w12, #1, #1 +; NO_SVE-NEXT: and w12, w13, #0x1 +; NO_SVE-NEXT: and w13, w15, #0x1 +; NO_SVE-NEXT: and w15, w16, #0x1 +; NO_SVE-NEXT: fcmeq v0.8h, v3.8h, #0.0 +; NO_SVE-NEXT: and w9, w9, #0x1 +; NO_SVE-NEXT: orr w8, w8, w12, lsl #12 +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: umov w12, v2.b[6] +; NO_SVE-NEXT: bfi w14, w15, #2, #1 +; NO_SVE-NEXT: and w11, w11, #0x1 +; NO_SVE-NEXT: bfi w14, w9, #3, #1 +; NO_SVE-NEXT: umov w9, v2.b[7] +; NO_SVE-NEXT: xtn v0.8b, v0.8h +; NO_SVE-NEXT: bfi w14, w10, #4, #1 +; NO_SVE-NEXT: umov w10, v1.b[6] +; NO_SVE-NEXT: bfi w14, w11, #5, #1 +; NO_SVE-NEXT: and w11, w12, #0x1 +; NO_SVE-NEXT: umov w12, v0.b[0] +; NO_SVE-NEXT: umov w15, v0.b[1] +; NO_SVE-NEXT: and w9, w9, #0x1 +; NO_SVE-NEXT: orr w11, w14, w11, lsl #6 +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: umov w14, v0.b[3] +; NO_SVE-NEXT: orr w9, w11, w9, lsl #7 +; NO_SVE-NEXT: orr w8, w8, w13, lsl #13 +; NO_SVE-NEXT: and w11, w12, #0x1 +; NO_SVE-NEXT: umov w12, v0.b[2] +; NO_SVE-NEXT: and w13, w15, #0x1 +; NO_SVE-NEXT: orr w8, w8, w10, lsl #14 +; NO_SVE-NEXT: umov w10, v0.b[4] +; NO_SVE-NEXT: orr w9, w9, w11, lsl #8 +; NO_SVE-NEXT: orr w9, w9, w13, lsl #9 +; NO_SVE-NEXT: and w13, w14, #0x1 +; NO_SVE-NEXT: and w11, w12, #0x1 +; NO_SVE-NEXT: umov w12, v0.b[5] +; NO_SVE-NEXT: umov w14, v0.b[6] +; NO_SVE-NEXT: orr w9, w9, w11, lsl #10 +; NO_SVE-NEXT: umov w11, v1.b[7] +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: orr w9, w9, w13, lsl #11 +; NO_SVE-NEXT: and w12, w12, #0x1 +; NO_SVE-NEXT: umov w13, v0.b[7] +; NO_SVE-NEXT: orr w9, w9, w10, lsl #12 +; NO_SVE-NEXT: and w10, w14, #0x1 +; NO_SVE-NEXT: orr w11, w8, w11, lsl #15 +; NO_SVE-NEXT: orr w8, w9, w12, lsl #13 +; NO_SVE-NEXT: orr w8, w8, w10, lsl #14 +; NO_SVE-NEXT: ldr q1, [x1] +; NO_SVE-NEXT: orr w8, w8, w13, lsl #15 +; NO_SVE-NEXT: bfi w8, w11, #16, #16 +; NO_SVE-NEXT: tbz w8, #0, .LBB25_2 +; NO_SVE-NEXT: // %bb.1: // %cond.load +; NO_SVE-NEXT: fmov x9, d1 +; NO_SVE-NEXT: ldr h0, [x9] +; NO_SVE-NEXT: tbnz w8, #1, .LBB25_3 +; NO_SVE-NEXT: b .LBB25_4 +; NO_SVE-NEXT: .LBB25_2: +; NO_SVE-NEXT: // implicit-def: $q0 +; NO_SVE-NEXT: tbz w8, #1, .LBB25_4 +; NO_SVE-NEXT: .LBB25_3: // %cond.load1 +; NO_SVE-NEXT: mov x9, v1.d[1] +; NO_SVE-NEXT: ld1 { v0.h }[1], [x9] +; NO_SVE-NEXT: .LBB25_4: // %else2 +; NO_SVE-NEXT: ldr q1, [x1, #16] +; NO_SVE-NEXT: tbnz w8, #2, .LBB25_12 +; NO_SVE-NEXT: // %bb.5: // %else5 +; NO_SVE-NEXT: tbnz w8, #3, .LBB25_13 +; NO_SVE-NEXT: .LBB25_6: // %else8 +; NO_SVE-NEXT: ldr q1, [x1, #32] +; NO_SVE-NEXT: tbnz w8, #4, .LBB25_14 +; NO_SVE-NEXT: .LBB25_7: // %else11 +; NO_SVE-NEXT: tbnz w8, #5, .LBB25_15 +; NO_SVE-NEXT: .LBB25_8: // %else14 +; NO_SVE-NEXT: ldr q1, [x1, #48] +; NO_SVE-NEXT: tbnz w8, #6, .LBB25_16 +; NO_SVE-NEXT: .LBB25_9: // %else17 +; NO_SVE-NEXT: tbnz w8, #7, .LBB25_17 +; NO_SVE-NEXT: .LBB25_10: // %else20 +; NO_SVE-NEXT: ldr q2, [x1, #64] +; NO_SVE-NEXT: tbz w8, #8, .LBB25_18 +; NO_SVE-NEXT: .LBB25_11: // %cond.load22 +; NO_SVE-NEXT: fmov x9, d2 +; NO_SVE-NEXT: ld1 { v1.h }[0], [x9] +; NO_SVE-NEXT: tbnz w8, #9, .LBB25_19 +; NO_SVE-NEXT: b .LBB25_20 +; NO_SVE-NEXT: .LBB25_12: // %cond.load4 +; NO_SVE-NEXT: fmov x9, d1 +; NO_SVE-NEXT: ld1 { v0.h }[2], [x9] +; NO_SVE-NEXT: tbz w8, #3, .LBB25_6 +; NO_SVE-NEXT: .LBB25_13: // %cond.load7 +; NO_SVE-NEXT: mov x9, v1.d[1] +; NO_SVE-NEXT: ld1 { v0.h }[3], [x9] +; NO_SVE-NEXT: ldr q1, [x1, #32] +; NO_SVE-NEXT: tbz w8, #4, .LBB25_7 +; NO_SVE-NEXT: .LBB25_14: // %cond.load10 +; NO_SVE-NEXT: fmov x9, d1 +; NO_SVE-NEXT: ld1 { v0.h }[4], [x9] +; NO_SVE-NEXT: tbz w8, #5, .LBB25_8 +; NO_SVE-NEXT: .LBB25_15: // %cond.load13 +; NO_SVE-NEXT: mov x9, v1.d[1] +; NO_SVE-NEXT: ld1 { v0.h }[5], [x9] +; NO_SVE-NEXT: ldr q1, [x1, #48] +; NO_SVE-NEXT: tbz w8, #6, .LBB25_9 +; NO_SVE-NEXT: .LBB25_16: // %cond.load16 +; NO_SVE-NEXT: fmov x9, d1 +; NO_SVE-NEXT: ld1 { v0.h }[6], [x9] +; NO_SVE-NEXT: tbz w8, #7, .LBB25_10 +; NO_SVE-NEXT: .LBB25_17: // %cond.load19 +; NO_SVE-NEXT: mov x9, v1.d[1] +; NO_SVE-NEXT: ld1 { v0.h }[7], [x9] +; NO_SVE-NEXT: ldr q2, [x1, #64] +; NO_SVE-NEXT: tbnz w8, #8, .LBB25_11 +; NO_SVE-NEXT: .LBB25_18: +; NO_SVE-NEXT: // implicit-def: $q1 +; NO_SVE-NEXT: tbz w8, #9, .LBB25_20 +; NO_SVE-NEXT: .LBB25_19: // %cond.load25 +; NO_SVE-NEXT: mov x9, v2.d[1] +; NO_SVE-NEXT: ld1 { v1.h }[1], [x9] +; NO_SVE-NEXT: .LBB25_20: // %else26 +; NO_SVE-NEXT: ldr q2, [x1, #80] +; NO_SVE-NEXT: tbnz w8, #10, .LBB25_28 +; NO_SVE-NEXT: // %bb.21: // %else29 +; NO_SVE-NEXT: tbnz w8, #11, .LBB25_29 +; NO_SVE-NEXT: .LBB25_22: // %else32 +; NO_SVE-NEXT: ldr q2, [x1, #96] +; NO_SVE-NEXT: tbnz w8, #12, .LBB25_30 +; NO_SVE-NEXT: .LBB25_23: // %else35 +; NO_SVE-NEXT: tbnz w8, #13, .LBB25_31 +; NO_SVE-NEXT: .LBB25_24: // %else38 +; NO_SVE-NEXT: ldr q2, [x1, #112] +; NO_SVE-NEXT: tbnz w8, #14, .LBB25_32 +; NO_SVE-NEXT: .LBB25_25: // %else41 +; NO_SVE-NEXT: tbnz w8, #15, .LBB25_33 +; NO_SVE-NEXT: .LBB25_26: // %else44 +; NO_SVE-NEXT: ldr q3, [x1, #128] +; NO_SVE-NEXT: tbz w8, #16, .LBB25_34 +; NO_SVE-NEXT: .LBB25_27: // %cond.load46 +; NO_SVE-NEXT: fmov x9, d3 +; NO_SVE-NEXT: ld1 { v2.h }[0], [x9] +; NO_SVE-NEXT: tbnz w8, #17, .LBB25_35 +; NO_SVE-NEXT: b .LBB25_36 +; NO_SVE-NEXT: .LBB25_28: // %cond.load28 +; NO_SVE-NEXT: fmov x9, d2 +; NO_SVE-NEXT: ld1 { v1.h }[2], [x9] +; NO_SVE-NEXT: tbz w8, #11, .LBB25_22 +; NO_SVE-NEXT: .LBB25_29: // %cond.load31 +; NO_SVE-NEXT: mov x9, v2.d[1] +; NO_SVE-NEXT: ld1 { v1.h }[3], [x9] +; NO_SVE-NEXT: ldr q2, [x1, #96] +; NO_SVE-NEXT: tbz w8, #12, .LBB25_23 +; NO_SVE-NEXT: .LBB25_30: // %cond.load34 +; NO_SVE-NEXT: fmov x9, d2 +; NO_SVE-NEXT: ld1 { v1.h }[4], [x9] +; NO_SVE-NEXT: tbz w8, #13, .LBB25_24 +; NO_SVE-NEXT: .LBB25_31: // %cond.load37 +; NO_SVE-NEXT: mov x9, v2.d[1] +; NO_SVE-NEXT: ld1 { v1.h }[5], [x9] +; NO_SVE-NEXT: ldr q2, [x1, #112] +; NO_SVE-NEXT: tbz w8, #14, .LBB25_25 +; NO_SVE-NEXT: .LBB25_32: // %cond.load40 +; NO_SVE-NEXT: fmov x9, d2 +; NO_SVE-NEXT: ld1 { v1.h }[6], [x9] +; NO_SVE-NEXT: tbz w8, #15, .LBB25_26 +; NO_SVE-NEXT: .LBB25_33: // %cond.load43 +; NO_SVE-NEXT: mov x9, v2.d[1] +; NO_SVE-NEXT: ld1 { v1.h }[7], [x9] +; NO_SVE-NEXT: ldr q3, [x1, #128] +; NO_SVE-NEXT: tbnz w8, #16, .LBB25_27 +; NO_SVE-NEXT: .LBB25_34: +; NO_SVE-NEXT: // implicit-def: $q2 +; NO_SVE-NEXT: tbz w8, #17, .LBB25_36 +; NO_SVE-NEXT: .LBB25_35: // %cond.load49 +; NO_SVE-NEXT: mov x9, v3.d[1] +; NO_SVE-NEXT: ld1 { v2.h }[1], [x9] +; NO_SVE-NEXT: .LBB25_36: // %else50 +; NO_SVE-NEXT: ldr q3, [x1, #144] +; NO_SVE-NEXT: tbnz w8, #18, .LBB25_44 +; NO_SVE-NEXT: // %bb.37: // %else53 +; NO_SVE-NEXT: tbnz w8, #19, .LBB25_45 +; NO_SVE-NEXT: .LBB25_38: // %else56 +; NO_SVE-NEXT: ldr q3, [x1, #160] +; NO_SVE-NEXT: tbnz w8, #20, .LBB25_46 +; NO_SVE-NEXT: .LBB25_39: // %else59 +; NO_SVE-NEXT: tbnz w8, #21, .LBB25_47 +; NO_SVE-NEXT: .LBB25_40: // %else62 +; NO_SVE-NEXT: ldr q3, [x1, #176] +; NO_SVE-NEXT: tbnz w8, #22, .LBB25_48 +; NO_SVE-NEXT: .LBB25_41: // %else65 +; NO_SVE-NEXT: tbnz w8, #23, .LBB25_49 +; NO_SVE-NEXT: .LBB25_42: // %else68 +; NO_SVE-NEXT: ldr q4, [x1, #192] +; NO_SVE-NEXT: tbz w8, #24, .LBB25_50 +; NO_SVE-NEXT: .LBB25_43: // %cond.load70 +; NO_SVE-NEXT: fmov x9, d4 +; NO_SVE-NEXT: ld1 { v3.h }[0], [x9] +; NO_SVE-NEXT: tbnz w8, #25, .LBB25_51 +; NO_SVE-NEXT: b .LBB25_52 +; NO_SVE-NEXT: .LBB25_44: // %cond.load52 +; NO_SVE-NEXT: fmov x9, d3 +; NO_SVE-NEXT: ld1 { v2.h }[2], [x9] +; NO_SVE-NEXT: tbz w8, #19, .LBB25_38 +; NO_SVE-NEXT: .LBB25_45: // %cond.load55 +; NO_SVE-NEXT: mov x9, v3.d[1] +; NO_SVE-NEXT: ld1 { v2.h }[3], [x9] +; NO_SVE-NEXT: ldr q3, [x1, #160] +; NO_SVE-NEXT: tbz w8, #20, .LBB25_39 +; NO_SVE-NEXT: .LBB25_46: // %cond.load58 +; NO_SVE-NEXT: fmov x9, d3 +; NO_SVE-NEXT: ld1 { v2.h }[4], [x9] +; NO_SVE-NEXT: tbz w8, #21, .LBB25_40 +; NO_SVE-NEXT: .LBB25_47: // %cond.load61 +; NO_SVE-NEXT: mov x9, v3.d[1] +; NO_SVE-NEXT: ld1 { v2.h }[5], [x9] +; NO_SVE-NEXT: ldr q3, [x1, #176] +; NO_SVE-NEXT: tbz w8, #22, .LBB25_41 +; NO_SVE-NEXT: .LBB25_48: // %cond.load64 +; NO_SVE-NEXT: fmov x9, d3 +; NO_SVE-NEXT: ld1 { v2.h }[6], [x9] +; NO_SVE-NEXT: tbz w8, #23, .LBB25_42 +; NO_SVE-NEXT: .LBB25_49: // %cond.load67 +; NO_SVE-NEXT: mov x9, v3.d[1] +; NO_SVE-NEXT: ld1 { v2.h }[7], [x9] +; NO_SVE-NEXT: ldr q4, [x1, #192] +; NO_SVE-NEXT: tbnz w8, #24, .LBB25_43 +; NO_SVE-NEXT: .LBB25_50: +; NO_SVE-NEXT: // implicit-def: $q3 +; NO_SVE-NEXT: tbz w8, #25, .LBB25_52 +; NO_SVE-NEXT: .LBB25_51: // %cond.load73 +; NO_SVE-NEXT: mov x9, v4.d[1] +; NO_SVE-NEXT: ld1 { v3.h }[1], [x9] +; NO_SVE-NEXT: .LBB25_52: // %else74 +; NO_SVE-NEXT: ldr q4, [x1, #208] +; NO_SVE-NEXT: tbnz w8, #26, .LBB25_60 +; NO_SVE-NEXT: // %bb.53: // %else77 +; NO_SVE-NEXT: tbnz w8, #27, .LBB25_61 +; NO_SVE-NEXT: .LBB25_54: // %else80 +; NO_SVE-NEXT: ldr q4, [x1, #224] +; NO_SVE-NEXT: tbnz w8, #28, .LBB25_62 +; NO_SVE-NEXT: .LBB25_55: // %else83 +; NO_SVE-NEXT: tbnz w8, #29, .LBB25_63 +; NO_SVE-NEXT: .LBB25_56: // %else86 +; NO_SVE-NEXT: ldr q4, [x1, #240] +; NO_SVE-NEXT: tbnz w8, #30, .LBB25_64 +; NO_SVE-NEXT: .LBB25_57: // %else89 +; NO_SVE-NEXT: tbz w8, #31, .LBB25_59 +; NO_SVE-NEXT: .LBB25_58: // %cond.load91 +; NO_SVE-NEXT: mov x8, v4.d[1] +; NO_SVE-NEXT: ld1 { v3.h }[7], [x8] +; NO_SVE-NEXT: .LBB25_59: // %else92 +; NO_SVE-NEXT: stp q0, q1, [x0] +; NO_SVE-NEXT: stp q2, q3, [x0, #32] +; NO_SVE-NEXT: add sp, sp, #16 +; NO_SVE-NEXT: ret +; NO_SVE-NEXT: .LBB25_60: // %cond.load76 +; NO_SVE-NEXT: fmov x9, d4 +; NO_SVE-NEXT: ld1 { v3.h }[2], [x9] +; NO_SVE-NEXT: tbz w8, #27, .LBB25_54 +; NO_SVE-NEXT: .LBB25_61: // %cond.load79 +; NO_SVE-NEXT: mov x9, v4.d[1] +; NO_SVE-NEXT: ld1 { v3.h }[3], [x9] +; NO_SVE-NEXT: ldr q4, [x1, #224] +; NO_SVE-NEXT: tbz w8, #28, .LBB25_55 +; NO_SVE-NEXT: .LBB25_62: // %cond.load82 +; NO_SVE-NEXT: fmov x9, d4 +; NO_SVE-NEXT: ld1 { v3.h }[4], [x9] +; NO_SVE-NEXT: tbz w8, #29, .LBB25_56 +; NO_SVE-NEXT: .LBB25_63: // %cond.load85 +; NO_SVE-NEXT: mov x9, v4.d[1] +; NO_SVE-NEXT: ld1 { v3.h }[5], [x9] +; NO_SVE-NEXT: ldr q4, [x1, #240] +; NO_SVE-NEXT: tbz w8, #30, .LBB25_57 +; NO_SVE-NEXT: .LBB25_64: // %cond.load88 +; NO_SVE-NEXT: fmov x9, d4 +; NO_SVE-NEXT: ld1 { v3.h }[6], [x9] +; NO_SVE-NEXT: tbnz w8, #31, .LBB25_58 +; NO_SVE-NEXT: b .LBB25_59 +; +; VBITS_EQ_256-LABEL: masked_gather_v32f16: +; VBITS_EQ_256: // %bb.0: +; VBITS_EQ_256-NEXT: mov x8, #16 +; VBITS_EQ_256-NEXT: ptrue p0.h, vl16 +; VBITS_EQ_256-NEXT: mov x11, #4 +; VBITS_EQ_256-NEXT: mov x9, #12 +; VBITS_EQ_256-NEXT: mov x10, #8 +; VBITS_EQ_256-NEXT: ptrue p1.d, vl4 +; VBITS_EQ_256-NEXT: ld1h { z3.h }, p0/z, [x0, x8, lsl #1] +; VBITS_EQ_256-NEXT: ld1h { z4.h }, p0/z, [x0] +; VBITS_EQ_256-NEXT: ld1d { z2.d }, p1/z, [x1, x11, lsl #3] +; VBITS_EQ_256-NEXT: mov x11, #20 +; VBITS_EQ_256-NEXT: ld1d { z0.d }, p1/z, [x1, x9, lsl #3] +; VBITS_EQ_256-NEXT: mov x9, #28 +; VBITS_EQ_256-NEXT: ld1d { z1.d }, p1/z, [x1, x10, lsl #3] +; VBITS_EQ_256-NEXT: mov x10, #24 +; VBITS_EQ_256-NEXT: fcmeq p2.h, p0/z, z3.h, #0.0 +; VBITS_EQ_256-NEXT: ld1d { z17.d }, p1/z, [x1, x8, lsl #3] +; VBITS_EQ_256-NEXT: mov z3.h, p2/z, #-1 // =0xffffffffffffffff +; VBITS_EQ_256-NEXT: ld1d { z16.d }, p1/z, [x1, x11, lsl #3] +; VBITS_EQ_256-NEXT: sunpklo z18.s, z3.h +; VBITS_EQ_256-NEXT: ld1d { z6.d }, p1/z, [x1, x9, lsl #3] +; VBITS_EQ_256-NEXT: ext v5.16b, v3.16b, v3.16b, #8 +; VBITS_EQ_256-NEXT: ext z3.b, z3.b, z3.b, #16 +; VBITS_EQ_256-NEXT: sunpklo z18.d, z18.s +; VBITS_EQ_256-NEXT: ld1d { z7.d }, p1/z, [x1, x10, lsl #3] +; VBITS_EQ_256-NEXT: cmpne p2.d, p1/z, z18.d, #0 +; VBITS_EQ_256-NEXT: ext v18.16b, v3.16b, v3.16b, #8 +; VBITS_EQ_256-NEXT: sunpklo z3.s, z3.h +; VBITS_EQ_256-NEXT: sunpklo z5.s, z5.h +; VBITS_EQ_256-NEXT: sunpklo z3.d, z3.s +; VBITS_EQ_256-NEXT: sunpklo z5.d, z5.s +; VBITS_EQ_256-NEXT: cmpne p3.d, p1/z, z5.d, #0 +; VBITS_EQ_256-NEXT: ld1d { z5.d }, p1/z, [x1] +; VBITS_EQ_256-NEXT: sunpklo z18.s, z18.h +; VBITS_EQ_256-NEXT: ld1h { z17.d }, p2/z, [z17.d] +; VBITS_EQ_256-NEXT: fcmeq p2.h, p0/z, z4.h, #0.0 +; VBITS_EQ_256-NEXT: sunpklo z18.d, z18.s +; VBITS_EQ_256-NEXT: ld1h { z4.d }, p3/z, [z16.d] +; VBITS_EQ_256-NEXT: mov z16.h, p2/z, #-1 // =0xffffffffffffffff +; VBITS_EQ_256-NEXT: cmpne p2.d, p1/z, z3.d, #0 +; VBITS_EQ_256-NEXT: cmpne p3.d, p1/z, z18.d, #0 +; VBITS_EQ_256-NEXT: ld1h { z3.d }, p2/z, [z7.d] +; VBITS_EQ_256-NEXT: ld1h { z6.d }, p3/z, [z6.d] +; VBITS_EQ_256-NEXT: uzp1 z17.s, z17.s, z17.s +; VBITS_EQ_256-NEXT: uzp1 z4.s, z4.s, z4.s +; VBITS_EQ_256-NEXT: uzp1 z7.h, z17.h, z17.h +; VBITS_EQ_256-NEXT: uzp1 z4.h, z4.h, z4.h +; VBITS_EQ_256-NEXT: uzp1 z3.s, z3.s, z3.s +; VBITS_EQ_256-NEXT: uzp1 z6.s, z6.s, z6.s +; VBITS_EQ_256-NEXT: mov v7.d[1], v4.d[0] +; VBITS_EQ_256-NEXT: uzp1 z3.h, z3.h, z3.h +; VBITS_EQ_256-NEXT: ext v4.16b, v16.16b, v16.16b, #8 +; VBITS_EQ_256-NEXT: uzp1 z6.h, z6.h, z6.h +; VBITS_EQ_256-NEXT: mov v3.d[1], v6.d[0] +; VBITS_EQ_256-NEXT: sunpklo z6.s, z16.h +; VBITS_EQ_256-NEXT: ext z16.b, z16.b, z16.b, #16 +; VBITS_EQ_256-NEXT: sunpklo z6.d, z6.s +; VBITS_EQ_256-NEXT: ext v17.16b, v16.16b, v16.16b, #8 +; VBITS_EQ_256-NEXT: cmpne p2.d, p1/z, z6.d, #0 +; VBITS_EQ_256-NEXT: sunpklo z4.s, z4.h +; VBITS_EQ_256-NEXT: sunpklo z4.d, z4.s +; VBITS_EQ_256-NEXT: cmpne p3.d, p1/z, z4.d, #0 +; VBITS_EQ_256-NEXT: ld1h { z4.d }, p2/z, [z5.d] +; VBITS_EQ_256-NEXT: sunpklo z5.s, z16.h +; VBITS_EQ_256-NEXT: sunpklo z6.s, z17.h +; VBITS_EQ_256-NEXT: sunpklo z5.d, z5.s +; VBITS_EQ_256-NEXT: sunpklo z6.d, z6.s +; VBITS_EQ_256-NEXT: cmpne p2.d, p1/z, z5.d, #0 +; VBITS_EQ_256-NEXT: cmpne p1.d, p1/z, z6.d, #0 +; VBITS_EQ_256-NEXT: ld1h { z2.d }, p3/z, [z2.d] +; VBITS_EQ_256-NEXT: ld1h { z1.d }, p2/z, [z1.d] +; VBITS_EQ_256-NEXT: ld1h { z0.d }, p1/z, [z0.d] +; VBITS_EQ_256-NEXT: uzp1 z4.s, z4.s, z4.s +; VBITS_EQ_256-NEXT: uzp1 z4.h, z4.h, z4.h +; VBITS_EQ_256-NEXT: ptrue p1.h, vl8 +; VBITS_EQ_256-NEXT: splice z7.h, p1, z7.h, z3.h +; VBITS_EQ_256-NEXT: uzp1 z2.s, z2.s, z2.s +; VBITS_EQ_256-NEXT: uzp1 z1.s, z1.s, z1.s +; VBITS_EQ_256-NEXT: uzp1 z0.s, z0.s, z0.s +; VBITS_EQ_256-NEXT: uzp1 z2.h, z2.h, z2.h +; VBITS_EQ_256-NEXT: uzp1 z1.h, z1.h, z1.h +; VBITS_EQ_256-NEXT: uzp1 z0.h, z0.h, z0.h +; VBITS_EQ_256-NEXT: mov v4.d[1], v2.d[0] +; VBITS_EQ_256-NEXT: st1h { z7.h }, p0, [x0, x8, lsl #1] +; VBITS_EQ_256-NEXT: mov v1.d[1], v0.d[0] +; VBITS_EQ_256-NEXT: splice z4.h, p1, z4.h, z1.h +; VBITS_EQ_256-NEXT: st1h { z4.h }, p0, [x0] +; VBITS_EQ_256-NEXT: ret +; ; VBITS_GE_2048-LABEL: masked_gather_v32f16: ; VBITS_GE_2048: // %bb.0: ; VBITS_GE_2048-NEXT: ptrue p0.h, vl32 @@ -734,6 +4922,32 @@ ; define void @masked_gather_v2f32(<2 x float>* %a, <2 x float*>* %b) #0 { +; NO_SVE-LABEL: masked_gather_v2f32: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: sub sp, sp, #16 +; NO_SVE-NEXT: .cfi_def_cfa_offset 16 +; NO_SVE-NEXT: ldr d0, [x0] +; NO_SVE-NEXT: ldr q1, [x1] +; NO_SVE-NEXT: fcmeq v0.2s, v0.2s, #0.0 +; NO_SVE-NEXT: mov w8, v0.s[1] +; NO_SVE-NEXT: fmov w9, s0 +; NO_SVE-NEXT: // implicit-def: $d0 +; NO_SVE-NEXT: bfi w9, w8, #1, #31 +; NO_SVE-NEXT: and w8, w9, #0x3 +; NO_SVE-NEXT: tbz w9, #0, .LBB26_2 +; NO_SVE-NEXT: // %bb.1: // %cond.load +; NO_SVE-NEXT: fmov x9, d1 +; NO_SVE-NEXT: ldr s0, [x9] +; NO_SVE-NEXT: .LBB26_2: // %else +; NO_SVE-NEXT: tbz w8, #1, .LBB26_4 +; NO_SVE-NEXT: // %bb.3: // %cond.load1 +; NO_SVE-NEXT: mov x8, v1.d[1] +; NO_SVE-NEXT: ld1 { v0.s }[1], [x8] +; NO_SVE-NEXT: .LBB26_4: // %else2 +; NO_SVE-NEXT: str d0, [x0] +; NO_SVE-NEXT: add sp, sp, #16 +; NO_SVE-NEXT: ret +; ; CHECK-LABEL: masked_gather_v2f32: ; CHECK: // %bb.0: ; CHECK-NEXT: ldr d0, [x0] @@ -755,6 +4969,55 @@ } define void @masked_gather_v4f32(<4 x float>* %a, <4 x float*>* %b) #0 { +; NO_SVE-LABEL: masked_gather_v4f32: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: sub sp, sp, #16 +; NO_SVE-NEXT: .cfi_def_cfa_offset 16 +; NO_SVE-NEXT: ldr q0, [x0] +; NO_SVE-NEXT: ldr q1, [x1] +; NO_SVE-NEXT: fcmeq v0.4s, v0.4s, #0.0 +; NO_SVE-NEXT: xtn v0.4h, v0.4s +; NO_SVE-NEXT: umov w8, v0.h[1] +; NO_SVE-NEXT: umov w9, v0.h[2] +; NO_SVE-NEXT: umov w10, v0.h[0] +; NO_SVE-NEXT: umov w11, v0.h[3] +; NO_SVE-NEXT: // implicit-def: $q0 +; NO_SVE-NEXT: and w8, w8, #0x1 +; NO_SVE-NEXT: and w9, w9, #0x1 +; NO_SVE-NEXT: bfi w10, w8, #1, #1 +; NO_SVE-NEXT: bfi w10, w9, #2, #1 +; NO_SVE-NEXT: bfi w10, w11, #3, #29 +; NO_SVE-NEXT: and w8, w10, #0xf +; NO_SVE-NEXT: tbnz w10, #0, .LBB27_6 +; NO_SVE-NEXT: // %bb.1: // %else +; NO_SVE-NEXT: tbnz w8, #1, .LBB27_7 +; NO_SVE-NEXT: .LBB27_2: // %else2 +; NO_SVE-NEXT: ldr q1, [x1, #16] +; NO_SVE-NEXT: tbnz w8, #2, .LBB27_8 +; NO_SVE-NEXT: .LBB27_3: // %else5 +; NO_SVE-NEXT: tbz w8, #3, .LBB27_5 +; NO_SVE-NEXT: .LBB27_4: // %cond.load7 +; NO_SVE-NEXT: mov x8, v1.d[1] +; NO_SVE-NEXT: ld1 { v0.s }[3], [x8] +; NO_SVE-NEXT: .LBB27_5: // %else8 +; NO_SVE-NEXT: str q0, [x0] +; NO_SVE-NEXT: add sp, sp, #16 +; NO_SVE-NEXT: ret +; NO_SVE-NEXT: .LBB27_6: // %cond.load +; NO_SVE-NEXT: fmov x9, d1 +; NO_SVE-NEXT: ldr s0, [x9] +; NO_SVE-NEXT: tbz w8, #1, .LBB27_2 +; NO_SVE-NEXT: .LBB27_7: // %cond.load1 +; NO_SVE-NEXT: mov x9, v1.d[1] +; NO_SVE-NEXT: ld1 { v0.s }[1], [x9] +; NO_SVE-NEXT: ldr q1, [x1, #16] +; NO_SVE-NEXT: tbz w8, #2, .LBB27_3 +; NO_SVE-NEXT: .LBB27_8: // %cond.load4 +; NO_SVE-NEXT: fmov x9, d1 +; NO_SVE-NEXT: ld1 { v0.s }[2], [x9] +; NO_SVE-NEXT: tbnz w8, #3, .LBB27_4 +; NO_SVE-NEXT: b .LBB27_5 +; ; CHECK-LABEL: masked_gather_v4f32: ; CHECK: // %bb.0: ; CHECK-NEXT: ldr q0, [x0] @@ -776,6 +5039,121 @@ } define void @masked_gather_v8f32(<8 x float>* %a, <8 x float*>* %b) #0 { +; NO_SVE-LABEL: masked_gather_v8f32: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: sub sp, sp, #16 +; NO_SVE-NEXT: .cfi_def_cfa_offset 16 +; NO_SVE-NEXT: ldp q1, q0, [x0] +; NO_SVE-NEXT: fcmeq v1.4s, v1.4s, #0.0 +; NO_SVE-NEXT: fcmeq v0.4s, v0.4s, #0.0 +; NO_SVE-NEXT: uzp1 v0.8h, v1.8h, v0.8h +; NO_SVE-NEXT: ldr q1, [x1] +; NO_SVE-NEXT: xtn v0.8b, v0.8h +; NO_SVE-NEXT: umov w8, v0.b[1] +; NO_SVE-NEXT: umov w10, v0.b[2] +; NO_SVE-NEXT: umov w9, v0.b[0] +; NO_SVE-NEXT: umov w11, v0.b[3] +; NO_SVE-NEXT: umov w12, v0.b[4] +; NO_SVE-NEXT: umov w13, v0.b[5] +; NO_SVE-NEXT: umov w14, v0.b[6] +; NO_SVE-NEXT: and w8, w8, #0x1 +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: and w9, w9, #0x1 +; NO_SVE-NEXT: and w11, w11, #0x1 +; NO_SVE-NEXT: bfi w9, w8, #1, #1 +; NO_SVE-NEXT: and w8, w12, #0x1 +; NO_SVE-NEXT: bfi w9, w10, #2, #1 +; NO_SVE-NEXT: and w10, w13, #0x1 +; NO_SVE-NEXT: bfi w9, w11, #3, #1 +; NO_SVE-NEXT: umov w11, v0.b[7] +; NO_SVE-NEXT: bfi w9, w8, #4, #1 +; NO_SVE-NEXT: and w8, w14, #0x1 +; NO_SVE-NEXT: bfi w9, w10, #5, #1 +; NO_SVE-NEXT: orr w8, w9, w8, lsl #6 +; NO_SVE-NEXT: orr w9, w8, w11, lsl #7 +; NO_SVE-NEXT: and w8, w9, #0xff +; NO_SVE-NEXT: tbz w9, #0, .LBB28_2 +; NO_SVE-NEXT: // %bb.1: // %cond.load +; NO_SVE-NEXT: fmov x9, d1 +; NO_SVE-NEXT: ldr s0, [x9] +; NO_SVE-NEXT: tbnz w8, #1, .LBB28_3 +; NO_SVE-NEXT: b .LBB28_4 +; NO_SVE-NEXT: .LBB28_2: +; NO_SVE-NEXT: // implicit-def: $q0 +; NO_SVE-NEXT: tbz w8, #1, .LBB28_4 +; NO_SVE-NEXT: .LBB28_3: // %cond.load1 +; NO_SVE-NEXT: mov x9, v1.d[1] +; NO_SVE-NEXT: ld1 { v0.s }[1], [x9] +; NO_SVE-NEXT: .LBB28_4: // %else2 +; NO_SVE-NEXT: ldr q1, [x1, #16] +; NO_SVE-NEXT: tbnz w8, #2, .LBB28_8 +; NO_SVE-NEXT: // %bb.5: // %else5 +; NO_SVE-NEXT: tbnz w8, #3, .LBB28_9 +; NO_SVE-NEXT: .LBB28_6: // %else8 +; NO_SVE-NEXT: ldr q2, [x1, #32] +; NO_SVE-NEXT: tbz w8, #4, .LBB28_10 +; NO_SVE-NEXT: .LBB28_7: // %cond.load10 +; NO_SVE-NEXT: fmov x9, d2 +; NO_SVE-NEXT: ld1 { v1.s }[0], [x9] +; NO_SVE-NEXT: tbnz w8, #5, .LBB28_11 +; NO_SVE-NEXT: b .LBB28_12 +; NO_SVE-NEXT: .LBB28_8: // %cond.load4 +; NO_SVE-NEXT: fmov x9, d1 +; NO_SVE-NEXT: ld1 { v0.s }[2], [x9] +; NO_SVE-NEXT: tbz w8, #3, .LBB28_6 +; NO_SVE-NEXT: .LBB28_9: // %cond.load7 +; NO_SVE-NEXT: mov x9, v1.d[1] +; NO_SVE-NEXT: ld1 { v0.s }[3], [x9] +; NO_SVE-NEXT: ldr q2, [x1, #32] +; NO_SVE-NEXT: tbnz w8, #4, .LBB28_7 +; NO_SVE-NEXT: .LBB28_10: +; NO_SVE-NEXT: // implicit-def: $q1 +; NO_SVE-NEXT: tbz w8, #5, .LBB28_12 +; NO_SVE-NEXT: .LBB28_11: // %cond.load13 +; NO_SVE-NEXT: mov x9, v2.d[1] +; NO_SVE-NEXT: ld1 { v1.s }[1], [x9] +; NO_SVE-NEXT: .LBB28_12: // %else14 +; NO_SVE-NEXT: ldr q2, [x1, #48] +; NO_SVE-NEXT: tbnz w8, #6, .LBB28_16 +; NO_SVE-NEXT: // %bb.13: // %else17 +; NO_SVE-NEXT: tbz w8, #7, .LBB28_15 +; NO_SVE-NEXT: .LBB28_14: // %cond.load19 +; NO_SVE-NEXT: mov x8, v2.d[1] +; NO_SVE-NEXT: ld1 { v1.s }[3], [x8] +; NO_SVE-NEXT: .LBB28_15: // %else20 +; NO_SVE-NEXT: stp q0, q1, [x0] +; NO_SVE-NEXT: add sp, sp, #16 +; NO_SVE-NEXT: ret +; NO_SVE-NEXT: .LBB28_16: // %cond.load16 +; NO_SVE-NEXT: fmov x9, d2 +; NO_SVE-NEXT: ld1 { v1.s }[2], [x9] +; NO_SVE-NEXT: tbnz w8, #7, .LBB28_14 +; NO_SVE-NEXT: b .LBB28_15 +; +; VBITS_EQ_256-LABEL: masked_gather_v8f32: +; VBITS_EQ_256: // %bb.0: +; VBITS_EQ_256-NEXT: ptrue p0.s, vl8 +; VBITS_EQ_256-NEXT: mov x8, #4 +; VBITS_EQ_256-NEXT: ld1w { z0.s }, p0/z, [x0] +; VBITS_EQ_256-NEXT: ptrue p1.d, vl4 +; VBITS_EQ_256-NEXT: ld1d { z1.d }, p1/z, [x1, x8, lsl #3] +; VBITS_EQ_256-NEXT: ld1d { z2.d }, p1/z, [x1] +; VBITS_EQ_256-NEXT: fcmeq p2.s, p0/z, z0.s, #0.0 +; VBITS_EQ_256-NEXT: mov z0.s, p2/z, #-1 // =0xffffffffffffffff +; VBITS_EQ_256-NEXT: punpklo p2.h, p2.b +; VBITS_EQ_256-NEXT: ext z0.b, z0.b, z0.b, #16 +; VBITS_EQ_256-NEXT: and p2.b, p2/z, p2.b, p1.b +; VBITS_EQ_256-NEXT: sunpklo z0.d, z0.s +; VBITS_EQ_256-NEXT: ld1w { z2.d }, p2/z, [z2.d] +; VBITS_EQ_256-NEXT: cmpne p1.d, p1/z, z0.d, #0 +; VBITS_EQ_256-NEXT: ld1w { z0.d }, p1/z, [z1.d] +; VBITS_EQ_256-NEXT: ptrue p1.s, vl4 +; VBITS_EQ_256-NEXT: uzp1 z1.s, z2.s, z2.s +; VBITS_EQ_256-NEXT: uzp1 z0.s, z0.s, z0.s +; VBITS_EQ_256-NEXT: splice z1.s, p1, z1.s, z0.s +; VBITS_EQ_256-NEXT: st1w { z1.s }, p0, [x0] +; VBITS_EQ_256-NEXT: ret +; ; VBITS_GE_512-LABEL: masked_gather_v8f32: ; VBITS_GE_512: // %bb.0: ; VBITS_GE_512-NEXT: ptrue p0.s, vl8 @@ -797,6 +5175,225 @@ } define void @masked_gather_v16f32(<16 x float>* %a, <16 x float*>* %b) #0 { +; NO_SVE-LABEL: masked_gather_v16f32: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: sub sp, sp, #16 +; NO_SVE-NEXT: .cfi_def_cfa_offset 16 +; NO_SVE-NEXT: ldp q1, q0, [x0] +; NO_SVE-NEXT: fcmeq v1.4s, v1.4s, #0.0 +; NO_SVE-NEXT: fcmeq v0.4s, v0.4s, #0.0 +; NO_SVE-NEXT: uzp1 v0.8h, v1.8h, v0.8h +; NO_SVE-NEXT: ldp q1, q2, [x0, #32] +; NO_SVE-NEXT: xtn v0.8b, v0.8h +; NO_SVE-NEXT: fcmeq v1.4s, v1.4s, #0.0 +; NO_SVE-NEXT: umov w8, v0.b[1] +; NO_SVE-NEXT: fcmeq v2.4s, v2.4s, #0.0 +; NO_SVE-NEXT: umov w10, v0.b[2] +; NO_SVE-NEXT: umov w9, v0.b[0] +; NO_SVE-NEXT: umov w11, v0.b[3] +; NO_SVE-NEXT: umov w12, v0.b[4] +; NO_SVE-NEXT: and w8, w8, #0x1 +; NO_SVE-NEXT: umov w13, v0.b[5] +; NO_SVE-NEXT: uzp1 v1.8h, v1.8h, v2.8h +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: and w9, w9, #0x1 +; NO_SVE-NEXT: and w11, w11, #0x1 +; NO_SVE-NEXT: bfi w9, w8, #1, #1 +; NO_SVE-NEXT: umov w8, v0.b[6] +; NO_SVE-NEXT: and w12, w12, #0x1 +; NO_SVE-NEXT: xtn v1.8b, v1.8h +; NO_SVE-NEXT: bfi w9, w10, #2, #1 +; NO_SVE-NEXT: umov w10, v0.b[7] +; NO_SVE-NEXT: and w13, w13, #0x1 +; NO_SVE-NEXT: bfi w9, w11, #3, #1 +; NO_SVE-NEXT: umov w11, v1.b[0] +; NO_SVE-NEXT: bfi w9, w12, #4, #1 +; NO_SVE-NEXT: umov w12, v1.b[1] +; NO_SVE-NEXT: and w8, w8, #0x1 +; NO_SVE-NEXT: bfi w9, w13, #5, #1 +; NO_SVE-NEXT: umov w13, v1.b[2] +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: orr w8, w9, w8, lsl #6 +; NO_SVE-NEXT: umov w9, v1.b[3] +; NO_SVE-NEXT: and w11, w11, #0x1 +; NO_SVE-NEXT: orr w8, w8, w10, lsl #7 +; NO_SVE-NEXT: umov w10, v1.b[4] +; NO_SVE-NEXT: and w12, w12, #0x1 +; NO_SVE-NEXT: orr w8, w8, w11, lsl #8 +; NO_SVE-NEXT: umov w11, v1.b[5] +; NO_SVE-NEXT: and w13, w13, #0x1 +; NO_SVE-NEXT: orr w8, w8, w12, lsl #9 +; NO_SVE-NEXT: umov w12, v1.b[6] +; NO_SVE-NEXT: and w9, w9, #0x1 +; NO_SVE-NEXT: orr w8, w8, w13, lsl #10 +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: orr w8, w8, w9, lsl #11 +; NO_SVE-NEXT: and w9, w11, #0x1 +; NO_SVE-NEXT: umov w11, v1.b[7] +; NO_SVE-NEXT: orr w8, w8, w10, lsl #12 +; NO_SVE-NEXT: and w10, w12, #0x1 +; NO_SVE-NEXT: orr w8, w8, w9, lsl #13 +; NO_SVE-NEXT: ldr q1, [x1] +; NO_SVE-NEXT: orr w8, w8, w10, lsl #14 +; NO_SVE-NEXT: orr w9, w8, w11, lsl #15 +; NO_SVE-NEXT: and w8, w9, #0xffff +; NO_SVE-NEXT: tbz w9, #0, .LBB29_2 +; NO_SVE-NEXT: // %bb.1: // %cond.load +; NO_SVE-NEXT: fmov x9, d1 +; NO_SVE-NEXT: ldr s0, [x9] +; NO_SVE-NEXT: tbnz w8, #1, .LBB29_3 +; NO_SVE-NEXT: b .LBB29_4 +; NO_SVE-NEXT: .LBB29_2: +; NO_SVE-NEXT: // implicit-def: $q0 +; NO_SVE-NEXT: tbz w8, #1, .LBB29_4 +; NO_SVE-NEXT: .LBB29_3: // %cond.load1 +; NO_SVE-NEXT: mov x9, v1.d[1] +; NO_SVE-NEXT: ld1 { v0.s }[1], [x9] +; NO_SVE-NEXT: .LBB29_4: // %else2 +; NO_SVE-NEXT: ldr q1, [x1, #16] +; NO_SVE-NEXT: tbnz w8, #2, .LBB29_8 +; NO_SVE-NEXT: // %bb.5: // %else5 +; NO_SVE-NEXT: tbnz w8, #3, .LBB29_9 +; NO_SVE-NEXT: .LBB29_6: // %else8 +; NO_SVE-NEXT: ldr q2, [x1, #32] +; NO_SVE-NEXT: tbz w8, #4, .LBB29_10 +; NO_SVE-NEXT: .LBB29_7: // %cond.load10 +; NO_SVE-NEXT: fmov x9, d2 +; NO_SVE-NEXT: ld1 { v1.s }[0], [x9] +; NO_SVE-NEXT: tbnz w8, #5, .LBB29_11 +; NO_SVE-NEXT: b .LBB29_12 +; NO_SVE-NEXT: .LBB29_8: // %cond.load4 +; NO_SVE-NEXT: fmov x9, d1 +; NO_SVE-NEXT: ld1 { v0.s }[2], [x9] +; NO_SVE-NEXT: tbz w8, #3, .LBB29_6 +; NO_SVE-NEXT: .LBB29_9: // %cond.load7 +; NO_SVE-NEXT: mov x9, v1.d[1] +; NO_SVE-NEXT: ld1 { v0.s }[3], [x9] +; NO_SVE-NEXT: ldr q2, [x1, #32] +; NO_SVE-NEXT: tbnz w8, #4, .LBB29_7 +; NO_SVE-NEXT: .LBB29_10: +; NO_SVE-NEXT: // implicit-def: $q1 +; NO_SVE-NEXT: tbz w8, #5, .LBB29_12 +; NO_SVE-NEXT: .LBB29_11: // %cond.load13 +; NO_SVE-NEXT: mov x9, v2.d[1] +; NO_SVE-NEXT: ld1 { v1.s }[1], [x9] +; NO_SVE-NEXT: .LBB29_12: // %else14 +; NO_SVE-NEXT: ldr q2, [x1, #48] +; NO_SVE-NEXT: tbnz w8, #6, .LBB29_16 +; NO_SVE-NEXT: // %bb.13: // %else17 +; NO_SVE-NEXT: tbnz w8, #7, .LBB29_17 +; NO_SVE-NEXT: .LBB29_14: // %else20 +; NO_SVE-NEXT: ldr q3, [x1, #64] +; NO_SVE-NEXT: tbz w8, #8, .LBB29_18 +; NO_SVE-NEXT: .LBB29_15: // %cond.load22 +; NO_SVE-NEXT: fmov x9, d3 +; NO_SVE-NEXT: ld1 { v2.s }[0], [x9] +; NO_SVE-NEXT: tbnz w8, #9, .LBB29_19 +; NO_SVE-NEXT: b .LBB29_20 +; NO_SVE-NEXT: .LBB29_16: // %cond.load16 +; NO_SVE-NEXT: fmov x9, d2 +; NO_SVE-NEXT: ld1 { v1.s }[2], [x9] +; NO_SVE-NEXT: tbz w8, #7, .LBB29_14 +; NO_SVE-NEXT: .LBB29_17: // %cond.load19 +; NO_SVE-NEXT: mov x9, v2.d[1] +; NO_SVE-NEXT: ld1 { v1.s }[3], [x9] +; NO_SVE-NEXT: ldr q3, [x1, #64] +; NO_SVE-NEXT: tbnz w8, #8, .LBB29_15 +; NO_SVE-NEXT: .LBB29_18: +; NO_SVE-NEXT: // implicit-def: $q2 +; NO_SVE-NEXT: tbz w8, #9, .LBB29_20 +; NO_SVE-NEXT: .LBB29_19: // %cond.load25 +; NO_SVE-NEXT: mov x9, v3.d[1] +; NO_SVE-NEXT: ld1 { v2.s }[1], [x9] +; NO_SVE-NEXT: .LBB29_20: // %else26 +; NO_SVE-NEXT: ldr q3, [x1, #80] +; NO_SVE-NEXT: tbnz w8, #10, .LBB29_24 +; NO_SVE-NEXT: // %bb.21: // %else29 +; NO_SVE-NEXT: tbnz w8, #11, .LBB29_25 +; NO_SVE-NEXT: .LBB29_22: // %else32 +; NO_SVE-NEXT: ldr q4, [x1, #96] +; NO_SVE-NEXT: tbz w8, #12, .LBB29_26 +; NO_SVE-NEXT: .LBB29_23: // %cond.load34 +; NO_SVE-NEXT: fmov x9, d4 +; NO_SVE-NEXT: ld1 { v3.s }[0], [x9] +; NO_SVE-NEXT: tbnz w8, #13, .LBB29_27 +; NO_SVE-NEXT: b .LBB29_28 +; NO_SVE-NEXT: .LBB29_24: // %cond.load28 +; NO_SVE-NEXT: fmov x9, d3 +; NO_SVE-NEXT: ld1 { v2.s }[2], [x9] +; NO_SVE-NEXT: tbz w8, #11, .LBB29_22 +; NO_SVE-NEXT: .LBB29_25: // %cond.load31 +; NO_SVE-NEXT: mov x9, v3.d[1] +; NO_SVE-NEXT: ld1 { v2.s }[3], [x9] +; NO_SVE-NEXT: ldr q4, [x1, #96] +; NO_SVE-NEXT: tbnz w8, #12, .LBB29_23 +; NO_SVE-NEXT: .LBB29_26: +; NO_SVE-NEXT: // implicit-def: $q3 +; NO_SVE-NEXT: tbz w8, #13, .LBB29_28 +; NO_SVE-NEXT: .LBB29_27: // %cond.load37 +; NO_SVE-NEXT: mov x9, v4.d[1] +; NO_SVE-NEXT: ld1 { v3.s }[1], [x9] +; NO_SVE-NEXT: .LBB29_28: // %else38 +; NO_SVE-NEXT: ldr q4, [x1, #112] +; NO_SVE-NEXT: tbnz w8, #14, .LBB29_32 +; NO_SVE-NEXT: // %bb.29: // %else41 +; NO_SVE-NEXT: tbz w8, #15, .LBB29_31 +; NO_SVE-NEXT: .LBB29_30: // %cond.load43 +; NO_SVE-NEXT: mov x8, v4.d[1] +; NO_SVE-NEXT: ld1 { v3.s }[3], [x8] +; NO_SVE-NEXT: .LBB29_31: // %else44 +; NO_SVE-NEXT: stp q0, q1, [x0] +; NO_SVE-NEXT: stp q2, q3, [x0, #32] +; NO_SVE-NEXT: add sp, sp, #16 +; NO_SVE-NEXT: ret +; NO_SVE-NEXT: .LBB29_32: // %cond.load40 +; NO_SVE-NEXT: fmov x9, d4 +; NO_SVE-NEXT: ld1 { v3.s }[2], [x9] +; NO_SVE-NEXT: tbnz w8, #15, .LBB29_30 +; NO_SVE-NEXT: b .LBB29_31 +; +; VBITS_EQ_256-LABEL: masked_gather_v16f32: +; VBITS_EQ_256: // %bb.0: +; VBITS_EQ_256-NEXT: mov x8, #8 +; VBITS_EQ_256-NEXT: ptrue p0.s, vl8 +; VBITS_EQ_256-NEXT: mov x9, #4 +; VBITS_EQ_256-NEXT: mov x10, #12 +; VBITS_EQ_256-NEXT: ptrue p1.d, vl4 +; VBITS_EQ_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] +; VBITS_EQ_256-NEXT: ld1w { z1.s }, p0/z, [x0] +; VBITS_EQ_256-NEXT: ld1d { z4.d }, p1/z, [x1, x8, lsl #3] +; VBITS_EQ_256-NEXT: ld1d { z2.d }, p1/z, [x1, x9, lsl #3] +; VBITS_EQ_256-NEXT: ld1d { z3.d }, p1/z, [x1, x10, lsl #3] +; VBITS_EQ_256-NEXT: ld1d { z5.d }, p1/z, [x1] +; VBITS_EQ_256-NEXT: fcmeq p2.s, p0/z, z0.s, #0.0 +; VBITS_EQ_256-NEXT: fcmeq p3.s, p0/z, z1.s, #0.0 +; VBITS_EQ_256-NEXT: mov z0.s, p2/z, #-1 // =0xffffffffffffffff +; VBITS_EQ_256-NEXT: punpklo p2.h, p2.b +; VBITS_EQ_256-NEXT: mov z1.s, p3/z, #-1 // =0xffffffffffffffff +; VBITS_EQ_256-NEXT: and p2.b, p2/z, p2.b, p1.b +; VBITS_EQ_256-NEXT: ext z0.b, z0.b, z0.b, #16 +; VBITS_EQ_256-NEXT: ext z1.b, z1.b, z1.b, #16 +; VBITS_EQ_256-NEXT: ld1w { z4.d }, p2/z, [z4.d] +; VBITS_EQ_256-NEXT: sunpklo z0.d, z0.s +; VBITS_EQ_256-NEXT: punpklo p2.h, p3.b +; VBITS_EQ_256-NEXT: sunpklo z1.d, z1.s +; VBITS_EQ_256-NEXT: cmpne p3.d, p1/z, z0.d, #0 +; VBITS_EQ_256-NEXT: and p2.b, p2/z, p2.b, p1.b +; VBITS_EQ_256-NEXT: cmpne p1.d, p1/z, z1.d, #0 +; VBITS_EQ_256-NEXT: ld1w { z0.d }, p3/z, [z3.d] +; VBITS_EQ_256-NEXT: ld1w { z1.d }, p2/z, [z5.d] +; VBITS_EQ_256-NEXT: ld1w { z2.d }, p1/z, [z2.d] +; VBITS_EQ_256-NEXT: ptrue p1.s, vl4 +; VBITS_EQ_256-NEXT: uzp1 z3.s, z4.s, z4.s +; VBITS_EQ_256-NEXT: uzp1 z0.s, z0.s, z0.s +; VBITS_EQ_256-NEXT: uzp1 z1.s, z1.s, z1.s +; VBITS_EQ_256-NEXT: uzp1 z2.s, z2.s, z2.s +; VBITS_EQ_256-NEXT: splice z3.s, p1, z3.s, z0.s +; VBITS_EQ_256-NEXT: splice z1.s, p1, z1.s, z2.s +; VBITS_EQ_256-NEXT: st1w { z3.s }, p0, [x0, x8, lsl #2] +; VBITS_EQ_256-NEXT: st1w { z1.s }, p0, [x0] +; VBITS_EQ_256-NEXT: ret +; ; VBITS_GE_1024-LABEL: masked_gather_v16f32: ; VBITS_GE_1024: // %bb.0: ; VBITS_GE_1024-NEXT: ptrue p0.s, vl16 @@ -818,6 +5415,432 @@ } define void @masked_gather_v32f32(<32 x float>* %a, <32 x float*>* %b) #0 { +; NO_SVE-LABEL: masked_gather_v32f32: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: sub sp, sp, #16 +; NO_SVE-NEXT: .cfi_def_cfa_offset 16 +; NO_SVE-NEXT: ldp q2, q3, [x0, #64] +; NO_SVE-NEXT: fcmeq v2.4s, v2.4s, #0.0 +; NO_SVE-NEXT: ldp q0, q1, [x0] +; NO_SVE-NEXT: fcmeq v3.4s, v3.4s, #0.0 +; NO_SVE-NEXT: fcmeq v0.4s, v0.4s, #0.0 +; NO_SVE-NEXT: uzp1 v2.8h, v2.8h, v3.8h +; NO_SVE-NEXT: ldp q4, q5, [x0, #96] +; NO_SVE-NEXT: fcmeq v1.4s, v1.4s, #0.0 +; NO_SVE-NEXT: fcmeq v4.4s, v4.4s, #0.0 +; NO_SVE-NEXT: uzp1 v0.8h, v0.8h, v1.8h +; NO_SVE-NEXT: fcmeq v5.4s, v5.4s, #0.0 +; NO_SVE-NEXT: ldr q3, [x0, #48] +; NO_SVE-NEXT: xtn v1.8b, v2.8h +; NO_SVE-NEXT: umov w8, v1.b[1] +; NO_SVE-NEXT: umov w10, v1.b[2] +; NO_SVE-NEXT: uzp1 v4.8h, v4.8h, v5.8h +; NO_SVE-NEXT: umov w9, v1.b[0] +; NO_SVE-NEXT: umov w11, v1.b[3] +; NO_SVE-NEXT: umov w12, v1.b[4] +; NO_SVE-NEXT: umov w13, v1.b[5] +; NO_SVE-NEXT: and w8, w8, #0x1 +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: xtn v4.8b, v4.8h +; NO_SVE-NEXT: umov w14, v1.b[6] +; NO_SVE-NEXT: and w9, w9, #0x1 +; NO_SVE-NEXT: and w11, w11, #0x1 +; NO_SVE-NEXT: umov w15, v1.b[7] +; NO_SVE-NEXT: bfi w9, w8, #1, #1 +; NO_SVE-NEXT: and w12, w12, #0x1 +; NO_SVE-NEXT: umov w8, v4.b[0] +; NO_SVE-NEXT: bfi w9, w10, #2, #1 +; NO_SVE-NEXT: and w13, w13, #0x1 +; NO_SVE-NEXT: umov w10, v4.b[1] +; NO_SVE-NEXT: bfi w9, w11, #3, #1 +; NO_SVE-NEXT: umov w11, v4.b[2] +; NO_SVE-NEXT: bfi w9, w12, #4, #1 +; NO_SVE-NEXT: and w14, w14, #0x1 +; NO_SVE-NEXT: umov w12, v4.b[3] +; NO_SVE-NEXT: bfi w9, w13, #5, #1 +; NO_SVE-NEXT: and w15, w15, #0x1 +; NO_SVE-NEXT: xtn v0.8b, v0.8h +; NO_SVE-NEXT: umov w13, v4.b[4] +; NO_SVE-NEXT: orr w9, w9, w14, lsl #6 +; NO_SVE-NEXT: and w8, w8, #0x1 +; NO_SVE-NEXT: fcmeq v2.4s, v3.4s, #0.0 +; NO_SVE-NEXT: ldr q3, [x0, #32] +; NO_SVE-NEXT: orr w9, w9, w15, lsl #7 +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: umov w15, v0.b[1] +; NO_SVE-NEXT: orr w8, w9, w8, lsl #8 +; NO_SVE-NEXT: and w11, w11, #0x1 +; NO_SVE-NEXT: orr w8, w8, w10, lsl #9 +; NO_SVE-NEXT: umov w10, v0.b[2] +; NO_SVE-NEXT: and w12, w12, #0x1 +; NO_SVE-NEXT: umov w9, v0.b[0] +; NO_SVE-NEXT: orr w8, w8, w11, lsl #10 +; NO_SVE-NEXT: and w13, w13, #0x1 +; NO_SVE-NEXT: umov w11, v0.b[3] +; NO_SVE-NEXT: fcmeq v1.4s, v3.4s, #0.0 +; NO_SVE-NEXT: orr w8, w8, w12, lsl #11 +; NO_SVE-NEXT: and w15, w15, #0x1 +; NO_SVE-NEXT: orr w8, w8, w13, lsl #12 +; NO_SVE-NEXT: umov w12, v0.b[4] +; NO_SVE-NEXT: umov w13, v0.b[5] +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: and w9, w9, #0x1 +; NO_SVE-NEXT: uzp1 v1.8h, v1.8h, v2.8h +; NO_SVE-NEXT: bfi w9, w15, #1, #1 +; NO_SVE-NEXT: and w11, w11, #0x1 +; NO_SVE-NEXT: umov w14, v4.b[5] +; NO_SVE-NEXT: bfi w9, w10, #2, #1 +; NO_SVE-NEXT: and w10, w12, #0x1 +; NO_SVE-NEXT: and w12, w13, #0x1 +; NO_SVE-NEXT: umov w13, v0.b[6] +; NO_SVE-NEXT: bfi w9, w11, #3, #1 +; NO_SVE-NEXT: umov w11, v0.b[7] +; NO_SVE-NEXT: xtn v0.8b, v1.8h +; NO_SVE-NEXT: bfi w9, w10, #4, #1 +; NO_SVE-NEXT: umov w10, v4.b[6] +; NO_SVE-NEXT: bfi w9, w12, #5, #1 +; NO_SVE-NEXT: and w12, w13, #0x1 +; NO_SVE-NEXT: umov w13, v0.b[0] +; NO_SVE-NEXT: and w14, w14, #0x1 +; NO_SVE-NEXT: umov w15, v0.b[1] +; NO_SVE-NEXT: and w11, w11, #0x1 +; NO_SVE-NEXT: orr w9, w9, w12, lsl #6 +; NO_SVE-NEXT: umov w12, v0.b[2] +; NO_SVE-NEXT: orr w8, w8, w14, lsl #13 +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: umov w14, v0.b[3] +; NO_SVE-NEXT: orr w9, w9, w11, lsl #7 +; NO_SVE-NEXT: and w11, w13, #0x1 +; NO_SVE-NEXT: and w13, w15, #0x1 +; NO_SVE-NEXT: orr w8, w8, w10, lsl #14 +; NO_SVE-NEXT: umov w10, v0.b[4] +; NO_SVE-NEXT: orr w9, w9, w11, lsl #8 +; NO_SVE-NEXT: and w11, w12, #0x1 +; NO_SVE-NEXT: umov w12, v0.b[5] +; NO_SVE-NEXT: orr w9, w9, w13, lsl #9 +; NO_SVE-NEXT: and w13, w14, #0x1 +; NO_SVE-NEXT: umov w14, v0.b[6] +; NO_SVE-NEXT: orr w9, w9, w11, lsl #10 +; NO_SVE-NEXT: umov w11, v4.b[7] +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: orr w9, w9, w13, lsl #11 +; NO_SVE-NEXT: and w12, w12, #0x1 +; NO_SVE-NEXT: umov w13, v0.b[7] +; NO_SVE-NEXT: orr w9, w9, w10, lsl #12 +; NO_SVE-NEXT: and w10, w14, #0x1 +; NO_SVE-NEXT: ldr q1, [x1] +; NO_SVE-NEXT: orr w11, w8, w11, lsl #15 +; NO_SVE-NEXT: orr w8, w9, w12, lsl #13 +; NO_SVE-NEXT: orr w8, w8, w10, lsl #14 +; NO_SVE-NEXT: orr w8, w8, w13, lsl #15 +; NO_SVE-NEXT: bfi w8, w11, #16, #16 +; NO_SVE-NEXT: tbz w8, #0, .LBB30_2 +; NO_SVE-NEXT: // %bb.1: // %cond.load +; NO_SVE-NEXT: fmov x9, d1 +; NO_SVE-NEXT: ldr s0, [x9] +; NO_SVE-NEXT: tbnz w8, #1, .LBB30_3 +; NO_SVE-NEXT: b .LBB30_4 +; NO_SVE-NEXT: .LBB30_2: +; NO_SVE-NEXT: // implicit-def: $q0 +; NO_SVE-NEXT: tbz w8, #1, .LBB30_4 +; NO_SVE-NEXT: .LBB30_3: // %cond.load1 +; NO_SVE-NEXT: mov x9, v1.d[1] +; NO_SVE-NEXT: ld1 { v0.s }[1], [x9] +; NO_SVE-NEXT: .LBB30_4: // %else2 +; NO_SVE-NEXT: ldr q1, [x1, #16] +; NO_SVE-NEXT: tbnz w8, #2, .LBB30_8 +; NO_SVE-NEXT: // %bb.5: // %else5 +; NO_SVE-NEXT: tbnz w8, #3, .LBB30_9 +; NO_SVE-NEXT: .LBB30_6: // %else8 +; NO_SVE-NEXT: ldr q2, [x1, #32] +; NO_SVE-NEXT: tbz w8, #4, .LBB30_10 +; NO_SVE-NEXT: .LBB30_7: // %cond.load10 +; NO_SVE-NEXT: fmov x9, d2 +; NO_SVE-NEXT: ld1 { v1.s }[0], [x9] +; NO_SVE-NEXT: tbnz w8, #5, .LBB30_11 +; NO_SVE-NEXT: b .LBB30_12 +; NO_SVE-NEXT: .LBB30_8: // %cond.load4 +; NO_SVE-NEXT: fmov x9, d1 +; NO_SVE-NEXT: ld1 { v0.s }[2], [x9] +; NO_SVE-NEXT: tbz w8, #3, .LBB30_6 +; NO_SVE-NEXT: .LBB30_9: // %cond.load7 +; NO_SVE-NEXT: mov x9, v1.d[1] +; NO_SVE-NEXT: ld1 { v0.s }[3], [x9] +; NO_SVE-NEXT: ldr q2, [x1, #32] +; NO_SVE-NEXT: tbnz w8, #4, .LBB30_7 +; NO_SVE-NEXT: .LBB30_10: +; NO_SVE-NEXT: // implicit-def: $q1 +; NO_SVE-NEXT: tbz w8, #5, .LBB30_12 +; NO_SVE-NEXT: .LBB30_11: // %cond.load13 +; NO_SVE-NEXT: mov x9, v2.d[1] +; NO_SVE-NEXT: ld1 { v1.s }[1], [x9] +; NO_SVE-NEXT: .LBB30_12: // %else14 +; NO_SVE-NEXT: ldr q2, [x1, #48] +; NO_SVE-NEXT: tbnz w8, #6, .LBB30_16 +; NO_SVE-NEXT: // %bb.13: // %else17 +; NO_SVE-NEXT: tbnz w8, #7, .LBB30_17 +; NO_SVE-NEXT: .LBB30_14: // %else20 +; NO_SVE-NEXT: ldr q3, [x1, #64] +; NO_SVE-NEXT: tbz w8, #8, .LBB30_18 +; NO_SVE-NEXT: .LBB30_15: // %cond.load22 +; NO_SVE-NEXT: fmov x9, d3 +; NO_SVE-NEXT: ld1 { v2.s }[0], [x9] +; NO_SVE-NEXT: tbnz w8, #9, .LBB30_19 +; NO_SVE-NEXT: b .LBB30_20 +; NO_SVE-NEXT: .LBB30_16: // %cond.load16 +; NO_SVE-NEXT: fmov x9, d2 +; NO_SVE-NEXT: ld1 { v1.s }[2], [x9] +; NO_SVE-NEXT: tbz w8, #7, .LBB30_14 +; NO_SVE-NEXT: .LBB30_17: // %cond.load19 +; NO_SVE-NEXT: mov x9, v2.d[1] +; NO_SVE-NEXT: ld1 { v1.s }[3], [x9] +; NO_SVE-NEXT: ldr q3, [x1, #64] +; NO_SVE-NEXT: tbnz w8, #8, .LBB30_15 +; NO_SVE-NEXT: .LBB30_18: +; NO_SVE-NEXT: // implicit-def: $q2 +; NO_SVE-NEXT: tbz w8, #9, .LBB30_20 +; NO_SVE-NEXT: .LBB30_19: // %cond.load25 +; NO_SVE-NEXT: mov x9, v3.d[1] +; NO_SVE-NEXT: ld1 { v2.s }[1], [x9] +; NO_SVE-NEXT: .LBB30_20: // %else26 +; NO_SVE-NEXT: ldr q3, [x1, #80] +; NO_SVE-NEXT: tbnz w8, #10, .LBB30_24 +; NO_SVE-NEXT: // %bb.21: // %else29 +; NO_SVE-NEXT: tbnz w8, #11, .LBB30_25 +; NO_SVE-NEXT: .LBB30_22: // %else32 +; NO_SVE-NEXT: ldr q4, [x1, #96] +; NO_SVE-NEXT: tbz w8, #12, .LBB30_26 +; NO_SVE-NEXT: .LBB30_23: // %cond.load34 +; NO_SVE-NEXT: fmov x9, d4 +; NO_SVE-NEXT: ld1 { v3.s }[0], [x9] +; NO_SVE-NEXT: tbnz w8, #13, .LBB30_27 +; NO_SVE-NEXT: b .LBB30_28 +; NO_SVE-NEXT: .LBB30_24: // %cond.load28 +; NO_SVE-NEXT: fmov x9, d3 +; NO_SVE-NEXT: ld1 { v2.s }[2], [x9] +; NO_SVE-NEXT: tbz w8, #11, .LBB30_22 +; NO_SVE-NEXT: .LBB30_25: // %cond.load31 +; NO_SVE-NEXT: mov x9, v3.d[1] +; NO_SVE-NEXT: ld1 { v2.s }[3], [x9] +; NO_SVE-NEXT: ldr q4, [x1, #96] +; NO_SVE-NEXT: tbnz w8, #12, .LBB30_23 +; NO_SVE-NEXT: .LBB30_26: +; NO_SVE-NEXT: // implicit-def: $q3 +; NO_SVE-NEXT: tbz w8, #13, .LBB30_28 +; NO_SVE-NEXT: .LBB30_27: // %cond.load37 +; NO_SVE-NEXT: mov x9, v4.d[1] +; NO_SVE-NEXT: ld1 { v3.s }[1], [x9] +; NO_SVE-NEXT: .LBB30_28: // %else38 +; NO_SVE-NEXT: ldr q4, [x1, #112] +; NO_SVE-NEXT: tbnz w8, #14, .LBB30_32 +; NO_SVE-NEXT: // %bb.29: // %else41 +; NO_SVE-NEXT: tbnz w8, #15, .LBB30_33 +; NO_SVE-NEXT: .LBB30_30: // %else44 +; NO_SVE-NEXT: ldr q5, [x1, #128] +; NO_SVE-NEXT: tbz w8, #16, .LBB30_34 +; NO_SVE-NEXT: .LBB30_31: // %cond.load46 +; NO_SVE-NEXT: fmov x9, d5 +; NO_SVE-NEXT: ld1 { v4.s }[0], [x9] +; NO_SVE-NEXT: tbnz w8, #17, .LBB30_35 +; NO_SVE-NEXT: b .LBB30_36 +; NO_SVE-NEXT: .LBB30_32: // %cond.load40 +; NO_SVE-NEXT: fmov x9, d4 +; NO_SVE-NEXT: ld1 { v3.s }[2], [x9] +; NO_SVE-NEXT: tbz w8, #15, .LBB30_30 +; NO_SVE-NEXT: .LBB30_33: // %cond.load43 +; NO_SVE-NEXT: mov x9, v4.d[1] +; NO_SVE-NEXT: ld1 { v3.s }[3], [x9] +; NO_SVE-NEXT: ldr q5, [x1, #128] +; NO_SVE-NEXT: tbnz w8, #16, .LBB30_31 +; NO_SVE-NEXT: .LBB30_34: +; NO_SVE-NEXT: // implicit-def: $q4 +; NO_SVE-NEXT: tbz w8, #17, .LBB30_36 +; NO_SVE-NEXT: .LBB30_35: // %cond.load49 +; NO_SVE-NEXT: mov x9, v5.d[1] +; NO_SVE-NEXT: ld1 { v4.s }[1], [x9] +; NO_SVE-NEXT: .LBB30_36: // %else50 +; NO_SVE-NEXT: ldr q5, [x1, #144] +; NO_SVE-NEXT: tbnz w8, #18, .LBB30_40 +; NO_SVE-NEXT: // %bb.37: // %else53 +; NO_SVE-NEXT: tbnz w8, #19, .LBB30_41 +; NO_SVE-NEXT: .LBB30_38: // %else56 +; NO_SVE-NEXT: ldr q6, [x1, #160] +; NO_SVE-NEXT: tbz w8, #20, .LBB30_42 +; NO_SVE-NEXT: .LBB30_39: // %cond.load58 +; NO_SVE-NEXT: fmov x9, d6 +; NO_SVE-NEXT: ld1 { v5.s }[0], [x9] +; NO_SVE-NEXT: tbnz w8, #21, .LBB30_43 +; NO_SVE-NEXT: b .LBB30_44 +; NO_SVE-NEXT: .LBB30_40: // %cond.load52 +; NO_SVE-NEXT: fmov x9, d5 +; NO_SVE-NEXT: ld1 { v4.s }[2], [x9] +; NO_SVE-NEXT: tbz w8, #19, .LBB30_38 +; NO_SVE-NEXT: .LBB30_41: // %cond.load55 +; NO_SVE-NEXT: mov x9, v5.d[1] +; NO_SVE-NEXT: ld1 { v4.s }[3], [x9] +; NO_SVE-NEXT: ldr q6, [x1, #160] +; NO_SVE-NEXT: tbnz w8, #20, .LBB30_39 +; NO_SVE-NEXT: .LBB30_42: +; NO_SVE-NEXT: // implicit-def: $q5 +; NO_SVE-NEXT: tbz w8, #21, .LBB30_44 +; NO_SVE-NEXT: .LBB30_43: // %cond.load61 +; NO_SVE-NEXT: mov x9, v6.d[1] +; NO_SVE-NEXT: ld1 { v5.s }[1], [x9] +; NO_SVE-NEXT: .LBB30_44: // %else62 +; NO_SVE-NEXT: ldr q6, [x1, #176] +; NO_SVE-NEXT: tbnz w8, #22, .LBB30_48 +; NO_SVE-NEXT: // %bb.45: // %else65 +; NO_SVE-NEXT: tbnz w8, #23, .LBB30_49 +; NO_SVE-NEXT: .LBB30_46: // %else68 +; NO_SVE-NEXT: ldr q7, [x1, #192] +; NO_SVE-NEXT: tbz w8, #24, .LBB30_50 +; NO_SVE-NEXT: .LBB30_47: // %cond.load70 +; NO_SVE-NEXT: fmov x9, d7 +; NO_SVE-NEXT: ld1 { v6.s }[0], [x9] +; NO_SVE-NEXT: tbnz w8, #25, .LBB30_51 +; NO_SVE-NEXT: b .LBB30_52 +; NO_SVE-NEXT: .LBB30_48: // %cond.load64 +; NO_SVE-NEXT: fmov x9, d6 +; NO_SVE-NEXT: ld1 { v5.s }[2], [x9] +; NO_SVE-NEXT: tbz w8, #23, .LBB30_46 +; NO_SVE-NEXT: .LBB30_49: // %cond.load67 +; NO_SVE-NEXT: mov x9, v6.d[1] +; NO_SVE-NEXT: ld1 { v5.s }[3], [x9] +; NO_SVE-NEXT: ldr q7, [x1, #192] +; NO_SVE-NEXT: tbnz w8, #24, .LBB30_47 +; NO_SVE-NEXT: .LBB30_50: +; NO_SVE-NEXT: // implicit-def: $q6 +; NO_SVE-NEXT: tbz w8, #25, .LBB30_52 +; NO_SVE-NEXT: .LBB30_51: // %cond.load73 +; NO_SVE-NEXT: mov x9, v7.d[1] +; NO_SVE-NEXT: ld1 { v6.s }[1], [x9] +; NO_SVE-NEXT: .LBB30_52: // %else74 +; NO_SVE-NEXT: ldr q7, [x1, #208] +; NO_SVE-NEXT: tbnz w8, #26, .LBB30_56 +; NO_SVE-NEXT: // %bb.53: // %else77 +; NO_SVE-NEXT: tbnz w8, #27, .LBB30_57 +; NO_SVE-NEXT: .LBB30_54: // %else80 +; NO_SVE-NEXT: ldr q16, [x1, #224] +; NO_SVE-NEXT: tbz w8, #28, .LBB30_58 +; NO_SVE-NEXT: .LBB30_55: // %cond.load82 +; NO_SVE-NEXT: fmov x9, d16 +; NO_SVE-NEXT: ld1 { v7.s }[0], [x9] +; NO_SVE-NEXT: tbnz w8, #29, .LBB30_59 +; NO_SVE-NEXT: b .LBB30_60 +; NO_SVE-NEXT: .LBB30_56: // %cond.load76 +; NO_SVE-NEXT: fmov x9, d7 +; NO_SVE-NEXT: ld1 { v6.s }[2], [x9] +; NO_SVE-NEXT: tbz w8, #27, .LBB30_54 +; NO_SVE-NEXT: .LBB30_57: // %cond.load79 +; NO_SVE-NEXT: mov x9, v7.d[1] +; NO_SVE-NEXT: ld1 { v6.s }[3], [x9] +; NO_SVE-NEXT: ldr q16, [x1, #224] +; NO_SVE-NEXT: tbnz w8, #28, .LBB30_55 +; NO_SVE-NEXT: .LBB30_58: +; NO_SVE-NEXT: // implicit-def: $q7 +; NO_SVE-NEXT: tbz w8, #29, .LBB30_60 +; NO_SVE-NEXT: .LBB30_59: // %cond.load85 +; NO_SVE-NEXT: mov x9, v16.d[1] +; NO_SVE-NEXT: ld1 { v7.s }[1], [x9] +; NO_SVE-NEXT: .LBB30_60: // %else86 +; NO_SVE-NEXT: ldr q16, [x1, #240] +; NO_SVE-NEXT: tbnz w8, #30, .LBB30_64 +; NO_SVE-NEXT: // %bb.61: // %else89 +; NO_SVE-NEXT: tbz w8, #31, .LBB30_63 +; NO_SVE-NEXT: .LBB30_62: // %cond.load91 +; NO_SVE-NEXT: mov x8, v16.d[1] +; NO_SVE-NEXT: ld1 { v7.s }[3], [x8] +; NO_SVE-NEXT: .LBB30_63: // %else92 +; NO_SVE-NEXT: stp q0, q1, [x0] +; NO_SVE-NEXT: stp q2, q3, [x0, #32] +; NO_SVE-NEXT: stp q4, q5, [x0, #64] +; NO_SVE-NEXT: stp q6, q7, [x0, #96] +; NO_SVE-NEXT: add sp, sp, #16 +; NO_SVE-NEXT: ret +; NO_SVE-NEXT: .LBB30_64: // %cond.load88 +; NO_SVE-NEXT: fmov x9, d16 +; NO_SVE-NEXT: ld1 { v7.s }[2], [x9] +; NO_SVE-NEXT: tbnz w8, #31, .LBB30_62 +; NO_SVE-NEXT: b .LBB30_63 +; +; VBITS_EQ_256-LABEL: masked_gather_v32f32: +; VBITS_EQ_256: // %bb.0: +; VBITS_EQ_256-NEXT: mov x8, #24 +; VBITS_EQ_256-NEXT: ptrue p0.s, vl8 +; VBITS_EQ_256-NEXT: mov x10, #16 +; VBITS_EQ_256-NEXT: mov x9, #8 +; VBITS_EQ_256-NEXT: mov x14, #28 +; VBITS_EQ_256-NEXT: ptrue p1.d, vl4 +; VBITS_EQ_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] +; VBITS_EQ_256-NEXT: mov x11, #4 +; VBITS_EQ_256-NEXT: ld1w { z2.s }, p0/z, [x0, x10, lsl #2] +; VBITS_EQ_256-NEXT: ld1w { z1.s }, p0/z, [x0, x9, lsl #2] +; VBITS_EQ_256-NEXT: ld1w { z3.s }, p0/z, [x0] +; VBITS_EQ_256-NEXT: ld1d { z17.d }, p1/z, [x1, x14, lsl #3] +; VBITS_EQ_256-NEXT: ld1d { z18.d }, p1/z, [x1, x8, lsl #3] +; VBITS_EQ_256-NEXT: mov x12, #20 +; VBITS_EQ_256-NEXT: fcmeq p2.s, p0/z, z0.s, #0.0 +; VBITS_EQ_256-NEXT: mov x13, #12 +; VBITS_EQ_256-NEXT: mov z19.s, p2/z, #-1 // =0xffffffffffffffff +; VBITS_EQ_256-NEXT: fcmeq p4.s, p0/z, z2.s, #0.0 +; VBITS_EQ_256-NEXT: ext z19.b, z19.b, z19.b, #16 +; VBITS_EQ_256-NEXT: punpklo p2.h, p2.b +; VBITS_EQ_256-NEXT: sunpklo z2.d, z19.s +; VBITS_EQ_256-NEXT: and p2.b, p2/z, p2.b, p1.b +; VBITS_EQ_256-NEXT: cmpne p3.d, p1/z, z2.d, #0 +; VBITS_EQ_256-NEXT: ld1d { z4.d }, p1/z, [x1, x11, lsl #3] +; VBITS_EQ_256-NEXT: ld1d { z5.d }, p1/z, [x1, x12, lsl #3] +; VBITS_EQ_256-NEXT: ld1d { z6.d }, p1/z, [x1, x13, lsl #3] +; VBITS_EQ_256-NEXT: ld1d { z7.d }, p1/z, [x1, x10, lsl #3] +; VBITS_EQ_256-NEXT: ld1d { z16.d }, p1/z, [x1, x9, lsl #3] +; VBITS_EQ_256-NEXT: ld1d { z0.d }, p1/z, [x1] +; VBITS_EQ_256-NEXT: ld1w { z2.d }, p2/z, [z18.d] +; VBITS_EQ_256-NEXT: ld1w { z17.d }, p3/z, [z17.d] +; VBITS_EQ_256-NEXT: fcmeq p3.s, p0/z, z1.s, #0.0 +; VBITS_EQ_256-NEXT: mov z1.s, p3/z, #-1 // =0xffffffffffffffff +; VBITS_EQ_256-NEXT: fcmeq p2.s, p0/z, z3.s, #0.0 +; VBITS_EQ_256-NEXT: ext z1.b, z1.b, z1.b, #16 +; VBITS_EQ_256-NEXT: mov z18.s, p4/z, #-1 // =0xffffffffffffffff +; VBITS_EQ_256-NEXT: mov z3.s, p2/z, #-1 // =0xffffffffffffffff +; VBITS_EQ_256-NEXT: punpklo p3.h, p3.b +; VBITS_EQ_256-NEXT: sunpklo z1.d, z1.s +; VBITS_EQ_256-NEXT: and p3.b, p3/z, p3.b, p1.b +; VBITS_EQ_256-NEXT: cmpne p5.d, p1/z, z1.d, #0 +; VBITS_EQ_256-NEXT: punpklo p4.h, p4.b +; VBITS_EQ_256-NEXT: ext z18.b, z18.b, z18.b, #16 +; VBITS_EQ_256-NEXT: ext z3.b, z3.b, z3.b, #16 +; VBITS_EQ_256-NEXT: ld1w { z16.d }, p3/z, [z16.d] +; VBITS_EQ_256-NEXT: ld1w { z1.d }, p5/z, [z6.d] +; VBITS_EQ_256-NEXT: and p4.b, p4/z, p4.b, p1.b +; VBITS_EQ_256-NEXT: sunpklo z6.d, z18.s +; VBITS_EQ_256-NEXT: punpklo p2.h, p2.b +; VBITS_EQ_256-NEXT: sunpklo z3.d, z3.s +; VBITS_EQ_256-NEXT: ld1w { z7.d }, p4/z, [z7.d] +; VBITS_EQ_256-NEXT: cmpne p4.d, p1/z, z6.d, #0 +; VBITS_EQ_256-NEXT: and p2.b, p2/z, p2.b, p1.b +; VBITS_EQ_256-NEXT: cmpne p1.d, p1/z, z3.d, #0 +; VBITS_EQ_256-NEXT: ld1w { z5.d }, p4/z, [z5.d] +; VBITS_EQ_256-NEXT: ld1w { z0.d }, p2/z, [z0.d] +; VBITS_EQ_256-NEXT: ld1w { z3.d }, p1/z, [z4.d] +; VBITS_EQ_256-NEXT: ptrue p3.s, vl4 +; VBITS_EQ_256-NEXT: uzp1 z4.s, z16.s, z16.s +; VBITS_EQ_256-NEXT: uzp1 z1.s, z1.s, z1.s +; VBITS_EQ_256-NEXT: uzp1 z2.s, z2.s, z2.s +; VBITS_EQ_256-NEXT: uzp1 z17.s, z17.s, z17.s +; VBITS_EQ_256-NEXT: splice z4.s, p3, z4.s, z1.s +; VBITS_EQ_256-NEXT: uzp1 z1.s, z7.s, z7.s +; VBITS_EQ_256-NEXT: uzp1 z5.s, z5.s, z5.s +; VBITS_EQ_256-NEXT: uzp1 z0.s, z0.s, z0.s +; VBITS_EQ_256-NEXT: uzp1 z3.s, z3.s, z3.s +; VBITS_EQ_256-NEXT: splice z2.s, p3, z2.s, z17.s +; VBITS_EQ_256-NEXT: splice z1.s, p3, z1.s, z5.s +; VBITS_EQ_256-NEXT: splice z0.s, p3, z0.s, z3.s +; VBITS_EQ_256-NEXT: st1w { z1.s }, p0, [x0, x10, lsl #2] +; VBITS_EQ_256-NEXT: st1w { z4.s }, p0, [x0, x9, lsl #2] +; VBITS_EQ_256-NEXT: st1w { z2.s }, p0, [x0, x8, lsl #2] +; VBITS_EQ_256-NEXT: st1w { z0.s }, p0, [x0] +; VBITS_EQ_256-NEXT: ret +; ; VBITS_GE_2048-LABEL: masked_gather_v32f32: ; VBITS_GE_2048: // %bb.0: ; VBITS_GE_2048-NEXT: ptrue p0.s, vl32 @@ -844,6 +5867,20 @@ ; Scalarize 1 x double gathers define void @masked_gather_v1f64(<1 x double>* %a, <1 x double*>* %b) #0 { +; NO_SVE-LABEL: masked_gather_v1f64: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: ldr d0, [x0] +; NO_SVE-NEXT: fcmp d0, #0.0 +; NO_SVE-NEXT: // implicit-def: $d0 +; NO_SVE-NEXT: b.ne .LBB31_2 +; NO_SVE-NEXT: // %bb.1: // %cond.load +; NO_SVE-NEXT: ldr d0, [x1] +; NO_SVE-NEXT: fmov x8, d0 +; NO_SVE-NEXT: ldr d0, [x8] +; NO_SVE-NEXT: .LBB31_2: // %else +; NO_SVE-NEXT: str d0, [x0] +; NO_SVE-NEXT: ret +; ; CHECK-LABEL: masked_gather_v1f64: ; CHECK: // %bb.0: ; CHECK-NEXT: ldr d0, [x0] @@ -866,6 +5903,33 @@ } define void @masked_gather_v2f64(<2 x double>* %a, <2 x double*>* %b) #0 { +; NO_SVE-LABEL: masked_gather_v2f64: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: sub sp, sp, #16 +; NO_SVE-NEXT: .cfi_def_cfa_offset 16 +; NO_SVE-NEXT: ldr q0, [x0] +; NO_SVE-NEXT: ldr q1, [x1] +; NO_SVE-NEXT: fcmeq v0.2d, v0.2d, #0.0 +; NO_SVE-NEXT: xtn v0.2s, v0.2d +; NO_SVE-NEXT: mov w8, v0.s[1] +; NO_SVE-NEXT: fmov w9, s0 +; NO_SVE-NEXT: // implicit-def: $q0 +; NO_SVE-NEXT: bfi w9, w8, #1, #31 +; NO_SVE-NEXT: and w8, w9, #0x3 +; NO_SVE-NEXT: tbz w9, #0, .LBB32_2 +; NO_SVE-NEXT: // %bb.1: // %cond.load +; NO_SVE-NEXT: fmov x9, d1 +; NO_SVE-NEXT: ldr d0, [x9] +; NO_SVE-NEXT: .LBB32_2: // %else +; NO_SVE-NEXT: tbz w8, #1, .LBB32_4 +; NO_SVE-NEXT: // %bb.3: // %cond.load1 +; NO_SVE-NEXT: mov x8, v1.d[1] +; NO_SVE-NEXT: ld1 { v0.d }[1], [x8] +; NO_SVE-NEXT: .LBB32_4: // %else2 +; NO_SVE-NEXT: str q0, [x0] +; NO_SVE-NEXT: add sp, sp, #16 +; NO_SVE-NEXT: ret +; ; CHECK-LABEL: masked_gather_v2f64: ; CHECK: // %bb.0: ; CHECK-NEXT: ldr q0, [x0] @@ -885,6 +5949,57 @@ } define void @masked_gather_v4f64(<4 x double>* %a, <4 x double*>* %b) #0 { +; NO_SVE-LABEL: masked_gather_v4f64: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: sub sp, sp, #16 +; NO_SVE-NEXT: .cfi_def_cfa_offset 16 +; NO_SVE-NEXT: ldp q1, q0, [x0] +; NO_SVE-NEXT: fcmeq v1.2d, v1.2d, #0.0 +; NO_SVE-NEXT: fcmeq v0.2d, v0.2d, #0.0 +; NO_SVE-NEXT: uzp1 v0.4s, v1.4s, v0.4s +; NO_SVE-NEXT: ldr q1, [x1] +; NO_SVE-NEXT: xtn v0.4h, v0.4s +; NO_SVE-NEXT: umov w8, v0.h[1] +; NO_SVE-NEXT: umov w9, v0.h[2] +; NO_SVE-NEXT: umov w10, v0.h[0] +; NO_SVE-NEXT: umov w11, v0.h[3] +; NO_SVE-NEXT: and w8, w8, #0x1 +; NO_SVE-NEXT: and w9, w9, #0x1 +; NO_SVE-NEXT: bfi w10, w8, #1, #1 +; NO_SVE-NEXT: bfi w10, w9, #2, #1 +; NO_SVE-NEXT: bfi w10, w11, #3, #29 +; NO_SVE-NEXT: and w8, w10, #0xf +; NO_SVE-NEXT: tbz w10, #0, .LBB33_2 +; NO_SVE-NEXT: // %bb.1: // %cond.load +; NO_SVE-NEXT: fmov x9, d1 +; NO_SVE-NEXT: ldr d0, [x9] +; NO_SVE-NEXT: tbnz w8, #1, .LBB33_3 +; NO_SVE-NEXT: b .LBB33_4 +; NO_SVE-NEXT: .LBB33_2: +; NO_SVE-NEXT: // implicit-def: $q0 +; NO_SVE-NEXT: tbz w8, #1, .LBB33_4 +; NO_SVE-NEXT: .LBB33_3: // %cond.load1 +; NO_SVE-NEXT: mov x9, v1.d[1] +; NO_SVE-NEXT: ld1 { v0.d }[1], [x9] +; NO_SVE-NEXT: .LBB33_4: // %else2 +; NO_SVE-NEXT: ldr q2, [x1, #16] +; NO_SVE-NEXT: tbz w8, #2, .LBB33_6 +; NO_SVE-NEXT: // %bb.5: // %cond.load4 +; NO_SVE-NEXT: fmov x9, d2 +; NO_SVE-NEXT: ld1 { v1.d }[0], [x9] +; NO_SVE-NEXT: tbnz w8, #3, .LBB33_7 +; NO_SVE-NEXT: b .LBB33_8 +; NO_SVE-NEXT: .LBB33_6: +; NO_SVE-NEXT: // implicit-def: $q1 +; NO_SVE-NEXT: tbz w8, #3, .LBB33_8 +; NO_SVE-NEXT: .LBB33_7: // %cond.load7 +; NO_SVE-NEXT: mov x8, v2.d[1] +; NO_SVE-NEXT: ld1 { v1.d }[1], [x8] +; NO_SVE-NEXT: .LBB33_8: // %else8 +; NO_SVE-NEXT: stp q0, q1, [x0] +; NO_SVE-NEXT: add sp, sp, #16 +; NO_SVE-NEXT: ret +; ; CHECK-LABEL: masked_gather_v4f64: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.d, vl4 @@ -903,6 +6018,120 @@ } define void @masked_gather_v8f64(<8 x double>* %a, <8 x double*>* %b) #0 { +; NO_SVE-LABEL: masked_gather_v8f64: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: sub sp, sp, #16 +; NO_SVE-NEXT: .cfi_def_cfa_offset 16 +; NO_SVE-NEXT: ldp q0, q1, [x0, #32] +; NO_SVE-NEXT: fcmeq v0.2d, v0.2d, #0.0 +; NO_SVE-NEXT: ldp q2, q3, [x0] +; NO_SVE-NEXT: fcmeq v1.2d, v1.2d, #0.0 +; NO_SVE-NEXT: fcmeq v2.2d, v2.2d, #0.0 +; NO_SVE-NEXT: uzp1 v0.4s, v0.4s, v1.4s +; NO_SVE-NEXT: fcmeq v3.2d, v3.2d, #0.0 +; NO_SVE-NEXT: uzp1 v1.4s, v2.4s, v3.4s +; NO_SVE-NEXT: uzp1 v0.8h, v1.8h, v0.8h +; NO_SVE-NEXT: ldr q1, [x1] +; NO_SVE-NEXT: xtn v0.8b, v0.8h +; NO_SVE-NEXT: umov w8, v0.b[1] +; NO_SVE-NEXT: umov w10, v0.b[2] +; NO_SVE-NEXT: umov w9, v0.b[0] +; NO_SVE-NEXT: umov w11, v0.b[3] +; NO_SVE-NEXT: umov w12, v0.b[4] +; NO_SVE-NEXT: umov w13, v0.b[5] +; NO_SVE-NEXT: umov w14, v0.b[6] +; NO_SVE-NEXT: and w8, w8, #0x1 +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: and w9, w9, #0x1 +; NO_SVE-NEXT: and w11, w11, #0x1 +; NO_SVE-NEXT: bfi w9, w8, #1, #1 +; NO_SVE-NEXT: and w8, w12, #0x1 +; NO_SVE-NEXT: bfi w9, w10, #2, #1 +; NO_SVE-NEXT: and w10, w13, #0x1 +; NO_SVE-NEXT: bfi w9, w11, #3, #1 +; NO_SVE-NEXT: umov w11, v0.b[7] +; NO_SVE-NEXT: bfi w9, w8, #4, #1 +; NO_SVE-NEXT: and w8, w14, #0x1 +; NO_SVE-NEXT: bfi w9, w10, #5, #1 +; NO_SVE-NEXT: orr w8, w9, w8, lsl #6 +; NO_SVE-NEXT: orr w9, w8, w11, lsl #7 +; NO_SVE-NEXT: and w8, w9, #0xff +; NO_SVE-NEXT: tbz w9, #0, .LBB34_2 +; NO_SVE-NEXT: // %bb.1: // %cond.load +; NO_SVE-NEXT: fmov x9, d1 +; NO_SVE-NEXT: ldr d0, [x9] +; NO_SVE-NEXT: tbnz w8, #1, .LBB34_3 +; NO_SVE-NEXT: b .LBB34_4 +; NO_SVE-NEXT: .LBB34_2: +; NO_SVE-NEXT: // implicit-def: $q0 +; NO_SVE-NEXT: tbz w8, #1, .LBB34_4 +; NO_SVE-NEXT: .LBB34_3: // %cond.load1 +; NO_SVE-NEXT: mov x9, v1.d[1] +; NO_SVE-NEXT: ld1 { v0.d }[1], [x9] +; NO_SVE-NEXT: .LBB34_4: // %else2 +; NO_SVE-NEXT: ldr q2, [x1, #16] +; NO_SVE-NEXT: tbz w8, #2, .LBB34_6 +; NO_SVE-NEXT: // %bb.5: // %cond.load4 +; NO_SVE-NEXT: fmov x9, d2 +; NO_SVE-NEXT: ld1 { v1.d }[0], [x9] +; NO_SVE-NEXT: tbnz w8, #3, .LBB34_7 +; NO_SVE-NEXT: b .LBB34_8 +; NO_SVE-NEXT: .LBB34_6: +; NO_SVE-NEXT: // implicit-def: $q1 +; NO_SVE-NEXT: tbz w8, #3, .LBB34_8 +; NO_SVE-NEXT: .LBB34_7: // %cond.load7 +; NO_SVE-NEXT: mov x9, v2.d[1] +; NO_SVE-NEXT: ld1 { v1.d }[1], [x9] +; NO_SVE-NEXT: .LBB34_8: // %else8 +; NO_SVE-NEXT: ldr q3, [x1, #32] +; NO_SVE-NEXT: tbz w8, #4, .LBB34_10 +; NO_SVE-NEXT: // %bb.9: // %cond.load10 +; NO_SVE-NEXT: fmov x9, d3 +; NO_SVE-NEXT: ld1 { v2.d }[0], [x9] +; NO_SVE-NEXT: tbnz w8, #5, .LBB34_11 +; NO_SVE-NEXT: b .LBB34_12 +; NO_SVE-NEXT: .LBB34_10: +; NO_SVE-NEXT: // implicit-def: $q2 +; NO_SVE-NEXT: tbz w8, #5, .LBB34_12 +; NO_SVE-NEXT: .LBB34_11: // %cond.load13 +; NO_SVE-NEXT: mov x9, v3.d[1] +; NO_SVE-NEXT: ld1 { v2.d }[1], [x9] +; NO_SVE-NEXT: .LBB34_12: // %else14 +; NO_SVE-NEXT: ldr q4, [x1, #48] +; NO_SVE-NEXT: tbz w8, #6, .LBB34_14 +; NO_SVE-NEXT: // %bb.13: // %cond.load16 +; NO_SVE-NEXT: fmov x9, d4 +; NO_SVE-NEXT: ld1 { v3.d }[0], [x9] +; NO_SVE-NEXT: tbnz w8, #7, .LBB34_15 +; NO_SVE-NEXT: b .LBB34_16 +; NO_SVE-NEXT: .LBB34_14: +; NO_SVE-NEXT: // implicit-def: $q3 +; NO_SVE-NEXT: tbz w8, #7, .LBB34_16 +; NO_SVE-NEXT: .LBB34_15: // %cond.load19 +; NO_SVE-NEXT: mov x8, v4.d[1] +; NO_SVE-NEXT: ld1 { v3.d }[1], [x8] +; NO_SVE-NEXT: .LBB34_16: // %else20 +; NO_SVE-NEXT: stp q0, q1, [x0] +; NO_SVE-NEXT: stp q2, q3, [x0, #32] +; NO_SVE-NEXT: add sp, sp, #16 +; NO_SVE-NEXT: ret +; +; VBITS_EQ_256-LABEL: masked_gather_v8f64: +; VBITS_EQ_256: // %bb.0: +; VBITS_EQ_256-NEXT: mov x8, #4 +; VBITS_EQ_256-NEXT: ptrue p0.d, vl4 +; VBITS_EQ_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] +; VBITS_EQ_256-NEXT: ld1d { z1.d }, p0/z, [x0] +; VBITS_EQ_256-NEXT: ld1d { z2.d }, p0/z, [x1, x8, lsl #3] +; VBITS_EQ_256-NEXT: ld1d { z3.d }, p0/z, [x1] +; VBITS_EQ_256-NEXT: fcmeq p1.d, p0/z, z0.d, #0.0 +; VBITS_EQ_256-NEXT: fcmeq p2.d, p0/z, z1.d, #0.0 +; VBITS_EQ_256-NEXT: ld1d { z0.d }, p1/z, [z2.d] +; VBITS_EQ_256-NEXT: ld1d { z1.d }, p2/z, [z3.d] +; VBITS_EQ_256-NEXT: st1d { z0.d }, p0, [x0, x8, lsl #3] +; VBITS_EQ_256-NEXT: st1d { z1.d }, p0, [x0] +; VBITS_EQ_256-NEXT: ret +; ; VBITS_GE_512-LABEL: masked_gather_v8f64: ; VBITS_GE_512: // %bb.0: ; VBITS_GE_512-NEXT: ptrue p0.d, vl8 @@ -921,6 +6150,224 @@ } define void @masked_gather_v16f64(<16 x double>* %a, <16 x double*>* %b) #0 { +; NO_SVE-LABEL: masked_gather_v16f64: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: sub sp, sp, #16 +; NO_SVE-NEXT: .cfi_def_cfa_offset 16 +; NO_SVE-NEXT: ldp q2, q3, [x0, #32] +; NO_SVE-NEXT: fcmeq v2.2d, v2.2d, #0.0 +; NO_SVE-NEXT: ldp q4, q5, [x0] +; NO_SVE-NEXT: fcmeq v3.2d, v3.2d, #0.0 +; NO_SVE-NEXT: fcmeq v4.2d, v4.2d, #0.0 +; NO_SVE-NEXT: uzp1 v2.4s, v2.4s, v3.4s +; NO_SVE-NEXT: fcmeq v5.2d, v5.2d, #0.0 +; NO_SVE-NEXT: ldp q0, q1, [x0, #64] +; NO_SVE-NEXT: uzp1 v3.4s, v4.4s, v5.4s +; NO_SVE-NEXT: fcmeq v0.2d, v0.2d, #0.0 +; NO_SVE-NEXT: ldp q6, q7, [x0, #96] +; NO_SVE-NEXT: uzp1 v2.8h, v3.8h, v2.8h +; NO_SVE-NEXT: fcmeq v1.2d, v1.2d, #0.0 +; NO_SVE-NEXT: fcmeq v5.2d, v6.2d, #0.0 +; NO_SVE-NEXT: xtn v2.8b, v2.8h +; NO_SVE-NEXT: fcmeq v4.2d, v7.2d, #0.0 +; NO_SVE-NEXT: uzp1 v0.4s, v0.4s, v1.4s +; NO_SVE-NEXT: ldr q1, [x1] +; NO_SVE-NEXT: umov w8, v2.b[1] +; NO_SVE-NEXT: umov w10, v2.b[2] +; NO_SVE-NEXT: umov w9, v2.b[0] +; NO_SVE-NEXT: uzp1 v3.4s, v5.4s, v4.4s +; NO_SVE-NEXT: umov w11, v2.b[3] +; NO_SVE-NEXT: umov w12, v2.b[4] +; NO_SVE-NEXT: and w8, w8, #0x1 +; NO_SVE-NEXT: umov w13, v2.b[5] +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: and w9, w9, #0x1 +; NO_SVE-NEXT: uzp1 v0.8h, v0.8h, v3.8h +; NO_SVE-NEXT: and w11, w11, #0x1 +; NO_SVE-NEXT: bfi w9, w8, #1, #1 +; NO_SVE-NEXT: umov w8, v2.b[6] +; NO_SVE-NEXT: and w12, w12, #0x1 +; NO_SVE-NEXT: bfi w9, w10, #2, #1 +; NO_SVE-NEXT: umov w10, v2.b[7] +; NO_SVE-NEXT: and w13, w13, #0x1 +; NO_SVE-NEXT: xtn v0.8b, v0.8h +; NO_SVE-NEXT: bfi w9, w11, #3, #1 +; NO_SVE-NEXT: bfi w9, w12, #4, #1 +; NO_SVE-NEXT: and w8, w8, #0x1 +; NO_SVE-NEXT: umov w11, v0.b[0] +; NO_SVE-NEXT: umov w12, v0.b[1] +; NO_SVE-NEXT: bfi w9, w13, #5, #1 +; NO_SVE-NEXT: umov w13, v0.b[2] +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: orr w8, w9, w8, lsl #6 +; NO_SVE-NEXT: umov w9, v0.b[3] +; NO_SVE-NEXT: and w11, w11, #0x1 +; NO_SVE-NEXT: orr w8, w8, w10, lsl #7 +; NO_SVE-NEXT: umov w10, v0.b[4] +; NO_SVE-NEXT: and w12, w12, #0x1 +; NO_SVE-NEXT: orr w8, w8, w11, lsl #8 +; NO_SVE-NEXT: umov w11, v0.b[5] +; NO_SVE-NEXT: and w13, w13, #0x1 +; NO_SVE-NEXT: orr w8, w8, w12, lsl #9 +; NO_SVE-NEXT: umov w12, v0.b[6] +; NO_SVE-NEXT: and w9, w9, #0x1 +; NO_SVE-NEXT: orr w8, w8, w13, lsl #10 +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: orr w8, w8, w9, lsl #11 +; NO_SVE-NEXT: and w9, w11, #0x1 +; NO_SVE-NEXT: umov w11, v0.b[7] +; NO_SVE-NEXT: orr w8, w8, w10, lsl #12 +; NO_SVE-NEXT: and w10, w12, #0x1 +; NO_SVE-NEXT: orr w8, w8, w9, lsl #13 +; NO_SVE-NEXT: orr w8, w8, w10, lsl #14 +; NO_SVE-NEXT: orr w9, w8, w11, lsl #15 +; NO_SVE-NEXT: and w8, w9, #0xffff +; NO_SVE-NEXT: tbz w9, #0, .LBB35_2 +; NO_SVE-NEXT: // %bb.1: // %cond.load +; NO_SVE-NEXT: fmov x9, d1 +; NO_SVE-NEXT: ldr d0, [x9] +; NO_SVE-NEXT: tbnz w8, #1, .LBB35_3 +; NO_SVE-NEXT: b .LBB35_4 +; NO_SVE-NEXT: .LBB35_2: +; NO_SVE-NEXT: // implicit-def: $q0 +; NO_SVE-NEXT: tbz w8, #1, .LBB35_4 +; NO_SVE-NEXT: .LBB35_3: // %cond.load1 +; NO_SVE-NEXT: mov x9, v1.d[1] +; NO_SVE-NEXT: ld1 { v0.d }[1], [x9] +; NO_SVE-NEXT: .LBB35_4: // %else2 +; NO_SVE-NEXT: ldr q2, [x1, #16] +; NO_SVE-NEXT: tbz w8, #2, .LBB35_6 +; NO_SVE-NEXT: // %bb.5: // %cond.load4 +; NO_SVE-NEXT: fmov x9, d2 +; NO_SVE-NEXT: ld1 { v1.d }[0], [x9] +; NO_SVE-NEXT: tbnz w8, #3, .LBB35_7 +; NO_SVE-NEXT: b .LBB35_8 +; NO_SVE-NEXT: .LBB35_6: +; NO_SVE-NEXT: // implicit-def: $q1 +; NO_SVE-NEXT: tbz w8, #3, .LBB35_8 +; NO_SVE-NEXT: .LBB35_7: // %cond.load7 +; NO_SVE-NEXT: mov x9, v2.d[1] +; NO_SVE-NEXT: ld1 { v1.d }[1], [x9] +; NO_SVE-NEXT: .LBB35_8: // %else8 +; NO_SVE-NEXT: ldr q3, [x1, #32] +; NO_SVE-NEXT: tbz w8, #4, .LBB35_10 +; NO_SVE-NEXT: // %bb.9: // %cond.load10 +; NO_SVE-NEXT: fmov x9, d3 +; NO_SVE-NEXT: ld1 { v2.d }[0], [x9] +; NO_SVE-NEXT: tbnz w8, #5, .LBB35_11 +; NO_SVE-NEXT: b .LBB35_12 +; NO_SVE-NEXT: .LBB35_10: +; NO_SVE-NEXT: // implicit-def: $q2 +; NO_SVE-NEXT: tbz w8, #5, .LBB35_12 +; NO_SVE-NEXT: .LBB35_11: // %cond.load13 +; NO_SVE-NEXT: mov x9, v3.d[1] +; NO_SVE-NEXT: ld1 { v2.d }[1], [x9] +; NO_SVE-NEXT: .LBB35_12: // %else14 +; NO_SVE-NEXT: ldr q4, [x1, #48] +; NO_SVE-NEXT: tbz w8, #6, .LBB35_14 +; NO_SVE-NEXT: // %bb.13: // %cond.load16 +; NO_SVE-NEXT: fmov x9, d4 +; NO_SVE-NEXT: ld1 { v3.d }[0], [x9] +; NO_SVE-NEXT: tbnz w8, #7, .LBB35_15 +; NO_SVE-NEXT: b .LBB35_16 +; NO_SVE-NEXT: .LBB35_14: +; NO_SVE-NEXT: // implicit-def: $q3 +; NO_SVE-NEXT: tbz w8, #7, .LBB35_16 +; NO_SVE-NEXT: .LBB35_15: // %cond.load19 +; NO_SVE-NEXT: mov x9, v4.d[1] +; NO_SVE-NEXT: ld1 { v3.d }[1], [x9] +; NO_SVE-NEXT: .LBB35_16: // %else20 +; NO_SVE-NEXT: ldr q5, [x1, #64] +; NO_SVE-NEXT: tbz w8, #8, .LBB35_18 +; NO_SVE-NEXT: // %bb.17: // %cond.load22 +; NO_SVE-NEXT: fmov x9, d5 +; NO_SVE-NEXT: ld1 { v4.d }[0], [x9] +; NO_SVE-NEXT: tbnz w8, #9, .LBB35_19 +; NO_SVE-NEXT: b .LBB35_20 +; NO_SVE-NEXT: .LBB35_18: +; NO_SVE-NEXT: // implicit-def: $q4 +; NO_SVE-NEXT: tbz w8, #9, .LBB35_20 +; NO_SVE-NEXT: .LBB35_19: // %cond.load25 +; NO_SVE-NEXT: mov x9, v5.d[1] +; NO_SVE-NEXT: ld1 { v4.d }[1], [x9] +; NO_SVE-NEXT: .LBB35_20: // %else26 +; NO_SVE-NEXT: ldr q6, [x1, #80] +; NO_SVE-NEXT: tbz w8, #10, .LBB35_22 +; NO_SVE-NEXT: // %bb.21: // %cond.load28 +; NO_SVE-NEXT: fmov x9, d6 +; NO_SVE-NEXT: ld1 { v5.d }[0], [x9] +; NO_SVE-NEXT: tbnz w8, #11, .LBB35_23 +; NO_SVE-NEXT: b .LBB35_24 +; NO_SVE-NEXT: .LBB35_22: +; NO_SVE-NEXT: // implicit-def: $q5 +; NO_SVE-NEXT: tbz w8, #11, .LBB35_24 +; NO_SVE-NEXT: .LBB35_23: // %cond.load31 +; NO_SVE-NEXT: mov x9, v6.d[1] +; NO_SVE-NEXT: ld1 { v5.d }[1], [x9] +; NO_SVE-NEXT: .LBB35_24: // %else32 +; NO_SVE-NEXT: ldr q7, [x1, #96] +; NO_SVE-NEXT: tbz w8, #12, .LBB35_26 +; NO_SVE-NEXT: // %bb.25: // %cond.load34 +; NO_SVE-NEXT: fmov x9, d7 +; NO_SVE-NEXT: ld1 { v6.d }[0], [x9] +; NO_SVE-NEXT: tbnz w8, #13, .LBB35_27 +; NO_SVE-NEXT: b .LBB35_28 +; NO_SVE-NEXT: .LBB35_26: +; NO_SVE-NEXT: // implicit-def: $q6 +; NO_SVE-NEXT: tbz w8, #13, .LBB35_28 +; NO_SVE-NEXT: .LBB35_27: // %cond.load37 +; NO_SVE-NEXT: mov x9, v7.d[1] +; NO_SVE-NEXT: ld1 { v6.d }[1], [x9] +; NO_SVE-NEXT: .LBB35_28: // %else38 +; NO_SVE-NEXT: ldr q16, [x1, #112] +; NO_SVE-NEXT: tbz w8, #14, .LBB35_30 +; NO_SVE-NEXT: // %bb.29: // %cond.load40 +; NO_SVE-NEXT: fmov x9, d16 +; NO_SVE-NEXT: ld1 { v7.d }[0], [x9] +; NO_SVE-NEXT: tbnz w8, #15, .LBB35_31 +; NO_SVE-NEXT: b .LBB35_32 +; NO_SVE-NEXT: .LBB35_30: +; NO_SVE-NEXT: // implicit-def: $q7 +; NO_SVE-NEXT: tbz w8, #15, .LBB35_32 +; NO_SVE-NEXT: .LBB35_31: // %cond.load43 +; NO_SVE-NEXT: mov x8, v16.d[1] +; NO_SVE-NEXT: ld1 { v7.d }[1], [x8] +; NO_SVE-NEXT: .LBB35_32: // %else44 +; NO_SVE-NEXT: stp q0, q1, [x0] +; NO_SVE-NEXT: stp q2, q3, [x0, #32] +; NO_SVE-NEXT: stp q4, q5, [x0, #64] +; NO_SVE-NEXT: stp q6, q7, [x0, #96] +; NO_SVE-NEXT: add sp, sp, #16 +; NO_SVE-NEXT: ret +; +; VBITS_EQ_256-LABEL: masked_gather_v16f64: +; VBITS_EQ_256: // %bb.0: +; VBITS_EQ_256-NEXT: mov x8, #8 +; VBITS_EQ_256-NEXT: mov x9, #4 +; VBITS_EQ_256-NEXT: mov x10, #12 +; VBITS_EQ_256-NEXT: ptrue p0.d, vl4 +; VBITS_EQ_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] +; VBITS_EQ_256-NEXT: ld1d { z1.d }, p0/z, [x0, x9, lsl #3] +; VBITS_EQ_256-NEXT: ld1d { z2.d }, p0/z, [x0, x10, lsl #3] +; VBITS_EQ_256-NEXT: ld1d { z3.d }, p0/z, [x0] +; VBITS_EQ_256-NEXT: ld1d { z4.d }, p0/z, [x1, x9, lsl #3] +; VBITS_EQ_256-NEXT: ld1d { z5.d }, p0/z, [x1, x8, lsl #3] +; VBITS_EQ_256-NEXT: ld1d { z6.d }, p0/z, [x1, x10, lsl #3] +; VBITS_EQ_256-NEXT: ld1d { z7.d }, p0/z, [x1] +; VBITS_EQ_256-NEXT: fcmeq p1.d, p0/z, z0.d, #0.0 +; VBITS_EQ_256-NEXT: fcmeq p3.d, p0/z, z1.d, #0.0 +; VBITS_EQ_256-NEXT: fcmeq p2.d, p0/z, z2.d, #0.0 +; VBITS_EQ_256-NEXT: fcmeq p4.d, p0/z, z3.d, #0.0 +; VBITS_EQ_256-NEXT: ld1d { z0.d }, p3/z, [z4.d] +; VBITS_EQ_256-NEXT: ld1d { z1.d }, p2/z, [z6.d] +; VBITS_EQ_256-NEXT: ld1d { z2.d }, p1/z, [z5.d] +; VBITS_EQ_256-NEXT: ld1d { z3.d }, p4/z, [z7.d] +; VBITS_EQ_256-NEXT: st1d { z0.d }, p0, [x0, x9, lsl #3] +; VBITS_EQ_256-NEXT: st1d { z2.d }, p0, [x0, x8, lsl #3] +; VBITS_EQ_256-NEXT: st1d { z1.d }, p0, [x0, x10, lsl #3] +; VBITS_EQ_256-NEXT: st1d { z3.d }, p0, [x0] +; VBITS_EQ_256-NEXT: ret +; ; VBITS_GE_1024-LABEL: masked_gather_v16f64: ; VBITS_GE_1024: // %bb.0: ; VBITS_GE_1024-NEXT: ptrue p0.d, vl16 @@ -939,6 +6386,430 @@ } define void @masked_gather_v32f64(<32 x double>* %a, <32 x double*>* %b) #0 { +; NO_SVE-LABEL: masked_gather_v32f64: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: sub sp, sp, #16 +; NO_SVE-NEXT: .cfi_def_cfa_offset 16 +; NO_SVE-NEXT: ldp q2, q3, [x0, #160] +; NO_SVE-NEXT: fcmeq v2.2d, v2.2d, #0.0 +; NO_SVE-NEXT: ldp q4, q5, [x0, #128] +; NO_SVE-NEXT: fcmeq v3.2d, v3.2d, #0.0 +; NO_SVE-NEXT: fcmeq v4.2d, v4.2d, #0.0 +; NO_SVE-NEXT: uzp1 v2.4s, v2.4s, v3.4s +; NO_SVE-NEXT: fcmeq v5.2d, v5.2d, #0.0 +; NO_SVE-NEXT: ldp q0, q1, [x0, #224] +; NO_SVE-NEXT: uzp1 v4.4s, v4.4s, v5.4s +; NO_SVE-NEXT: fcmeq v0.2d, v0.2d, #0.0 +; NO_SVE-NEXT: ldp q6, q7, [x0, #192] +; NO_SVE-NEXT: fcmeq v1.2d, v1.2d, #0.0 +; NO_SVE-NEXT: uzp1 v2.8h, v4.8h, v2.8h +; NO_SVE-NEXT: fcmeq v6.2d, v6.2d, #0.0 +; NO_SVE-NEXT: uzp1 v0.4s, v0.4s, v1.4s +; NO_SVE-NEXT: ldp q3, q5, [x0, #32] +; NO_SVE-NEXT: fcmeq v7.2d, v7.2d, #0.0 +; NO_SVE-NEXT: xtn v2.8b, v2.8h +; NO_SVE-NEXT: umov w8, v2.b[1] +; NO_SVE-NEXT: umov w10, v2.b[2] +; NO_SVE-NEXT: umov w9, v2.b[0] +; NO_SVE-NEXT: umov w11, v2.b[3] +; NO_SVE-NEXT: fcmeq v3.2d, v3.2d, #0.0 +; NO_SVE-NEXT: umov w12, v2.b[4] +; NO_SVE-NEXT: ldp q16, q17, [x0] +; NO_SVE-NEXT: fcmeq v1.2d, v5.2d, #0.0 +; NO_SVE-NEXT: and w8, w8, #0x1 +; NO_SVE-NEXT: uzp1 v5.4s, v6.4s, v7.4s +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: umov w13, v2.b[5] +; NO_SVE-NEXT: and w9, w9, #0x1 +; NO_SVE-NEXT: fcmeq v6.2d, v16.2d, #0.0 +; NO_SVE-NEXT: umov w14, v2.b[6] +; NO_SVE-NEXT: and w11, w11, #0x1 +; NO_SVE-NEXT: umov w15, v2.b[7] +; NO_SVE-NEXT: uzp1 v0.8h, v5.8h, v0.8h +; NO_SVE-NEXT: bfi w9, w8, #1, #1 +; NO_SVE-NEXT: fcmeq v4.2d, v17.2d, #0.0 +; NO_SVE-NEXT: and w12, w12, #0x1 +; NO_SVE-NEXT: uzp1 v1.4s, v3.4s, v1.4s +; NO_SVE-NEXT: bfi w9, w10, #2, #1 +; NO_SVE-NEXT: and w13, w13, #0x1 +; NO_SVE-NEXT: bfi w9, w11, #3, #1 +; NO_SVE-NEXT: xtn v0.8b, v0.8h +; NO_SVE-NEXT: bfi w9, w12, #4, #1 +; NO_SVE-NEXT: uzp1 v3.4s, v6.4s, v4.4s +; NO_SVE-NEXT: and w14, w14, #0x1 +; NO_SVE-NEXT: umov w8, v0.b[0] +; NO_SVE-NEXT: bfi w9, w13, #5, #1 +; NO_SVE-NEXT: umov w10, v0.b[1] +; NO_SVE-NEXT: umov w11, v0.b[2] +; NO_SVE-NEXT: ldp q4, q5, [x0, #96] +; NO_SVE-NEXT: umov w12, v0.b[3] +; NO_SVE-NEXT: and w15, w15, #0x1 +; NO_SVE-NEXT: uzp1 v1.8h, v3.8h, v1.8h +; NO_SVE-NEXT: and w8, w8, #0x1 +; NO_SVE-NEXT: orr w9, w9, w14, lsl #6 +; NO_SVE-NEXT: umov w13, v0.b[4] +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: orr w9, w9, w15, lsl #7 +; NO_SVE-NEXT: and w11, w11, #0x1 +; NO_SVE-NEXT: orr w8, w9, w8, lsl #8 +; NO_SVE-NEXT: ldp q2, q3, [x0, #64] +; NO_SVE-NEXT: orr w8, w8, w10, lsl #9 +; NO_SVE-NEXT: and w9, w12, #0x1 +; NO_SVE-NEXT: umov w10, v0.b[5] +; NO_SVE-NEXT: orr w8, w8, w11, lsl #10 +; NO_SVE-NEXT: orr w8, w8, w9, lsl #11 +; NO_SVE-NEXT: and w9, w13, #0x1 +; NO_SVE-NEXT: xtn v1.8b, v1.8h +; NO_SVE-NEXT: fcmeq v5.2d, v5.2d, #0.0 +; NO_SVE-NEXT: orr w8, w8, w9, lsl #12 +; NO_SVE-NEXT: fcmeq v4.2d, v4.2d, #0.0 +; NO_SVE-NEXT: and w9, w10, #0x1 +; NO_SVE-NEXT: fcmeq v3.2d, v3.2d, #0.0 +; NO_SVE-NEXT: fcmeq v2.2d, v2.2d, #0.0 +; NO_SVE-NEXT: orr w8, w8, w9, lsl #13 +; NO_SVE-NEXT: umov w10, v1.b[1] +; NO_SVE-NEXT: umov w12, v1.b[2] +; NO_SVE-NEXT: umov w11, v1.b[0] +; NO_SVE-NEXT: uzp1 v4.4s, v4.4s, v5.4s +; NO_SVE-NEXT: umov w13, v1.b[3] +; NO_SVE-NEXT: uzp1 v2.4s, v2.4s, v3.4s +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: umov w14, v1.b[4] +; NO_SVE-NEXT: and w12, w12, #0x1 +; NO_SVE-NEXT: umov w15, v1.b[5] +; NO_SVE-NEXT: and w11, w11, #0x1 +; NO_SVE-NEXT: uzp1 v2.8h, v2.8h, v4.8h +; NO_SVE-NEXT: bfi w11, w10, #1, #1 +; NO_SVE-NEXT: and w10, w13, #0x1 +; NO_SVE-NEXT: bfi w11, w12, #2, #1 +; NO_SVE-NEXT: and w12, w14, #0x1 +; NO_SVE-NEXT: umov w14, v1.b[6] +; NO_SVE-NEXT: and w13, w15, #0x1 +; NO_SVE-NEXT: bfi w11, w10, #3, #1 +; NO_SVE-NEXT: umov w10, v1.b[7] +; NO_SVE-NEXT: xtn v1.8b, v2.8h +; NO_SVE-NEXT: bfi w11, w12, #4, #1 +; NO_SVE-NEXT: umov w12, v0.b[6] +; NO_SVE-NEXT: bfi w11, w13, #5, #1 +; NO_SVE-NEXT: and w13, w14, #0x1 +; NO_SVE-NEXT: umov w14, v1.b[0] +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: umov w15, v1.b[1] +; NO_SVE-NEXT: orr w9, w11, w13, lsl #6 +; NO_SVE-NEXT: and w11, w12, #0x1 +; NO_SVE-NEXT: umov w12, v1.b[2] +; NO_SVE-NEXT: orr w9, w9, w10, lsl #7 +; NO_SVE-NEXT: and w10, w14, #0x1 +; NO_SVE-NEXT: umov w14, v1.b[3] +; NO_SVE-NEXT: and w13, w15, #0x1 +; NO_SVE-NEXT: orr w9, w9, w10, lsl #8 +; NO_SVE-NEXT: umov w10, v1.b[4] +; NO_SVE-NEXT: orr w8, w8, w11, lsl #14 +; NO_SVE-NEXT: and w11, w12, #0x1 +; NO_SVE-NEXT: umov w12, v1.b[5] +; NO_SVE-NEXT: orr w9, w9, w13, lsl #9 +; NO_SVE-NEXT: and w13, w14, #0x1 +; NO_SVE-NEXT: umov w14, v1.b[6] +; NO_SVE-NEXT: orr w9, w9, w11, lsl #10 +; NO_SVE-NEXT: umov w11, v0.b[7] +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: orr w9, w9, w13, lsl #11 +; NO_SVE-NEXT: and w12, w12, #0x1 +; NO_SVE-NEXT: umov w13, v1.b[7] +; NO_SVE-NEXT: orr w9, w9, w10, lsl #12 +; NO_SVE-NEXT: and w10, w14, #0x1 +; NO_SVE-NEXT: orr w11, w8, w11, lsl #15 +; NO_SVE-NEXT: orr w8, w9, w12, lsl #13 +; NO_SVE-NEXT: orr w8, w8, w10, lsl #14 +; NO_SVE-NEXT: ldr q1, [x1] +; NO_SVE-NEXT: orr w8, w8, w13, lsl #15 +; NO_SVE-NEXT: bfi w8, w11, #16, #16 +; NO_SVE-NEXT: tbz w8, #0, .LBB36_2 +; NO_SVE-NEXT: // %bb.1: // %cond.load +; NO_SVE-NEXT: fmov x9, d1 +; NO_SVE-NEXT: ldr d0, [x9] +; NO_SVE-NEXT: tbnz w8, #1, .LBB36_3 +; NO_SVE-NEXT: b .LBB36_4 +; NO_SVE-NEXT: .LBB36_2: +; NO_SVE-NEXT: // implicit-def: $q0 +; NO_SVE-NEXT: tbz w8, #1, .LBB36_4 +; NO_SVE-NEXT: .LBB36_3: // %cond.load1 +; NO_SVE-NEXT: mov x9, v1.d[1] +; NO_SVE-NEXT: ld1 { v0.d }[1], [x9] +; NO_SVE-NEXT: .LBB36_4: // %else2 +; NO_SVE-NEXT: ldr q2, [x1, #16] +; NO_SVE-NEXT: tbz w8, #2, .LBB36_6 +; NO_SVE-NEXT: // %bb.5: // %cond.load4 +; NO_SVE-NEXT: fmov x9, d2 +; NO_SVE-NEXT: ld1 { v1.d }[0], [x9] +; NO_SVE-NEXT: tbnz w8, #3, .LBB36_7 +; NO_SVE-NEXT: b .LBB36_8 +; NO_SVE-NEXT: .LBB36_6: +; NO_SVE-NEXT: // implicit-def: $q1 +; NO_SVE-NEXT: tbz w8, #3, .LBB36_8 +; NO_SVE-NEXT: .LBB36_7: // %cond.load7 +; NO_SVE-NEXT: mov x9, v2.d[1] +; NO_SVE-NEXT: ld1 { v1.d }[1], [x9] +; NO_SVE-NEXT: .LBB36_8: // %else8 +; NO_SVE-NEXT: ldr q3, [x1, #32] +; NO_SVE-NEXT: tbz w8, #4, .LBB36_10 +; NO_SVE-NEXT: // %bb.9: // %cond.load10 +; NO_SVE-NEXT: fmov x9, d3 +; NO_SVE-NEXT: ld1 { v2.d }[0], [x9] +; NO_SVE-NEXT: tbnz w8, #5, .LBB36_11 +; NO_SVE-NEXT: b .LBB36_12 +; NO_SVE-NEXT: .LBB36_10: +; NO_SVE-NEXT: // implicit-def: $q2 +; NO_SVE-NEXT: tbz w8, #5, .LBB36_12 +; NO_SVE-NEXT: .LBB36_11: // %cond.load13 +; NO_SVE-NEXT: mov x9, v3.d[1] +; NO_SVE-NEXT: ld1 { v2.d }[1], [x9] +; NO_SVE-NEXT: .LBB36_12: // %else14 +; NO_SVE-NEXT: ldr q4, [x1, #48] +; NO_SVE-NEXT: tbz w8, #6, .LBB36_14 +; NO_SVE-NEXT: // %bb.13: // %cond.load16 +; NO_SVE-NEXT: fmov x9, d4 +; NO_SVE-NEXT: ld1 { v3.d }[0], [x9] +; NO_SVE-NEXT: tbnz w8, #7, .LBB36_15 +; NO_SVE-NEXT: b .LBB36_16 +; NO_SVE-NEXT: .LBB36_14: +; NO_SVE-NEXT: // implicit-def: $q3 +; NO_SVE-NEXT: tbz w8, #7, .LBB36_16 +; NO_SVE-NEXT: .LBB36_15: // %cond.load19 +; NO_SVE-NEXT: mov x9, v4.d[1] +; NO_SVE-NEXT: ld1 { v3.d }[1], [x9] +; NO_SVE-NEXT: .LBB36_16: // %else20 +; NO_SVE-NEXT: ldr q5, [x1, #64] +; NO_SVE-NEXT: tbz w8, #8, .LBB36_18 +; NO_SVE-NEXT: // %bb.17: // %cond.load22 +; NO_SVE-NEXT: fmov x9, d5 +; NO_SVE-NEXT: ld1 { v4.d }[0], [x9] +; NO_SVE-NEXT: tbnz w8, #9, .LBB36_19 +; NO_SVE-NEXT: b .LBB36_20 +; NO_SVE-NEXT: .LBB36_18: +; NO_SVE-NEXT: // implicit-def: $q4 +; NO_SVE-NEXT: tbz w8, #9, .LBB36_20 +; NO_SVE-NEXT: .LBB36_19: // %cond.load25 +; NO_SVE-NEXT: mov x9, v5.d[1] +; NO_SVE-NEXT: ld1 { v4.d }[1], [x9] +; NO_SVE-NEXT: .LBB36_20: // %else26 +; NO_SVE-NEXT: ldr q6, [x1, #80] +; NO_SVE-NEXT: tbz w8, #10, .LBB36_22 +; NO_SVE-NEXT: // %bb.21: // %cond.load28 +; NO_SVE-NEXT: fmov x9, d6 +; NO_SVE-NEXT: ld1 { v5.d }[0], [x9] +; NO_SVE-NEXT: tbnz w8, #11, .LBB36_23 +; NO_SVE-NEXT: b .LBB36_24 +; NO_SVE-NEXT: .LBB36_22: +; NO_SVE-NEXT: // implicit-def: $q5 +; NO_SVE-NEXT: tbz w8, #11, .LBB36_24 +; NO_SVE-NEXT: .LBB36_23: // %cond.load31 +; NO_SVE-NEXT: mov x9, v6.d[1] +; NO_SVE-NEXT: ld1 { v5.d }[1], [x9] +; NO_SVE-NEXT: .LBB36_24: // %else32 +; NO_SVE-NEXT: ldr q7, [x1, #96] +; NO_SVE-NEXT: tbz w8, #12, .LBB36_26 +; NO_SVE-NEXT: // %bb.25: // %cond.load34 +; NO_SVE-NEXT: fmov x9, d7 +; NO_SVE-NEXT: ld1 { v6.d }[0], [x9] +; NO_SVE-NEXT: tbnz w8, #13, .LBB36_27 +; NO_SVE-NEXT: b .LBB36_28 +; NO_SVE-NEXT: .LBB36_26: +; NO_SVE-NEXT: // implicit-def: $q6 +; NO_SVE-NEXT: tbz w8, #13, .LBB36_28 +; NO_SVE-NEXT: .LBB36_27: // %cond.load37 +; NO_SVE-NEXT: mov x9, v7.d[1] +; NO_SVE-NEXT: ld1 { v6.d }[1], [x9] +; NO_SVE-NEXT: .LBB36_28: // %else38 +; NO_SVE-NEXT: ldr q16, [x1, #112] +; NO_SVE-NEXT: tbz w8, #14, .LBB36_30 +; NO_SVE-NEXT: // %bb.29: // %cond.load40 +; NO_SVE-NEXT: fmov x9, d16 +; NO_SVE-NEXT: ld1 { v7.d }[0], [x9] +; NO_SVE-NEXT: tbnz w8, #15, .LBB36_31 +; NO_SVE-NEXT: b .LBB36_32 +; NO_SVE-NEXT: .LBB36_30: +; NO_SVE-NEXT: // implicit-def: $q7 +; NO_SVE-NEXT: tbz w8, #15, .LBB36_32 +; NO_SVE-NEXT: .LBB36_31: // %cond.load43 +; NO_SVE-NEXT: mov x9, v16.d[1] +; NO_SVE-NEXT: ld1 { v7.d }[1], [x9] +; NO_SVE-NEXT: .LBB36_32: // %else44 +; NO_SVE-NEXT: ldr q17, [x1, #128] +; NO_SVE-NEXT: tbz w8, #16, .LBB36_34 +; NO_SVE-NEXT: // %bb.33: // %cond.load46 +; NO_SVE-NEXT: fmov x9, d17 +; NO_SVE-NEXT: ld1 { v16.d }[0], [x9] +; NO_SVE-NEXT: tbnz w8, #17, .LBB36_35 +; NO_SVE-NEXT: b .LBB36_36 +; NO_SVE-NEXT: .LBB36_34: +; NO_SVE-NEXT: // implicit-def: $q16 +; NO_SVE-NEXT: tbz w8, #17, .LBB36_36 +; NO_SVE-NEXT: .LBB36_35: // %cond.load49 +; NO_SVE-NEXT: mov x9, v17.d[1] +; NO_SVE-NEXT: ld1 { v16.d }[1], [x9] +; NO_SVE-NEXT: .LBB36_36: // %else50 +; NO_SVE-NEXT: ldr q18, [x1, #144] +; NO_SVE-NEXT: tbz w8, #18, .LBB36_38 +; NO_SVE-NEXT: // %bb.37: // %cond.load52 +; NO_SVE-NEXT: fmov x9, d18 +; NO_SVE-NEXT: ld1 { v17.d }[0], [x9] +; NO_SVE-NEXT: tbnz w8, #19, .LBB36_39 +; NO_SVE-NEXT: b .LBB36_40 +; NO_SVE-NEXT: .LBB36_38: +; NO_SVE-NEXT: // implicit-def: $q17 +; NO_SVE-NEXT: tbz w8, #19, .LBB36_40 +; NO_SVE-NEXT: .LBB36_39: // %cond.load55 +; NO_SVE-NEXT: mov x9, v18.d[1] +; NO_SVE-NEXT: ld1 { v17.d }[1], [x9] +; NO_SVE-NEXT: .LBB36_40: // %else56 +; NO_SVE-NEXT: ldr q19, [x1, #160] +; NO_SVE-NEXT: tbz w8, #20, .LBB36_42 +; NO_SVE-NEXT: // %bb.41: // %cond.load58 +; NO_SVE-NEXT: fmov x9, d19 +; NO_SVE-NEXT: ld1 { v18.d }[0], [x9] +; NO_SVE-NEXT: tbnz w8, #21, .LBB36_43 +; NO_SVE-NEXT: b .LBB36_44 +; NO_SVE-NEXT: .LBB36_42: +; NO_SVE-NEXT: // implicit-def: $q18 +; NO_SVE-NEXT: tbz w8, #21, .LBB36_44 +; NO_SVE-NEXT: .LBB36_43: // %cond.load61 +; NO_SVE-NEXT: mov x9, v19.d[1] +; NO_SVE-NEXT: ld1 { v18.d }[1], [x9] +; NO_SVE-NEXT: .LBB36_44: // %else62 +; NO_SVE-NEXT: ldr q20, [x1, #176] +; NO_SVE-NEXT: tbz w8, #22, .LBB36_46 +; NO_SVE-NEXT: // %bb.45: // %cond.load64 +; NO_SVE-NEXT: fmov x9, d20 +; NO_SVE-NEXT: ld1 { v19.d }[0], [x9] +; NO_SVE-NEXT: tbnz w8, #23, .LBB36_47 +; NO_SVE-NEXT: b .LBB36_48 +; NO_SVE-NEXT: .LBB36_46: +; NO_SVE-NEXT: // implicit-def: $q19 +; NO_SVE-NEXT: tbz w8, #23, .LBB36_48 +; NO_SVE-NEXT: .LBB36_47: // %cond.load67 +; NO_SVE-NEXT: mov x9, v20.d[1] +; NO_SVE-NEXT: ld1 { v19.d }[1], [x9] +; NO_SVE-NEXT: .LBB36_48: // %else68 +; NO_SVE-NEXT: ldr q21, [x1, #192] +; NO_SVE-NEXT: tbz w8, #24, .LBB36_50 +; NO_SVE-NEXT: // %bb.49: // %cond.load70 +; NO_SVE-NEXT: fmov x9, d21 +; NO_SVE-NEXT: ld1 { v20.d }[0], [x9] +; NO_SVE-NEXT: tbnz w8, #25, .LBB36_51 +; NO_SVE-NEXT: b .LBB36_52 +; NO_SVE-NEXT: .LBB36_50: +; NO_SVE-NEXT: // implicit-def: $q20 +; NO_SVE-NEXT: tbz w8, #25, .LBB36_52 +; NO_SVE-NEXT: .LBB36_51: // %cond.load73 +; NO_SVE-NEXT: mov x9, v21.d[1] +; NO_SVE-NEXT: ld1 { v20.d }[1], [x9] +; NO_SVE-NEXT: .LBB36_52: // %else74 +; NO_SVE-NEXT: ldr q22, [x1, #208] +; NO_SVE-NEXT: tbz w8, #26, .LBB36_54 +; NO_SVE-NEXT: // %bb.53: // %cond.load76 +; NO_SVE-NEXT: fmov x9, d22 +; NO_SVE-NEXT: ld1 { v21.d }[0], [x9] +; NO_SVE-NEXT: tbnz w8, #27, .LBB36_55 +; NO_SVE-NEXT: b .LBB36_56 +; NO_SVE-NEXT: .LBB36_54: +; NO_SVE-NEXT: // implicit-def: $q21 +; NO_SVE-NEXT: tbz w8, #27, .LBB36_56 +; NO_SVE-NEXT: .LBB36_55: // %cond.load79 +; NO_SVE-NEXT: mov x9, v22.d[1] +; NO_SVE-NEXT: ld1 { v21.d }[1], [x9] +; NO_SVE-NEXT: .LBB36_56: // %else80 +; NO_SVE-NEXT: ldr q23, [x1, #224] +; NO_SVE-NEXT: tbz w8, #28, .LBB36_58 +; NO_SVE-NEXT: // %bb.57: // %cond.load82 +; NO_SVE-NEXT: fmov x9, d23 +; NO_SVE-NEXT: ld1 { v22.d }[0], [x9] +; NO_SVE-NEXT: tbnz w8, #29, .LBB36_59 +; NO_SVE-NEXT: b .LBB36_60 +; NO_SVE-NEXT: .LBB36_58: +; NO_SVE-NEXT: // implicit-def: $q22 +; NO_SVE-NEXT: tbz w8, #29, .LBB36_60 +; NO_SVE-NEXT: .LBB36_59: // %cond.load85 +; NO_SVE-NEXT: mov x9, v23.d[1] +; NO_SVE-NEXT: ld1 { v22.d }[1], [x9] +; NO_SVE-NEXT: .LBB36_60: // %else86 +; NO_SVE-NEXT: ldr q24, [x1, #240] +; NO_SVE-NEXT: tbz w8, #30, .LBB36_62 +; NO_SVE-NEXT: // %bb.61: // %cond.load88 +; NO_SVE-NEXT: fmov x9, d24 +; NO_SVE-NEXT: ld1 { v23.d }[0], [x9] +; NO_SVE-NEXT: tbnz w8, #31, .LBB36_63 +; NO_SVE-NEXT: b .LBB36_64 +; NO_SVE-NEXT: .LBB36_62: +; NO_SVE-NEXT: // implicit-def: $q23 +; NO_SVE-NEXT: tbz w8, #31, .LBB36_64 +; NO_SVE-NEXT: .LBB36_63: // %cond.load91 +; NO_SVE-NEXT: mov x8, v24.d[1] +; NO_SVE-NEXT: ld1 { v23.d }[1], [x8] +; NO_SVE-NEXT: .LBB36_64: // %else92 +; NO_SVE-NEXT: stp q0, q1, [x0] +; NO_SVE-NEXT: stp q2, q3, [x0, #32] +; NO_SVE-NEXT: stp q4, q5, [x0, #64] +; NO_SVE-NEXT: stp q6, q7, [x0, #96] +; NO_SVE-NEXT: stp q16, q17, [x0, #128] +; NO_SVE-NEXT: stp q18, q19, [x0, #160] +; NO_SVE-NEXT: stp q20, q21, [x0, #192] +; NO_SVE-NEXT: stp q22, q23, [x0, #224] +; NO_SVE-NEXT: add sp, sp, #16 +; NO_SVE-NEXT: ret +; +; VBITS_EQ_256-LABEL: masked_gather_v32f64: +; VBITS_EQ_256: // %bb.0: +; VBITS_EQ_256-NEXT: mov x8, #4 +; VBITS_EQ_256-NEXT: mov x9, #8 +; VBITS_EQ_256-NEXT: mov x10, #12 +; VBITS_EQ_256-NEXT: mov x11, #16 +; VBITS_EQ_256-NEXT: mov x12, #20 +; VBITS_EQ_256-NEXT: mov x13, #24 +; VBITS_EQ_256-NEXT: mov x14, #28 +; VBITS_EQ_256-NEXT: ptrue p0.d, vl4 +; VBITS_EQ_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] +; VBITS_EQ_256-NEXT: ld1d { z1.d }, p0/z, [x0, x9, lsl #3] +; VBITS_EQ_256-NEXT: ld1d { z2.d }, p0/z, [x0, x10, lsl #3] +; VBITS_EQ_256-NEXT: ld1d { z3.d }, p0/z, [x0, x11, lsl #3] +; VBITS_EQ_256-NEXT: ld1d { z4.d }, p0/z, [x0, x12, lsl #3] +; VBITS_EQ_256-NEXT: ld1d { z5.d }, p0/z, [x0, x13, lsl #3] +; VBITS_EQ_256-NEXT: ld1d { z6.d }, p0/z, [x0, x14, lsl #3] +; VBITS_EQ_256-NEXT: ld1d { z7.d }, p0/z, [x0] +; VBITS_EQ_256-NEXT: ld1d { z19.d }, p0/z, [x1, x8, lsl #3] +; VBITS_EQ_256-NEXT: ld1d { z21.d }, p0/z, [x1, x9, lsl #3] +; VBITS_EQ_256-NEXT: ld1d { z22.d }, p0/z, [x1, x10, lsl #3] +; VBITS_EQ_256-NEXT: ld1d { z20.d }, p0/z, [x1, x11, lsl #3] +; VBITS_EQ_256-NEXT: fcmeq p2.d, p0/z, z0.d, #0.0 +; VBITS_EQ_256-NEXT: ld1d { z16.d }, p0/z, [x1, x14, lsl #3] +; VBITS_EQ_256-NEXT: ld1d { z17.d }, p0/z, [x1, x13, lsl #3] +; VBITS_EQ_256-NEXT: ld1d { z18.d }, p0/z, [x1, x12, lsl #3] +; VBITS_EQ_256-NEXT: ld1d { z23.d }, p0/z, [x1] +; VBITS_EQ_256-NEXT: ld1d { z0.d }, p2/z, [z19.d] +; VBITS_EQ_256-NEXT: fcmeq p2.d, p0/z, z1.d, #0.0 +; VBITS_EQ_256-NEXT: fcmeq p1.d, p0/z, z6.d, #0.0 +; VBITS_EQ_256-NEXT: ld1d { z1.d }, p2/z, [z21.d] +; VBITS_EQ_256-NEXT: fcmeq p2.d, p0/z, z2.d, #0.0 +; VBITS_EQ_256-NEXT: ld1d { z2.d }, p2/z, [z22.d] +; VBITS_EQ_256-NEXT: fcmeq p2.d, p0/z, z3.d, #0.0 +; VBITS_EQ_256-NEXT: ld1d { z3.d }, p2/z, [z20.d] +; VBITS_EQ_256-NEXT: fcmeq p2.d, p0/z, z4.d, #0.0 +; VBITS_EQ_256-NEXT: ld1d { z4.d }, p2/z, [z18.d] +; VBITS_EQ_256-NEXT: fcmeq p2.d, p0/z, z5.d, #0.0 +; VBITS_EQ_256-NEXT: ld1d { z5.d }, p2/z, [z17.d] +; VBITS_EQ_256-NEXT: ld1d { z6.d }, p1/z, [z16.d] +; VBITS_EQ_256-NEXT: fcmeq p1.d, p0/z, z7.d, #0.0 +; VBITS_EQ_256-NEXT: ld1d { z7.d }, p1/z, [z23.d] +; VBITS_EQ_256-NEXT: st1d { z3.d }, p0, [x0, x11, lsl #3] +; VBITS_EQ_256-NEXT: st1d { z4.d }, p0, [x0, x12, lsl #3] +; VBITS_EQ_256-NEXT: st1d { z2.d }, p0, [x0, x10, lsl #3] +; VBITS_EQ_256-NEXT: st1d { z6.d }, p0, [x0, x14, lsl #3] +; VBITS_EQ_256-NEXT: st1d { z5.d }, p0, [x0, x13, lsl #3] +; VBITS_EQ_256-NEXT: st1d { z1.d }, p0, [x0, x9, lsl #3] +; VBITS_EQ_256-NEXT: st1d { z0.d }, p0, [x0, x8, lsl #3] +; VBITS_EQ_256-NEXT: st1d { z7.d }, p0, [x0] +; VBITS_EQ_256-NEXT: ret +; ; VBITS_GE_2048-LABEL: masked_gather_v32f64: ; VBITS_GE_2048: // %bb.0: ; VBITS_GE_2048-NEXT: ptrue p0.d, vl32 @@ -960,18 +6831,436 @@ ; modes still function define void @masked_gather_32b_scaled_sext_f16(<32 x half>* %a, <32 x i32>* %b, half* %base) #0 { -; VBITS_GE_2048-LABEL: masked_gather_32b_scaled_sext_f16: -; VBITS_GE_2048: // %bb.0: -; VBITS_GE_2048-NEXT: ptrue p0.h, vl32 -; VBITS_GE_2048-NEXT: ptrue p1.s, vl32 -; VBITS_GE_2048-NEXT: ld1h { z0.h }, p0/z, [x0] -; VBITS_GE_2048-NEXT: ld1w { z1.s }, p1/z, [x1] -; VBITS_GE_2048-NEXT: fcmeq p1.h, p0/z, z0.h, #0.0 -; VBITS_GE_2048-NEXT: punpklo p1.h, p1.b -; VBITS_GE_2048-NEXT: ld1h { z0.s }, p1/z, [x2, z1.s, sxtw #1] -; VBITS_GE_2048-NEXT: uzp1 z0.h, z0.h, z0.h -; VBITS_GE_2048-NEXT: st1h { z0.h }, p0, [x0] -; VBITS_GE_2048-NEXT: ret +; NO_SVE-LABEL: masked_gather_32b_scaled_sext_f16: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: sub sp, sp, #16 +; NO_SVE-NEXT: .cfi_def_cfa_offset 16 +; NO_SVE-NEXT: ldp q0, q1, [x0, #32] +; NO_SVE-NEXT: fcmeq v0.8h, v0.8h, #0.0 +; NO_SVE-NEXT: fcmeq v1.8h, v1.8h, #0.0 +; NO_SVE-NEXT: ldr q2, [x1] +; NO_SVE-NEXT: ldp q3, q4, [x0] +; NO_SVE-NEXT: xtn v0.8b, v0.8h +; NO_SVE-NEXT: umov w8, v0.b[1] +; NO_SVE-NEXT: umov w10, v0.b[2] +; NO_SVE-NEXT: umov w9, v0.b[0] +; NO_SVE-NEXT: umov w11, v0.b[3] +; NO_SVE-NEXT: umov w12, v0.b[4] +; NO_SVE-NEXT: umov w13, v0.b[5] +; NO_SVE-NEXT: xtn v1.8b, v1.8h +; NO_SVE-NEXT: umov w14, v0.b[6] +; NO_SVE-NEXT: and w8, w8, #0x1 +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: and w9, w9, #0x1 +; NO_SVE-NEXT: and w11, w11, #0x1 +; NO_SVE-NEXT: umov w15, v0.b[7] +; NO_SVE-NEXT: and w12, w12, #0x1 +; NO_SVE-NEXT: bfi w9, w8, #1, #1 +; NO_SVE-NEXT: umov w8, v1.b[0] +; NO_SVE-NEXT: fcmeq v3.8h, v3.8h, #0.0 +; NO_SVE-NEXT: bfi w9, w10, #2, #1 +; NO_SVE-NEXT: and w13, w13, #0x1 +; NO_SVE-NEXT: umov w10, v1.b[1] +; NO_SVE-NEXT: bfi w9, w11, #3, #1 +; NO_SVE-NEXT: umov w11, v1.b[2] +; NO_SVE-NEXT: bfi w9, w12, #4, #1 +; NO_SVE-NEXT: and w14, w14, #0x1 +; NO_SVE-NEXT: umov w12, v1.b[3] +; NO_SVE-NEXT: bfi w9, w13, #5, #1 +; NO_SVE-NEXT: and w15, w15, #0x1 +; NO_SVE-NEXT: xtn v3.8b, v3.8h +; NO_SVE-NEXT: umov w13, v1.b[4] +; NO_SVE-NEXT: orr w9, w9, w14, lsl #6 +; NO_SVE-NEXT: and w8, w8, #0x1 +; NO_SVE-NEXT: orr w9, w9, w15, lsl #7 +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: umov w15, v3.b[1] +; NO_SVE-NEXT: orr w8, w9, w8, lsl #8 +; NO_SVE-NEXT: and w11, w11, #0x1 +; NO_SVE-NEXT: orr w8, w8, w10, lsl #9 +; NO_SVE-NEXT: umov w10, v3.b[2] +; NO_SVE-NEXT: and w12, w12, #0x1 +; NO_SVE-NEXT: umov w9, v3.b[0] +; NO_SVE-NEXT: orr w8, w8, w11, lsl #10 +; NO_SVE-NEXT: and w13, w13, #0x1 +; NO_SVE-NEXT: umov w11, v3.b[3] +; NO_SVE-NEXT: orr w8, w8, w12, lsl #11 +; NO_SVE-NEXT: and w15, w15, #0x1 +; NO_SVE-NEXT: umov w12, v3.b[4] +; NO_SVE-NEXT: orr w8, w8, w13, lsl #12 +; NO_SVE-NEXT: umov w13, v3.b[5] +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: and w9, w9, #0x1 +; NO_SVE-NEXT: fcmeq v0.8h, v4.8h, #0.0 +; NO_SVE-NEXT: bfi w9, w15, #1, #1 +; NO_SVE-NEXT: and w11, w11, #0x1 +; NO_SVE-NEXT: umov w14, v1.b[5] +; NO_SVE-NEXT: bfi w9, w10, #2, #1 +; NO_SVE-NEXT: and w10, w12, #0x1 +; NO_SVE-NEXT: and w12, w13, #0x1 +; NO_SVE-NEXT: umov w13, v3.b[6] +; NO_SVE-NEXT: bfi w9, w11, #3, #1 +; NO_SVE-NEXT: umov w11, v3.b[7] +; NO_SVE-NEXT: xtn v0.8b, v0.8h +; NO_SVE-NEXT: bfi w9, w10, #4, #1 +; NO_SVE-NEXT: umov w10, v1.b[6] +; NO_SVE-NEXT: bfi w9, w12, #5, #1 +; NO_SVE-NEXT: and w12, w13, #0x1 +; NO_SVE-NEXT: umov w13, v0.b[0] +; NO_SVE-NEXT: and w14, w14, #0x1 +; NO_SVE-NEXT: umov w15, v0.b[1] +; NO_SVE-NEXT: and w11, w11, #0x1 +; NO_SVE-NEXT: orr w9, w9, w12, lsl #6 +; NO_SVE-NEXT: umov w12, v0.b[2] +; NO_SVE-NEXT: orr w8, w8, w14, lsl #13 +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: umov w14, v0.b[3] +; NO_SVE-NEXT: orr w9, w9, w11, lsl #7 +; NO_SVE-NEXT: and w11, w13, #0x1 +; NO_SVE-NEXT: and w13, w15, #0x1 +; NO_SVE-NEXT: orr w8, w8, w10, lsl #14 +; NO_SVE-NEXT: umov w10, v0.b[4] +; NO_SVE-NEXT: orr w9, w9, w11, lsl #8 +; NO_SVE-NEXT: and w11, w12, #0x1 +; NO_SVE-NEXT: umov w12, v0.b[5] +; NO_SVE-NEXT: orr w9, w9, w13, lsl #9 +; NO_SVE-NEXT: and w13, w14, #0x1 +; NO_SVE-NEXT: umov w14, v0.b[6] +; NO_SVE-NEXT: orr w9, w9, w11, lsl #10 +; NO_SVE-NEXT: umov w11, v1.b[7] +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: orr w9, w9, w13, lsl #11 +; NO_SVE-NEXT: and w12, w12, #0x1 +; NO_SVE-NEXT: umov w13, v0.b[7] +; NO_SVE-NEXT: orr w9, w9, w10, lsl #12 +; NO_SVE-NEXT: and w10, w14, #0x1 +; NO_SVE-NEXT: dup v1.2d, x2 +; NO_SVE-NEXT: orr w11, w8, w11, lsl #15 +; NO_SVE-NEXT: orr w8, w9, w12, lsl #13 +; NO_SVE-NEXT: orr w8, w8, w10, lsl #14 +; NO_SVE-NEXT: orr w8, w8, w13, lsl #15 +; NO_SVE-NEXT: sshll v0.2d, v2.2s, #1 +; NO_SVE-NEXT: bfi w8, w11, #16, #16 +; NO_SVE-NEXT: add v4.2d, v1.2d, v0.2d +; NO_SVE-NEXT: tbz w8, #0, .LBB37_2 +; NO_SVE-NEXT: // %bb.1: // %cond.load +; NO_SVE-NEXT: fmov x9, d4 +; NO_SVE-NEXT: ldr h0, [x9] +; NO_SVE-NEXT: b .LBB37_3 +; NO_SVE-NEXT: .LBB37_2: +; NO_SVE-NEXT: // implicit-def: $q0 +; NO_SVE-NEXT: .LBB37_3: // %else +; NO_SVE-NEXT: ldr q3, [x1, #16] +; NO_SVE-NEXT: sshll2 v2.2d, v2.4s, #1 +; NO_SVE-NEXT: tbnz w8, #1, .LBB37_13 +; NO_SVE-NEXT: // %bb.4: // %else2 +; NO_SVE-NEXT: add v2.2d, v1.2d, v2.2d +; NO_SVE-NEXT: tbnz w8, #2, .LBB37_14 +; NO_SVE-NEXT: .LBB37_5: // %else5 +; NO_SVE-NEXT: sshll v4.2d, v3.2s, #1 +; NO_SVE-NEXT: tbnz w8, #3, .LBB37_15 +; NO_SVE-NEXT: .LBB37_6: // %else8 +; NO_SVE-NEXT: add v2.2d, v1.2d, v4.2d +; NO_SVE-NEXT: tbz w8, #4, .LBB37_8 +; NO_SVE-NEXT: .LBB37_7: // %cond.load10 +; NO_SVE-NEXT: fmov x9, d2 +; NO_SVE-NEXT: ld1 { v0.h }[4], [x9] +; NO_SVE-NEXT: .LBB37_8: // %else11 +; NO_SVE-NEXT: ldr q4, [x1, #32] +; NO_SVE-NEXT: sshll2 v3.2d, v3.4s, #1 +; NO_SVE-NEXT: tbnz w8, #5, .LBB37_16 +; NO_SVE-NEXT: // %bb.9: // %else14 +; NO_SVE-NEXT: add v2.2d, v1.2d, v3.2d +; NO_SVE-NEXT: tbnz w8, #6, .LBB37_17 +; NO_SVE-NEXT: .LBB37_10: // %else17 +; NO_SVE-NEXT: sshll v3.2d, v4.2s, #1 +; NO_SVE-NEXT: tbnz w8, #7, .LBB37_18 +; NO_SVE-NEXT: .LBB37_11: // %else20 +; NO_SVE-NEXT: add v5.2d, v1.2d, v3.2d +; NO_SVE-NEXT: tbz w8, #8, .LBB37_19 +; NO_SVE-NEXT: .LBB37_12: // %cond.load22 +; NO_SVE-NEXT: fmov x9, d5 +; NO_SVE-NEXT: ld1 { v2.h }[0], [x9] +; NO_SVE-NEXT: b .LBB37_20 +; NO_SVE-NEXT: .LBB37_13: // %cond.load1 +; NO_SVE-NEXT: mov x9, v4.d[1] +; NO_SVE-NEXT: ld1 { v0.h }[1], [x9] +; NO_SVE-NEXT: add v2.2d, v1.2d, v2.2d +; NO_SVE-NEXT: tbz w8, #2, .LBB37_5 +; NO_SVE-NEXT: .LBB37_14: // %cond.load4 +; NO_SVE-NEXT: fmov x9, d2 +; NO_SVE-NEXT: ld1 { v0.h }[2], [x9] +; NO_SVE-NEXT: sshll v4.2d, v3.2s, #1 +; NO_SVE-NEXT: tbz w8, #3, .LBB37_6 +; NO_SVE-NEXT: .LBB37_15: // %cond.load7 +; NO_SVE-NEXT: mov x9, v2.d[1] +; NO_SVE-NEXT: ld1 { v0.h }[3], [x9] +; NO_SVE-NEXT: add v2.2d, v1.2d, v4.2d +; NO_SVE-NEXT: tbnz w8, #4, .LBB37_7 +; NO_SVE-NEXT: b .LBB37_8 +; NO_SVE-NEXT: .LBB37_16: // %cond.load13 +; NO_SVE-NEXT: mov x9, v2.d[1] +; NO_SVE-NEXT: ld1 { v0.h }[5], [x9] +; NO_SVE-NEXT: add v2.2d, v1.2d, v3.2d +; NO_SVE-NEXT: tbz w8, #6, .LBB37_10 +; NO_SVE-NEXT: .LBB37_17: // %cond.load16 +; NO_SVE-NEXT: fmov x9, d2 +; NO_SVE-NEXT: ld1 { v0.h }[6], [x9] +; NO_SVE-NEXT: sshll v3.2d, v4.2s, #1 +; NO_SVE-NEXT: tbz w8, #7, .LBB37_11 +; NO_SVE-NEXT: .LBB37_18: // %cond.load19 +; NO_SVE-NEXT: mov x9, v2.d[1] +; NO_SVE-NEXT: ld1 { v0.h }[7], [x9] +; NO_SVE-NEXT: add v5.2d, v1.2d, v3.2d +; NO_SVE-NEXT: tbnz w8, #8, .LBB37_12 +; NO_SVE-NEXT: .LBB37_19: +; NO_SVE-NEXT: // implicit-def: $q2 +; NO_SVE-NEXT: .LBB37_20: // %else23 +; NO_SVE-NEXT: ldr q3, [x1, #48] +; NO_SVE-NEXT: sshll2 v4.2d, v4.4s, #1 +; NO_SVE-NEXT: tbnz w8, #9, .LBB37_30 +; NO_SVE-NEXT: // %bb.21: // %else26 +; NO_SVE-NEXT: add v4.2d, v1.2d, v4.2d +; NO_SVE-NEXT: tbnz w8, #10, .LBB37_31 +; NO_SVE-NEXT: .LBB37_22: // %else29 +; NO_SVE-NEXT: sshll v5.2d, v3.2s, #1 +; NO_SVE-NEXT: tbnz w8, #11, .LBB37_32 +; NO_SVE-NEXT: .LBB37_23: // %else32 +; NO_SVE-NEXT: add v4.2d, v1.2d, v5.2d +; NO_SVE-NEXT: tbz w8, #12, .LBB37_25 +; NO_SVE-NEXT: .LBB37_24: // %cond.load34 +; NO_SVE-NEXT: fmov x9, d4 +; NO_SVE-NEXT: ld1 { v2.h }[4], [x9] +; NO_SVE-NEXT: .LBB37_25: // %else35 +; NO_SVE-NEXT: ldr q5, [x1, #64] +; NO_SVE-NEXT: sshll2 v3.2d, v3.4s, #1 +; NO_SVE-NEXT: tbnz w8, #13, .LBB37_33 +; NO_SVE-NEXT: // %bb.26: // %else38 +; NO_SVE-NEXT: add v3.2d, v1.2d, v3.2d +; NO_SVE-NEXT: tbnz w8, #14, .LBB37_34 +; NO_SVE-NEXT: .LBB37_27: // %else41 +; NO_SVE-NEXT: sshll v4.2d, v5.2s, #1 +; NO_SVE-NEXT: tbnz w8, #15, .LBB37_35 +; NO_SVE-NEXT: .LBB37_28: // %else44 +; NO_SVE-NEXT: add v6.2d, v1.2d, v4.2d +; NO_SVE-NEXT: tbz w8, #16, .LBB37_36 +; NO_SVE-NEXT: .LBB37_29: // %cond.load46 +; NO_SVE-NEXT: fmov x9, d6 +; NO_SVE-NEXT: ld1 { v3.h }[0], [x9] +; NO_SVE-NEXT: b .LBB37_37 +; NO_SVE-NEXT: .LBB37_30: // %cond.load25 +; NO_SVE-NEXT: mov x9, v5.d[1] +; NO_SVE-NEXT: ld1 { v2.h }[1], [x9] +; NO_SVE-NEXT: add v4.2d, v1.2d, v4.2d +; NO_SVE-NEXT: tbz w8, #10, .LBB37_22 +; NO_SVE-NEXT: .LBB37_31: // %cond.load28 +; NO_SVE-NEXT: fmov x9, d4 +; NO_SVE-NEXT: ld1 { v2.h }[2], [x9] +; NO_SVE-NEXT: sshll v5.2d, v3.2s, #1 +; NO_SVE-NEXT: tbz w8, #11, .LBB37_23 +; NO_SVE-NEXT: .LBB37_32: // %cond.load31 +; NO_SVE-NEXT: mov x9, v4.d[1] +; NO_SVE-NEXT: ld1 { v2.h }[3], [x9] +; NO_SVE-NEXT: add v4.2d, v1.2d, v5.2d +; NO_SVE-NEXT: tbnz w8, #12, .LBB37_24 +; NO_SVE-NEXT: b .LBB37_25 +; NO_SVE-NEXT: .LBB37_33: // %cond.load37 +; NO_SVE-NEXT: mov x9, v4.d[1] +; NO_SVE-NEXT: ld1 { v2.h }[5], [x9] +; NO_SVE-NEXT: add v3.2d, v1.2d, v3.2d +; NO_SVE-NEXT: tbz w8, #14, .LBB37_27 +; NO_SVE-NEXT: .LBB37_34: // %cond.load40 +; NO_SVE-NEXT: fmov x9, d3 +; NO_SVE-NEXT: ld1 { v2.h }[6], [x9] +; NO_SVE-NEXT: sshll v4.2d, v5.2s, #1 +; NO_SVE-NEXT: tbz w8, #15, .LBB37_28 +; NO_SVE-NEXT: .LBB37_35: // %cond.load43 +; NO_SVE-NEXT: mov x9, v3.d[1] +; NO_SVE-NEXT: ld1 { v2.h }[7], [x9] +; NO_SVE-NEXT: add v6.2d, v1.2d, v4.2d +; NO_SVE-NEXT: tbnz w8, #16, .LBB37_29 +; NO_SVE-NEXT: .LBB37_36: +; NO_SVE-NEXT: // implicit-def: $q3 +; NO_SVE-NEXT: .LBB37_37: // %else47 +; NO_SVE-NEXT: ldr q4, [x1, #80] +; NO_SVE-NEXT: sshll2 v5.2d, v5.4s, #1 +; NO_SVE-NEXT: tbnz w8, #17, .LBB37_47 +; NO_SVE-NEXT: // %bb.38: // %else50 +; NO_SVE-NEXT: add v5.2d, v1.2d, v5.2d +; NO_SVE-NEXT: tbnz w8, #18, .LBB37_48 +; NO_SVE-NEXT: .LBB37_39: // %else53 +; NO_SVE-NEXT: sshll v6.2d, v4.2s, #1 +; NO_SVE-NEXT: tbnz w8, #19, .LBB37_49 +; NO_SVE-NEXT: .LBB37_40: // %else56 +; NO_SVE-NEXT: add v5.2d, v1.2d, v6.2d +; NO_SVE-NEXT: tbz w8, #20, .LBB37_42 +; NO_SVE-NEXT: .LBB37_41: // %cond.load58 +; NO_SVE-NEXT: fmov x9, d5 +; NO_SVE-NEXT: ld1 { v3.h }[4], [x9] +; NO_SVE-NEXT: .LBB37_42: // %else59 +; NO_SVE-NEXT: ldr q6, [x1, #96] +; NO_SVE-NEXT: sshll2 v4.2d, v4.4s, #1 +; NO_SVE-NEXT: tbnz w8, #21, .LBB37_50 +; NO_SVE-NEXT: // %bb.43: // %else62 +; NO_SVE-NEXT: add v4.2d, v1.2d, v4.2d +; NO_SVE-NEXT: tbnz w8, #22, .LBB37_51 +; NO_SVE-NEXT: .LBB37_44: // %else65 +; NO_SVE-NEXT: sshll v5.2d, v6.2s, #1 +; NO_SVE-NEXT: tbnz w8, #23, .LBB37_52 +; NO_SVE-NEXT: .LBB37_45: // %else68 +; NO_SVE-NEXT: add v7.2d, v1.2d, v5.2d +; NO_SVE-NEXT: tbz w8, #24, .LBB37_53 +; NO_SVE-NEXT: .LBB37_46: // %cond.load70 +; NO_SVE-NEXT: fmov x9, d7 +; NO_SVE-NEXT: ld1 { v4.h }[0], [x9] +; NO_SVE-NEXT: b .LBB37_54 +; NO_SVE-NEXT: .LBB37_47: // %cond.load49 +; NO_SVE-NEXT: mov x9, v6.d[1] +; NO_SVE-NEXT: ld1 { v3.h }[1], [x9] +; NO_SVE-NEXT: add v5.2d, v1.2d, v5.2d +; NO_SVE-NEXT: tbz w8, #18, .LBB37_39 +; NO_SVE-NEXT: .LBB37_48: // %cond.load52 +; NO_SVE-NEXT: fmov x9, d5 +; NO_SVE-NEXT: ld1 { v3.h }[2], [x9] +; NO_SVE-NEXT: sshll v6.2d, v4.2s, #1 +; NO_SVE-NEXT: tbz w8, #19, .LBB37_40 +; NO_SVE-NEXT: .LBB37_49: // %cond.load55 +; NO_SVE-NEXT: mov x9, v5.d[1] +; NO_SVE-NEXT: ld1 { v3.h }[3], [x9] +; NO_SVE-NEXT: add v5.2d, v1.2d, v6.2d +; NO_SVE-NEXT: tbnz w8, #20, .LBB37_41 +; NO_SVE-NEXT: b .LBB37_42 +; NO_SVE-NEXT: .LBB37_50: // %cond.load61 +; NO_SVE-NEXT: mov x9, v5.d[1] +; NO_SVE-NEXT: ld1 { v3.h }[5], [x9] +; NO_SVE-NEXT: add v4.2d, v1.2d, v4.2d +; NO_SVE-NEXT: tbz w8, #22, .LBB37_44 +; NO_SVE-NEXT: .LBB37_51: // %cond.load64 +; NO_SVE-NEXT: fmov x9, d4 +; NO_SVE-NEXT: ld1 { v3.h }[6], [x9] +; NO_SVE-NEXT: sshll v5.2d, v6.2s, #1 +; NO_SVE-NEXT: tbz w8, #23, .LBB37_45 +; NO_SVE-NEXT: .LBB37_52: // %cond.load67 +; NO_SVE-NEXT: mov x9, v4.d[1] +; NO_SVE-NEXT: ld1 { v3.h }[7], [x9] +; NO_SVE-NEXT: add v7.2d, v1.2d, v5.2d +; NO_SVE-NEXT: tbnz w8, #24, .LBB37_46 +; NO_SVE-NEXT: .LBB37_53: +; NO_SVE-NEXT: // implicit-def: $q4 +; NO_SVE-NEXT: .LBB37_54: // %else71 +; NO_SVE-NEXT: ldr q5, [x1, #112] +; NO_SVE-NEXT: sshll2 v6.2d, v6.4s, #1 +; NO_SVE-NEXT: tbnz w8, #25, .LBB37_63 +; NO_SVE-NEXT: // %bb.55: // %else74 +; NO_SVE-NEXT: add v6.2d, v1.2d, v6.2d +; NO_SVE-NEXT: tbnz w8, #26, .LBB37_64 +; NO_SVE-NEXT: .LBB37_56: // %else77 +; NO_SVE-NEXT: sshll v7.2d, v5.2s, #1 +; NO_SVE-NEXT: tbnz w8, #27, .LBB37_65 +; NO_SVE-NEXT: .LBB37_57: // %else80 +; NO_SVE-NEXT: add v6.2d, v1.2d, v7.2d +; NO_SVE-NEXT: tbnz w8, #28, .LBB37_66 +; NO_SVE-NEXT: .LBB37_58: // %else83 +; NO_SVE-NEXT: sshll2 v5.2d, v5.4s, #1 +; NO_SVE-NEXT: tbnz w8, #29, .LBB37_67 +; NO_SVE-NEXT: .LBB37_59: // %else86 +; NO_SVE-NEXT: add v1.2d, v1.2d, v5.2d +; NO_SVE-NEXT: tbnz w8, #30, .LBB37_68 +; NO_SVE-NEXT: .LBB37_60: // %else89 +; NO_SVE-NEXT: tbz w8, #31, .LBB37_62 +; NO_SVE-NEXT: .LBB37_61: // %cond.load91 +; NO_SVE-NEXT: mov x8, v1.d[1] +; NO_SVE-NEXT: ld1 { v4.h }[7], [x8] +; NO_SVE-NEXT: .LBB37_62: // %else92 +; NO_SVE-NEXT: stp q0, q2, [x0] +; NO_SVE-NEXT: stp q3, q4, [x0, #32] +; NO_SVE-NEXT: add sp, sp, #16 +; NO_SVE-NEXT: ret +; NO_SVE-NEXT: .LBB37_63: // %cond.load73 +; NO_SVE-NEXT: mov x9, v7.d[1] +; NO_SVE-NEXT: ld1 { v4.h }[1], [x9] +; NO_SVE-NEXT: add v6.2d, v1.2d, v6.2d +; NO_SVE-NEXT: tbz w8, #26, .LBB37_56 +; NO_SVE-NEXT: .LBB37_64: // %cond.load76 +; NO_SVE-NEXT: fmov x9, d6 +; NO_SVE-NEXT: ld1 { v4.h }[2], [x9] +; NO_SVE-NEXT: sshll v7.2d, v5.2s, #1 +; NO_SVE-NEXT: tbz w8, #27, .LBB37_57 +; NO_SVE-NEXT: .LBB37_65: // %cond.load79 +; NO_SVE-NEXT: mov x9, v6.d[1] +; NO_SVE-NEXT: ld1 { v4.h }[3], [x9] +; NO_SVE-NEXT: add v6.2d, v1.2d, v7.2d +; NO_SVE-NEXT: tbz w8, #28, .LBB37_58 +; NO_SVE-NEXT: .LBB37_66: // %cond.load82 +; NO_SVE-NEXT: fmov x9, d6 +; NO_SVE-NEXT: ld1 { v4.h }[4], [x9] +; NO_SVE-NEXT: sshll2 v5.2d, v5.4s, #1 +; NO_SVE-NEXT: tbz w8, #29, .LBB37_59 +; NO_SVE-NEXT: .LBB37_67: // %cond.load85 +; NO_SVE-NEXT: mov x9, v6.d[1] +; NO_SVE-NEXT: ld1 { v4.h }[5], [x9] +; NO_SVE-NEXT: add v1.2d, v1.2d, v5.2d +; NO_SVE-NEXT: tbz w8, #30, .LBB37_60 +; NO_SVE-NEXT: .LBB37_68: // %cond.load88 +; NO_SVE-NEXT: fmov x9, d1 +; NO_SVE-NEXT: ld1 { v4.h }[6], [x9] +; NO_SVE-NEXT: tbnz w8, #31, .LBB37_61 +; NO_SVE-NEXT: b .LBB37_62 +; +; VBITS_EQ_256-LABEL: masked_gather_32b_scaled_sext_f16: +; VBITS_EQ_256: // %bb.0: +; VBITS_EQ_256-NEXT: mov x8, #16 +; VBITS_EQ_256-NEXT: ptrue p0.h, vl16 +; VBITS_EQ_256-NEXT: mov x10, #24 +; VBITS_EQ_256-NEXT: ptrue p1.s, vl8 +; VBITS_EQ_256-NEXT: mov x9, #8 +; VBITS_EQ_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] +; VBITS_EQ_256-NEXT: ld1h { z1.h }, p0/z, [x0] +; VBITS_EQ_256-NEXT: ld1w { z4.s }, p1/z, [x1, x8, lsl #2] +; VBITS_EQ_256-NEXT: ld1w { z3.s }, p1/z, [x1, x10, lsl #2] +; VBITS_EQ_256-NEXT: ld1w { z2.s }, p1/z, [x1, x9, lsl #2] +; VBITS_EQ_256-NEXT: ld1w { z5.s }, p1/z, [x1] +; VBITS_EQ_256-NEXT: fcmeq p2.h, p0/z, z0.h, #0.0 +; VBITS_EQ_256-NEXT: fcmeq p3.h, p0/z, z1.h, #0.0 +; VBITS_EQ_256-NEXT: mov z0.h, p2/z, #-1 // =0xffffffffffffffff +; VBITS_EQ_256-NEXT: punpklo p2.h, p2.b +; VBITS_EQ_256-NEXT: ext z0.b, z0.b, z0.b, #16 +; VBITS_EQ_256-NEXT: mov z1.h, p3/z, #-1 // =0xffffffffffffffff +; VBITS_EQ_256-NEXT: and p2.b, p2/z, p2.b, p1.b +; VBITS_EQ_256-NEXT: sunpklo z0.s, z0.h +; VBITS_EQ_256-NEXT: ld1h { z4.s }, p2/z, [x2, z4.s, sxtw #1] +; VBITS_EQ_256-NEXT: cmpne p2.s, p1/z, z0.s, #0 +; VBITS_EQ_256-NEXT: ext z1.b, z1.b, z1.b, #16 +; VBITS_EQ_256-NEXT: ld1h { z0.s }, p2/z, [x2, z3.s, sxtw #1] +; VBITS_EQ_256-NEXT: punpklo p2.h, p3.b +; VBITS_EQ_256-NEXT: sunpklo z1.s, z1.h +; VBITS_EQ_256-NEXT: and p2.b, p2/z, p2.b, p1.b +; VBITS_EQ_256-NEXT: cmpne p1.s, p1/z, z1.s, #0 +; VBITS_EQ_256-NEXT: ld1h { z1.s }, p2/z, [x2, z5.s, sxtw #1] +; VBITS_EQ_256-NEXT: ld1h { z2.s }, p1/z, [x2, z2.s, sxtw #1] +; VBITS_EQ_256-NEXT: uzp1 z3.h, z4.h, z4.h +; VBITS_EQ_256-NEXT: uzp1 z0.h, z0.h, z0.h +; VBITS_EQ_256-NEXT: ptrue p1.h, vl8 +; VBITS_EQ_256-NEXT: splice z3.h, p1, z3.h, z0.h +; VBITS_EQ_256-NEXT: uzp1 z1.h, z1.h, z1.h +; VBITS_EQ_256-NEXT: uzp1 z2.h, z2.h, z2.h +; VBITS_EQ_256-NEXT: splice z1.h, p1, z1.h, z2.h +; VBITS_EQ_256-NEXT: st1h { z3.h }, p0, [x0, x8, lsl #1] +; VBITS_EQ_256-NEXT: st1h { z1.h }, p0, [x0] +; VBITS_EQ_256-NEXT: ret +; +; VBITS_GE_1024-LABEL: masked_gather_32b_scaled_sext_f16: +; VBITS_GE_1024: // %bb.0: +; VBITS_GE_1024-NEXT: ptrue p0.h, vl32 +; VBITS_GE_1024-NEXT: ptrue p1.s, vl32 +; VBITS_GE_1024-NEXT: ld1h { z0.h }, p0/z, [x0] +; VBITS_GE_1024-NEXT: ld1w { z1.s }, p1/z, [x1] +; VBITS_GE_1024-NEXT: fcmeq p1.h, p0/z, z0.h, #0.0 +; VBITS_GE_1024-NEXT: punpklo p1.h, p1.b +; VBITS_GE_1024-NEXT: ld1h { z0.s }, p1/z, [x2, z1.s, sxtw #1] +; VBITS_GE_1024-NEXT: uzp1 z0.h, z0.h, z0.h +; VBITS_GE_1024-NEXT: st1h { z0.h }, p0, [x0] +; VBITS_GE_1024-NEXT: ret %cvals = load <32 x half>, <32 x half>* %a %idxs = load <32 x i32>, <32 x i32>* %b %ext = sext <32 x i32> %idxs to <32 x i64> @@ -983,15 +7272,438 @@ } define void @masked_gather_32b_scaled_sext_f32(<32 x float>* %a, <32 x i32>* %b, float* %base) #0 { -; VBITS_GE_2048-LABEL: masked_gather_32b_scaled_sext_f32: -; VBITS_GE_2048: // %bb.0: -; VBITS_GE_2048-NEXT: ptrue p0.s, vl32 -; VBITS_GE_2048-NEXT: ld1w { z0.s }, p0/z, [x0] -; VBITS_GE_2048-NEXT: ld1w { z1.s }, p0/z, [x1] -; VBITS_GE_2048-NEXT: fcmeq p1.s, p0/z, z0.s, #0.0 -; VBITS_GE_2048-NEXT: ld1w { z0.s }, p1/z, [x2, z1.s, sxtw #2] -; VBITS_GE_2048-NEXT: st1w { z0.s }, p0, [x0] -; VBITS_GE_2048-NEXT: ret +; NO_SVE-LABEL: masked_gather_32b_scaled_sext_f32: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: sub sp, sp, #16 +; NO_SVE-NEXT: .cfi_def_cfa_offset 16 +; NO_SVE-NEXT: ldp q3, q4, [x0, #64] +; NO_SVE-NEXT: fcmeq v3.4s, v3.4s, #0.0 +; NO_SVE-NEXT: ldp q0, q2, [x0] +; NO_SVE-NEXT: fcmeq v4.4s, v4.4s, #0.0 +; NO_SVE-NEXT: fcmeq v0.4s, v0.4s, #0.0 +; NO_SVE-NEXT: uzp1 v3.8h, v3.8h, v4.8h +; NO_SVE-NEXT: ldp q5, q6, [x0, #96] +; NO_SVE-NEXT: fcmeq v2.4s, v2.4s, #0.0 +; NO_SVE-NEXT: fcmeq v5.4s, v5.4s, #0.0 +; NO_SVE-NEXT: uzp1 v0.8h, v0.8h, v2.8h +; NO_SVE-NEXT: fcmeq v6.4s, v6.4s, #0.0 +; NO_SVE-NEXT: ldr q4, [x0, #48] +; NO_SVE-NEXT: xtn v2.8b, v3.8h +; NO_SVE-NEXT: ldr q1, [x1] +; NO_SVE-NEXT: umov w8, v2.b[1] +; NO_SVE-NEXT: umov w10, v2.b[2] +; NO_SVE-NEXT: uzp1 v5.8h, v5.8h, v6.8h +; NO_SVE-NEXT: umov w9, v2.b[0] +; NO_SVE-NEXT: umov w11, v2.b[3] +; NO_SVE-NEXT: umov w12, v2.b[4] +; NO_SVE-NEXT: umov w13, v2.b[5] +; NO_SVE-NEXT: and w8, w8, #0x1 +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: xtn v5.8b, v5.8h +; NO_SVE-NEXT: umov w14, v2.b[6] +; NO_SVE-NEXT: and w9, w9, #0x1 +; NO_SVE-NEXT: and w11, w11, #0x1 +; NO_SVE-NEXT: umov w15, v2.b[7] +; NO_SVE-NEXT: bfi w9, w8, #1, #1 +; NO_SVE-NEXT: and w12, w12, #0x1 +; NO_SVE-NEXT: umov w8, v5.b[0] +; NO_SVE-NEXT: bfi w9, w10, #2, #1 +; NO_SVE-NEXT: and w13, w13, #0x1 +; NO_SVE-NEXT: umov w10, v5.b[1] +; NO_SVE-NEXT: bfi w9, w11, #3, #1 +; NO_SVE-NEXT: umov w11, v5.b[2] +; NO_SVE-NEXT: bfi w9, w12, #4, #1 +; NO_SVE-NEXT: and w14, w14, #0x1 +; NO_SVE-NEXT: umov w12, v5.b[3] +; NO_SVE-NEXT: bfi w9, w13, #5, #1 +; NO_SVE-NEXT: and w15, w15, #0x1 +; NO_SVE-NEXT: xtn v0.8b, v0.8h +; NO_SVE-NEXT: umov w13, v5.b[4] +; NO_SVE-NEXT: orr w9, w9, w14, lsl #6 +; NO_SVE-NEXT: and w8, w8, #0x1 +; NO_SVE-NEXT: fcmeq v3.4s, v4.4s, #0.0 +; NO_SVE-NEXT: ldr q4, [x0, #32] +; NO_SVE-NEXT: orr w9, w9, w15, lsl #7 +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: umov w15, v0.b[1] +; NO_SVE-NEXT: orr w8, w9, w8, lsl #8 +; NO_SVE-NEXT: and w11, w11, #0x1 +; NO_SVE-NEXT: orr w8, w8, w10, lsl #9 +; NO_SVE-NEXT: umov w10, v0.b[2] +; NO_SVE-NEXT: and w12, w12, #0x1 +; NO_SVE-NEXT: umov w9, v0.b[0] +; NO_SVE-NEXT: orr w8, w8, w11, lsl #10 +; NO_SVE-NEXT: and w13, w13, #0x1 +; NO_SVE-NEXT: umov w11, v0.b[3] +; NO_SVE-NEXT: fcmeq v2.4s, v4.4s, #0.0 +; NO_SVE-NEXT: orr w8, w8, w12, lsl #11 +; NO_SVE-NEXT: and w12, w15, #0x1 +; NO_SVE-NEXT: orr w8, w8, w13, lsl #12 +; NO_SVE-NEXT: umov w13, v0.b[4] +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: umov w15, v0.b[5] +; NO_SVE-NEXT: and w9, w9, #0x1 +; NO_SVE-NEXT: uzp1 v2.8h, v2.8h, v3.8h +; NO_SVE-NEXT: bfi w9, w12, #1, #1 +; NO_SVE-NEXT: and w11, w11, #0x1 +; NO_SVE-NEXT: bfi w9, w10, #2, #1 +; NO_SVE-NEXT: and w10, w13, #0x1 +; NO_SVE-NEXT: umov w13, v0.b[6] +; NO_SVE-NEXT: umov w14, v5.b[5] +; NO_SVE-NEXT: and w12, w15, #0x1 +; NO_SVE-NEXT: bfi w9, w11, #3, #1 +; NO_SVE-NEXT: umov w11, v0.b[7] +; NO_SVE-NEXT: xtn v0.8b, v2.8h +; NO_SVE-NEXT: bfi w9, w10, #4, #1 +; NO_SVE-NEXT: umov w10, v5.b[6] +; NO_SVE-NEXT: bfi w9, w12, #5, #1 +; NO_SVE-NEXT: and w12, w13, #0x1 +; NO_SVE-NEXT: umov w13, v0.b[0] +; NO_SVE-NEXT: and w14, w14, #0x1 +; NO_SVE-NEXT: umov w15, v0.b[1] +; NO_SVE-NEXT: and w11, w11, #0x1 +; NO_SVE-NEXT: orr w9, w9, w12, lsl #6 +; NO_SVE-NEXT: umov w12, v0.b[2] +; NO_SVE-NEXT: orr w8, w8, w14, lsl #13 +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: umov w14, v0.b[3] +; NO_SVE-NEXT: orr w9, w9, w11, lsl #7 +; NO_SVE-NEXT: and w11, w13, #0x1 +; NO_SVE-NEXT: and w13, w15, #0x1 +; NO_SVE-NEXT: orr w8, w8, w10, lsl #14 +; NO_SVE-NEXT: umov w10, v0.b[4] +; NO_SVE-NEXT: orr w9, w9, w11, lsl #8 +; NO_SVE-NEXT: and w11, w12, #0x1 +; NO_SVE-NEXT: umov w12, v0.b[5] +; NO_SVE-NEXT: orr w9, w9, w13, lsl #9 +; NO_SVE-NEXT: and w13, w14, #0x1 +; NO_SVE-NEXT: umov w14, v0.b[6] +; NO_SVE-NEXT: orr w9, w9, w11, lsl #10 +; NO_SVE-NEXT: umov w11, v5.b[7] +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: orr w9, w9, w13, lsl #11 +; NO_SVE-NEXT: and w12, w12, #0x1 +; NO_SVE-NEXT: umov w13, v0.b[7] +; NO_SVE-NEXT: orr w9, w9, w10, lsl #12 +; NO_SVE-NEXT: and w10, w14, #0x1 +; NO_SVE-NEXT: dup v2.2d, x2 +; NO_SVE-NEXT: orr w11, w8, w11, lsl #15 +; NO_SVE-NEXT: orr w8, w9, w12, lsl #13 +; NO_SVE-NEXT: orr w8, w8, w10, lsl #14 +; NO_SVE-NEXT: orr w8, w8, w13, lsl #15 +; NO_SVE-NEXT: sshll v0.2d, v1.2s, #2 +; NO_SVE-NEXT: bfi w8, w11, #16, #16 +; NO_SVE-NEXT: add v4.2d, v2.2d, v0.2d +; NO_SVE-NEXT: tbz w8, #0, .LBB38_2 +; NO_SVE-NEXT: // %bb.1: // %cond.load +; NO_SVE-NEXT: fmov x9, d4 +; NO_SVE-NEXT: ldr s0, [x9] +; NO_SVE-NEXT: b .LBB38_3 +; NO_SVE-NEXT: .LBB38_2: +; NO_SVE-NEXT: // implicit-def: $q0 +; NO_SVE-NEXT: .LBB38_3: // %else +; NO_SVE-NEXT: ldr q3, [x1, #16] +; NO_SVE-NEXT: sshll2 v1.2d, v1.4s, #2 +; NO_SVE-NEXT: tbnz w8, #1, .LBB38_8 +; NO_SVE-NEXT: // %bb.4: // %else2 +; NO_SVE-NEXT: add v1.2d, v2.2d, v1.2d +; NO_SVE-NEXT: tbnz w8, #2, .LBB38_9 +; NO_SVE-NEXT: .LBB38_5: // %else5 +; NO_SVE-NEXT: sshll v4.2d, v3.2s, #2 +; NO_SVE-NEXT: tbnz w8, #3, .LBB38_10 +; NO_SVE-NEXT: .LBB38_6: // %else8 +; NO_SVE-NEXT: add v5.2d, v2.2d, v4.2d +; NO_SVE-NEXT: tbz w8, #4, .LBB38_11 +; NO_SVE-NEXT: .LBB38_7: // %cond.load10 +; NO_SVE-NEXT: fmov x9, d5 +; NO_SVE-NEXT: ld1 { v1.s }[0], [x9] +; NO_SVE-NEXT: b .LBB38_12 +; NO_SVE-NEXT: .LBB38_8: // %cond.load1 +; NO_SVE-NEXT: mov x9, v4.d[1] +; NO_SVE-NEXT: ld1 { v0.s }[1], [x9] +; NO_SVE-NEXT: add v1.2d, v2.2d, v1.2d +; NO_SVE-NEXT: tbz w8, #2, .LBB38_5 +; NO_SVE-NEXT: .LBB38_9: // %cond.load4 +; NO_SVE-NEXT: fmov x9, d1 +; NO_SVE-NEXT: ld1 { v0.s }[2], [x9] +; NO_SVE-NEXT: sshll v4.2d, v3.2s, #2 +; NO_SVE-NEXT: tbz w8, #3, .LBB38_6 +; NO_SVE-NEXT: .LBB38_10: // %cond.load7 +; NO_SVE-NEXT: mov x9, v1.d[1] +; NO_SVE-NEXT: ld1 { v0.s }[3], [x9] +; NO_SVE-NEXT: add v5.2d, v2.2d, v4.2d +; NO_SVE-NEXT: tbnz w8, #4, .LBB38_7 +; NO_SVE-NEXT: .LBB38_11: +; NO_SVE-NEXT: // implicit-def: $q1 +; NO_SVE-NEXT: .LBB38_12: // %else11 +; NO_SVE-NEXT: ldr q4, [x1, #32] +; NO_SVE-NEXT: sshll2 v3.2d, v3.4s, #2 +; NO_SVE-NEXT: tbnz w8, #5, .LBB38_17 +; NO_SVE-NEXT: // %bb.13: // %else14 +; NO_SVE-NEXT: add v3.2d, v2.2d, v3.2d +; NO_SVE-NEXT: tbnz w8, #6, .LBB38_18 +; NO_SVE-NEXT: .LBB38_14: // %else17 +; NO_SVE-NEXT: sshll v5.2d, v4.2s, #2 +; NO_SVE-NEXT: tbnz w8, #7, .LBB38_19 +; NO_SVE-NEXT: .LBB38_15: // %else20 +; NO_SVE-NEXT: add v6.2d, v2.2d, v5.2d +; NO_SVE-NEXT: tbz w8, #8, .LBB38_20 +; NO_SVE-NEXT: .LBB38_16: // %cond.load22 +; NO_SVE-NEXT: fmov x9, d6 +; NO_SVE-NEXT: ld1 { v3.s }[0], [x9] +; NO_SVE-NEXT: b .LBB38_21 +; NO_SVE-NEXT: .LBB38_17: // %cond.load13 +; NO_SVE-NEXT: mov x9, v5.d[1] +; NO_SVE-NEXT: ld1 { v1.s }[1], [x9] +; NO_SVE-NEXT: add v3.2d, v2.2d, v3.2d +; NO_SVE-NEXT: tbz w8, #6, .LBB38_14 +; NO_SVE-NEXT: .LBB38_18: // %cond.load16 +; NO_SVE-NEXT: fmov x9, d3 +; NO_SVE-NEXT: ld1 { v1.s }[2], [x9] +; NO_SVE-NEXT: sshll v5.2d, v4.2s, #2 +; NO_SVE-NEXT: tbz w8, #7, .LBB38_15 +; NO_SVE-NEXT: .LBB38_19: // %cond.load19 +; NO_SVE-NEXT: mov x9, v3.d[1] +; NO_SVE-NEXT: ld1 { v1.s }[3], [x9] +; NO_SVE-NEXT: add v6.2d, v2.2d, v5.2d +; NO_SVE-NEXT: tbnz w8, #8, .LBB38_16 +; NO_SVE-NEXT: .LBB38_20: +; NO_SVE-NEXT: // implicit-def: $q3 +; NO_SVE-NEXT: .LBB38_21: // %else23 +; NO_SVE-NEXT: ldr q5, [x1, #48] +; NO_SVE-NEXT: sshll2 v4.2d, v4.4s, #2 +; NO_SVE-NEXT: tbnz w8, #9, .LBB38_26 +; NO_SVE-NEXT: // %bb.22: // %else26 +; NO_SVE-NEXT: add v4.2d, v2.2d, v4.2d +; NO_SVE-NEXT: tbnz w8, #10, .LBB38_27 +; NO_SVE-NEXT: .LBB38_23: // %else29 +; NO_SVE-NEXT: sshll v6.2d, v5.2s, #2 +; NO_SVE-NEXT: tbnz w8, #11, .LBB38_28 +; NO_SVE-NEXT: .LBB38_24: // %else32 +; NO_SVE-NEXT: add v7.2d, v2.2d, v6.2d +; NO_SVE-NEXT: tbz w8, #12, .LBB38_29 +; NO_SVE-NEXT: .LBB38_25: // %cond.load34 +; NO_SVE-NEXT: fmov x9, d7 +; NO_SVE-NEXT: ld1 { v4.s }[0], [x9] +; NO_SVE-NEXT: b .LBB38_30 +; NO_SVE-NEXT: .LBB38_26: // %cond.load25 +; NO_SVE-NEXT: mov x9, v6.d[1] +; NO_SVE-NEXT: ld1 { v3.s }[1], [x9] +; NO_SVE-NEXT: add v4.2d, v2.2d, v4.2d +; NO_SVE-NEXT: tbz w8, #10, .LBB38_23 +; NO_SVE-NEXT: .LBB38_27: // %cond.load28 +; NO_SVE-NEXT: fmov x9, d4 +; NO_SVE-NEXT: ld1 { v3.s }[2], [x9] +; NO_SVE-NEXT: sshll v6.2d, v5.2s, #2 +; NO_SVE-NEXT: tbz w8, #11, .LBB38_24 +; NO_SVE-NEXT: .LBB38_28: // %cond.load31 +; NO_SVE-NEXT: mov x9, v4.d[1] +; NO_SVE-NEXT: ld1 { v3.s }[3], [x9] +; NO_SVE-NEXT: add v7.2d, v2.2d, v6.2d +; NO_SVE-NEXT: tbnz w8, #12, .LBB38_25 +; NO_SVE-NEXT: .LBB38_29: +; NO_SVE-NEXT: // implicit-def: $q4 +; NO_SVE-NEXT: .LBB38_30: // %else35 +; NO_SVE-NEXT: ldr q6, [x1, #64] +; NO_SVE-NEXT: sshll2 v5.2d, v5.4s, #2 +; NO_SVE-NEXT: tbnz w8, #13, .LBB38_35 +; NO_SVE-NEXT: // %bb.31: // %else38 +; NO_SVE-NEXT: add v5.2d, v2.2d, v5.2d +; NO_SVE-NEXT: tbnz w8, #14, .LBB38_36 +; NO_SVE-NEXT: .LBB38_32: // %else41 +; NO_SVE-NEXT: sshll v7.2d, v6.2s, #2 +; NO_SVE-NEXT: tbnz w8, #15, .LBB38_37 +; NO_SVE-NEXT: .LBB38_33: // %else44 +; NO_SVE-NEXT: add v16.2d, v2.2d, v7.2d +; NO_SVE-NEXT: tbz w8, #16, .LBB38_38 +; NO_SVE-NEXT: .LBB38_34: // %cond.load46 +; NO_SVE-NEXT: fmov x9, d16 +; NO_SVE-NEXT: ld1 { v5.s }[0], [x9] +; NO_SVE-NEXT: b .LBB38_39 +; NO_SVE-NEXT: .LBB38_35: // %cond.load37 +; NO_SVE-NEXT: mov x9, v7.d[1] +; NO_SVE-NEXT: ld1 { v4.s }[1], [x9] +; NO_SVE-NEXT: add v5.2d, v2.2d, v5.2d +; NO_SVE-NEXT: tbz w8, #14, .LBB38_32 +; NO_SVE-NEXT: .LBB38_36: // %cond.load40 +; NO_SVE-NEXT: fmov x9, d5 +; NO_SVE-NEXT: ld1 { v4.s }[2], [x9] +; NO_SVE-NEXT: sshll v7.2d, v6.2s, #2 +; NO_SVE-NEXT: tbz w8, #15, .LBB38_33 +; NO_SVE-NEXT: .LBB38_37: // %cond.load43 +; NO_SVE-NEXT: mov x9, v5.d[1] +; NO_SVE-NEXT: ld1 { v4.s }[3], [x9] +; NO_SVE-NEXT: add v16.2d, v2.2d, v7.2d +; NO_SVE-NEXT: tbnz w8, #16, .LBB38_34 +; NO_SVE-NEXT: .LBB38_38: +; NO_SVE-NEXT: // implicit-def: $q5 +; NO_SVE-NEXT: .LBB38_39: // %else47 +; NO_SVE-NEXT: ldr q7, [x1, #80] +; NO_SVE-NEXT: sshll2 v6.2d, v6.4s, #2 +; NO_SVE-NEXT: tbnz w8, #17, .LBB38_44 +; NO_SVE-NEXT: // %bb.40: // %else50 +; NO_SVE-NEXT: add v6.2d, v2.2d, v6.2d +; NO_SVE-NEXT: tbnz w8, #18, .LBB38_45 +; NO_SVE-NEXT: .LBB38_41: // %else53 +; NO_SVE-NEXT: sshll v16.2d, v7.2s, #2 +; NO_SVE-NEXT: tbnz w8, #19, .LBB38_46 +; NO_SVE-NEXT: .LBB38_42: // %else56 +; NO_SVE-NEXT: add v17.2d, v2.2d, v16.2d +; NO_SVE-NEXT: tbz w8, #20, .LBB38_47 +; NO_SVE-NEXT: .LBB38_43: // %cond.load58 +; NO_SVE-NEXT: fmov x9, d17 +; NO_SVE-NEXT: ld1 { v6.s }[0], [x9] +; NO_SVE-NEXT: b .LBB38_48 +; NO_SVE-NEXT: .LBB38_44: // %cond.load49 +; NO_SVE-NEXT: mov x9, v16.d[1] +; NO_SVE-NEXT: ld1 { v5.s }[1], [x9] +; NO_SVE-NEXT: add v6.2d, v2.2d, v6.2d +; NO_SVE-NEXT: tbz w8, #18, .LBB38_41 +; NO_SVE-NEXT: .LBB38_45: // %cond.load52 +; NO_SVE-NEXT: fmov x9, d6 +; NO_SVE-NEXT: ld1 { v5.s }[2], [x9] +; NO_SVE-NEXT: sshll v16.2d, v7.2s, #2 +; NO_SVE-NEXT: tbz w8, #19, .LBB38_42 +; NO_SVE-NEXT: .LBB38_46: // %cond.load55 +; NO_SVE-NEXT: mov x9, v6.d[1] +; NO_SVE-NEXT: ld1 { v5.s }[3], [x9] +; NO_SVE-NEXT: add v17.2d, v2.2d, v16.2d +; NO_SVE-NEXT: tbnz w8, #20, .LBB38_43 +; NO_SVE-NEXT: .LBB38_47: +; NO_SVE-NEXT: // implicit-def: $q6 +; NO_SVE-NEXT: .LBB38_48: // %else59 +; NO_SVE-NEXT: ldr q16, [x1, #96] +; NO_SVE-NEXT: sshll2 v7.2d, v7.4s, #2 +; NO_SVE-NEXT: tbnz w8, #21, .LBB38_53 +; NO_SVE-NEXT: // %bb.49: // %else62 +; NO_SVE-NEXT: add v7.2d, v2.2d, v7.2d +; NO_SVE-NEXT: tbnz w8, #22, .LBB38_54 +; NO_SVE-NEXT: .LBB38_50: // %else65 +; NO_SVE-NEXT: sshll v17.2d, v16.2s, #2 +; NO_SVE-NEXT: tbnz w8, #23, .LBB38_55 +; NO_SVE-NEXT: .LBB38_51: // %else68 +; NO_SVE-NEXT: add v18.2d, v2.2d, v17.2d +; NO_SVE-NEXT: tbz w8, #24, .LBB38_56 +; NO_SVE-NEXT: .LBB38_52: // %cond.load70 +; NO_SVE-NEXT: fmov x9, d18 +; NO_SVE-NEXT: ld1 { v7.s }[0], [x9] +; NO_SVE-NEXT: b .LBB38_57 +; NO_SVE-NEXT: .LBB38_53: // %cond.load61 +; NO_SVE-NEXT: mov x9, v17.d[1] +; NO_SVE-NEXT: ld1 { v6.s }[1], [x9] +; NO_SVE-NEXT: add v7.2d, v2.2d, v7.2d +; NO_SVE-NEXT: tbz w8, #22, .LBB38_50 +; NO_SVE-NEXT: .LBB38_54: // %cond.load64 +; NO_SVE-NEXT: fmov x9, d7 +; NO_SVE-NEXT: ld1 { v6.s }[2], [x9] +; NO_SVE-NEXT: sshll v17.2d, v16.2s, #2 +; NO_SVE-NEXT: tbz w8, #23, .LBB38_51 +; NO_SVE-NEXT: .LBB38_55: // %cond.load67 +; NO_SVE-NEXT: mov x9, v7.d[1] +; NO_SVE-NEXT: ld1 { v6.s }[3], [x9] +; NO_SVE-NEXT: add v18.2d, v2.2d, v17.2d +; NO_SVE-NEXT: tbnz w8, #24, .LBB38_52 +; NO_SVE-NEXT: .LBB38_56: +; NO_SVE-NEXT: // implicit-def: $q7 +; NO_SVE-NEXT: .LBB38_57: // %else71 +; NO_SVE-NEXT: ldr q17, [x1, #112] +; NO_SVE-NEXT: sshll2 v16.2d, v16.4s, #2 +; NO_SVE-NEXT: tbnz w8, #25, .LBB38_62 +; NO_SVE-NEXT: // %bb.58: // %else74 +; NO_SVE-NEXT: add v16.2d, v2.2d, v16.2d +; NO_SVE-NEXT: tbnz w8, #26, .LBB38_63 +; NO_SVE-NEXT: .LBB38_59: // %else77 +; NO_SVE-NEXT: sshll v18.2d, v17.2s, #2 +; NO_SVE-NEXT: tbnz w8, #27, .LBB38_64 +; NO_SVE-NEXT: .LBB38_60: // %else80 +; NO_SVE-NEXT: add v18.2d, v2.2d, v18.2d +; NO_SVE-NEXT: tbz w8, #28, .LBB38_65 +; NO_SVE-NEXT: .LBB38_61: // %cond.load82 +; NO_SVE-NEXT: fmov x9, d18 +; NO_SVE-NEXT: ld1 { v16.s }[0], [x9] +; NO_SVE-NEXT: sshll2 v17.2d, v17.4s, #2 +; NO_SVE-NEXT: tbnz w8, #29, .LBB38_66 +; NO_SVE-NEXT: b .LBB38_67 +; NO_SVE-NEXT: .LBB38_62: // %cond.load73 +; NO_SVE-NEXT: mov x9, v18.d[1] +; NO_SVE-NEXT: ld1 { v7.s }[1], [x9] +; NO_SVE-NEXT: add v16.2d, v2.2d, v16.2d +; NO_SVE-NEXT: tbz w8, #26, .LBB38_59 +; NO_SVE-NEXT: .LBB38_63: // %cond.load76 +; NO_SVE-NEXT: fmov x9, d16 +; NO_SVE-NEXT: ld1 { v7.s }[2], [x9] +; NO_SVE-NEXT: sshll v18.2d, v17.2s, #2 +; NO_SVE-NEXT: tbz w8, #27, .LBB38_60 +; NO_SVE-NEXT: .LBB38_64: // %cond.load79 +; NO_SVE-NEXT: mov x9, v16.d[1] +; NO_SVE-NEXT: ld1 { v7.s }[3], [x9] +; NO_SVE-NEXT: add v18.2d, v2.2d, v18.2d +; NO_SVE-NEXT: tbnz w8, #28, .LBB38_61 +; NO_SVE-NEXT: .LBB38_65: +; NO_SVE-NEXT: // implicit-def: $q16 +; NO_SVE-NEXT: sshll2 v17.2d, v17.4s, #2 +; NO_SVE-NEXT: tbz w8, #29, .LBB38_67 +; NO_SVE-NEXT: .LBB38_66: // %cond.load85 +; NO_SVE-NEXT: mov x9, v18.d[1] +; NO_SVE-NEXT: ld1 { v16.s }[1], [x9] +; NO_SVE-NEXT: .LBB38_67: // %else86 +; NO_SVE-NEXT: add v2.2d, v2.2d, v17.2d +; NO_SVE-NEXT: tbnz w8, #30, .LBB38_71 +; NO_SVE-NEXT: // %bb.68: // %else89 +; NO_SVE-NEXT: tbz w8, #31, .LBB38_70 +; NO_SVE-NEXT: .LBB38_69: // %cond.load91 +; NO_SVE-NEXT: mov x8, v2.d[1] +; NO_SVE-NEXT: ld1 { v16.s }[3], [x8] +; NO_SVE-NEXT: .LBB38_70: // %else92 +; NO_SVE-NEXT: stp q0, q1, [x0] +; NO_SVE-NEXT: stp q3, q4, [x0, #32] +; NO_SVE-NEXT: stp q5, q6, [x0, #64] +; NO_SVE-NEXT: stp q7, q16, [x0, #96] +; NO_SVE-NEXT: add sp, sp, #16 +; NO_SVE-NEXT: ret +; NO_SVE-NEXT: .LBB38_71: // %cond.load88 +; NO_SVE-NEXT: fmov x9, d2 +; NO_SVE-NEXT: ld1 { v16.s }[2], [x9] +; NO_SVE-NEXT: tbnz w8, #31, .LBB38_69 +; NO_SVE-NEXT: b .LBB38_70 +; +; VBITS_EQ_256-LABEL: masked_gather_32b_scaled_sext_f32: +; VBITS_EQ_256: // %bb.0: +; VBITS_EQ_256-NEXT: mov x8, #16 +; VBITS_EQ_256-NEXT: mov x9, #8 +; VBITS_EQ_256-NEXT: mov x10, #24 +; VBITS_EQ_256-NEXT: ptrue p0.s, vl8 +; VBITS_EQ_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] +; VBITS_EQ_256-NEXT: ld1w { z1.s }, p0/z, [x0, x9, lsl #2] +; VBITS_EQ_256-NEXT: ld1w { z2.s }, p0/z, [x0, x10, lsl #2] +; VBITS_EQ_256-NEXT: ld1w { z3.s }, p0/z, [x0] +; VBITS_EQ_256-NEXT: ld1w { z4.s }, p0/z, [x1, x9, lsl #2] +; VBITS_EQ_256-NEXT: ld1w { z5.s }, p0/z, [x1, x8, lsl #2] +; VBITS_EQ_256-NEXT: ld1w { z6.s }, p0/z, [x1, x10, lsl #2] +; VBITS_EQ_256-NEXT: ld1w { z7.s }, p0/z, [x1] +; VBITS_EQ_256-NEXT: fcmeq p1.s, p0/z, z0.s, #0.0 +; VBITS_EQ_256-NEXT: fcmeq p3.s, p0/z, z1.s, #0.0 +; VBITS_EQ_256-NEXT: fcmeq p2.s, p0/z, z2.s, #0.0 +; VBITS_EQ_256-NEXT: fcmeq p4.s, p0/z, z3.s, #0.0 +; VBITS_EQ_256-NEXT: ld1w { z0.s }, p3/z, [x2, z4.s, sxtw #2] +; VBITS_EQ_256-NEXT: ld1w { z1.s }, p2/z, [x2, z6.s, sxtw #2] +; VBITS_EQ_256-NEXT: ld1w { z2.s }, p1/z, [x2, z5.s, sxtw #2] +; VBITS_EQ_256-NEXT: ld1w { z3.s }, p4/z, [x2, z7.s, sxtw #2] +; VBITS_EQ_256-NEXT: st1w { z0.s }, p0, [x0, x9, lsl #2] +; VBITS_EQ_256-NEXT: st1w { z2.s }, p0, [x0, x8, lsl #2] +; VBITS_EQ_256-NEXT: st1w { z1.s }, p0, [x0, x10, lsl #2] +; VBITS_EQ_256-NEXT: st1w { z3.s }, p0, [x0] +; VBITS_EQ_256-NEXT: ret +; +; VBITS_GE_1024-LABEL: masked_gather_32b_scaled_sext_f32: +; VBITS_GE_1024: // %bb.0: +; VBITS_GE_1024-NEXT: ptrue p0.s, vl32 +; VBITS_GE_1024-NEXT: ld1w { z0.s }, p0/z, [x0] +; VBITS_GE_1024-NEXT: ld1w { z1.s }, p0/z, [x1] +; VBITS_GE_1024-NEXT: fcmeq p1.s, p0/z, z0.s, #0.0 +; VBITS_GE_1024-NEXT: ld1w { z0.s }, p1/z, [x2, z1.s, sxtw #2] +; VBITS_GE_1024-NEXT: st1w { z0.s }, p0, [x0] +; VBITS_GE_1024-NEXT: ret %cvals = load <32 x float>, <32 x float>* %a %idxs = load <32 x i32>, <32 x i32>* %b %ext = sext <32 x i32> %idxs to <32 x i64> @@ -1003,6 +7715,472 @@ } define void @masked_gather_32b_scaled_sext_f64(<32 x double>* %a, <32 x i32>* %b, double* %base) #0 { +; NO_SVE-LABEL: masked_gather_32b_scaled_sext_f64: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: sub sp, sp, #16 +; NO_SVE-NEXT: .cfi_def_cfa_offset 16 +; NO_SVE-NEXT: ldp q3, q4, [x0, #160] +; NO_SVE-NEXT: fcmeq v3.2d, v3.2d, #0.0 +; NO_SVE-NEXT: ldp q5, q6, [x0, #128] +; NO_SVE-NEXT: fcmeq v4.2d, v4.2d, #0.0 +; NO_SVE-NEXT: fcmeq v5.2d, v5.2d, #0.0 +; NO_SVE-NEXT: uzp1 v3.4s, v3.4s, v4.4s +; NO_SVE-NEXT: ldp q0, q2, [x0, #224] +; NO_SVE-NEXT: fcmeq v6.2d, v6.2d, #0.0 +; NO_SVE-NEXT: fcmeq v0.2d, v0.2d, #0.0 +; NO_SVE-NEXT: uzp1 v5.4s, v5.4s, v6.4s +; NO_SVE-NEXT: ldp q7, q16, [x0, #192] +; NO_SVE-NEXT: fcmeq v2.2d, v2.2d, #0.0 +; NO_SVE-NEXT: uzp1 v3.8h, v5.8h, v3.8h +; NO_SVE-NEXT: fcmeq v7.2d, v7.2d, #0.0 +; NO_SVE-NEXT: uzp1 v0.4s, v0.4s, v2.4s +; NO_SVE-NEXT: fcmeq v16.2d, v16.2d, #0.0 +; NO_SVE-NEXT: ldr q1, [x1] +; NO_SVE-NEXT: ldp q17, q18, [x0] +; NO_SVE-NEXT: uzp1 v2.4s, v7.4s, v16.4s +; NO_SVE-NEXT: fcmeq v7.2d, v17.2d, #0.0 +; NO_SVE-NEXT: ldp q19, q4, [x0, #32] +; NO_SVE-NEXT: uzp1 v0.8h, v2.8h, v0.8h +; NO_SVE-NEXT: xtn v2.8b, v3.8h +; NO_SVE-NEXT: fcmeq v6.2d, v18.2d, #0.0 +; NO_SVE-NEXT: umov w8, v2.b[1] +; NO_SVE-NEXT: umov w10, v2.b[2] +; NO_SVE-NEXT: umov w9, v2.b[0] +; NO_SVE-NEXT: umov w11, v2.b[3] +; NO_SVE-NEXT: umov w12, v2.b[4] +; NO_SVE-NEXT: fcmeq v4.2d, v4.2d, #0.0 +; NO_SVE-NEXT: umov w13, v2.b[5] +; NO_SVE-NEXT: fcmeq v5.2d, v19.2d, #0.0 +; NO_SVE-NEXT: and w8, w8, #0x1 +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: xtn v0.8b, v0.8h +; NO_SVE-NEXT: umov w14, v2.b[6] +; NO_SVE-NEXT: and w9, w9, #0x1 +; NO_SVE-NEXT: and w11, w11, #0x1 +; NO_SVE-NEXT: umov w15, v2.b[7] +; NO_SVE-NEXT: bfi w9, w8, #1, #1 +; NO_SVE-NEXT: and w12, w12, #0x1 +; NO_SVE-NEXT: uzp1 v3.4s, v5.4s, v4.4s +; NO_SVE-NEXT: umov w8, v0.b[0] +; NO_SVE-NEXT: uzp1 v4.4s, v7.4s, v6.4s +; NO_SVE-NEXT: bfi w9, w10, #2, #1 +; NO_SVE-NEXT: and w13, w13, #0x1 +; NO_SVE-NEXT: bfi w9, w11, #3, #1 +; NO_SVE-NEXT: umov w10, v0.b[1] +; NO_SVE-NEXT: umov w11, v0.b[2] +; NO_SVE-NEXT: and w14, w14, #0x1 +; NO_SVE-NEXT: bfi w9, w12, #4, #1 +; NO_SVE-NEXT: and w12, w15, #0x1 +; NO_SVE-NEXT: bfi w9, w13, #5, #1 +; NO_SVE-NEXT: ldp q5, q6, [x0, #96] +; NO_SVE-NEXT: and w8, w8, #0x1 +; NO_SVE-NEXT: orr w9, w9, w14, lsl #6 +; NO_SVE-NEXT: umov w13, v0.b[3] +; NO_SVE-NEXT: orr w9, w9, w12, lsl #7 +; NO_SVE-NEXT: uzp1 v2.8h, v4.8h, v3.8h +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: orr w8, w9, w8, lsl #8 +; NO_SVE-NEXT: and w9, w11, #0x1 +; NO_SVE-NEXT: umov w11, v0.b[4] +; NO_SVE-NEXT: orr w8, w8, w10, lsl #9 +; NO_SVE-NEXT: ldp q3, q4, [x0, #64] +; NO_SVE-NEXT: and w10, w13, #0x1 +; NO_SVE-NEXT: orr w8, w8, w9, lsl #10 +; NO_SVE-NEXT: xtn v2.8b, v2.8h +; NO_SVE-NEXT: umov w9, v0.b[5] +; NO_SVE-NEXT: orr w8, w8, w10, lsl #11 +; NO_SVE-NEXT: and w10, w11, #0x1 +; NO_SVE-NEXT: fcmeq v6.2d, v6.2d, #0.0 +; NO_SVE-NEXT: umov w12, v2.b[2] +; NO_SVE-NEXT: fcmeq v5.2d, v5.2d, #0.0 +; NO_SVE-NEXT: orr w8, w8, w10, lsl #12 +; NO_SVE-NEXT: fcmeq v4.2d, v4.2d, #0.0 +; NO_SVE-NEXT: and w9, w9, #0x1 +; NO_SVE-NEXT: fcmeq v3.2d, v3.2d, #0.0 +; NO_SVE-NEXT: umov w10, v2.b[1] +; NO_SVE-NEXT: and w12, w12, #0x1 +; NO_SVE-NEXT: umov w11, v2.b[0] +; NO_SVE-NEXT: umov w13, v2.b[3] +; NO_SVE-NEXT: uzp1 v5.4s, v5.4s, v6.4s +; NO_SVE-NEXT: umov w14, v2.b[4] +; NO_SVE-NEXT: uzp1 v3.4s, v3.4s, v4.4s +; NO_SVE-NEXT: orr w8, w8, w9, lsl #13 +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: umov w15, v2.b[5] +; NO_SVE-NEXT: and w11, w11, #0x1 +; NO_SVE-NEXT: uzp1 v3.8h, v3.8h, v5.8h +; NO_SVE-NEXT: bfi w11, w10, #1, #1 +; NO_SVE-NEXT: and w10, w13, #0x1 +; NO_SVE-NEXT: bfi w11, w12, #2, #1 +; NO_SVE-NEXT: and w12, w14, #0x1 +; NO_SVE-NEXT: umov w14, v2.b[6] +; NO_SVE-NEXT: and w13, w15, #0x1 +; NO_SVE-NEXT: dup v5.2d, x2 +; NO_SVE-NEXT: bfi w11, w10, #3, #1 +; NO_SVE-NEXT: umov w10, v2.b[7] +; NO_SVE-NEXT: xtn v2.8b, v3.8h +; NO_SVE-NEXT: bfi w11, w12, #4, #1 +; NO_SVE-NEXT: umov w12, v0.b[6] +; NO_SVE-NEXT: bfi w11, w13, #5, #1 +; NO_SVE-NEXT: and w13, w14, #0x1 +; NO_SVE-NEXT: umov w14, v2.b[0] +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: umov w15, v2.b[1] +; NO_SVE-NEXT: orr w9, w11, w13, lsl #6 +; NO_SVE-NEXT: and w11, w12, #0x1 +; NO_SVE-NEXT: umov w12, v2.b[2] +; NO_SVE-NEXT: orr w9, w9, w10, lsl #7 +; NO_SVE-NEXT: and w10, w14, #0x1 +; NO_SVE-NEXT: umov w14, v2.b[3] +; NO_SVE-NEXT: and w13, w15, #0x1 +; NO_SVE-NEXT: orr w9, w9, w10, lsl #8 +; NO_SVE-NEXT: umov w10, v2.b[4] +; NO_SVE-NEXT: orr w8, w8, w11, lsl #14 +; NO_SVE-NEXT: and w11, w12, #0x1 +; NO_SVE-NEXT: umov w12, v2.b[5] +; NO_SVE-NEXT: orr w9, w9, w13, lsl #9 +; NO_SVE-NEXT: and w13, w14, #0x1 +; NO_SVE-NEXT: umov w14, v2.b[6] +; NO_SVE-NEXT: orr w9, w9, w11, lsl #10 +; NO_SVE-NEXT: umov w11, v0.b[7] +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: orr w9, w9, w13, lsl #11 +; NO_SVE-NEXT: and w12, w12, #0x1 +; NO_SVE-NEXT: umov w13, v2.b[7] +; NO_SVE-NEXT: orr w9, w9, w10, lsl #12 +; NO_SVE-NEXT: and w10, w14, #0x1 +; NO_SVE-NEXT: orr w11, w8, w11, lsl #15 +; NO_SVE-NEXT: orr w8, w9, w12, lsl #13 +; NO_SVE-NEXT: orr w8, w8, w10, lsl #14 +; NO_SVE-NEXT: orr w8, w8, w13, lsl #15 +; NO_SVE-NEXT: sshll v0.2d, v1.2s, #3 +; NO_SVE-NEXT: bfi w8, w11, #16, #16 +; NO_SVE-NEXT: add v2.2d, v5.2d, v0.2d +; NO_SVE-NEXT: tbz w8, #0, .LBB39_2 +; NO_SVE-NEXT: // %bb.1: // %cond.load +; NO_SVE-NEXT: fmov x9, d2 +; NO_SVE-NEXT: ldr d0, [x9] +; NO_SVE-NEXT: b .LBB39_3 +; NO_SVE-NEXT: .LBB39_2: +; NO_SVE-NEXT: // implicit-def: $q0 +; NO_SVE-NEXT: .LBB39_3: // %else +; NO_SVE-NEXT: ldr q3, [x1, #16] +; NO_SVE-NEXT: sshll2 v1.2d, v1.4s, #3 +; NO_SVE-NEXT: tbz w8, #1, .LBB39_5 +; NO_SVE-NEXT: // %bb.4: // %cond.load1 +; NO_SVE-NEXT: mov x9, v2.d[1] +; NO_SVE-NEXT: ld1 { v0.d }[1], [x9] +; NO_SVE-NEXT: .LBB39_5: // %else2 +; NO_SVE-NEXT: add v2.2d, v5.2d, v1.2d +; NO_SVE-NEXT: tbz w8, #2, .LBB39_7 +; NO_SVE-NEXT: // %bb.6: // %cond.load4 +; NO_SVE-NEXT: fmov x9, d2 +; NO_SVE-NEXT: ld1 { v1.d }[0], [x9] +; NO_SVE-NEXT: sshll v4.2d, v3.2s, #3 +; NO_SVE-NEXT: tbnz w8, #3, .LBB39_8 +; NO_SVE-NEXT: b .LBB39_9 +; NO_SVE-NEXT: .LBB39_7: +; NO_SVE-NEXT: // implicit-def: $q1 +; NO_SVE-NEXT: sshll v4.2d, v3.2s, #3 +; NO_SVE-NEXT: tbz w8, #3, .LBB39_9 +; NO_SVE-NEXT: .LBB39_8: // %cond.load7 +; NO_SVE-NEXT: mov x9, v2.d[1] +; NO_SVE-NEXT: ld1 { v1.d }[1], [x9] +; NO_SVE-NEXT: .LBB39_9: // %else8 +; NO_SVE-NEXT: add v4.2d, v5.2d, v4.2d +; NO_SVE-NEXT: tbz w8, #4, .LBB39_11 +; NO_SVE-NEXT: // %bb.10: // %cond.load10 +; NO_SVE-NEXT: fmov x9, d4 +; NO_SVE-NEXT: ld1 { v2.d }[0], [x9] +; NO_SVE-NEXT: b .LBB39_12 +; NO_SVE-NEXT: .LBB39_11: +; NO_SVE-NEXT: // implicit-def: $q2 +; NO_SVE-NEXT: .LBB39_12: // %else11 +; NO_SVE-NEXT: ldr q6, [x1, #32] +; NO_SVE-NEXT: sshll2 v3.2d, v3.4s, #3 +; NO_SVE-NEXT: tbz w8, #5, .LBB39_14 +; NO_SVE-NEXT: // %bb.13: // %cond.load13 +; NO_SVE-NEXT: mov x9, v4.d[1] +; NO_SVE-NEXT: ld1 { v2.d }[1], [x9] +; NO_SVE-NEXT: .LBB39_14: // %else14 +; NO_SVE-NEXT: add v4.2d, v5.2d, v3.2d +; NO_SVE-NEXT: tbz w8, #6, .LBB39_16 +; NO_SVE-NEXT: // %bb.15: // %cond.load16 +; NO_SVE-NEXT: fmov x9, d4 +; NO_SVE-NEXT: ld1 { v3.d }[0], [x9] +; NO_SVE-NEXT: sshll v7.2d, v6.2s, #3 +; NO_SVE-NEXT: tbnz w8, #7, .LBB39_17 +; NO_SVE-NEXT: b .LBB39_18 +; NO_SVE-NEXT: .LBB39_16: +; NO_SVE-NEXT: // implicit-def: $q3 +; NO_SVE-NEXT: sshll v7.2d, v6.2s, #3 +; NO_SVE-NEXT: tbz w8, #7, .LBB39_18 +; NO_SVE-NEXT: .LBB39_17: // %cond.load19 +; NO_SVE-NEXT: mov x9, v4.d[1] +; NO_SVE-NEXT: ld1 { v3.d }[1], [x9] +; NO_SVE-NEXT: .LBB39_18: // %else20 +; NO_SVE-NEXT: add v7.2d, v5.2d, v7.2d +; NO_SVE-NEXT: tbz w8, #8, .LBB39_20 +; NO_SVE-NEXT: // %bb.19: // %cond.load22 +; NO_SVE-NEXT: fmov x9, d7 +; NO_SVE-NEXT: ld1 { v4.d }[0], [x9] +; NO_SVE-NEXT: b .LBB39_21 +; NO_SVE-NEXT: .LBB39_20: +; NO_SVE-NEXT: // implicit-def: $q4 +; NO_SVE-NEXT: .LBB39_21: // %else23 +; NO_SVE-NEXT: ldr q16, [x1, #48] +; NO_SVE-NEXT: sshll2 v6.2d, v6.4s, #3 +; NO_SVE-NEXT: tbz w8, #9, .LBB39_23 +; NO_SVE-NEXT: // %bb.22: // %cond.load25 +; NO_SVE-NEXT: mov x9, v7.d[1] +; NO_SVE-NEXT: ld1 { v4.d }[1], [x9] +; NO_SVE-NEXT: .LBB39_23: // %else26 +; NO_SVE-NEXT: add v7.2d, v5.2d, v6.2d +; NO_SVE-NEXT: tbz w8, #10, .LBB39_25 +; NO_SVE-NEXT: // %bb.24: // %cond.load28 +; NO_SVE-NEXT: fmov x9, d7 +; NO_SVE-NEXT: ld1 { v6.d }[0], [x9] +; NO_SVE-NEXT: sshll v17.2d, v16.2s, #3 +; NO_SVE-NEXT: tbnz w8, #11, .LBB39_26 +; NO_SVE-NEXT: b .LBB39_27 +; NO_SVE-NEXT: .LBB39_25: +; NO_SVE-NEXT: // implicit-def: $q6 +; NO_SVE-NEXT: sshll v17.2d, v16.2s, #3 +; NO_SVE-NEXT: tbz w8, #11, .LBB39_27 +; NO_SVE-NEXT: .LBB39_26: // %cond.load31 +; NO_SVE-NEXT: mov x9, v7.d[1] +; NO_SVE-NEXT: ld1 { v6.d }[1], [x9] +; NO_SVE-NEXT: .LBB39_27: // %else32 +; NO_SVE-NEXT: add v17.2d, v5.2d, v17.2d +; NO_SVE-NEXT: tbz w8, #12, .LBB39_29 +; NO_SVE-NEXT: // %bb.28: // %cond.load34 +; NO_SVE-NEXT: fmov x9, d17 +; NO_SVE-NEXT: ld1 { v7.d }[0], [x9] +; NO_SVE-NEXT: b .LBB39_30 +; NO_SVE-NEXT: .LBB39_29: +; NO_SVE-NEXT: // implicit-def: $q7 +; NO_SVE-NEXT: .LBB39_30: // %else35 +; NO_SVE-NEXT: ldr q18, [x1, #64] +; NO_SVE-NEXT: sshll2 v16.2d, v16.4s, #3 +; NO_SVE-NEXT: tbz w8, #13, .LBB39_32 +; NO_SVE-NEXT: // %bb.31: // %cond.load37 +; NO_SVE-NEXT: mov x9, v17.d[1] +; NO_SVE-NEXT: ld1 { v7.d }[1], [x9] +; NO_SVE-NEXT: .LBB39_32: // %else38 +; NO_SVE-NEXT: add v17.2d, v5.2d, v16.2d +; NO_SVE-NEXT: tbz w8, #14, .LBB39_34 +; NO_SVE-NEXT: // %bb.33: // %cond.load40 +; NO_SVE-NEXT: fmov x9, d17 +; NO_SVE-NEXT: ld1 { v16.d }[0], [x9] +; NO_SVE-NEXT: sshll v19.2d, v18.2s, #3 +; NO_SVE-NEXT: tbnz w8, #15, .LBB39_35 +; NO_SVE-NEXT: b .LBB39_36 +; NO_SVE-NEXT: .LBB39_34: +; NO_SVE-NEXT: // implicit-def: $q16 +; NO_SVE-NEXT: sshll v19.2d, v18.2s, #3 +; NO_SVE-NEXT: tbz w8, #15, .LBB39_36 +; NO_SVE-NEXT: .LBB39_35: // %cond.load43 +; NO_SVE-NEXT: mov x9, v17.d[1] +; NO_SVE-NEXT: ld1 { v16.d }[1], [x9] +; NO_SVE-NEXT: .LBB39_36: // %else44 +; NO_SVE-NEXT: add v19.2d, v5.2d, v19.2d +; NO_SVE-NEXT: tbz w8, #16, .LBB39_38 +; NO_SVE-NEXT: // %bb.37: // %cond.load46 +; NO_SVE-NEXT: fmov x9, d19 +; NO_SVE-NEXT: ld1 { v17.d }[0], [x9] +; NO_SVE-NEXT: b .LBB39_39 +; NO_SVE-NEXT: .LBB39_38: +; NO_SVE-NEXT: // implicit-def: $q17 +; NO_SVE-NEXT: .LBB39_39: // %else47 +; NO_SVE-NEXT: ldr q20, [x1, #80] +; NO_SVE-NEXT: sshll2 v18.2d, v18.4s, #3 +; NO_SVE-NEXT: tbz w8, #17, .LBB39_41 +; NO_SVE-NEXT: // %bb.40: // %cond.load49 +; NO_SVE-NEXT: mov x9, v19.d[1] +; NO_SVE-NEXT: ld1 { v17.d }[1], [x9] +; NO_SVE-NEXT: .LBB39_41: // %else50 +; NO_SVE-NEXT: add v19.2d, v5.2d, v18.2d +; NO_SVE-NEXT: tbz w8, #18, .LBB39_43 +; NO_SVE-NEXT: // %bb.42: // %cond.load52 +; NO_SVE-NEXT: fmov x9, d19 +; NO_SVE-NEXT: ld1 { v18.d }[0], [x9] +; NO_SVE-NEXT: sshll v21.2d, v20.2s, #3 +; NO_SVE-NEXT: tbnz w8, #19, .LBB39_44 +; NO_SVE-NEXT: b .LBB39_45 +; NO_SVE-NEXT: .LBB39_43: +; NO_SVE-NEXT: // implicit-def: $q18 +; NO_SVE-NEXT: sshll v21.2d, v20.2s, #3 +; NO_SVE-NEXT: tbz w8, #19, .LBB39_45 +; NO_SVE-NEXT: .LBB39_44: // %cond.load55 +; NO_SVE-NEXT: mov x9, v19.d[1] +; NO_SVE-NEXT: ld1 { v18.d }[1], [x9] +; NO_SVE-NEXT: .LBB39_45: // %else56 +; NO_SVE-NEXT: add v21.2d, v5.2d, v21.2d +; NO_SVE-NEXT: tbz w8, #20, .LBB39_47 +; NO_SVE-NEXT: // %bb.46: // %cond.load58 +; NO_SVE-NEXT: fmov x9, d21 +; NO_SVE-NEXT: ld1 { v19.d }[0], [x9] +; NO_SVE-NEXT: b .LBB39_48 +; NO_SVE-NEXT: .LBB39_47: +; NO_SVE-NEXT: // implicit-def: $q19 +; NO_SVE-NEXT: .LBB39_48: // %else59 +; NO_SVE-NEXT: ldr q22, [x1, #96] +; NO_SVE-NEXT: sshll2 v20.2d, v20.4s, #3 +; NO_SVE-NEXT: tbz w8, #21, .LBB39_50 +; NO_SVE-NEXT: // %bb.49: // %cond.load61 +; NO_SVE-NEXT: mov x9, v21.d[1] +; NO_SVE-NEXT: ld1 { v19.d }[1], [x9] +; NO_SVE-NEXT: .LBB39_50: // %else62 +; NO_SVE-NEXT: add v21.2d, v5.2d, v20.2d +; NO_SVE-NEXT: tbz w8, #22, .LBB39_52 +; NO_SVE-NEXT: // %bb.51: // %cond.load64 +; NO_SVE-NEXT: fmov x9, d21 +; NO_SVE-NEXT: ld1 { v20.d }[0], [x9] +; NO_SVE-NEXT: sshll v23.2d, v22.2s, #3 +; NO_SVE-NEXT: tbnz w8, #23, .LBB39_53 +; NO_SVE-NEXT: b .LBB39_54 +; NO_SVE-NEXT: .LBB39_52: +; NO_SVE-NEXT: // implicit-def: $q20 +; NO_SVE-NEXT: sshll v23.2d, v22.2s, #3 +; NO_SVE-NEXT: tbz w8, #23, .LBB39_54 +; NO_SVE-NEXT: .LBB39_53: // %cond.load67 +; NO_SVE-NEXT: mov x9, v21.d[1] +; NO_SVE-NEXT: ld1 { v20.d }[1], [x9] +; NO_SVE-NEXT: .LBB39_54: // %else68 +; NO_SVE-NEXT: add v23.2d, v5.2d, v23.2d +; NO_SVE-NEXT: tbz w8, #24, .LBB39_56 +; NO_SVE-NEXT: // %bb.55: // %cond.load70 +; NO_SVE-NEXT: fmov x9, d23 +; NO_SVE-NEXT: ld1 { v21.d }[0], [x9] +; NO_SVE-NEXT: b .LBB39_57 +; NO_SVE-NEXT: .LBB39_56: +; NO_SVE-NEXT: // implicit-def: $q21 +; NO_SVE-NEXT: .LBB39_57: // %else71 +; NO_SVE-NEXT: ldr q24, [x1, #112] +; NO_SVE-NEXT: sshll2 v22.2d, v22.4s, #3 +; NO_SVE-NEXT: tbz w8, #25, .LBB39_59 +; NO_SVE-NEXT: // %bb.58: // %cond.load73 +; NO_SVE-NEXT: mov x9, v23.d[1] +; NO_SVE-NEXT: ld1 { v21.d }[1], [x9] +; NO_SVE-NEXT: .LBB39_59: // %else74 +; NO_SVE-NEXT: add v23.2d, v5.2d, v22.2d +; NO_SVE-NEXT: tbz w8, #26, .LBB39_61 +; NO_SVE-NEXT: // %bb.60: // %cond.load76 +; NO_SVE-NEXT: fmov x9, d23 +; NO_SVE-NEXT: ld1 { v22.d }[0], [x9] +; NO_SVE-NEXT: sshll v25.2d, v24.2s, #3 +; NO_SVE-NEXT: tbnz w8, #27, .LBB39_62 +; NO_SVE-NEXT: b .LBB39_63 +; NO_SVE-NEXT: .LBB39_61: +; NO_SVE-NEXT: // implicit-def: $q22 +; NO_SVE-NEXT: sshll v25.2d, v24.2s, #3 +; NO_SVE-NEXT: tbz w8, #27, .LBB39_63 +; NO_SVE-NEXT: .LBB39_62: // %cond.load79 +; NO_SVE-NEXT: mov x9, v23.d[1] +; NO_SVE-NEXT: ld1 { v22.d }[1], [x9] +; NO_SVE-NEXT: .LBB39_63: // %else80 +; NO_SVE-NEXT: add v25.2d, v5.2d, v25.2d +; NO_SVE-NEXT: tbz w8, #28, .LBB39_65 +; NO_SVE-NEXT: // %bb.64: // %cond.load82 +; NO_SVE-NEXT: fmov x9, d25 +; NO_SVE-NEXT: ld1 { v23.d }[0], [x9] +; NO_SVE-NEXT: sshll2 v24.2d, v24.4s, #3 +; NO_SVE-NEXT: tbnz w8, #29, .LBB39_66 +; NO_SVE-NEXT: b .LBB39_67 +; NO_SVE-NEXT: .LBB39_65: +; NO_SVE-NEXT: // implicit-def: $q23 +; NO_SVE-NEXT: sshll2 v24.2d, v24.4s, #3 +; NO_SVE-NEXT: tbz w8, #29, .LBB39_67 +; NO_SVE-NEXT: .LBB39_66: // %cond.load85 +; NO_SVE-NEXT: mov x9, v25.d[1] +; NO_SVE-NEXT: ld1 { v23.d }[1], [x9] +; NO_SVE-NEXT: .LBB39_67: // %else86 +; NO_SVE-NEXT: add v24.2d, v5.2d, v24.2d +; NO_SVE-NEXT: tbz w8, #30, .LBB39_69 +; NO_SVE-NEXT: // %bb.68: // %cond.load88 +; NO_SVE-NEXT: fmov x9, d24 +; NO_SVE-NEXT: ld1 { v5.d }[0], [x9] +; NO_SVE-NEXT: tbnz w8, #31, .LBB39_70 +; NO_SVE-NEXT: b .LBB39_71 +; NO_SVE-NEXT: .LBB39_69: +; NO_SVE-NEXT: // implicit-def: $q5 +; NO_SVE-NEXT: tbz w8, #31, .LBB39_71 +; NO_SVE-NEXT: .LBB39_70: // %cond.load91 +; NO_SVE-NEXT: mov x8, v24.d[1] +; NO_SVE-NEXT: ld1 { v5.d }[1], [x8] +; NO_SVE-NEXT: .LBB39_71: // %else92 +; NO_SVE-NEXT: stp q0, q1, [x0] +; NO_SVE-NEXT: stp q2, q3, [x0, #32] +; NO_SVE-NEXT: stp q4, q6, [x0, #64] +; NO_SVE-NEXT: stp q7, q16, [x0, #96] +; NO_SVE-NEXT: stp q17, q18, [x0, #128] +; NO_SVE-NEXT: stp q19, q20, [x0, #160] +; NO_SVE-NEXT: stp q21, q22, [x0, #192] +; NO_SVE-NEXT: stp q23, q5, [x0, #224] +; NO_SVE-NEXT: add sp, sp, #16 +; NO_SVE-NEXT: ret +; +; VBITS_EQ_256-LABEL: masked_gather_32b_scaled_sext_f64: +; VBITS_EQ_256: // %bb.0: +; VBITS_EQ_256-NEXT: mov x8, #12 +; VBITS_EQ_256-NEXT: mov x9, #8 +; VBITS_EQ_256-NEXT: mov x10, #16 +; VBITS_EQ_256-NEXT: mov x11, #20 +; VBITS_EQ_256-NEXT: mov x12, #24 +; VBITS_EQ_256-NEXT: mov x13, #28 +; VBITS_EQ_256-NEXT: mov x14, #4 +; VBITS_EQ_256-NEXT: ptrue p0.d, vl4 +; VBITS_EQ_256-NEXT: ptrue p1.s, vl8 +; VBITS_EQ_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] +; VBITS_EQ_256-NEXT: ld1d { z1.d }, p0/z, [x0, x9, lsl #3] +; VBITS_EQ_256-NEXT: ld1d { z2.d }, p0/z, [x0, x10, lsl #3] +; VBITS_EQ_256-NEXT: ld1d { z3.d }, p0/z, [x0, x11, lsl #3] +; VBITS_EQ_256-NEXT: ld1d { z4.d }, p0/z, [x0, x12, lsl #3] +; VBITS_EQ_256-NEXT: ld1d { z5.d }, p0/z, [x0, x13, lsl #3] +; VBITS_EQ_256-NEXT: ld1d { z6.d }, p0/z, [x0, x14, lsl #3] +; VBITS_EQ_256-NEXT: ld1d { z7.d }, p0/z, [x0] +; VBITS_EQ_256-NEXT: ld1w { z18.s }, p1/z, [x1, x9, lsl #2] +; VBITS_EQ_256-NEXT: ld1w { z17.s }, p1/z, [x1, x10, lsl #2] +; VBITS_EQ_256-NEXT: ld1w { z16.s }, p1/z, [x1, x12, lsl #2] +; VBITS_EQ_256-NEXT: ld1w { z19.s }, p1/z, [x1] +; VBITS_EQ_256-NEXT: fcmeq p2.d, p0/z, z0.d, #0.0 +; VBITS_EQ_256-NEXT: fcmeq p1.d, p0/z, z5.d, #0.0 +; VBITS_EQ_256-NEXT: sunpklo z22.d, z18.s +; VBITS_EQ_256-NEXT: ext z18.b, z18.b, z18.b, #16 +; VBITS_EQ_256-NEXT: sunpklo z18.d, z18.s +; VBITS_EQ_256-NEXT: sunpklo z21.d, z17.s +; VBITS_EQ_256-NEXT: ld1d { z0.d }, p2/z, [x2, z18.d, lsl #3] +; VBITS_EQ_256-NEXT: fcmeq p2.d, p0/z, z1.d, #0.0 +; VBITS_EQ_256-NEXT: ext z17.b, z17.b, z17.b, #16 +; VBITS_EQ_256-NEXT: ld1d { z1.d }, p2/z, [x2, z22.d, lsl #3] +; VBITS_EQ_256-NEXT: fcmeq p2.d, p0/z, z2.d, #0.0 +; VBITS_EQ_256-NEXT: sunpklo z20.d, z16.s +; VBITS_EQ_256-NEXT: ext z16.b, z16.b, z16.b, #16 +; VBITS_EQ_256-NEXT: sunpklo z17.d, z17.s +; VBITS_EQ_256-NEXT: ld1d { z2.d }, p2/z, [x2, z21.d, lsl #3] +; VBITS_EQ_256-NEXT: fcmeq p2.d, p0/z, z3.d, #0.0 +; VBITS_EQ_256-NEXT: sunpklo z16.d, z16.s +; VBITS_EQ_256-NEXT: sunpklo z23.d, z19.s +; VBITS_EQ_256-NEXT: ext z19.b, z19.b, z19.b, #16 +; VBITS_EQ_256-NEXT: ld1d { z3.d }, p2/z, [x2, z17.d, lsl #3] +; VBITS_EQ_256-NEXT: fcmeq p2.d, p0/z, z4.d, #0.0 +; VBITS_EQ_256-NEXT: sunpklo z19.d, z19.s +; VBITS_EQ_256-NEXT: ld1d { z4.d }, p2/z, [x2, z20.d, lsl #3] +; VBITS_EQ_256-NEXT: ld1d { z5.d }, p1/z, [x2, z16.d, lsl #3] +; VBITS_EQ_256-NEXT: fcmeq p1.d, p0/z, z6.d, #0.0 +; VBITS_EQ_256-NEXT: fcmeq p2.d, p0/z, z7.d, #0.0 +; VBITS_EQ_256-NEXT: ld1d { z6.d }, p2/z, [x2, z23.d, lsl #3] +; VBITS_EQ_256-NEXT: ld1d { z7.d }, p1/z, [x2, z19.d, lsl #3] +; VBITS_EQ_256-NEXT: st1d { z3.d }, p0, [x0, x11, lsl #3] +; VBITS_EQ_256-NEXT: st1d { z2.d }, p0, [x0, x10, lsl #3] +; VBITS_EQ_256-NEXT: st1d { z5.d }, p0, [x0, x13, lsl #3] +; VBITS_EQ_256-NEXT: st1d { z4.d }, p0, [x0, x12, lsl #3] +; VBITS_EQ_256-NEXT: st1d { z1.d }, p0, [x0, x9, lsl #3] +; VBITS_EQ_256-NEXT: st1d { z0.d }, p0, [x0, x8, lsl #3] +; VBITS_EQ_256-NEXT: st1d { z7.d }, p0, [x0, x14, lsl #3] +; VBITS_EQ_256-NEXT: st1d { z6.d }, p0, [x0] +; VBITS_EQ_256-NEXT: ret +; ; VBITS_GE_2048-LABEL: masked_gather_32b_scaled_sext_f64: ; VBITS_GE_2048: // %bb.0: ; VBITS_GE_2048-NEXT: ptrue p0.d, vl32 @@ -1023,18 +8201,436 @@ } define void @masked_gather_32b_scaled_zext(<32 x half>* %a, <32 x i32>* %b, half* %base) #0 { -; VBITS_GE_2048-LABEL: masked_gather_32b_scaled_zext: -; VBITS_GE_2048: // %bb.0: -; VBITS_GE_2048-NEXT: ptrue p0.h, vl32 -; VBITS_GE_2048-NEXT: ptrue p1.s, vl32 -; VBITS_GE_2048-NEXT: ld1h { z0.h }, p0/z, [x0] -; VBITS_GE_2048-NEXT: ld1w { z1.s }, p1/z, [x1] -; VBITS_GE_2048-NEXT: fcmeq p1.h, p0/z, z0.h, #0.0 -; VBITS_GE_2048-NEXT: punpklo p1.h, p1.b -; VBITS_GE_2048-NEXT: ld1h { z0.s }, p1/z, [x2, z1.s, uxtw #1] -; VBITS_GE_2048-NEXT: uzp1 z0.h, z0.h, z0.h -; VBITS_GE_2048-NEXT: st1h { z0.h }, p0, [x0] -; VBITS_GE_2048-NEXT: ret +; NO_SVE-LABEL: masked_gather_32b_scaled_zext: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: sub sp, sp, #16 +; NO_SVE-NEXT: .cfi_def_cfa_offset 16 +; NO_SVE-NEXT: ldp q0, q1, [x0, #32] +; NO_SVE-NEXT: fcmeq v0.8h, v0.8h, #0.0 +; NO_SVE-NEXT: fcmeq v1.8h, v1.8h, #0.0 +; NO_SVE-NEXT: ldr q2, [x1] +; NO_SVE-NEXT: ldp q3, q4, [x0] +; NO_SVE-NEXT: xtn v0.8b, v0.8h +; NO_SVE-NEXT: umov w8, v0.b[1] +; NO_SVE-NEXT: umov w10, v0.b[2] +; NO_SVE-NEXT: umov w9, v0.b[0] +; NO_SVE-NEXT: umov w11, v0.b[3] +; NO_SVE-NEXT: umov w12, v0.b[4] +; NO_SVE-NEXT: umov w13, v0.b[5] +; NO_SVE-NEXT: xtn v1.8b, v1.8h +; NO_SVE-NEXT: umov w14, v0.b[6] +; NO_SVE-NEXT: and w8, w8, #0x1 +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: and w9, w9, #0x1 +; NO_SVE-NEXT: and w11, w11, #0x1 +; NO_SVE-NEXT: umov w15, v0.b[7] +; NO_SVE-NEXT: and w12, w12, #0x1 +; NO_SVE-NEXT: bfi w9, w8, #1, #1 +; NO_SVE-NEXT: umov w8, v1.b[0] +; NO_SVE-NEXT: fcmeq v3.8h, v3.8h, #0.0 +; NO_SVE-NEXT: bfi w9, w10, #2, #1 +; NO_SVE-NEXT: and w13, w13, #0x1 +; NO_SVE-NEXT: umov w10, v1.b[1] +; NO_SVE-NEXT: bfi w9, w11, #3, #1 +; NO_SVE-NEXT: umov w11, v1.b[2] +; NO_SVE-NEXT: bfi w9, w12, #4, #1 +; NO_SVE-NEXT: and w14, w14, #0x1 +; NO_SVE-NEXT: umov w12, v1.b[3] +; NO_SVE-NEXT: bfi w9, w13, #5, #1 +; NO_SVE-NEXT: and w15, w15, #0x1 +; NO_SVE-NEXT: xtn v3.8b, v3.8h +; NO_SVE-NEXT: umov w13, v1.b[4] +; NO_SVE-NEXT: orr w9, w9, w14, lsl #6 +; NO_SVE-NEXT: and w8, w8, #0x1 +; NO_SVE-NEXT: orr w9, w9, w15, lsl #7 +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: umov w15, v3.b[1] +; NO_SVE-NEXT: orr w8, w9, w8, lsl #8 +; NO_SVE-NEXT: and w11, w11, #0x1 +; NO_SVE-NEXT: orr w8, w8, w10, lsl #9 +; NO_SVE-NEXT: umov w10, v3.b[2] +; NO_SVE-NEXT: and w12, w12, #0x1 +; NO_SVE-NEXT: umov w9, v3.b[0] +; NO_SVE-NEXT: orr w8, w8, w11, lsl #10 +; NO_SVE-NEXT: and w13, w13, #0x1 +; NO_SVE-NEXT: umov w11, v3.b[3] +; NO_SVE-NEXT: orr w8, w8, w12, lsl #11 +; NO_SVE-NEXT: and w15, w15, #0x1 +; NO_SVE-NEXT: umov w12, v3.b[4] +; NO_SVE-NEXT: orr w8, w8, w13, lsl #12 +; NO_SVE-NEXT: umov w13, v3.b[5] +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: and w9, w9, #0x1 +; NO_SVE-NEXT: fcmeq v0.8h, v4.8h, #0.0 +; NO_SVE-NEXT: bfi w9, w15, #1, #1 +; NO_SVE-NEXT: and w11, w11, #0x1 +; NO_SVE-NEXT: umov w14, v1.b[5] +; NO_SVE-NEXT: bfi w9, w10, #2, #1 +; NO_SVE-NEXT: and w10, w12, #0x1 +; NO_SVE-NEXT: and w12, w13, #0x1 +; NO_SVE-NEXT: umov w13, v3.b[6] +; NO_SVE-NEXT: bfi w9, w11, #3, #1 +; NO_SVE-NEXT: umov w11, v3.b[7] +; NO_SVE-NEXT: xtn v0.8b, v0.8h +; NO_SVE-NEXT: bfi w9, w10, #4, #1 +; NO_SVE-NEXT: umov w10, v1.b[6] +; NO_SVE-NEXT: bfi w9, w12, #5, #1 +; NO_SVE-NEXT: and w12, w13, #0x1 +; NO_SVE-NEXT: umov w13, v0.b[0] +; NO_SVE-NEXT: and w14, w14, #0x1 +; NO_SVE-NEXT: umov w15, v0.b[1] +; NO_SVE-NEXT: and w11, w11, #0x1 +; NO_SVE-NEXT: orr w9, w9, w12, lsl #6 +; NO_SVE-NEXT: umov w12, v0.b[2] +; NO_SVE-NEXT: orr w8, w8, w14, lsl #13 +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: umov w14, v0.b[3] +; NO_SVE-NEXT: orr w9, w9, w11, lsl #7 +; NO_SVE-NEXT: and w11, w13, #0x1 +; NO_SVE-NEXT: and w13, w15, #0x1 +; NO_SVE-NEXT: orr w8, w8, w10, lsl #14 +; NO_SVE-NEXT: umov w10, v0.b[4] +; NO_SVE-NEXT: orr w9, w9, w11, lsl #8 +; NO_SVE-NEXT: and w11, w12, #0x1 +; NO_SVE-NEXT: umov w12, v0.b[5] +; NO_SVE-NEXT: orr w9, w9, w13, lsl #9 +; NO_SVE-NEXT: and w13, w14, #0x1 +; NO_SVE-NEXT: umov w14, v0.b[6] +; NO_SVE-NEXT: orr w9, w9, w11, lsl #10 +; NO_SVE-NEXT: umov w11, v1.b[7] +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: orr w9, w9, w13, lsl #11 +; NO_SVE-NEXT: and w12, w12, #0x1 +; NO_SVE-NEXT: umov w13, v0.b[7] +; NO_SVE-NEXT: orr w9, w9, w10, lsl #12 +; NO_SVE-NEXT: and w10, w14, #0x1 +; NO_SVE-NEXT: dup v1.2d, x2 +; NO_SVE-NEXT: orr w11, w8, w11, lsl #15 +; NO_SVE-NEXT: orr w8, w9, w12, lsl #13 +; NO_SVE-NEXT: orr w8, w8, w10, lsl #14 +; NO_SVE-NEXT: orr w8, w8, w13, lsl #15 +; NO_SVE-NEXT: ushll v0.2d, v2.2s, #1 +; NO_SVE-NEXT: bfi w8, w11, #16, #16 +; NO_SVE-NEXT: add v4.2d, v1.2d, v0.2d +; NO_SVE-NEXT: tbz w8, #0, .LBB40_2 +; NO_SVE-NEXT: // %bb.1: // %cond.load +; NO_SVE-NEXT: fmov x9, d4 +; NO_SVE-NEXT: ldr h0, [x9] +; NO_SVE-NEXT: b .LBB40_3 +; NO_SVE-NEXT: .LBB40_2: +; NO_SVE-NEXT: // implicit-def: $q0 +; NO_SVE-NEXT: .LBB40_3: // %else +; NO_SVE-NEXT: ldr q3, [x1, #16] +; NO_SVE-NEXT: ushll2 v2.2d, v2.4s, #1 +; NO_SVE-NEXT: tbnz w8, #1, .LBB40_13 +; NO_SVE-NEXT: // %bb.4: // %else2 +; NO_SVE-NEXT: add v2.2d, v1.2d, v2.2d +; NO_SVE-NEXT: tbnz w8, #2, .LBB40_14 +; NO_SVE-NEXT: .LBB40_5: // %else5 +; NO_SVE-NEXT: ushll v4.2d, v3.2s, #1 +; NO_SVE-NEXT: tbnz w8, #3, .LBB40_15 +; NO_SVE-NEXT: .LBB40_6: // %else8 +; NO_SVE-NEXT: add v2.2d, v1.2d, v4.2d +; NO_SVE-NEXT: tbz w8, #4, .LBB40_8 +; NO_SVE-NEXT: .LBB40_7: // %cond.load10 +; NO_SVE-NEXT: fmov x9, d2 +; NO_SVE-NEXT: ld1 { v0.h }[4], [x9] +; NO_SVE-NEXT: .LBB40_8: // %else11 +; NO_SVE-NEXT: ldr q4, [x1, #32] +; NO_SVE-NEXT: ushll2 v3.2d, v3.4s, #1 +; NO_SVE-NEXT: tbnz w8, #5, .LBB40_16 +; NO_SVE-NEXT: // %bb.9: // %else14 +; NO_SVE-NEXT: add v2.2d, v1.2d, v3.2d +; NO_SVE-NEXT: tbnz w8, #6, .LBB40_17 +; NO_SVE-NEXT: .LBB40_10: // %else17 +; NO_SVE-NEXT: ushll v3.2d, v4.2s, #1 +; NO_SVE-NEXT: tbnz w8, #7, .LBB40_18 +; NO_SVE-NEXT: .LBB40_11: // %else20 +; NO_SVE-NEXT: add v5.2d, v1.2d, v3.2d +; NO_SVE-NEXT: tbz w8, #8, .LBB40_19 +; NO_SVE-NEXT: .LBB40_12: // %cond.load22 +; NO_SVE-NEXT: fmov x9, d5 +; NO_SVE-NEXT: ld1 { v2.h }[0], [x9] +; NO_SVE-NEXT: b .LBB40_20 +; NO_SVE-NEXT: .LBB40_13: // %cond.load1 +; NO_SVE-NEXT: mov x9, v4.d[1] +; NO_SVE-NEXT: ld1 { v0.h }[1], [x9] +; NO_SVE-NEXT: add v2.2d, v1.2d, v2.2d +; NO_SVE-NEXT: tbz w8, #2, .LBB40_5 +; NO_SVE-NEXT: .LBB40_14: // %cond.load4 +; NO_SVE-NEXT: fmov x9, d2 +; NO_SVE-NEXT: ld1 { v0.h }[2], [x9] +; NO_SVE-NEXT: ushll v4.2d, v3.2s, #1 +; NO_SVE-NEXT: tbz w8, #3, .LBB40_6 +; NO_SVE-NEXT: .LBB40_15: // %cond.load7 +; NO_SVE-NEXT: mov x9, v2.d[1] +; NO_SVE-NEXT: ld1 { v0.h }[3], [x9] +; NO_SVE-NEXT: add v2.2d, v1.2d, v4.2d +; NO_SVE-NEXT: tbnz w8, #4, .LBB40_7 +; NO_SVE-NEXT: b .LBB40_8 +; NO_SVE-NEXT: .LBB40_16: // %cond.load13 +; NO_SVE-NEXT: mov x9, v2.d[1] +; NO_SVE-NEXT: ld1 { v0.h }[5], [x9] +; NO_SVE-NEXT: add v2.2d, v1.2d, v3.2d +; NO_SVE-NEXT: tbz w8, #6, .LBB40_10 +; NO_SVE-NEXT: .LBB40_17: // %cond.load16 +; NO_SVE-NEXT: fmov x9, d2 +; NO_SVE-NEXT: ld1 { v0.h }[6], [x9] +; NO_SVE-NEXT: ushll v3.2d, v4.2s, #1 +; NO_SVE-NEXT: tbz w8, #7, .LBB40_11 +; NO_SVE-NEXT: .LBB40_18: // %cond.load19 +; NO_SVE-NEXT: mov x9, v2.d[1] +; NO_SVE-NEXT: ld1 { v0.h }[7], [x9] +; NO_SVE-NEXT: add v5.2d, v1.2d, v3.2d +; NO_SVE-NEXT: tbnz w8, #8, .LBB40_12 +; NO_SVE-NEXT: .LBB40_19: +; NO_SVE-NEXT: // implicit-def: $q2 +; NO_SVE-NEXT: .LBB40_20: // %else23 +; NO_SVE-NEXT: ldr q3, [x1, #48] +; NO_SVE-NEXT: ushll2 v4.2d, v4.4s, #1 +; NO_SVE-NEXT: tbnz w8, #9, .LBB40_30 +; NO_SVE-NEXT: // %bb.21: // %else26 +; NO_SVE-NEXT: add v4.2d, v1.2d, v4.2d +; NO_SVE-NEXT: tbnz w8, #10, .LBB40_31 +; NO_SVE-NEXT: .LBB40_22: // %else29 +; NO_SVE-NEXT: ushll v5.2d, v3.2s, #1 +; NO_SVE-NEXT: tbnz w8, #11, .LBB40_32 +; NO_SVE-NEXT: .LBB40_23: // %else32 +; NO_SVE-NEXT: add v4.2d, v1.2d, v5.2d +; NO_SVE-NEXT: tbz w8, #12, .LBB40_25 +; NO_SVE-NEXT: .LBB40_24: // %cond.load34 +; NO_SVE-NEXT: fmov x9, d4 +; NO_SVE-NEXT: ld1 { v2.h }[4], [x9] +; NO_SVE-NEXT: .LBB40_25: // %else35 +; NO_SVE-NEXT: ldr q5, [x1, #64] +; NO_SVE-NEXT: ushll2 v3.2d, v3.4s, #1 +; NO_SVE-NEXT: tbnz w8, #13, .LBB40_33 +; NO_SVE-NEXT: // %bb.26: // %else38 +; NO_SVE-NEXT: add v3.2d, v1.2d, v3.2d +; NO_SVE-NEXT: tbnz w8, #14, .LBB40_34 +; NO_SVE-NEXT: .LBB40_27: // %else41 +; NO_SVE-NEXT: ushll v4.2d, v5.2s, #1 +; NO_SVE-NEXT: tbnz w8, #15, .LBB40_35 +; NO_SVE-NEXT: .LBB40_28: // %else44 +; NO_SVE-NEXT: add v6.2d, v1.2d, v4.2d +; NO_SVE-NEXT: tbz w8, #16, .LBB40_36 +; NO_SVE-NEXT: .LBB40_29: // %cond.load46 +; NO_SVE-NEXT: fmov x9, d6 +; NO_SVE-NEXT: ld1 { v3.h }[0], [x9] +; NO_SVE-NEXT: b .LBB40_37 +; NO_SVE-NEXT: .LBB40_30: // %cond.load25 +; NO_SVE-NEXT: mov x9, v5.d[1] +; NO_SVE-NEXT: ld1 { v2.h }[1], [x9] +; NO_SVE-NEXT: add v4.2d, v1.2d, v4.2d +; NO_SVE-NEXT: tbz w8, #10, .LBB40_22 +; NO_SVE-NEXT: .LBB40_31: // %cond.load28 +; NO_SVE-NEXT: fmov x9, d4 +; NO_SVE-NEXT: ld1 { v2.h }[2], [x9] +; NO_SVE-NEXT: ushll v5.2d, v3.2s, #1 +; NO_SVE-NEXT: tbz w8, #11, .LBB40_23 +; NO_SVE-NEXT: .LBB40_32: // %cond.load31 +; NO_SVE-NEXT: mov x9, v4.d[1] +; NO_SVE-NEXT: ld1 { v2.h }[3], [x9] +; NO_SVE-NEXT: add v4.2d, v1.2d, v5.2d +; NO_SVE-NEXT: tbnz w8, #12, .LBB40_24 +; NO_SVE-NEXT: b .LBB40_25 +; NO_SVE-NEXT: .LBB40_33: // %cond.load37 +; NO_SVE-NEXT: mov x9, v4.d[1] +; NO_SVE-NEXT: ld1 { v2.h }[5], [x9] +; NO_SVE-NEXT: add v3.2d, v1.2d, v3.2d +; NO_SVE-NEXT: tbz w8, #14, .LBB40_27 +; NO_SVE-NEXT: .LBB40_34: // %cond.load40 +; NO_SVE-NEXT: fmov x9, d3 +; NO_SVE-NEXT: ld1 { v2.h }[6], [x9] +; NO_SVE-NEXT: ushll v4.2d, v5.2s, #1 +; NO_SVE-NEXT: tbz w8, #15, .LBB40_28 +; NO_SVE-NEXT: .LBB40_35: // %cond.load43 +; NO_SVE-NEXT: mov x9, v3.d[1] +; NO_SVE-NEXT: ld1 { v2.h }[7], [x9] +; NO_SVE-NEXT: add v6.2d, v1.2d, v4.2d +; NO_SVE-NEXT: tbnz w8, #16, .LBB40_29 +; NO_SVE-NEXT: .LBB40_36: +; NO_SVE-NEXT: // implicit-def: $q3 +; NO_SVE-NEXT: .LBB40_37: // %else47 +; NO_SVE-NEXT: ldr q4, [x1, #80] +; NO_SVE-NEXT: ushll2 v5.2d, v5.4s, #1 +; NO_SVE-NEXT: tbnz w8, #17, .LBB40_47 +; NO_SVE-NEXT: // %bb.38: // %else50 +; NO_SVE-NEXT: add v5.2d, v1.2d, v5.2d +; NO_SVE-NEXT: tbnz w8, #18, .LBB40_48 +; NO_SVE-NEXT: .LBB40_39: // %else53 +; NO_SVE-NEXT: ushll v6.2d, v4.2s, #1 +; NO_SVE-NEXT: tbnz w8, #19, .LBB40_49 +; NO_SVE-NEXT: .LBB40_40: // %else56 +; NO_SVE-NEXT: add v5.2d, v1.2d, v6.2d +; NO_SVE-NEXT: tbz w8, #20, .LBB40_42 +; NO_SVE-NEXT: .LBB40_41: // %cond.load58 +; NO_SVE-NEXT: fmov x9, d5 +; NO_SVE-NEXT: ld1 { v3.h }[4], [x9] +; NO_SVE-NEXT: .LBB40_42: // %else59 +; NO_SVE-NEXT: ldr q6, [x1, #96] +; NO_SVE-NEXT: ushll2 v4.2d, v4.4s, #1 +; NO_SVE-NEXT: tbnz w8, #21, .LBB40_50 +; NO_SVE-NEXT: // %bb.43: // %else62 +; NO_SVE-NEXT: add v4.2d, v1.2d, v4.2d +; NO_SVE-NEXT: tbnz w8, #22, .LBB40_51 +; NO_SVE-NEXT: .LBB40_44: // %else65 +; NO_SVE-NEXT: ushll v5.2d, v6.2s, #1 +; NO_SVE-NEXT: tbnz w8, #23, .LBB40_52 +; NO_SVE-NEXT: .LBB40_45: // %else68 +; NO_SVE-NEXT: add v7.2d, v1.2d, v5.2d +; NO_SVE-NEXT: tbz w8, #24, .LBB40_53 +; NO_SVE-NEXT: .LBB40_46: // %cond.load70 +; NO_SVE-NEXT: fmov x9, d7 +; NO_SVE-NEXT: ld1 { v4.h }[0], [x9] +; NO_SVE-NEXT: b .LBB40_54 +; NO_SVE-NEXT: .LBB40_47: // %cond.load49 +; NO_SVE-NEXT: mov x9, v6.d[1] +; NO_SVE-NEXT: ld1 { v3.h }[1], [x9] +; NO_SVE-NEXT: add v5.2d, v1.2d, v5.2d +; NO_SVE-NEXT: tbz w8, #18, .LBB40_39 +; NO_SVE-NEXT: .LBB40_48: // %cond.load52 +; NO_SVE-NEXT: fmov x9, d5 +; NO_SVE-NEXT: ld1 { v3.h }[2], [x9] +; NO_SVE-NEXT: ushll v6.2d, v4.2s, #1 +; NO_SVE-NEXT: tbz w8, #19, .LBB40_40 +; NO_SVE-NEXT: .LBB40_49: // %cond.load55 +; NO_SVE-NEXT: mov x9, v5.d[1] +; NO_SVE-NEXT: ld1 { v3.h }[3], [x9] +; NO_SVE-NEXT: add v5.2d, v1.2d, v6.2d +; NO_SVE-NEXT: tbnz w8, #20, .LBB40_41 +; NO_SVE-NEXT: b .LBB40_42 +; NO_SVE-NEXT: .LBB40_50: // %cond.load61 +; NO_SVE-NEXT: mov x9, v5.d[1] +; NO_SVE-NEXT: ld1 { v3.h }[5], [x9] +; NO_SVE-NEXT: add v4.2d, v1.2d, v4.2d +; NO_SVE-NEXT: tbz w8, #22, .LBB40_44 +; NO_SVE-NEXT: .LBB40_51: // %cond.load64 +; NO_SVE-NEXT: fmov x9, d4 +; NO_SVE-NEXT: ld1 { v3.h }[6], [x9] +; NO_SVE-NEXT: ushll v5.2d, v6.2s, #1 +; NO_SVE-NEXT: tbz w8, #23, .LBB40_45 +; NO_SVE-NEXT: .LBB40_52: // %cond.load67 +; NO_SVE-NEXT: mov x9, v4.d[1] +; NO_SVE-NEXT: ld1 { v3.h }[7], [x9] +; NO_SVE-NEXT: add v7.2d, v1.2d, v5.2d +; NO_SVE-NEXT: tbnz w8, #24, .LBB40_46 +; NO_SVE-NEXT: .LBB40_53: +; NO_SVE-NEXT: // implicit-def: $q4 +; NO_SVE-NEXT: .LBB40_54: // %else71 +; NO_SVE-NEXT: ldr q5, [x1, #112] +; NO_SVE-NEXT: ushll2 v6.2d, v6.4s, #1 +; NO_SVE-NEXT: tbnz w8, #25, .LBB40_63 +; NO_SVE-NEXT: // %bb.55: // %else74 +; NO_SVE-NEXT: add v6.2d, v1.2d, v6.2d +; NO_SVE-NEXT: tbnz w8, #26, .LBB40_64 +; NO_SVE-NEXT: .LBB40_56: // %else77 +; NO_SVE-NEXT: ushll v7.2d, v5.2s, #1 +; NO_SVE-NEXT: tbnz w8, #27, .LBB40_65 +; NO_SVE-NEXT: .LBB40_57: // %else80 +; NO_SVE-NEXT: add v6.2d, v1.2d, v7.2d +; NO_SVE-NEXT: tbnz w8, #28, .LBB40_66 +; NO_SVE-NEXT: .LBB40_58: // %else83 +; NO_SVE-NEXT: ushll2 v5.2d, v5.4s, #1 +; NO_SVE-NEXT: tbnz w8, #29, .LBB40_67 +; NO_SVE-NEXT: .LBB40_59: // %else86 +; NO_SVE-NEXT: add v1.2d, v1.2d, v5.2d +; NO_SVE-NEXT: tbnz w8, #30, .LBB40_68 +; NO_SVE-NEXT: .LBB40_60: // %else89 +; NO_SVE-NEXT: tbz w8, #31, .LBB40_62 +; NO_SVE-NEXT: .LBB40_61: // %cond.load91 +; NO_SVE-NEXT: mov x8, v1.d[1] +; NO_SVE-NEXT: ld1 { v4.h }[7], [x8] +; NO_SVE-NEXT: .LBB40_62: // %else92 +; NO_SVE-NEXT: stp q0, q2, [x0] +; NO_SVE-NEXT: stp q3, q4, [x0, #32] +; NO_SVE-NEXT: add sp, sp, #16 +; NO_SVE-NEXT: ret +; NO_SVE-NEXT: .LBB40_63: // %cond.load73 +; NO_SVE-NEXT: mov x9, v7.d[1] +; NO_SVE-NEXT: ld1 { v4.h }[1], [x9] +; NO_SVE-NEXT: add v6.2d, v1.2d, v6.2d +; NO_SVE-NEXT: tbz w8, #26, .LBB40_56 +; NO_SVE-NEXT: .LBB40_64: // %cond.load76 +; NO_SVE-NEXT: fmov x9, d6 +; NO_SVE-NEXT: ld1 { v4.h }[2], [x9] +; NO_SVE-NEXT: ushll v7.2d, v5.2s, #1 +; NO_SVE-NEXT: tbz w8, #27, .LBB40_57 +; NO_SVE-NEXT: .LBB40_65: // %cond.load79 +; NO_SVE-NEXT: mov x9, v6.d[1] +; NO_SVE-NEXT: ld1 { v4.h }[3], [x9] +; NO_SVE-NEXT: add v6.2d, v1.2d, v7.2d +; NO_SVE-NEXT: tbz w8, #28, .LBB40_58 +; NO_SVE-NEXT: .LBB40_66: // %cond.load82 +; NO_SVE-NEXT: fmov x9, d6 +; NO_SVE-NEXT: ld1 { v4.h }[4], [x9] +; NO_SVE-NEXT: ushll2 v5.2d, v5.4s, #1 +; NO_SVE-NEXT: tbz w8, #29, .LBB40_59 +; NO_SVE-NEXT: .LBB40_67: // %cond.load85 +; NO_SVE-NEXT: mov x9, v6.d[1] +; NO_SVE-NEXT: ld1 { v4.h }[5], [x9] +; NO_SVE-NEXT: add v1.2d, v1.2d, v5.2d +; NO_SVE-NEXT: tbz w8, #30, .LBB40_60 +; NO_SVE-NEXT: .LBB40_68: // %cond.load88 +; NO_SVE-NEXT: fmov x9, d1 +; NO_SVE-NEXT: ld1 { v4.h }[6], [x9] +; NO_SVE-NEXT: tbnz w8, #31, .LBB40_61 +; NO_SVE-NEXT: b .LBB40_62 +; +; VBITS_EQ_256-LABEL: masked_gather_32b_scaled_zext: +; VBITS_EQ_256: // %bb.0: +; VBITS_EQ_256-NEXT: mov x8, #16 +; VBITS_EQ_256-NEXT: ptrue p0.h, vl16 +; VBITS_EQ_256-NEXT: mov x10, #24 +; VBITS_EQ_256-NEXT: ptrue p1.s, vl8 +; VBITS_EQ_256-NEXT: mov x9, #8 +; VBITS_EQ_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] +; VBITS_EQ_256-NEXT: ld1h { z1.h }, p0/z, [x0] +; VBITS_EQ_256-NEXT: ld1w { z4.s }, p1/z, [x1, x8, lsl #2] +; VBITS_EQ_256-NEXT: ld1w { z3.s }, p1/z, [x1, x10, lsl #2] +; VBITS_EQ_256-NEXT: ld1w { z2.s }, p1/z, [x1, x9, lsl #2] +; VBITS_EQ_256-NEXT: ld1w { z5.s }, p1/z, [x1] +; VBITS_EQ_256-NEXT: fcmeq p2.h, p0/z, z0.h, #0.0 +; VBITS_EQ_256-NEXT: fcmeq p3.h, p0/z, z1.h, #0.0 +; VBITS_EQ_256-NEXT: mov z0.h, p2/z, #-1 // =0xffffffffffffffff +; VBITS_EQ_256-NEXT: punpklo p2.h, p2.b +; VBITS_EQ_256-NEXT: ext z0.b, z0.b, z0.b, #16 +; VBITS_EQ_256-NEXT: mov z1.h, p3/z, #-1 // =0xffffffffffffffff +; VBITS_EQ_256-NEXT: and p2.b, p2/z, p2.b, p1.b +; VBITS_EQ_256-NEXT: sunpklo z0.s, z0.h +; VBITS_EQ_256-NEXT: ld1h { z4.s }, p2/z, [x2, z4.s, uxtw #1] +; VBITS_EQ_256-NEXT: cmpne p2.s, p1/z, z0.s, #0 +; VBITS_EQ_256-NEXT: ext z1.b, z1.b, z1.b, #16 +; VBITS_EQ_256-NEXT: ld1h { z0.s }, p2/z, [x2, z3.s, uxtw #1] +; VBITS_EQ_256-NEXT: punpklo p2.h, p3.b +; VBITS_EQ_256-NEXT: sunpklo z1.s, z1.h +; VBITS_EQ_256-NEXT: and p2.b, p2/z, p2.b, p1.b +; VBITS_EQ_256-NEXT: cmpne p1.s, p1/z, z1.s, #0 +; VBITS_EQ_256-NEXT: ld1h { z1.s }, p2/z, [x2, z5.s, uxtw #1] +; VBITS_EQ_256-NEXT: ld1h { z2.s }, p1/z, [x2, z2.s, uxtw #1] +; VBITS_EQ_256-NEXT: uzp1 z3.h, z4.h, z4.h +; VBITS_EQ_256-NEXT: uzp1 z0.h, z0.h, z0.h +; VBITS_EQ_256-NEXT: ptrue p1.h, vl8 +; VBITS_EQ_256-NEXT: splice z3.h, p1, z3.h, z0.h +; VBITS_EQ_256-NEXT: uzp1 z1.h, z1.h, z1.h +; VBITS_EQ_256-NEXT: uzp1 z2.h, z2.h, z2.h +; VBITS_EQ_256-NEXT: splice z1.h, p1, z1.h, z2.h +; VBITS_EQ_256-NEXT: st1h { z3.h }, p0, [x0, x8, lsl #1] +; VBITS_EQ_256-NEXT: st1h { z1.h }, p0, [x0] +; VBITS_EQ_256-NEXT: ret +; +; VBITS_GE_1024-LABEL: masked_gather_32b_scaled_zext: +; VBITS_GE_1024: // %bb.0: +; VBITS_GE_1024-NEXT: ptrue p0.h, vl32 +; VBITS_GE_1024-NEXT: ptrue p1.s, vl32 +; VBITS_GE_1024-NEXT: ld1h { z0.h }, p0/z, [x0] +; VBITS_GE_1024-NEXT: ld1w { z1.s }, p1/z, [x1] +; VBITS_GE_1024-NEXT: fcmeq p1.h, p0/z, z0.h, #0.0 +; VBITS_GE_1024-NEXT: punpklo p1.h, p1.b +; VBITS_GE_1024-NEXT: ld1h { z0.s }, p1/z, [x2, z1.s, uxtw #1] +; VBITS_GE_1024-NEXT: uzp1 z0.h, z0.h, z0.h +; VBITS_GE_1024-NEXT: st1h { z0.h }, p0, [x0] +; VBITS_GE_1024-NEXT: ret %cvals = load <32 x half>, <32 x half>* %a %idxs = load <32 x i32>, <32 x i32>* %b %ext = zext <32 x i32> %idxs to <32 x i64> @@ -1046,18 +8642,401 @@ } define void @masked_gather_32b_unscaled_sext(<32 x half>* %a, <32 x i32>* %b, i8* %base) #0 { -; VBITS_GE_2048-LABEL: masked_gather_32b_unscaled_sext: -; VBITS_GE_2048: // %bb.0: -; VBITS_GE_2048-NEXT: ptrue p0.h, vl32 -; VBITS_GE_2048-NEXT: ptrue p1.s, vl32 -; VBITS_GE_2048-NEXT: ld1h { z0.h }, p0/z, [x0] -; VBITS_GE_2048-NEXT: ld1w { z1.s }, p1/z, [x1] -; VBITS_GE_2048-NEXT: fcmeq p1.h, p0/z, z0.h, #0.0 -; VBITS_GE_2048-NEXT: punpklo p1.h, p1.b -; VBITS_GE_2048-NEXT: ld1h { z0.s }, p1/z, [x2, z1.s, sxtw] -; VBITS_GE_2048-NEXT: uzp1 z0.h, z0.h, z0.h -; VBITS_GE_2048-NEXT: st1h { z0.h }, p0, [x0] -; VBITS_GE_2048-NEXT: ret +; NO_SVE-LABEL: masked_gather_32b_unscaled_sext: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: sub sp, sp, #16 +; NO_SVE-NEXT: .cfi_def_cfa_offset 16 +; NO_SVE-NEXT: ldp q0, q1, [x0, #32] +; NO_SVE-NEXT: fcmeq v0.8h, v0.8h, #0.0 +; NO_SVE-NEXT: fcmeq v1.8h, v1.8h, #0.0 +; NO_SVE-NEXT: ldr q2, [x1] +; NO_SVE-NEXT: ldp q3, q4, [x0] +; NO_SVE-NEXT: xtn v0.8b, v0.8h +; NO_SVE-NEXT: umov w8, v0.b[1] +; NO_SVE-NEXT: umov w10, v0.b[2] +; NO_SVE-NEXT: umov w9, v0.b[0] +; NO_SVE-NEXT: umov w11, v0.b[3] +; NO_SVE-NEXT: umov w12, v0.b[4] +; NO_SVE-NEXT: umov w13, v0.b[5] +; NO_SVE-NEXT: xtn v1.8b, v1.8h +; NO_SVE-NEXT: umov w14, v0.b[6] +; NO_SVE-NEXT: and w8, w8, #0x1 +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: and w9, w9, #0x1 +; NO_SVE-NEXT: and w11, w11, #0x1 +; NO_SVE-NEXT: umov w15, v0.b[7] +; NO_SVE-NEXT: and w12, w12, #0x1 +; NO_SVE-NEXT: bfi w9, w8, #1, #1 +; NO_SVE-NEXT: umov w8, v1.b[0] +; NO_SVE-NEXT: fcmeq v3.8h, v3.8h, #0.0 +; NO_SVE-NEXT: bfi w9, w10, #2, #1 +; NO_SVE-NEXT: and w13, w13, #0x1 +; NO_SVE-NEXT: umov w10, v1.b[1] +; NO_SVE-NEXT: bfi w9, w11, #3, #1 +; NO_SVE-NEXT: umov w11, v1.b[2] +; NO_SVE-NEXT: bfi w9, w12, #4, #1 +; NO_SVE-NEXT: and w14, w14, #0x1 +; NO_SVE-NEXT: umov w12, v1.b[3] +; NO_SVE-NEXT: bfi w9, w13, #5, #1 +; NO_SVE-NEXT: and w15, w15, #0x1 +; NO_SVE-NEXT: xtn v3.8b, v3.8h +; NO_SVE-NEXT: umov w13, v1.b[4] +; NO_SVE-NEXT: orr w9, w9, w14, lsl #6 +; NO_SVE-NEXT: and w8, w8, #0x1 +; NO_SVE-NEXT: orr w9, w9, w15, lsl #7 +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: umov w15, v3.b[1] +; NO_SVE-NEXT: orr w8, w9, w8, lsl #8 +; NO_SVE-NEXT: and w11, w11, #0x1 +; NO_SVE-NEXT: orr w8, w8, w10, lsl #9 +; NO_SVE-NEXT: umov w10, v3.b[2] +; NO_SVE-NEXT: and w12, w12, #0x1 +; NO_SVE-NEXT: umov w9, v3.b[0] +; NO_SVE-NEXT: orr w8, w8, w11, lsl #10 +; NO_SVE-NEXT: and w13, w13, #0x1 +; NO_SVE-NEXT: umov w11, v3.b[3] +; NO_SVE-NEXT: orr w8, w8, w12, lsl #11 +; NO_SVE-NEXT: and w15, w15, #0x1 +; NO_SVE-NEXT: umov w12, v3.b[4] +; NO_SVE-NEXT: orr w8, w8, w13, lsl #12 +; NO_SVE-NEXT: umov w13, v3.b[5] +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: and w9, w9, #0x1 +; NO_SVE-NEXT: fcmeq v0.8h, v4.8h, #0.0 +; NO_SVE-NEXT: bfi w9, w15, #1, #1 +; NO_SVE-NEXT: and w11, w11, #0x1 +; NO_SVE-NEXT: umov w14, v1.b[5] +; NO_SVE-NEXT: bfi w9, w10, #2, #1 +; NO_SVE-NEXT: and w10, w12, #0x1 +; NO_SVE-NEXT: and w12, w13, #0x1 +; NO_SVE-NEXT: umov w13, v3.b[6] +; NO_SVE-NEXT: bfi w9, w11, #3, #1 +; NO_SVE-NEXT: umov w11, v3.b[7] +; NO_SVE-NEXT: xtn v0.8b, v0.8h +; NO_SVE-NEXT: bfi w9, w10, #4, #1 +; NO_SVE-NEXT: umov w10, v1.b[6] +; NO_SVE-NEXT: bfi w9, w12, #5, #1 +; NO_SVE-NEXT: and w12, w13, #0x1 +; NO_SVE-NEXT: umov w13, v0.b[0] +; NO_SVE-NEXT: and w14, w14, #0x1 +; NO_SVE-NEXT: umov w15, v0.b[1] +; NO_SVE-NEXT: and w11, w11, #0x1 +; NO_SVE-NEXT: orr w9, w9, w12, lsl #6 +; NO_SVE-NEXT: umov w12, v0.b[2] +; NO_SVE-NEXT: orr w8, w8, w14, lsl #13 +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: umov w14, v0.b[3] +; NO_SVE-NEXT: orr w9, w9, w11, lsl #7 +; NO_SVE-NEXT: and w11, w13, #0x1 +; NO_SVE-NEXT: and w13, w15, #0x1 +; NO_SVE-NEXT: orr w8, w8, w10, lsl #14 +; NO_SVE-NEXT: umov w10, v0.b[4] +; NO_SVE-NEXT: orr w9, w9, w11, lsl #8 +; NO_SVE-NEXT: and w11, w12, #0x1 +; NO_SVE-NEXT: umov w12, v0.b[5] +; NO_SVE-NEXT: orr w9, w9, w13, lsl #9 +; NO_SVE-NEXT: and w13, w14, #0x1 +; NO_SVE-NEXT: umov w14, v0.b[6] +; NO_SVE-NEXT: orr w9, w9, w11, lsl #10 +; NO_SVE-NEXT: umov w11, v1.b[7] +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: orr w9, w9, w13, lsl #11 +; NO_SVE-NEXT: and w12, w12, #0x1 +; NO_SVE-NEXT: umov w13, v0.b[7] +; NO_SVE-NEXT: orr w9, w9, w10, lsl #12 +; NO_SVE-NEXT: and w10, w14, #0x1 +; NO_SVE-NEXT: dup v1.2d, x2 +; NO_SVE-NEXT: orr w11, w8, w11, lsl #15 +; NO_SVE-NEXT: orr w8, w9, w12, lsl #13 +; NO_SVE-NEXT: orr w8, w8, w10, lsl #14 +; NO_SVE-NEXT: orr w8, w8, w13, lsl #15 +; NO_SVE-NEXT: bfi w8, w11, #16, #16 +; NO_SVE-NEXT: saddw v3.2d, v1.2d, v2.2s +; NO_SVE-NEXT: tbz w8, #0, .LBB41_2 +; NO_SVE-NEXT: // %bb.1: // %cond.load +; NO_SVE-NEXT: fmov x9, d3 +; NO_SVE-NEXT: ldr h0, [x9] +; NO_SVE-NEXT: tbnz w8, #1, .LBB41_3 +; NO_SVE-NEXT: b .LBB41_4 +; NO_SVE-NEXT: .LBB41_2: +; NO_SVE-NEXT: // implicit-def: $q0 +; NO_SVE-NEXT: tbz w8, #1, .LBB41_4 +; NO_SVE-NEXT: .LBB41_3: // %cond.load1 +; NO_SVE-NEXT: mov x9, v3.d[1] +; NO_SVE-NEXT: ld1 { v0.h }[1], [x9] +; NO_SVE-NEXT: .LBB41_4: // %else2 +; NO_SVE-NEXT: ldr q3, [x1, #16] +; NO_SVE-NEXT: saddw2 v2.2d, v1.2d, v2.4s +; NO_SVE-NEXT: tbnz w8, #2, .LBB41_13 +; NO_SVE-NEXT: // %bb.5: // %else5 +; NO_SVE-NEXT: tbnz w8, #3, .LBB41_14 +; NO_SVE-NEXT: .LBB41_6: // %else8 +; NO_SVE-NEXT: saddw v2.2d, v1.2d, v3.2s +; NO_SVE-NEXT: tbnz w8, #4, .LBB41_15 +; NO_SVE-NEXT: .LBB41_7: // %else11 +; NO_SVE-NEXT: tbz w8, #5, .LBB41_9 +; NO_SVE-NEXT: .LBB41_8: // %cond.load13 +; NO_SVE-NEXT: mov x9, v2.d[1] +; NO_SVE-NEXT: ld1 { v0.h }[5], [x9] +; NO_SVE-NEXT: .LBB41_9: // %else14 +; NO_SVE-NEXT: ldr q4, [x1, #32] +; NO_SVE-NEXT: saddw2 v2.2d, v1.2d, v3.4s +; NO_SVE-NEXT: tbnz w8, #6, .LBB41_16 +; NO_SVE-NEXT: // %bb.10: // %else17 +; NO_SVE-NEXT: tbnz w8, #7, .LBB41_17 +; NO_SVE-NEXT: .LBB41_11: // %else20 +; NO_SVE-NEXT: saddw v3.2d, v1.2d, v4.2s +; NO_SVE-NEXT: tbz w8, #8, .LBB41_18 +; NO_SVE-NEXT: .LBB41_12: // %cond.load22 +; NO_SVE-NEXT: fmov x9, d3 +; NO_SVE-NEXT: ld1 { v2.h }[0], [x9] +; NO_SVE-NEXT: tbnz w8, #9, .LBB41_19 +; NO_SVE-NEXT: b .LBB41_20 +; NO_SVE-NEXT: .LBB41_13: // %cond.load4 +; NO_SVE-NEXT: fmov x9, d2 +; NO_SVE-NEXT: ld1 { v0.h }[2], [x9] +; NO_SVE-NEXT: tbz w8, #3, .LBB41_6 +; NO_SVE-NEXT: .LBB41_14: // %cond.load7 +; NO_SVE-NEXT: mov x9, v2.d[1] +; NO_SVE-NEXT: ld1 { v0.h }[3], [x9] +; NO_SVE-NEXT: saddw v2.2d, v1.2d, v3.2s +; NO_SVE-NEXT: tbz w8, #4, .LBB41_7 +; NO_SVE-NEXT: .LBB41_15: // %cond.load10 +; NO_SVE-NEXT: fmov x9, d2 +; NO_SVE-NEXT: ld1 { v0.h }[4], [x9] +; NO_SVE-NEXT: tbnz w8, #5, .LBB41_8 +; NO_SVE-NEXT: b .LBB41_9 +; NO_SVE-NEXT: .LBB41_16: // %cond.load16 +; NO_SVE-NEXT: fmov x9, d2 +; NO_SVE-NEXT: ld1 { v0.h }[6], [x9] +; NO_SVE-NEXT: tbz w8, #7, .LBB41_11 +; NO_SVE-NEXT: .LBB41_17: // %cond.load19 +; NO_SVE-NEXT: mov x9, v2.d[1] +; NO_SVE-NEXT: ld1 { v0.h }[7], [x9] +; NO_SVE-NEXT: saddw v3.2d, v1.2d, v4.2s +; NO_SVE-NEXT: tbnz w8, #8, .LBB41_12 +; NO_SVE-NEXT: .LBB41_18: +; NO_SVE-NEXT: // implicit-def: $q2 +; NO_SVE-NEXT: tbz w8, #9, .LBB41_20 +; NO_SVE-NEXT: .LBB41_19: // %cond.load25 +; NO_SVE-NEXT: mov x9, v3.d[1] +; NO_SVE-NEXT: ld1 { v2.h }[1], [x9] +; NO_SVE-NEXT: .LBB41_20: // %else26 +; NO_SVE-NEXT: ldr q3, [x1, #48] +; NO_SVE-NEXT: saddw2 v4.2d, v1.2d, v4.4s +; NO_SVE-NEXT: tbnz w8, #10, .LBB41_29 +; NO_SVE-NEXT: // %bb.21: // %else29 +; NO_SVE-NEXT: tbnz w8, #11, .LBB41_30 +; NO_SVE-NEXT: .LBB41_22: // %else32 +; NO_SVE-NEXT: saddw v4.2d, v1.2d, v3.2s +; NO_SVE-NEXT: tbnz w8, #12, .LBB41_31 +; NO_SVE-NEXT: .LBB41_23: // %else35 +; NO_SVE-NEXT: tbz w8, #13, .LBB41_25 +; NO_SVE-NEXT: .LBB41_24: // %cond.load37 +; NO_SVE-NEXT: mov x9, v4.d[1] +; NO_SVE-NEXT: ld1 { v2.h }[5], [x9] +; NO_SVE-NEXT: .LBB41_25: // %else38 +; NO_SVE-NEXT: ldr q5, [x1, #64] +; NO_SVE-NEXT: saddw2 v3.2d, v1.2d, v3.4s +; NO_SVE-NEXT: tbnz w8, #14, .LBB41_32 +; NO_SVE-NEXT: // %bb.26: // %else41 +; NO_SVE-NEXT: tbnz w8, #15, .LBB41_33 +; NO_SVE-NEXT: .LBB41_27: // %else44 +; NO_SVE-NEXT: saddw v4.2d, v1.2d, v5.2s +; NO_SVE-NEXT: tbz w8, #16, .LBB41_34 +; NO_SVE-NEXT: .LBB41_28: // %cond.load46 +; NO_SVE-NEXT: fmov x9, d4 +; NO_SVE-NEXT: ld1 { v3.h }[0], [x9] +; NO_SVE-NEXT: tbnz w8, #17, .LBB41_35 +; NO_SVE-NEXT: b .LBB41_36 +; NO_SVE-NEXT: .LBB41_29: // %cond.load28 +; NO_SVE-NEXT: fmov x9, d4 +; NO_SVE-NEXT: ld1 { v2.h }[2], [x9] +; NO_SVE-NEXT: tbz w8, #11, .LBB41_22 +; NO_SVE-NEXT: .LBB41_30: // %cond.load31 +; NO_SVE-NEXT: mov x9, v4.d[1] +; NO_SVE-NEXT: ld1 { v2.h }[3], [x9] +; NO_SVE-NEXT: saddw v4.2d, v1.2d, v3.2s +; NO_SVE-NEXT: tbz w8, #12, .LBB41_23 +; NO_SVE-NEXT: .LBB41_31: // %cond.load34 +; NO_SVE-NEXT: fmov x9, d4 +; NO_SVE-NEXT: ld1 { v2.h }[4], [x9] +; NO_SVE-NEXT: tbnz w8, #13, .LBB41_24 +; NO_SVE-NEXT: b .LBB41_25 +; NO_SVE-NEXT: .LBB41_32: // %cond.load40 +; NO_SVE-NEXT: fmov x9, d3 +; NO_SVE-NEXT: ld1 { v2.h }[6], [x9] +; NO_SVE-NEXT: tbz w8, #15, .LBB41_27 +; NO_SVE-NEXT: .LBB41_33: // %cond.load43 +; NO_SVE-NEXT: mov x9, v3.d[1] +; NO_SVE-NEXT: ld1 { v2.h }[7], [x9] +; NO_SVE-NEXT: saddw v4.2d, v1.2d, v5.2s +; NO_SVE-NEXT: tbnz w8, #16, .LBB41_28 +; NO_SVE-NEXT: .LBB41_34: +; NO_SVE-NEXT: // implicit-def: $q3 +; NO_SVE-NEXT: tbz w8, #17, .LBB41_36 +; NO_SVE-NEXT: .LBB41_35: // %cond.load49 +; NO_SVE-NEXT: mov x9, v4.d[1] +; NO_SVE-NEXT: ld1 { v3.h }[1], [x9] +; NO_SVE-NEXT: .LBB41_36: // %else50 +; NO_SVE-NEXT: ldr q4, [x1, #80] +; NO_SVE-NEXT: saddw2 v5.2d, v1.2d, v5.4s +; NO_SVE-NEXT: tbnz w8, #18, .LBB41_45 +; NO_SVE-NEXT: // %bb.37: // %else53 +; NO_SVE-NEXT: tbnz w8, #19, .LBB41_46 +; NO_SVE-NEXT: .LBB41_38: // %else56 +; NO_SVE-NEXT: saddw v5.2d, v1.2d, v4.2s +; NO_SVE-NEXT: tbnz w8, #20, .LBB41_47 +; NO_SVE-NEXT: .LBB41_39: // %else59 +; NO_SVE-NEXT: tbz w8, #21, .LBB41_41 +; NO_SVE-NEXT: .LBB41_40: // %cond.load61 +; NO_SVE-NEXT: mov x9, v5.d[1] +; NO_SVE-NEXT: ld1 { v3.h }[5], [x9] +; NO_SVE-NEXT: .LBB41_41: // %else62 +; NO_SVE-NEXT: ldr q6, [x1, #96] +; NO_SVE-NEXT: saddw2 v4.2d, v1.2d, v4.4s +; NO_SVE-NEXT: tbnz w8, #22, .LBB41_48 +; NO_SVE-NEXT: // %bb.42: // %else65 +; NO_SVE-NEXT: tbnz w8, #23, .LBB41_49 +; NO_SVE-NEXT: .LBB41_43: // %else68 +; NO_SVE-NEXT: saddw v5.2d, v1.2d, v6.2s +; NO_SVE-NEXT: tbz w8, #24, .LBB41_50 +; NO_SVE-NEXT: .LBB41_44: // %cond.load70 +; NO_SVE-NEXT: fmov x9, d5 +; NO_SVE-NEXT: ld1 { v4.h }[0], [x9] +; NO_SVE-NEXT: tbnz w8, #25, .LBB41_51 +; NO_SVE-NEXT: b .LBB41_52 +; NO_SVE-NEXT: .LBB41_45: // %cond.load52 +; NO_SVE-NEXT: fmov x9, d5 +; NO_SVE-NEXT: ld1 { v3.h }[2], [x9] +; NO_SVE-NEXT: tbz w8, #19, .LBB41_38 +; NO_SVE-NEXT: .LBB41_46: // %cond.load55 +; NO_SVE-NEXT: mov x9, v5.d[1] +; NO_SVE-NEXT: ld1 { v3.h }[3], [x9] +; NO_SVE-NEXT: saddw v5.2d, v1.2d, v4.2s +; NO_SVE-NEXT: tbz w8, #20, .LBB41_39 +; NO_SVE-NEXT: .LBB41_47: // %cond.load58 +; NO_SVE-NEXT: fmov x9, d5 +; NO_SVE-NEXT: ld1 { v3.h }[4], [x9] +; NO_SVE-NEXT: tbnz w8, #21, .LBB41_40 +; NO_SVE-NEXT: b .LBB41_41 +; NO_SVE-NEXT: .LBB41_48: // %cond.load64 +; NO_SVE-NEXT: fmov x9, d4 +; NO_SVE-NEXT: ld1 { v3.h }[6], [x9] +; NO_SVE-NEXT: tbz w8, #23, .LBB41_43 +; NO_SVE-NEXT: .LBB41_49: // %cond.load67 +; NO_SVE-NEXT: mov x9, v4.d[1] +; NO_SVE-NEXT: ld1 { v3.h }[7], [x9] +; NO_SVE-NEXT: saddw v5.2d, v1.2d, v6.2s +; NO_SVE-NEXT: tbnz w8, #24, .LBB41_44 +; NO_SVE-NEXT: .LBB41_50: +; NO_SVE-NEXT: // implicit-def: $q4 +; NO_SVE-NEXT: tbz w8, #25, .LBB41_52 +; NO_SVE-NEXT: .LBB41_51: // %cond.load73 +; NO_SVE-NEXT: mov x9, v5.d[1] +; NO_SVE-NEXT: ld1 { v4.h }[1], [x9] +; NO_SVE-NEXT: .LBB41_52: // %else74 +; NO_SVE-NEXT: ldr q5, [x1, #112] +; NO_SVE-NEXT: saddw2 v6.2d, v1.2d, v6.4s +; NO_SVE-NEXT: tbnz w8, #26, .LBB41_60 +; NO_SVE-NEXT: // %bb.53: // %else77 +; NO_SVE-NEXT: tbnz w8, #27, .LBB41_61 +; NO_SVE-NEXT: .LBB41_54: // %else80 +; NO_SVE-NEXT: saddw v6.2d, v1.2d, v5.2s +; NO_SVE-NEXT: tbnz w8, #28, .LBB41_62 +; NO_SVE-NEXT: .LBB41_55: // %else83 +; NO_SVE-NEXT: tbnz w8, #29, .LBB41_63 +; NO_SVE-NEXT: .LBB41_56: // %else86 +; NO_SVE-NEXT: saddw2 v1.2d, v1.2d, v5.4s +; NO_SVE-NEXT: tbnz w8, #30, .LBB41_64 +; NO_SVE-NEXT: .LBB41_57: // %else89 +; NO_SVE-NEXT: tbz w8, #31, .LBB41_59 +; NO_SVE-NEXT: .LBB41_58: // %cond.load91 +; NO_SVE-NEXT: mov x8, v1.d[1] +; NO_SVE-NEXT: ld1 { v4.h }[7], [x8] +; NO_SVE-NEXT: .LBB41_59: // %else92 +; NO_SVE-NEXT: stp q0, q2, [x0] +; NO_SVE-NEXT: stp q3, q4, [x0, #32] +; NO_SVE-NEXT: add sp, sp, #16 +; NO_SVE-NEXT: ret +; NO_SVE-NEXT: .LBB41_60: // %cond.load76 +; NO_SVE-NEXT: fmov x9, d6 +; NO_SVE-NEXT: ld1 { v4.h }[2], [x9] +; NO_SVE-NEXT: tbz w8, #27, .LBB41_54 +; NO_SVE-NEXT: .LBB41_61: // %cond.load79 +; NO_SVE-NEXT: mov x9, v6.d[1] +; NO_SVE-NEXT: ld1 { v4.h }[3], [x9] +; NO_SVE-NEXT: saddw v6.2d, v1.2d, v5.2s +; NO_SVE-NEXT: tbz w8, #28, .LBB41_55 +; NO_SVE-NEXT: .LBB41_62: // %cond.load82 +; NO_SVE-NEXT: fmov x9, d6 +; NO_SVE-NEXT: ld1 { v4.h }[4], [x9] +; NO_SVE-NEXT: tbz w8, #29, .LBB41_56 +; NO_SVE-NEXT: .LBB41_63: // %cond.load85 +; NO_SVE-NEXT: mov x9, v6.d[1] +; NO_SVE-NEXT: ld1 { v4.h }[5], [x9] +; NO_SVE-NEXT: saddw2 v1.2d, v1.2d, v5.4s +; NO_SVE-NEXT: tbz w8, #30, .LBB41_57 +; NO_SVE-NEXT: .LBB41_64: // %cond.load88 +; NO_SVE-NEXT: fmov x9, d1 +; NO_SVE-NEXT: ld1 { v4.h }[6], [x9] +; NO_SVE-NEXT: tbnz w8, #31, .LBB41_58 +; NO_SVE-NEXT: b .LBB41_59 +; +; VBITS_EQ_256-LABEL: masked_gather_32b_unscaled_sext: +; VBITS_EQ_256: // %bb.0: +; VBITS_EQ_256-NEXT: mov x8, #16 +; VBITS_EQ_256-NEXT: ptrue p0.h, vl16 +; VBITS_EQ_256-NEXT: mov x10, #24 +; VBITS_EQ_256-NEXT: ptrue p1.s, vl8 +; VBITS_EQ_256-NEXT: mov x9, #8 +; VBITS_EQ_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] +; VBITS_EQ_256-NEXT: ld1h { z1.h }, p0/z, [x0] +; VBITS_EQ_256-NEXT: ld1w { z4.s }, p1/z, [x1, x8, lsl #2] +; VBITS_EQ_256-NEXT: ld1w { z3.s }, p1/z, [x1, x10, lsl #2] +; VBITS_EQ_256-NEXT: ld1w { z2.s }, p1/z, [x1, x9, lsl #2] +; VBITS_EQ_256-NEXT: ld1w { z5.s }, p1/z, [x1] +; VBITS_EQ_256-NEXT: fcmeq p2.h, p0/z, z0.h, #0.0 +; VBITS_EQ_256-NEXT: fcmeq p3.h, p0/z, z1.h, #0.0 +; VBITS_EQ_256-NEXT: mov z0.h, p2/z, #-1 // =0xffffffffffffffff +; VBITS_EQ_256-NEXT: punpklo p2.h, p2.b +; VBITS_EQ_256-NEXT: ext z0.b, z0.b, z0.b, #16 +; VBITS_EQ_256-NEXT: mov z1.h, p3/z, #-1 // =0xffffffffffffffff +; VBITS_EQ_256-NEXT: and p2.b, p2/z, p2.b, p1.b +; VBITS_EQ_256-NEXT: sunpklo z0.s, z0.h +; VBITS_EQ_256-NEXT: ld1h { z4.s }, p2/z, [x2, z4.s, sxtw] +; VBITS_EQ_256-NEXT: cmpne p2.s, p1/z, z0.s, #0 +; VBITS_EQ_256-NEXT: ext z1.b, z1.b, z1.b, #16 +; VBITS_EQ_256-NEXT: ld1h { z0.s }, p2/z, [x2, z3.s, sxtw] +; VBITS_EQ_256-NEXT: punpklo p2.h, p3.b +; VBITS_EQ_256-NEXT: sunpklo z1.s, z1.h +; VBITS_EQ_256-NEXT: and p2.b, p2/z, p2.b, p1.b +; VBITS_EQ_256-NEXT: cmpne p1.s, p1/z, z1.s, #0 +; VBITS_EQ_256-NEXT: ld1h { z1.s }, p2/z, [x2, z5.s, sxtw] +; VBITS_EQ_256-NEXT: ld1h { z2.s }, p1/z, [x2, z2.s, sxtw] +; VBITS_EQ_256-NEXT: uzp1 z3.h, z4.h, z4.h +; VBITS_EQ_256-NEXT: uzp1 z0.h, z0.h, z0.h +; VBITS_EQ_256-NEXT: ptrue p1.h, vl8 +; VBITS_EQ_256-NEXT: splice z3.h, p1, z3.h, z0.h +; VBITS_EQ_256-NEXT: uzp1 z1.h, z1.h, z1.h +; VBITS_EQ_256-NEXT: uzp1 z2.h, z2.h, z2.h +; VBITS_EQ_256-NEXT: splice z1.h, p1, z1.h, z2.h +; VBITS_EQ_256-NEXT: st1h { z3.h }, p0, [x0, x8, lsl #1] +; VBITS_EQ_256-NEXT: st1h { z1.h }, p0, [x0] +; VBITS_EQ_256-NEXT: ret +; +; VBITS_GE_1024-LABEL: masked_gather_32b_unscaled_sext: +; VBITS_GE_1024: // %bb.0: +; VBITS_GE_1024-NEXT: ptrue p0.h, vl32 +; VBITS_GE_1024-NEXT: ptrue p1.s, vl32 +; VBITS_GE_1024-NEXT: ld1h { z0.h }, p0/z, [x0] +; VBITS_GE_1024-NEXT: ld1w { z1.s }, p1/z, [x1] +; VBITS_GE_1024-NEXT: fcmeq p1.h, p0/z, z0.h, #0.0 +; VBITS_GE_1024-NEXT: punpklo p1.h, p1.b +; VBITS_GE_1024-NEXT: ld1h { z0.s }, p1/z, [x2, z1.s, sxtw] +; VBITS_GE_1024-NEXT: uzp1 z0.h, z0.h, z0.h +; VBITS_GE_1024-NEXT: st1h { z0.h }, p0, [x0] +; VBITS_GE_1024-NEXT: ret %cvals = load <32 x half>, <32 x half>* %a %idxs = load <32 x i32>, <32 x i32>* %b %ext = sext <32 x i32> %idxs to <32 x i64> @@ -1070,18 +9049,401 @@ } define void @masked_gather_32b_unscaled_zext(<32 x half>* %a, <32 x i32>* %b, i8* %base) #0 { -; VBITS_GE_2048-LABEL: masked_gather_32b_unscaled_zext: -; VBITS_GE_2048: // %bb.0: -; VBITS_GE_2048-NEXT: ptrue p0.h, vl32 -; VBITS_GE_2048-NEXT: ptrue p1.s, vl32 -; VBITS_GE_2048-NEXT: ld1h { z0.h }, p0/z, [x0] -; VBITS_GE_2048-NEXT: ld1w { z1.s }, p1/z, [x1] -; VBITS_GE_2048-NEXT: fcmeq p1.h, p0/z, z0.h, #0.0 -; VBITS_GE_2048-NEXT: punpklo p1.h, p1.b -; VBITS_GE_2048-NEXT: ld1h { z0.s }, p1/z, [x2, z1.s, uxtw] -; VBITS_GE_2048-NEXT: uzp1 z0.h, z0.h, z0.h -; VBITS_GE_2048-NEXT: st1h { z0.h }, p0, [x0] -; VBITS_GE_2048-NEXT: ret +; NO_SVE-LABEL: masked_gather_32b_unscaled_zext: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: sub sp, sp, #16 +; NO_SVE-NEXT: .cfi_def_cfa_offset 16 +; NO_SVE-NEXT: ldp q0, q1, [x0, #32] +; NO_SVE-NEXT: fcmeq v0.8h, v0.8h, #0.0 +; NO_SVE-NEXT: fcmeq v1.8h, v1.8h, #0.0 +; NO_SVE-NEXT: ldr q2, [x1] +; NO_SVE-NEXT: ldp q3, q4, [x0] +; NO_SVE-NEXT: xtn v0.8b, v0.8h +; NO_SVE-NEXT: umov w8, v0.b[1] +; NO_SVE-NEXT: umov w10, v0.b[2] +; NO_SVE-NEXT: umov w9, v0.b[0] +; NO_SVE-NEXT: umov w11, v0.b[3] +; NO_SVE-NEXT: umov w12, v0.b[4] +; NO_SVE-NEXT: umov w13, v0.b[5] +; NO_SVE-NEXT: xtn v1.8b, v1.8h +; NO_SVE-NEXT: umov w14, v0.b[6] +; NO_SVE-NEXT: and w8, w8, #0x1 +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: and w9, w9, #0x1 +; NO_SVE-NEXT: and w11, w11, #0x1 +; NO_SVE-NEXT: umov w15, v0.b[7] +; NO_SVE-NEXT: and w12, w12, #0x1 +; NO_SVE-NEXT: bfi w9, w8, #1, #1 +; NO_SVE-NEXT: umov w8, v1.b[0] +; NO_SVE-NEXT: fcmeq v3.8h, v3.8h, #0.0 +; NO_SVE-NEXT: bfi w9, w10, #2, #1 +; NO_SVE-NEXT: and w13, w13, #0x1 +; NO_SVE-NEXT: umov w10, v1.b[1] +; NO_SVE-NEXT: bfi w9, w11, #3, #1 +; NO_SVE-NEXT: umov w11, v1.b[2] +; NO_SVE-NEXT: bfi w9, w12, #4, #1 +; NO_SVE-NEXT: and w14, w14, #0x1 +; NO_SVE-NEXT: umov w12, v1.b[3] +; NO_SVE-NEXT: bfi w9, w13, #5, #1 +; NO_SVE-NEXT: and w15, w15, #0x1 +; NO_SVE-NEXT: xtn v3.8b, v3.8h +; NO_SVE-NEXT: umov w13, v1.b[4] +; NO_SVE-NEXT: orr w9, w9, w14, lsl #6 +; NO_SVE-NEXT: and w8, w8, #0x1 +; NO_SVE-NEXT: orr w9, w9, w15, lsl #7 +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: umov w15, v3.b[1] +; NO_SVE-NEXT: orr w8, w9, w8, lsl #8 +; NO_SVE-NEXT: and w11, w11, #0x1 +; NO_SVE-NEXT: orr w8, w8, w10, lsl #9 +; NO_SVE-NEXT: umov w10, v3.b[2] +; NO_SVE-NEXT: and w12, w12, #0x1 +; NO_SVE-NEXT: umov w9, v3.b[0] +; NO_SVE-NEXT: orr w8, w8, w11, lsl #10 +; NO_SVE-NEXT: and w13, w13, #0x1 +; NO_SVE-NEXT: umov w11, v3.b[3] +; NO_SVE-NEXT: orr w8, w8, w12, lsl #11 +; NO_SVE-NEXT: and w15, w15, #0x1 +; NO_SVE-NEXT: umov w12, v3.b[4] +; NO_SVE-NEXT: orr w8, w8, w13, lsl #12 +; NO_SVE-NEXT: umov w13, v3.b[5] +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: and w9, w9, #0x1 +; NO_SVE-NEXT: fcmeq v0.8h, v4.8h, #0.0 +; NO_SVE-NEXT: bfi w9, w15, #1, #1 +; NO_SVE-NEXT: and w11, w11, #0x1 +; NO_SVE-NEXT: umov w14, v1.b[5] +; NO_SVE-NEXT: bfi w9, w10, #2, #1 +; NO_SVE-NEXT: and w10, w12, #0x1 +; NO_SVE-NEXT: and w12, w13, #0x1 +; NO_SVE-NEXT: umov w13, v3.b[6] +; NO_SVE-NEXT: bfi w9, w11, #3, #1 +; NO_SVE-NEXT: umov w11, v3.b[7] +; NO_SVE-NEXT: xtn v0.8b, v0.8h +; NO_SVE-NEXT: bfi w9, w10, #4, #1 +; NO_SVE-NEXT: umov w10, v1.b[6] +; NO_SVE-NEXT: bfi w9, w12, #5, #1 +; NO_SVE-NEXT: and w12, w13, #0x1 +; NO_SVE-NEXT: umov w13, v0.b[0] +; NO_SVE-NEXT: and w14, w14, #0x1 +; NO_SVE-NEXT: umov w15, v0.b[1] +; NO_SVE-NEXT: and w11, w11, #0x1 +; NO_SVE-NEXT: orr w9, w9, w12, lsl #6 +; NO_SVE-NEXT: umov w12, v0.b[2] +; NO_SVE-NEXT: orr w8, w8, w14, lsl #13 +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: umov w14, v0.b[3] +; NO_SVE-NEXT: orr w9, w9, w11, lsl #7 +; NO_SVE-NEXT: and w11, w13, #0x1 +; NO_SVE-NEXT: and w13, w15, #0x1 +; NO_SVE-NEXT: orr w8, w8, w10, lsl #14 +; NO_SVE-NEXT: umov w10, v0.b[4] +; NO_SVE-NEXT: orr w9, w9, w11, lsl #8 +; NO_SVE-NEXT: and w11, w12, #0x1 +; NO_SVE-NEXT: umov w12, v0.b[5] +; NO_SVE-NEXT: orr w9, w9, w13, lsl #9 +; NO_SVE-NEXT: and w13, w14, #0x1 +; NO_SVE-NEXT: umov w14, v0.b[6] +; NO_SVE-NEXT: orr w9, w9, w11, lsl #10 +; NO_SVE-NEXT: umov w11, v1.b[7] +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: orr w9, w9, w13, lsl #11 +; NO_SVE-NEXT: and w12, w12, #0x1 +; NO_SVE-NEXT: umov w13, v0.b[7] +; NO_SVE-NEXT: orr w9, w9, w10, lsl #12 +; NO_SVE-NEXT: and w10, w14, #0x1 +; NO_SVE-NEXT: dup v1.2d, x2 +; NO_SVE-NEXT: orr w11, w8, w11, lsl #15 +; NO_SVE-NEXT: orr w8, w9, w12, lsl #13 +; NO_SVE-NEXT: orr w8, w8, w10, lsl #14 +; NO_SVE-NEXT: orr w8, w8, w13, lsl #15 +; NO_SVE-NEXT: bfi w8, w11, #16, #16 +; NO_SVE-NEXT: uaddw v3.2d, v1.2d, v2.2s +; NO_SVE-NEXT: tbz w8, #0, .LBB42_2 +; NO_SVE-NEXT: // %bb.1: // %cond.load +; NO_SVE-NEXT: fmov x9, d3 +; NO_SVE-NEXT: ldr h0, [x9] +; NO_SVE-NEXT: tbnz w8, #1, .LBB42_3 +; NO_SVE-NEXT: b .LBB42_4 +; NO_SVE-NEXT: .LBB42_2: +; NO_SVE-NEXT: // implicit-def: $q0 +; NO_SVE-NEXT: tbz w8, #1, .LBB42_4 +; NO_SVE-NEXT: .LBB42_3: // %cond.load1 +; NO_SVE-NEXT: mov x9, v3.d[1] +; NO_SVE-NEXT: ld1 { v0.h }[1], [x9] +; NO_SVE-NEXT: .LBB42_4: // %else2 +; NO_SVE-NEXT: ldr q3, [x1, #16] +; NO_SVE-NEXT: uaddw2 v2.2d, v1.2d, v2.4s +; NO_SVE-NEXT: tbnz w8, #2, .LBB42_13 +; NO_SVE-NEXT: // %bb.5: // %else5 +; NO_SVE-NEXT: tbnz w8, #3, .LBB42_14 +; NO_SVE-NEXT: .LBB42_6: // %else8 +; NO_SVE-NEXT: uaddw v2.2d, v1.2d, v3.2s +; NO_SVE-NEXT: tbnz w8, #4, .LBB42_15 +; NO_SVE-NEXT: .LBB42_7: // %else11 +; NO_SVE-NEXT: tbz w8, #5, .LBB42_9 +; NO_SVE-NEXT: .LBB42_8: // %cond.load13 +; NO_SVE-NEXT: mov x9, v2.d[1] +; NO_SVE-NEXT: ld1 { v0.h }[5], [x9] +; NO_SVE-NEXT: .LBB42_9: // %else14 +; NO_SVE-NEXT: ldr q4, [x1, #32] +; NO_SVE-NEXT: uaddw2 v2.2d, v1.2d, v3.4s +; NO_SVE-NEXT: tbnz w8, #6, .LBB42_16 +; NO_SVE-NEXT: // %bb.10: // %else17 +; NO_SVE-NEXT: tbnz w8, #7, .LBB42_17 +; NO_SVE-NEXT: .LBB42_11: // %else20 +; NO_SVE-NEXT: uaddw v3.2d, v1.2d, v4.2s +; NO_SVE-NEXT: tbz w8, #8, .LBB42_18 +; NO_SVE-NEXT: .LBB42_12: // %cond.load22 +; NO_SVE-NEXT: fmov x9, d3 +; NO_SVE-NEXT: ld1 { v2.h }[0], [x9] +; NO_SVE-NEXT: tbnz w8, #9, .LBB42_19 +; NO_SVE-NEXT: b .LBB42_20 +; NO_SVE-NEXT: .LBB42_13: // %cond.load4 +; NO_SVE-NEXT: fmov x9, d2 +; NO_SVE-NEXT: ld1 { v0.h }[2], [x9] +; NO_SVE-NEXT: tbz w8, #3, .LBB42_6 +; NO_SVE-NEXT: .LBB42_14: // %cond.load7 +; NO_SVE-NEXT: mov x9, v2.d[1] +; NO_SVE-NEXT: ld1 { v0.h }[3], [x9] +; NO_SVE-NEXT: uaddw v2.2d, v1.2d, v3.2s +; NO_SVE-NEXT: tbz w8, #4, .LBB42_7 +; NO_SVE-NEXT: .LBB42_15: // %cond.load10 +; NO_SVE-NEXT: fmov x9, d2 +; NO_SVE-NEXT: ld1 { v0.h }[4], [x9] +; NO_SVE-NEXT: tbnz w8, #5, .LBB42_8 +; NO_SVE-NEXT: b .LBB42_9 +; NO_SVE-NEXT: .LBB42_16: // %cond.load16 +; NO_SVE-NEXT: fmov x9, d2 +; NO_SVE-NEXT: ld1 { v0.h }[6], [x9] +; NO_SVE-NEXT: tbz w8, #7, .LBB42_11 +; NO_SVE-NEXT: .LBB42_17: // %cond.load19 +; NO_SVE-NEXT: mov x9, v2.d[1] +; NO_SVE-NEXT: ld1 { v0.h }[7], [x9] +; NO_SVE-NEXT: uaddw v3.2d, v1.2d, v4.2s +; NO_SVE-NEXT: tbnz w8, #8, .LBB42_12 +; NO_SVE-NEXT: .LBB42_18: +; NO_SVE-NEXT: // implicit-def: $q2 +; NO_SVE-NEXT: tbz w8, #9, .LBB42_20 +; NO_SVE-NEXT: .LBB42_19: // %cond.load25 +; NO_SVE-NEXT: mov x9, v3.d[1] +; NO_SVE-NEXT: ld1 { v2.h }[1], [x9] +; NO_SVE-NEXT: .LBB42_20: // %else26 +; NO_SVE-NEXT: ldr q3, [x1, #48] +; NO_SVE-NEXT: uaddw2 v4.2d, v1.2d, v4.4s +; NO_SVE-NEXT: tbnz w8, #10, .LBB42_29 +; NO_SVE-NEXT: // %bb.21: // %else29 +; NO_SVE-NEXT: tbnz w8, #11, .LBB42_30 +; NO_SVE-NEXT: .LBB42_22: // %else32 +; NO_SVE-NEXT: uaddw v4.2d, v1.2d, v3.2s +; NO_SVE-NEXT: tbnz w8, #12, .LBB42_31 +; NO_SVE-NEXT: .LBB42_23: // %else35 +; NO_SVE-NEXT: tbz w8, #13, .LBB42_25 +; NO_SVE-NEXT: .LBB42_24: // %cond.load37 +; NO_SVE-NEXT: mov x9, v4.d[1] +; NO_SVE-NEXT: ld1 { v2.h }[5], [x9] +; NO_SVE-NEXT: .LBB42_25: // %else38 +; NO_SVE-NEXT: ldr q5, [x1, #64] +; NO_SVE-NEXT: uaddw2 v3.2d, v1.2d, v3.4s +; NO_SVE-NEXT: tbnz w8, #14, .LBB42_32 +; NO_SVE-NEXT: // %bb.26: // %else41 +; NO_SVE-NEXT: tbnz w8, #15, .LBB42_33 +; NO_SVE-NEXT: .LBB42_27: // %else44 +; NO_SVE-NEXT: uaddw v4.2d, v1.2d, v5.2s +; NO_SVE-NEXT: tbz w8, #16, .LBB42_34 +; NO_SVE-NEXT: .LBB42_28: // %cond.load46 +; NO_SVE-NEXT: fmov x9, d4 +; NO_SVE-NEXT: ld1 { v3.h }[0], [x9] +; NO_SVE-NEXT: tbnz w8, #17, .LBB42_35 +; NO_SVE-NEXT: b .LBB42_36 +; NO_SVE-NEXT: .LBB42_29: // %cond.load28 +; NO_SVE-NEXT: fmov x9, d4 +; NO_SVE-NEXT: ld1 { v2.h }[2], [x9] +; NO_SVE-NEXT: tbz w8, #11, .LBB42_22 +; NO_SVE-NEXT: .LBB42_30: // %cond.load31 +; NO_SVE-NEXT: mov x9, v4.d[1] +; NO_SVE-NEXT: ld1 { v2.h }[3], [x9] +; NO_SVE-NEXT: uaddw v4.2d, v1.2d, v3.2s +; NO_SVE-NEXT: tbz w8, #12, .LBB42_23 +; NO_SVE-NEXT: .LBB42_31: // %cond.load34 +; NO_SVE-NEXT: fmov x9, d4 +; NO_SVE-NEXT: ld1 { v2.h }[4], [x9] +; NO_SVE-NEXT: tbnz w8, #13, .LBB42_24 +; NO_SVE-NEXT: b .LBB42_25 +; NO_SVE-NEXT: .LBB42_32: // %cond.load40 +; NO_SVE-NEXT: fmov x9, d3 +; NO_SVE-NEXT: ld1 { v2.h }[6], [x9] +; NO_SVE-NEXT: tbz w8, #15, .LBB42_27 +; NO_SVE-NEXT: .LBB42_33: // %cond.load43 +; NO_SVE-NEXT: mov x9, v3.d[1] +; NO_SVE-NEXT: ld1 { v2.h }[7], [x9] +; NO_SVE-NEXT: uaddw v4.2d, v1.2d, v5.2s +; NO_SVE-NEXT: tbnz w8, #16, .LBB42_28 +; NO_SVE-NEXT: .LBB42_34: +; NO_SVE-NEXT: // implicit-def: $q3 +; NO_SVE-NEXT: tbz w8, #17, .LBB42_36 +; NO_SVE-NEXT: .LBB42_35: // %cond.load49 +; NO_SVE-NEXT: mov x9, v4.d[1] +; NO_SVE-NEXT: ld1 { v3.h }[1], [x9] +; NO_SVE-NEXT: .LBB42_36: // %else50 +; NO_SVE-NEXT: ldr q4, [x1, #80] +; NO_SVE-NEXT: uaddw2 v5.2d, v1.2d, v5.4s +; NO_SVE-NEXT: tbnz w8, #18, .LBB42_45 +; NO_SVE-NEXT: // %bb.37: // %else53 +; NO_SVE-NEXT: tbnz w8, #19, .LBB42_46 +; NO_SVE-NEXT: .LBB42_38: // %else56 +; NO_SVE-NEXT: uaddw v5.2d, v1.2d, v4.2s +; NO_SVE-NEXT: tbnz w8, #20, .LBB42_47 +; NO_SVE-NEXT: .LBB42_39: // %else59 +; NO_SVE-NEXT: tbz w8, #21, .LBB42_41 +; NO_SVE-NEXT: .LBB42_40: // %cond.load61 +; NO_SVE-NEXT: mov x9, v5.d[1] +; NO_SVE-NEXT: ld1 { v3.h }[5], [x9] +; NO_SVE-NEXT: .LBB42_41: // %else62 +; NO_SVE-NEXT: ldr q6, [x1, #96] +; NO_SVE-NEXT: uaddw2 v4.2d, v1.2d, v4.4s +; NO_SVE-NEXT: tbnz w8, #22, .LBB42_48 +; NO_SVE-NEXT: // %bb.42: // %else65 +; NO_SVE-NEXT: tbnz w8, #23, .LBB42_49 +; NO_SVE-NEXT: .LBB42_43: // %else68 +; NO_SVE-NEXT: uaddw v5.2d, v1.2d, v6.2s +; NO_SVE-NEXT: tbz w8, #24, .LBB42_50 +; NO_SVE-NEXT: .LBB42_44: // %cond.load70 +; NO_SVE-NEXT: fmov x9, d5 +; NO_SVE-NEXT: ld1 { v4.h }[0], [x9] +; NO_SVE-NEXT: tbnz w8, #25, .LBB42_51 +; NO_SVE-NEXT: b .LBB42_52 +; NO_SVE-NEXT: .LBB42_45: // %cond.load52 +; NO_SVE-NEXT: fmov x9, d5 +; NO_SVE-NEXT: ld1 { v3.h }[2], [x9] +; NO_SVE-NEXT: tbz w8, #19, .LBB42_38 +; NO_SVE-NEXT: .LBB42_46: // %cond.load55 +; NO_SVE-NEXT: mov x9, v5.d[1] +; NO_SVE-NEXT: ld1 { v3.h }[3], [x9] +; NO_SVE-NEXT: uaddw v5.2d, v1.2d, v4.2s +; NO_SVE-NEXT: tbz w8, #20, .LBB42_39 +; NO_SVE-NEXT: .LBB42_47: // %cond.load58 +; NO_SVE-NEXT: fmov x9, d5 +; NO_SVE-NEXT: ld1 { v3.h }[4], [x9] +; NO_SVE-NEXT: tbnz w8, #21, .LBB42_40 +; NO_SVE-NEXT: b .LBB42_41 +; NO_SVE-NEXT: .LBB42_48: // %cond.load64 +; NO_SVE-NEXT: fmov x9, d4 +; NO_SVE-NEXT: ld1 { v3.h }[6], [x9] +; NO_SVE-NEXT: tbz w8, #23, .LBB42_43 +; NO_SVE-NEXT: .LBB42_49: // %cond.load67 +; NO_SVE-NEXT: mov x9, v4.d[1] +; NO_SVE-NEXT: ld1 { v3.h }[7], [x9] +; NO_SVE-NEXT: uaddw v5.2d, v1.2d, v6.2s +; NO_SVE-NEXT: tbnz w8, #24, .LBB42_44 +; NO_SVE-NEXT: .LBB42_50: +; NO_SVE-NEXT: // implicit-def: $q4 +; NO_SVE-NEXT: tbz w8, #25, .LBB42_52 +; NO_SVE-NEXT: .LBB42_51: // %cond.load73 +; NO_SVE-NEXT: mov x9, v5.d[1] +; NO_SVE-NEXT: ld1 { v4.h }[1], [x9] +; NO_SVE-NEXT: .LBB42_52: // %else74 +; NO_SVE-NEXT: ldr q5, [x1, #112] +; NO_SVE-NEXT: uaddw2 v6.2d, v1.2d, v6.4s +; NO_SVE-NEXT: tbnz w8, #26, .LBB42_60 +; NO_SVE-NEXT: // %bb.53: // %else77 +; NO_SVE-NEXT: tbnz w8, #27, .LBB42_61 +; NO_SVE-NEXT: .LBB42_54: // %else80 +; NO_SVE-NEXT: uaddw v6.2d, v1.2d, v5.2s +; NO_SVE-NEXT: tbnz w8, #28, .LBB42_62 +; NO_SVE-NEXT: .LBB42_55: // %else83 +; NO_SVE-NEXT: tbnz w8, #29, .LBB42_63 +; NO_SVE-NEXT: .LBB42_56: // %else86 +; NO_SVE-NEXT: uaddw2 v1.2d, v1.2d, v5.4s +; NO_SVE-NEXT: tbnz w8, #30, .LBB42_64 +; NO_SVE-NEXT: .LBB42_57: // %else89 +; NO_SVE-NEXT: tbz w8, #31, .LBB42_59 +; NO_SVE-NEXT: .LBB42_58: // %cond.load91 +; NO_SVE-NEXT: mov x8, v1.d[1] +; NO_SVE-NEXT: ld1 { v4.h }[7], [x8] +; NO_SVE-NEXT: .LBB42_59: // %else92 +; NO_SVE-NEXT: stp q0, q2, [x0] +; NO_SVE-NEXT: stp q3, q4, [x0, #32] +; NO_SVE-NEXT: add sp, sp, #16 +; NO_SVE-NEXT: ret +; NO_SVE-NEXT: .LBB42_60: // %cond.load76 +; NO_SVE-NEXT: fmov x9, d6 +; NO_SVE-NEXT: ld1 { v4.h }[2], [x9] +; NO_SVE-NEXT: tbz w8, #27, .LBB42_54 +; NO_SVE-NEXT: .LBB42_61: // %cond.load79 +; NO_SVE-NEXT: mov x9, v6.d[1] +; NO_SVE-NEXT: ld1 { v4.h }[3], [x9] +; NO_SVE-NEXT: uaddw v6.2d, v1.2d, v5.2s +; NO_SVE-NEXT: tbz w8, #28, .LBB42_55 +; NO_SVE-NEXT: .LBB42_62: // %cond.load82 +; NO_SVE-NEXT: fmov x9, d6 +; NO_SVE-NEXT: ld1 { v4.h }[4], [x9] +; NO_SVE-NEXT: tbz w8, #29, .LBB42_56 +; NO_SVE-NEXT: .LBB42_63: // %cond.load85 +; NO_SVE-NEXT: mov x9, v6.d[1] +; NO_SVE-NEXT: ld1 { v4.h }[5], [x9] +; NO_SVE-NEXT: uaddw2 v1.2d, v1.2d, v5.4s +; NO_SVE-NEXT: tbz w8, #30, .LBB42_57 +; NO_SVE-NEXT: .LBB42_64: // %cond.load88 +; NO_SVE-NEXT: fmov x9, d1 +; NO_SVE-NEXT: ld1 { v4.h }[6], [x9] +; NO_SVE-NEXT: tbnz w8, #31, .LBB42_58 +; NO_SVE-NEXT: b .LBB42_59 +; +; VBITS_EQ_256-LABEL: masked_gather_32b_unscaled_zext: +; VBITS_EQ_256: // %bb.0: +; VBITS_EQ_256-NEXT: mov x8, #16 +; VBITS_EQ_256-NEXT: ptrue p0.h, vl16 +; VBITS_EQ_256-NEXT: mov x10, #24 +; VBITS_EQ_256-NEXT: ptrue p1.s, vl8 +; VBITS_EQ_256-NEXT: mov x9, #8 +; VBITS_EQ_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] +; VBITS_EQ_256-NEXT: ld1h { z1.h }, p0/z, [x0] +; VBITS_EQ_256-NEXT: ld1w { z4.s }, p1/z, [x1, x8, lsl #2] +; VBITS_EQ_256-NEXT: ld1w { z3.s }, p1/z, [x1, x10, lsl #2] +; VBITS_EQ_256-NEXT: ld1w { z2.s }, p1/z, [x1, x9, lsl #2] +; VBITS_EQ_256-NEXT: ld1w { z5.s }, p1/z, [x1] +; VBITS_EQ_256-NEXT: fcmeq p2.h, p0/z, z0.h, #0.0 +; VBITS_EQ_256-NEXT: fcmeq p3.h, p0/z, z1.h, #0.0 +; VBITS_EQ_256-NEXT: mov z0.h, p2/z, #-1 // =0xffffffffffffffff +; VBITS_EQ_256-NEXT: punpklo p2.h, p2.b +; VBITS_EQ_256-NEXT: ext z0.b, z0.b, z0.b, #16 +; VBITS_EQ_256-NEXT: mov z1.h, p3/z, #-1 // =0xffffffffffffffff +; VBITS_EQ_256-NEXT: and p2.b, p2/z, p2.b, p1.b +; VBITS_EQ_256-NEXT: sunpklo z0.s, z0.h +; VBITS_EQ_256-NEXT: ld1h { z4.s }, p2/z, [x2, z4.s, uxtw] +; VBITS_EQ_256-NEXT: cmpne p2.s, p1/z, z0.s, #0 +; VBITS_EQ_256-NEXT: ext z1.b, z1.b, z1.b, #16 +; VBITS_EQ_256-NEXT: ld1h { z0.s }, p2/z, [x2, z3.s, uxtw] +; VBITS_EQ_256-NEXT: punpklo p2.h, p3.b +; VBITS_EQ_256-NEXT: sunpklo z1.s, z1.h +; VBITS_EQ_256-NEXT: and p2.b, p2/z, p2.b, p1.b +; VBITS_EQ_256-NEXT: cmpne p1.s, p1/z, z1.s, #0 +; VBITS_EQ_256-NEXT: ld1h { z1.s }, p2/z, [x2, z5.s, uxtw] +; VBITS_EQ_256-NEXT: ld1h { z2.s }, p1/z, [x2, z2.s, uxtw] +; VBITS_EQ_256-NEXT: uzp1 z3.h, z4.h, z4.h +; VBITS_EQ_256-NEXT: uzp1 z0.h, z0.h, z0.h +; VBITS_EQ_256-NEXT: ptrue p1.h, vl8 +; VBITS_EQ_256-NEXT: splice z3.h, p1, z3.h, z0.h +; VBITS_EQ_256-NEXT: uzp1 z1.h, z1.h, z1.h +; VBITS_EQ_256-NEXT: uzp1 z2.h, z2.h, z2.h +; VBITS_EQ_256-NEXT: splice z1.h, p1, z1.h, z2.h +; VBITS_EQ_256-NEXT: st1h { z3.h }, p0, [x0, x8, lsl #1] +; VBITS_EQ_256-NEXT: st1h { z1.h }, p0, [x0] +; VBITS_EQ_256-NEXT: ret +; +; VBITS_GE_1024-LABEL: masked_gather_32b_unscaled_zext: +; VBITS_GE_1024: // %bb.0: +; VBITS_GE_1024-NEXT: ptrue p0.h, vl32 +; VBITS_GE_1024-NEXT: ptrue p1.s, vl32 +; VBITS_GE_1024-NEXT: ld1h { z0.h }, p0/z, [x0] +; VBITS_GE_1024-NEXT: ld1w { z1.s }, p1/z, [x1] +; VBITS_GE_1024-NEXT: fcmeq p1.h, p0/z, z0.h, #0.0 +; VBITS_GE_1024-NEXT: punpklo p1.h, p1.b +; VBITS_GE_1024-NEXT: ld1h { z0.s }, p1/z, [x2, z1.s, uxtw] +; VBITS_GE_1024-NEXT: uzp1 z0.h, z0.h, z0.h +; VBITS_GE_1024-NEXT: st1h { z0.h }, p0, [x0] +; VBITS_GE_1024-NEXT: ret %cvals = load <32 x half>, <32 x half>* %a %idxs = load <32 x i32>, <32 x i32>* %b %ext = zext <32 x i32> %idxs to <32 x i64> @@ -1094,6 +9456,452 @@ } define void @masked_gather_64b_scaled(<32 x float>* %a, <32 x i64>* %b, float* %base) #0 { +; NO_SVE-LABEL: masked_gather_64b_scaled: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: sub sp, sp, #16 +; NO_SVE-NEXT: .cfi_def_cfa_offset 16 +; NO_SVE-NEXT: ldp q3, q4, [x0, #64] +; NO_SVE-NEXT: fcmeq v3.4s, v3.4s, #0.0 +; NO_SVE-NEXT: ldp q1, q2, [x0] +; NO_SVE-NEXT: fcmeq v4.4s, v4.4s, #0.0 +; NO_SVE-NEXT: fcmeq v1.4s, v1.4s, #0.0 +; NO_SVE-NEXT: uzp1 v3.8h, v3.8h, v4.8h +; NO_SVE-NEXT: ldp q5, q6, [x0, #96] +; NO_SVE-NEXT: fcmeq v2.4s, v2.4s, #0.0 +; NO_SVE-NEXT: fcmeq v5.4s, v5.4s, #0.0 +; NO_SVE-NEXT: uzp1 v1.8h, v1.8h, v2.8h +; NO_SVE-NEXT: fcmeq v6.4s, v6.4s, #0.0 +; NO_SVE-NEXT: ldr q4, [x0, #48] +; NO_SVE-NEXT: xtn v2.8b, v3.8h +; NO_SVE-NEXT: ldr q0, [x1] +; NO_SVE-NEXT: umov w8, v2.b[1] +; NO_SVE-NEXT: umov w10, v2.b[2] +; NO_SVE-NEXT: uzp1 v5.8h, v5.8h, v6.8h +; NO_SVE-NEXT: umov w9, v2.b[0] +; NO_SVE-NEXT: umov w11, v2.b[3] +; NO_SVE-NEXT: umov w12, v2.b[4] +; NO_SVE-NEXT: umov w13, v2.b[5] +; NO_SVE-NEXT: and w8, w8, #0x1 +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: xtn v5.8b, v5.8h +; NO_SVE-NEXT: umov w14, v2.b[6] +; NO_SVE-NEXT: and w9, w9, #0x1 +; NO_SVE-NEXT: and w11, w11, #0x1 +; NO_SVE-NEXT: umov w15, v2.b[7] +; NO_SVE-NEXT: bfi w9, w8, #1, #1 +; NO_SVE-NEXT: and w12, w12, #0x1 +; NO_SVE-NEXT: umov w8, v5.b[0] +; NO_SVE-NEXT: bfi w9, w10, #2, #1 +; NO_SVE-NEXT: and w13, w13, #0x1 +; NO_SVE-NEXT: umov w10, v5.b[1] +; NO_SVE-NEXT: bfi w9, w11, #3, #1 +; NO_SVE-NEXT: umov w11, v5.b[2] +; NO_SVE-NEXT: bfi w9, w12, #4, #1 +; NO_SVE-NEXT: and w14, w14, #0x1 +; NO_SVE-NEXT: umov w12, v5.b[3] +; NO_SVE-NEXT: bfi w9, w13, #5, #1 +; NO_SVE-NEXT: and w15, w15, #0x1 +; NO_SVE-NEXT: xtn v1.8b, v1.8h +; NO_SVE-NEXT: umov w13, v5.b[4] +; NO_SVE-NEXT: orr w9, w9, w14, lsl #6 +; NO_SVE-NEXT: and w8, w8, #0x1 +; NO_SVE-NEXT: fcmeq v3.4s, v4.4s, #0.0 +; NO_SVE-NEXT: ldr q4, [x0, #32] +; NO_SVE-NEXT: orr w9, w9, w15, lsl #7 +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: umov w15, v1.b[1] +; NO_SVE-NEXT: orr w8, w9, w8, lsl #8 +; NO_SVE-NEXT: and w11, w11, #0x1 +; NO_SVE-NEXT: orr w8, w8, w10, lsl #9 +; NO_SVE-NEXT: umov w10, v1.b[2] +; NO_SVE-NEXT: and w12, w12, #0x1 +; NO_SVE-NEXT: umov w9, v1.b[0] +; NO_SVE-NEXT: orr w8, w8, w11, lsl #10 +; NO_SVE-NEXT: and w13, w13, #0x1 +; NO_SVE-NEXT: umov w11, v1.b[3] +; NO_SVE-NEXT: fcmeq v2.4s, v4.4s, #0.0 +; NO_SVE-NEXT: orr w8, w8, w12, lsl #11 +; NO_SVE-NEXT: and w12, w15, #0x1 +; NO_SVE-NEXT: orr w8, w8, w13, lsl #12 +; NO_SVE-NEXT: umov w13, v1.b[4] +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: umov w15, v1.b[5] +; NO_SVE-NEXT: and w9, w9, #0x1 +; NO_SVE-NEXT: uzp1 v2.8h, v2.8h, v3.8h +; NO_SVE-NEXT: bfi w9, w12, #1, #1 +; NO_SVE-NEXT: and w11, w11, #0x1 +; NO_SVE-NEXT: bfi w9, w10, #2, #1 +; NO_SVE-NEXT: and w10, w13, #0x1 +; NO_SVE-NEXT: umov w13, v1.b[6] +; NO_SVE-NEXT: umov w14, v5.b[5] +; NO_SVE-NEXT: and w12, w15, #0x1 +; NO_SVE-NEXT: bfi w9, w11, #3, #1 +; NO_SVE-NEXT: umov w11, v1.b[7] +; NO_SVE-NEXT: xtn v1.8b, v2.8h +; NO_SVE-NEXT: bfi w9, w10, #4, #1 +; NO_SVE-NEXT: umov w10, v5.b[6] +; NO_SVE-NEXT: bfi w9, w12, #5, #1 +; NO_SVE-NEXT: and w12, w13, #0x1 +; NO_SVE-NEXT: umov w13, v1.b[0] +; NO_SVE-NEXT: and w14, w14, #0x1 +; NO_SVE-NEXT: umov w15, v1.b[1] +; NO_SVE-NEXT: and w11, w11, #0x1 +; NO_SVE-NEXT: orr w9, w9, w12, lsl #6 +; NO_SVE-NEXT: umov w12, v1.b[2] +; NO_SVE-NEXT: orr w8, w8, w14, lsl #13 +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: umov w14, v1.b[3] +; NO_SVE-NEXT: orr w9, w9, w11, lsl #7 +; NO_SVE-NEXT: and w11, w13, #0x1 +; NO_SVE-NEXT: and w13, w15, #0x1 +; NO_SVE-NEXT: orr w8, w8, w10, lsl #14 +; NO_SVE-NEXT: umov w10, v1.b[4] +; NO_SVE-NEXT: orr w9, w9, w11, lsl #8 +; NO_SVE-NEXT: and w11, w12, #0x1 +; NO_SVE-NEXT: umov w12, v1.b[5] +; NO_SVE-NEXT: orr w9, w9, w13, lsl #9 +; NO_SVE-NEXT: and w13, w14, #0x1 +; NO_SVE-NEXT: umov w14, v1.b[6] +; NO_SVE-NEXT: orr w9, w9, w11, lsl #10 +; NO_SVE-NEXT: umov w11, v5.b[7] +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: orr w9, w9, w13, lsl #11 +; NO_SVE-NEXT: and w12, w12, #0x1 +; NO_SVE-NEXT: umov w13, v1.b[7] +; NO_SVE-NEXT: orr w9, w9, w10, lsl #12 +; NO_SVE-NEXT: and w10, w14, #0x1 +; NO_SVE-NEXT: dup v2.2d, x2 +; NO_SVE-NEXT: orr w11, w8, w11, lsl #15 +; NO_SVE-NEXT: orr w8, w9, w12, lsl #13 +; NO_SVE-NEXT: orr w8, w8, w10, lsl #14 +; NO_SVE-NEXT: shl v0.2d, v0.2d, #2 +; NO_SVE-NEXT: orr w8, w8, w13, lsl #15 +; NO_SVE-NEXT: ldr q1, [x1, #16] +; NO_SVE-NEXT: bfi w8, w11, #16, #16 +; NO_SVE-NEXT: add v3.2d, v2.2d, v0.2d +; NO_SVE-NEXT: tbz w8, #0, .LBB43_2 +; NO_SVE-NEXT: // %bb.1: // %cond.load +; NO_SVE-NEXT: fmov x9, d3 +; NO_SVE-NEXT: ldr s0, [x9] +; NO_SVE-NEXT: shl v1.2d, v1.2d, #2 +; NO_SVE-NEXT: tbnz w8, #1, .LBB43_3 +; NO_SVE-NEXT: b .LBB43_4 +; NO_SVE-NEXT: .LBB43_2: +; NO_SVE-NEXT: // implicit-def: $q0 +; NO_SVE-NEXT: shl v1.2d, v1.2d, #2 +; NO_SVE-NEXT: tbz w8, #1, .LBB43_4 +; NO_SVE-NEXT: .LBB43_3: // %cond.load1 +; NO_SVE-NEXT: mov x9, v3.d[1] +; NO_SVE-NEXT: ld1 { v0.s }[1], [x9] +; NO_SVE-NEXT: .LBB43_4: // %else2 +; NO_SVE-NEXT: ldr q3, [x1, #32] +; NO_SVE-NEXT: add v1.2d, v2.2d, v1.2d +; NO_SVE-NEXT: tbz w8, #2, .LBB43_6 +; NO_SVE-NEXT: // %bb.5: // %cond.load4 +; NO_SVE-NEXT: fmov x9, d1 +; NO_SVE-NEXT: ld1 { v0.s }[2], [x9] +; NO_SVE-NEXT: .LBB43_6: // %else5 +; NO_SVE-NEXT: shl v4.2d, v3.2d, #2 +; NO_SVE-NEXT: tbz w8, #3, .LBB43_8 +; NO_SVE-NEXT: // %bb.7: // %cond.load7 +; NO_SVE-NEXT: mov x9, v1.d[1] +; NO_SVE-NEXT: ld1 { v0.s }[3], [x9] +; NO_SVE-NEXT: .LBB43_8: // %else8 +; NO_SVE-NEXT: ldr q3, [x1, #48] +; NO_SVE-NEXT: add v4.2d, v2.2d, v4.2d +; NO_SVE-NEXT: tbz w8, #4, .LBB43_10 +; NO_SVE-NEXT: // %bb.9: // %cond.load10 +; NO_SVE-NEXT: fmov x9, d4 +; NO_SVE-NEXT: ld1 { v1.s }[0], [x9] +; NO_SVE-NEXT: shl v3.2d, v3.2d, #2 +; NO_SVE-NEXT: tbnz w8, #5, .LBB43_11 +; NO_SVE-NEXT: b .LBB43_12 +; NO_SVE-NEXT: .LBB43_10: +; NO_SVE-NEXT: // implicit-def: $q1 +; NO_SVE-NEXT: shl v3.2d, v3.2d, #2 +; NO_SVE-NEXT: tbz w8, #5, .LBB43_12 +; NO_SVE-NEXT: .LBB43_11: // %cond.load13 +; NO_SVE-NEXT: mov x9, v4.d[1] +; NO_SVE-NEXT: ld1 { v1.s }[1], [x9] +; NO_SVE-NEXT: .LBB43_12: // %else14 +; NO_SVE-NEXT: ldr q4, [x1, #64] +; NO_SVE-NEXT: add v3.2d, v2.2d, v3.2d +; NO_SVE-NEXT: tbz w8, #6, .LBB43_14 +; NO_SVE-NEXT: // %bb.13: // %cond.load16 +; NO_SVE-NEXT: fmov x9, d3 +; NO_SVE-NEXT: ld1 { v1.s }[2], [x9] +; NO_SVE-NEXT: .LBB43_14: // %else17 +; NO_SVE-NEXT: shl v4.2d, v4.2d, #2 +; NO_SVE-NEXT: tbz w8, #7, .LBB43_16 +; NO_SVE-NEXT: // %bb.15: // %cond.load19 +; NO_SVE-NEXT: mov x9, v3.d[1] +; NO_SVE-NEXT: ld1 { v1.s }[3], [x9] +; NO_SVE-NEXT: .LBB43_16: // %else20 +; NO_SVE-NEXT: ldr q5, [x1, #80] +; NO_SVE-NEXT: add v4.2d, v2.2d, v4.2d +; NO_SVE-NEXT: tbz w8, #8, .LBB43_18 +; NO_SVE-NEXT: // %bb.17: // %cond.load22 +; NO_SVE-NEXT: fmov x9, d4 +; NO_SVE-NEXT: ld1 { v3.s }[0], [x9] +; NO_SVE-NEXT: shl v6.2d, v5.2d, #2 +; NO_SVE-NEXT: tbnz w8, #9, .LBB43_19 +; NO_SVE-NEXT: b .LBB43_20 +; NO_SVE-NEXT: .LBB43_18: +; NO_SVE-NEXT: // implicit-def: $q3 +; NO_SVE-NEXT: shl v6.2d, v5.2d, #2 +; NO_SVE-NEXT: tbz w8, #9, .LBB43_20 +; NO_SVE-NEXT: .LBB43_19: // %cond.load25 +; NO_SVE-NEXT: mov x9, v4.d[1] +; NO_SVE-NEXT: ld1 { v3.s }[1], [x9] +; NO_SVE-NEXT: .LBB43_20: // %else26 +; NO_SVE-NEXT: ldr q5, [x1, #96] +; NO_SVE-NEXT: add v4.2d, v2.2d, v6.2d +; NO_SVE-NEXT: tbz w8, #10, .LBB43_22 +; NO_SVE-NEXT: // %bb.21: // %cond.load28 +; NO_SVE-NEXT: fmov x9, d4 +; NO_SVE-NEXT: ld1 { v3.s }[2], [x9] +; NO_SVE-NEXT: .LBB43_22: // %else29 +; NO_SVE-NEXT: shl v5.2d, v5.2d, #2 +; NO_SVE-NEXT: tbz w8, #11, .LBB43_24 +; NO_SVE-NEXT: // %bb.23: // %cond.load31 +; NO_SVE-NEXT: mov x9, v4.d[1] +; NO_SVE-NEXT: ld1 { v3.s }[3], [x9] +; NO_SVE-NEXT: .LBB43_24: // %else32 +; NO_SVE-NEXT: ldr q6, [x1, #112] +; NO_SVE-NEXT: add v5.2d, v2.2d, v5.2d +; NO_SVE-NEXT: tbz w8, #12, .LBB43_26 +; NO_SVE-NEXT: // %bb.25: // %cond.load34 +; NO_SVE-NEXT: fmov x9, d5 +; NO_SVE-NEXT: ld1 { v4.s }[0], [x9] +; NO_SVE-NEXT: shl v7.2d, v6.2d, #2 +; NO_SVE-NEXT: tbnz w8, #13, .LBB43_27 +; NO_SVE-NEXT: b .LBB43_28 +; NO_SVE-NEXT: .LBB43_26: +; NO_SVE-NEXT: // implicit-def: $q4 +; NO_SVE-NEXT: shl v7.2d, v6.2d, #2 +; NO_SVE-NEXT: tbz w8, #13, .LBB43_28 +; NO_SVE-NEXT: .LBB43_27: // %cond.load37 +; NO_SVE-NEXT: mov x9, v5.d[1] +; NO_SVE-NEXT: ld1 { v4.s }[1], [x9] +; NO_SVE-NEXT: .LBB43_28: // %else38 +; NO_SVE-NEXT: ldr q6, [x1, #128] +; NO_SVE-NEXT: add v5.2d, v2.2d, v7.2d +; NO_SVE-NEXT: tbz w8, #14, .LBB43_30 +; NO_SVE-NEXT: // %bb.29: // %cond.load40 +; NO_SVE-NEXT: fmov x9, d5 +; NO_SVE-NEXT: ld1 { v4.s }[2], [x9] +; NO_SVE-NEXT: .LBB43_30: // %else41 +; NO_SVE-NEXT: shl v6.2d, v6.2d, #2 +; NO_SVE-NEXT: tbz w8, #15, .LBB43_32 +; NO_SVE-NEXT: // %bb.31: // %cond.load43 +; NO_SVE-NEXT: mov x9, v5.d[1] +; NO_SVE-NEXT: ld1 { v4.s }[3], [x9] +; NO_SVE-NEXT: .LBB43_32: // %else44 +; NO_SVE-NEXT: ldr q7, [x1, #144] +; NO_SVE-NEXT: add v6.2d, v2.2d, v6.2d +; NO_SVE-NEXT: tbz w8, #16, .LBB43_34 +; NO_SVE-NEXT: // %bb.33: // %cond.load46 +; NO_SVE-NEXT: fmov x9, d6 +; NO_SVE-NEXT: ld1 { v5.s }[0], [x9] +; NO_SVE-NEXT: shl v16.2d, v7.2d, #2 +; NO_SVE-NEXT: tbnz w8, #17, .LBB43_35 +; NO_SVE-NEXT: b .LBB43_36 +; NO_SVE-NEXT: .LBB43_34: +; NO_SVE-NEXT: // implicit-def: $q5 +; NO_SVE-NEXT: shl v16.2d, v7.2d, #2 +; NO_SVE-NEXT: tbz w8, #17, .LBB43_36 +; NO_SVE-NEXT: .LBB43_35: // %cond.load49 +; NO_SVE-NEXT: mov x9, v6.d[1] +; NO_SVE-NEXT: ld1 { v5.s }[1], [x9] +; NO_SVE-NEXT: .LBB43_36: // %else50 +; NO_SVE-NEXT: ldr q7, [x1, #160] +; NO_SVE-NEXT: add v6.2d, v2.2d, v16.2d +; NO_SVE-NEXT: tbz w8, #18, .LBB43_38 +; NO_SVE-NEXT: // %bb.37: // %cond.load52 +; NO_SVE-NEXT: fmov x9, d6 +; NO_SVE-NEXT: ld1 { v5.s }[2], [x9] +; NO_SVE-NEXT: .LBB43_38: // %else53 +; NO_SVE-NEXT: shl v7.2d, v7.2d, #2 +; NO_SVE-NEXT: tbz w8, #19, .LBB43_40 +; NO_SVE-NEXT: // %bb.39: // %cond.load55 +; NO_SVE-NEXT: mov x9, v6.d[1] +; NO_SVE-NEXT: ld1 { v5.s }[3], [x9] +; NO_SVE-NEXT: .LBB43_40: // %else56 +; NO_SVE-NEXT: ldr q16, [x1, #176] +; NO_SVE-NEXT: add v7.2d, v2.2d, v7.2d +; NO_SVE-NEXT: tbz w8, #20, .LBB43_42 +; NO_SVE-NEXT: // %bb.41: // %cond.load58 +; NO_SVE-NEXT: fmov x9, d7 +; NO_SVE-NEXT: ld1 { v6.s }[0], [x9] +; NO_SVE-NEXT: shl v17.2d, v16.2d, #2 +; NO_SVE-NEXT: tbnz w8, #21, .LBB43_43 +; NO_SVE-NEXT: b .LBB43_44 +; NO_SVE-NEXT: .LBB43_42: +; NO_SVE-NEXT: // implicit-def: $q6 +; NO_SVE-NEXT: shl v17.2d, v16.2d, #2 +; NO_SVE-NEXT: tbz w8, #21, .LBB43_44 +; NO_SVE-NEXT: .LBB43_43: // %cond.load61 +; NO_SVE-NEXT: mov x9, v7.d[1] +; NO_SVE-NEXT: ld1 { v6.s }[1], [x9] +; NO_SVE-NEXT: .LBB43_44: // %else62 +; NO_SVE-NEXT: ldr q16, [x1, #192] +; NO_SVE-NEXT: add v7.2d, v2.2d, v17.2d +; NO_SVE-NEXT: tbz w8, #22, .LBB43_46 +; NO_SVE-NEXT: // %bb.45: // %cond.load64 +; NO_SVE-NEXT: fmov x9, d7 +; NO_SVE-NEXT: ld1 { v6.s }[2], [x9] +; NO_SVE-NEXT: .LBB43_46: // %else65 +; NO_SVE-NEXT: shl v16.2d, v16.2d, #2 +; NO_SVE-NEXT: tbz w8, #23, .LBB43_48 +; NO_SVE-NEXT: // %bb.47: // %cond.load67 +; NO_SVE-NEXT: mov x9, v7.d[1] +; NO_SVE-NEXT: ld1 { v6.s }[3], [x9] +; NO_SVE-NEXT: .LBB43_48: // %else68 +; NO_SVE-NEXT: ldr q17, [x1, #208] +; NO_SVE-NEXT: add v16.2d, v2.2d, v16.2d +; NO_SVE-NEXT: tbz w8, #24, .LBB43_50 +; NO_SVE-NEXT: // %bb.49: // %cond.load70 +; NO_SVE-NEXT: fmov x9, d16 +; NO_SVE-NEXT: ld1 { v7.s }[0], [x9] +; NO_SVE-NEXT: shl v18.2d, v17.2d, #2 +; NO_SVE-NEXT: tbnz w8, #25, .LBB43_51 +; NO_SVE-NEXT: b .LBB43_52 +; NO_SVE-NEXT: .LBB43_50: +; NO_SVE-NEXT: // implicit-def: $q7 +; NO_SVE-NEXT: shl v18.2d, v17.2d, #2 +; NO_SVE-NEXT: tbz w8, #25, .LBB43_52 +; NO_SVE-NEXT: .LBB43_51: // %cond.load73 +; NO_SVE-NEXT: mov x9, v16.d[1] +; NO_SVE-NEXT: ld1 { v7.s }[1], [x9] +; NO_SVE-NEXT: .LBB43_52: // %else74 +; NO_SVE-NEXT: ldr q17, [x1, #224] +; NO_SVE-NEXT: add v16.2d, v2.2d, v18.2d +; NO_SVE-NEXT: tbz w8, #26, .LBB43_54 +; NO_SVE-NEXT: // %bb.53: // %cond.load76 +; NO_SVE-NEXT: fmov x9, d16 +; NO_SVE-NEXT: ld1 { v7.s }[2], [x9] +; NO_SVE-NEXT: .LBB43_54: // %else77 +; NO_SVE-NEXT: shl v17.2d, v17.2d, #2 +; NO_SVE-NEXT: tbz w8, #27, .LBB43_56 +; NO_SVE-NEXT: // %bb.55: // %cond.load79 +; NO_SVE-NEXT: mov x9, v16.d[1] +; NO_SVE-NEXT: ld1 { v7.s }[3], [x9] +; NO_SVE-NEXT: .LBB43_56: // %else80 +; NO_SVE-NEXT: ldr q18, [x1, #240] +; NO_SVE-NEXT: add v17.2d, v2.2d, v17.2d +; NO_SVE-NEXT: tbz w8, #28, .LBB43_58 +; NO_SVE-NEXT: // %bb.57: // %cond.load82 +; NO_SVE-NEXT: fmov x9, d17 +; NO_SVE-NEXT: ld1 { v16.s }[0], [x9] +; NO_SVE-NEXT: shl v18.2d, v18.2d, #2 +; NO_SVE-NEXT: tbnz w8, #29, .LBB43_59 +; NO_SVE-NEXT: b .LBB43_60 +; NO_SVE-NEXT: .LBB43_58: +; NO_SVE-NEXT: // implicit-def: $q16 +; NO_SVE-NEXT: shl v18.2d, v18.2d, #2 +; NO_SVE-NEXT: tbz w8, #29, .LBB43_60 +; NO_SVE-NEXT: .LBB43_59: // %cond.load85 +; NO_SVE-NEXT: mov x9, v17.d[1] +; NO_SVE-NEXT: ld1 { v16.s }[1], [x9] +; NO_SVE-NEXT: .LBB43_60: // %else86 +; NO_SVE-NEXT: add v2.2d, v2.2d, v18.2d +; NO_SVE-NEXT: tbnz w8, #30, .LBB43_64 +; NO_SVE-NEXT: // %bb.61: // %else89 +; NO_SVE-NEXT: tbz w8, #31, .LBB43_63 +; NO_SVE-NEXT: .LBB43_62: // %cond.load91 +; NO_SVE-NEXT: mov x8, v2.d[1] +; NO_SVE-NEXT: ld1 { v16.s }[3], [x8] +; NO_SVE-NEXT: .LBB43_63: // %else92 +; NO_SVE-NEXT: stp q0, q1, [x0] +; NO_SVE-NEXT: stp q3, q4, [x0, #32] +; NO_SVE-NEXT: stp q5, q6, [x0, #64] +; NO_SVE-NEXT: stp q7, q16, [x0, #96] +; NO_SVE-NEXT: add sp, sp, #16 +; NO_SVE-NEXT: ret +; NO_SVE-NEXT: .LBB43_64: // %cond.load88 +; NO_SVE-NEXT: fmov x9, d2 +; NO_SVE-NEXT: ld1 { v16.s }[2], [x9] +; NO_SVE-NEXT: tbnz w8, #31, .LBB43_62 +; NO_SVE-NEXT: b .LBB43_63 +; +; VBITS_EQ_256-LABEL: masked_gather_64b_scaled: +; VBITS_EQ_256: // %bb.0: +; VBITS_EQ_256-NEXT: mov x8, #24 +; VBITS_EQ_256-NEXT: ptrue p0.s, vl8 +; VBITS_EQ_256-NEXT: mov x10, #16 +; VBITS_EQ_256-NEXT: mov x9, #8 +; VBITS_EQ_256-NEXT: mov x14, #28 +; VBITS_EQ_256-NEXT: ptrue p1.d, vl4 +; VBITS_EQ_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] +; VBITS_EQ_256-NEXT: mov x11, #4 +; VBITS_EQ_256-NEXT: ld1w { z2.s }, p0/z, [x0, x10, lsl #2] +; VBITS_EQ_256-NEXT: ld1w { z1.s }, p0/z, [x0, x9, lsl #2] +; VBITS_EQ_256-NEXT: ld1w { z3.s }, p0/z, [x0] +; VBITS_EQ_256-NEXT: ld1d { z17.d }, p1/z, [x1, x14, lsl #3] +; VBITS_EQ_256-NEXT: ld1d { z18.d }, p1/z, [x1, x8, lsl #3] +; VBITS_EQ_256-NEXT: mov x12, #20 +; VBITS_EQ_256-NEXT: fcmeq p2.s, p0/z, z0.s, #0.0 +; VBITS_EQ_256-NEXT: mov x13, #12 +; VBITS_EQ_256-NEXT: mov z19.s, p2/z, #-1 // =0xffffffffffffffff +; VBITS_EQ_256-NEXT: fcmeq p4.s, p0/z, z2.s, #0.0 +; VBITS_EQ_256-NEXT: ext z19.b, z19.b, z19.b, #16 +; VBITS_EQ_256-NEXT: punpklo p2.h, p2.b +; VBITS_EQ_256-NEXT: sunpklo z2.d, z19.s +; VBITS_EQ_256-NEXT: and p2.b, p2/z, p2.b, p1.b +; VBITS_EQ_256-NEXT: cmpne p3.d, p1/z, z2.d, #0 +; VBITS_EQ_256-NEXT: ld1d { z4.d }, p1/z, [x1, x11, lsl #3] +; VBITS_EQ_256-NEXT: ld1d { z5.d }, p1/z, [x1, x12, lsl #3] +; VBITS_EQ_256-NEXT: ld1d { z6.d }, p1/z, [x1, x13, lsl #3] +; VBITS_EQ_256-NEXT: ld1d { z7.d }, p1/z, [x1, x10, lsl #3] +; VBITS_EQ_256-NEXT: ld1d { z16.d }, p1/z, [x1, x9, lsl #3] +; VBITS_EQ_256-NEXT: ld1d { z0.d }, p1/z, [x1] +; VBITS_EQ_256-NEXT: ld1w { z2.d }, p2/z, [x2, z18.d, lsl #2] +; VBITS_EQ_256-NEXT: ld1w { z17.d }, p3/z, [x2, z17.d, lsl #2] +; VBITS_EQ_256-NEXT: fcmeq p3.s, p0/z, z1.s, #0.0 +; VBITS_EQ_256-NEXT: mov z1.s, p3/z, #-1 // =0xffffffffffffffff +; VBITS_EQ_256-NEXT: fcmeq p2.s, p0/z, z3.s, #0.0 +; VBITS_EQ_256-NEXT: ext z1.b, z1.b, z1.b, #16 +; VBITS_EQ_256-NEXT: mov z18.s, p4/z, #-1 // =0xffffffffffffffff +; VBITS_EQ_256-NEXT: mov z3.s, p2/z, #-1 // =0xffffffffffffffff +; VBITS_EQ_256-NEXT: punpklo p3.h, p3.b +; VBITS_EQ_256-NEXT: sunpklo z1.d, z1.s +; VBITS_EQ_256-NEXT: and p3.b, p3/z, p3.b, p1.b +; VBITS_EQ_256-NEXT: cmpne p5.d, p1/z, z1.d, #0 +; VBITS_EQ_256-NEXT: punpklo p4.h, p4.b +; VBITS_EQ_256-NEXT: ext z18.b, z18.b, z18.b, #16 +; VBITS_EQ_256-NEXT: ext z3.b, z3.b, z3.b, #16 +; VBITS_EQ_256-NEXT: ld1w { z16.d }, p3/z, [x2, z16.d, lsl #2] +; VBITS_EQ_256-NEXT: ld1w { z1.d }, p5/z, [x2, z6.d, lsl #2] +; VBITS_EQ_256-NEXT: and p4.b, p4/z, p4.b, p1.b +; VBITS_EQ_256-NEXT: sunpklo z6.d, z18.s +; VBITS_EQ_256-NEXT: punpklo p2.h, p2.b +; VBITS_EQ_256-NEXT: sunpklo z3.d, z3.s +; VBITS_EQ_256-NEXT: ld1w { z7.d }, p4/z, [x2, z7.d, lsl #2] +; VBITS_EQ_256-NEXT: cmpne p4.d, p1/z, z6.d, #0 +; VBITS_EQ_256-NEXT: and p2.b, p2/z, p2.b, p1.b +; VBITS_EQ_256-NEXT: cmpne p1.d, p1/z, z3.d, #0 +; VBITS_EQ_256-NEXT: ld1w { z5.d }, p4/z, [x2, z5.d, lsl #2] +; VBITS_EQ_256-NEXT: ld1w { z0.d }, p2/z, [x2, z0.d, lsl #2] +; VBITS_EQ_256-NEXT: ld1w { z3.d }, p1/z, [x2, z4.d, lsl #2] +; VBITS_EQ_256-NEXT: ptrue p3.s, vl4 +; VBITS_EQ_256-NEXT: uzp1 z4.s, z16.s, z16.s +; VBITS_EQ_256-NEXT: uzp1 z1.s, z1.s, z1.s +; VBITS_EQ_256-NEXT: uzp1 z2.s, z2.s, z2.s +; VBITS_EQ_256-NEXT: uzp1 z17.s, z17.s, z17.s +; VBITS_EQ_256-NEXT: splice z4.s, p3, z4.s, z1.s +; VBITS_EQ_256-NEXT: uzp1 z1.s, z7.s, z7.s +; VBITS_EQ_256-NEXT: uzp1 z5.s, z5.s, z5.s +; VBITS_EQ_256-NEXT: uzp1 z0.s, z0.s, z0.s +; VBITS_EQ_256-NEXT: uzp1 z3.s, z3.s, z3.s +; VBITS_EQ_256-NEXT: splice z2.s, p3, z2.s, z17.s +; VBITS_EQ_256-NEXT: splice z1.s, p3, z1.s, z5.s +; VBITS_EQ_256-NEXT: splice z0.s, p3, z0.s, z3.s +; VBITS_EQ_256-NEXT: st1w { z1.s }, p0, [x0, x10, lsl #2] +; VBITS_EQ_256-NEXT: st1w { z4.s }, p0, [x0, x9, lsl #2] +; VBITS_EQ_256-NEXT: st1w { z2.s }, p0, [x0, x8, lsl #2] +; VBITS_EQ_256-NEXT: st1w { z0.s }, p0, [x0] +; VBITS_EQ_256-NEXT: ret +; ; VBITS_GE_2048-LABEL: masked_gather_64b_scaled: ; VBITS_GE_2048: // %bb.0: ; VBITS_GE_2048-NEXT: ptrue p0.s, vl32 @@ -1116,6 +9924,464 @@ } define void @masked_gather_64b_unscaled(<32 x float>* %a, <32 x i64>* %b, i8* %base) #0 { +; NO_SVE-LABEL: masked_gather_64b_unscaled: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: sub sp, sp, #16 +; NO_SVE-NEXT: .cfi_def_cfa_offset 16 +; NO_SVE-NEXT: ldp q3, q4, [x0, #64] +; NO_SVE-NEXT: fcmeq v3.4s, v3.4s, #0.0 +; NO_SVE-NEXT: ldp q1, q2, [x0] +; NO_SVE-NEXT: fcmeq v4.4s, v4.4s, #0.0 +; NO_SVE-NEXT: fcmeq v1.4s, v1.4s, #0.0 +; NO_SVE-NEXT: uzp1 v3.8h, v3.8h, v4.8h +; NO_SVE-NEXT: ldp q5, q6, [x0, #96] +; NO_SVE-NEXT: fcmeq v2.4s, v2.4s, #0.0 +; NO_SVE-NEXT: fcmeq v5.4s, v5.4s, #0.0 +; NO_SVE-NEXT: uzp1 v1.8h, v1.8h, v2.8h +; NO_SVE-NEXT: fcmeq v6.4s, v6.4s, #0.0 +; NO_SVE-NEXT: ldr q4, [x0, #48] +; NO_SVE-NEXT: xtn v2.8b, v3.8h +; NO_SVE-NEXT: ldr q0, [x1] +; NO_SVE-NEXT: umov w8, v2.b[1] +; NO_SVE-NEXT: umov w10, v2.b[2] +; NO_SVE-NEXT: uzp1 v5.8h, v5.8h, v6.8h +; NO_SVE-NEXT: umov w9, v2.b[0] +; NO_SVE-NEXT: umov w11, v2.b[3] +; NO_SVE-NEXT: umov w12, v2.b[4] +; NO_SVE-NEXT: umov w13, v2.b[5] +; NO_SVE-NEXT: and w8, w8, #0x1 +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: xtn v5.8b, v5.8h +; NO_SVE-NEXT: umov w14, v2.b[6] +; NO_SVE-NEXT: and w9, w9, #0x1 +; NO_SVE-NEXT: and w11, w11, #0x1 +; NO_SVE-NEXT: umov w15, v2.b[7] +; NO_SVE-NEXT: bfi w9, w8, #1, #1 +; NO_SVE-NEXT: and w12, w12, #0x1 +; NO_SVE-NEXT: umov w8, v5.b[0] +; NO_SVE-NEXT: bfi w9, w10, #2, #1 +; NO_SVE-NEXT: and w13, w13, #0x1 +; NO_SVE-NEXT: umov w10, v5.b[1] +; NO_SVE-NEXT: bfi w9, w11, #3, #1 +; NO_SVE-NEXT: umov w11, v5.b[2] +; NO_SVE-NEXT: bfi w9, w12, #4, #1 +; NO_SVE-NEXT: and w14, w14, #0x1 +; NO_SVE-NEXT: umov w12, v5.b[3] +; NO_SVE-NEXT: bfi w9, w13, #5, #1 +; NO_SVE-NEXT: and w15, w15, #0x1 +; NO_SVE-NEXT: xtn v1.8b, v1.8h +; NO_SVE-NEXT: umov w13, v5.b[4] +; NO_SVE-NEXT: orr w9, w9, w14, lsl #6 +; NO_SVE-NEXT: and w8, w8, #0x1 +; NO_SVE-NEXT: fcmeq v3.4s, v4.4s, #0.0 +; NO_SVE-NEXT: ldr q4, [x0, #32] +; NO_SVE-NEXT: orr w9, w9, w15, lsl #7 +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: umov w15, v1.b[1] +; NO_SVE-NEXT: orr w8, w9, w8, lsl #8 +; NO_SVE-NEXT: and w11, w11, #0x1 +; NO_SVE-NEXT: orr w8, w8, w10, lsl #9 +; NO_SVE-NEXT: umov w10, v1.b[2] +; NO_SVE-NEXT: and w12, w12, #0x1 +; NO_SVE-NEXT: umov w9, v1.b[0] +; NO_SVE-NEXT: orr w8, w8, w11, lsl #10 +; NO_SVE-NEXT: and w13, w13, #0x1 +; NO_SVE-NEXT: umov w11, v1.b[3] +; NO_SVE-NEXT: fcmeq v2.4s, v4.4s, #0.0 +; NO_SVE-NEXT: orr w8, w8, w12, lsl #11 +; NO_SVE-NEXT: and w12, w15, #0x1 +; NO_SVE-NEXT: orr w8, w8, w13, lsl #12 +; NO_SVE-NEXT: umov w13, v1.b[4] +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: umov w15, v1.b[5] +; NO_SVE-NEXT: and w9, w9, #0x1 +; NO_SVE-NEXT: uzp1 v2.8h, v2.8h, v3.8h +; NO_SVE-NEXT: bfi w9, w12, #1, #1 +; NO_SVE-NEXT: and w11, w11, #0x1 +; NO_SVE-NEXT: bfi w9, w10, #2, #1 +; NO_SVE-NEXT: and w10, w13, #0x1 +; NO_SVE-NEXT: umov w13, v1.b[6] +; NO_SVE-NEXT: umov w14, v5.b[5] +; NO_SVE-NEXT: and w12, w15, #0x1 +; NO_SVE-NEXT: bfi w9, w11, #3, #1 +; NO_SVE-NEXT: umov w11, v1.b[7] +; NO_SVE-NEXT: xtn v1.8b, v2.8h +; NO_SVE-NEXT: bfi w9, w10, #4, #1 +; NO_SVE-NEXT: umov w10, v5.b[6] +; NO_SVE-NEXT: bfi w9, w12, #5, #1 +; NO_SVE-NEXT: and w12, w13, #0x1 +; NO_SVE-NEXT: umov w13, v1.b[0] +; NO_SVE-NEXT: and w14, w14, #0x1 +; NO_SVE-NEXT: umov w15, v1.b[1] +; NO_SVE-NEXT: and w11, w11, #0x1 +; NO_SVE-NEXT: orr w9, w9, w12, lsl #6 +; NO_SVE-NEXT: umov w12, v1.b[2] +; NO_SVE-NEXT: orr w8, w8, w14, lsl #13 +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: umov w14, v1.b[3] +; NO_SVE-NEXT: orr w9, w9, w11, lsl #7 +; NO_SVE-NEXT: and w11, w13, #0x1 +; NO_SVE-NEXT: and w13, w15, #0x1 +; NO_SVE-NEXT: orr w8, w8, w10, lsl #14 +; NO_SVE-NEXT: umov w10, v1.b[4] +; NO_SVE-NEXT: orr w9, w9, w11, lsl #8 +; NO_SVE-NEXT: and w11, w12, #0x1 +; NO_SVE-NEXT: umov w12, v1.b[5] +; NO_SVE-NEXT: orr w9, w9, w13, lsl #9 +; NO_SVE-NEXT: and w13, w14, #0x1 +; NO_SVE-NEXT: umov w14, v1.b[6] +; NO_SVE-NEXT: orr w9, w9, w11, lsl #10 +; NO_SVE-NEXT: umov w11, v5.b[7] +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: orr w9, w9, w13, lsl #11 +; NO_SVE-NEXT: and w12, w12, #0x1 +; NO_SVE-NEXT: umov w13, v1.b[7] +; NO_SVE-NEXT: orr w9, w9, w10, lsl #12 +; NO_SVE-NEXT: and w10, w14, #0x1 +; NO_SVE-NEXT: dup v2.2d, x2 +; NO_SVE-NEXT: orr w11, w8, w11, lsl #15 +; NO_SVE-NEXT: orr w8, w9, w12, lsl #13 +; NO_SVE-NEXT: orr w8, w8, w10, lsl #14 +; NO_SVE-NEXT: orr w8, w8, w13, lsl #15 +; NO_SVE-NEXT: bfi w8, w11, #16, #16 +; NO_SVE-NEXT: add v1.2d, v2.2d, v0.2d +; NO_SVE-NEXT: tbz w8, #0, .LBB44_2 +; NO_SVE-NEXT: // %bb.1: // %cond.load +; NO_SVE-NEXT: fmov x9, d1 +; NO_SVE-NEXT: ldr s0, [x9] +; NO_SVE-NEXT: ldr q3, [x1, #16] +; NO_SVE-NEXT: tbnz w8, #1, .LBB44_3 +; NO_SVE-NEXT: b .LBB44_4 +; NO_SVE-NEXT: .LBB44_2: +; NO_SVE-NEXT: // implicit-def: $q0 +; NO_SVE-NEXT: ldr q3, [x1, #16] +; NO_SVE-NEXT: tbz w8, #1, .LBB44_4 +; NO_SVE-NEXT: .LBB44_3: // %cond.load1 +; NO_SVE-NEXT: mov x9, v1.d[1] +; NO_SVE-NEXT: ld1 { v0.s }[1], [x9] +; NO_SVE-NEXT: .LBB44_4: // %else2 +; NO_SVE-NEXT: add v1.2d, v2.2d, v3.2d +; NO_SVE-NEXT: tbnz w8, #2, .LBB44_8 +; NO_SVE-NEXT: // %bb.5: // %else5 +; NO_SVE-NEXT: ldr q3, [x1, #32] +; NO_SVE-NEXT: tbnz w8, #3, .LBB44_9 +; NO_SVE-NEXT: .LBB44_6: // %else8 +; NO_SVE-NEXT: add v4.2d, v2.2d, v3.2d +; NO_SVE-NEXT: tbz w8, #4, .LBB44_10 +; NO_SVE-NEXT: .LBB44_7: // %cond.load10 +; NO_SVE-NEXT: fmov x9, d4 +; NO_SVE-NEXT: ld1 { v1.s }[0], [x9] +; NO_SVE-NEXT: ldr q3, [x1, #48] +; NO_SVE-NEXT: tbnz w8, #5, .LBB44_11 +; NO_SVE-NEXT: b .LBB44_12 +; NO_SVE-NEXT: .LBB44_8: // %cond.load4 +; NO_SVE-NEXT: fmov x9, d1 +; NO_SVE-NEXT: ld1 { v0.s }[2], [x9] +; NO_SVE-NEXT: ldr q3, [x1, #32] +; NO_SVE-NEXT: tbz w8, #3, .LBB44_6 +; NO_SVE-NEXT: .LBB44_9: // %cond.load7 +; NO_SVE-NEXT: mov x9, v1.d[1] +; NO_SVE-NEXT: ld1 { v0.s }[3], [x9] +; NO_SVE-NEXT: add v4.2d, v2.2d, v3.2d +; NO_SVE-NEXT: tbnz w8, #4, .LBB44_7 +; NO_SVE-NEXT: .LBB44_10: +; NO_SVE-NEXT: // implicit-def: $q1 +; NO_SVE-NEXT: ldr q3, [x1, #48] +; NO_SVE-NEXT: tbz w8, #5, .LBB44_12 +; NO_SVE-NEXT: .LBB44_11: // %cond.load13 +; NO_SVE-NEXT: mov x9, v4.d[1] +; NO_SVE-NEXT: ld1 { v1.s }[1], [x9] +; NO_SVE-NEXT: .LBB44_12: // %else14 +; NO_SVE-NEXT: add v3.2d, v2.2d, v3.2d +; NO_SVE-NEXT: tbnz w8, #6, .LBB44_16 +; NO_SVE-NEXT: // %bb.13: // %else17 +; NO_SVE-NEXT: ldr q4, [x1, #64] +; NO_SVE-NEXT: tbnz w8, #7, .LBB44_17 +; NO_SVE-NEXT: .LBB44_14: // %else20 +; NO_SVE-NEXT: add v4.2d, v2.2d, v4.2d +; NO_SVE-NEXT: tbz w8, #8, .LBB44_18 +; NO_SVE-NEXT: .LBB44_15: // %cond.load22 +; NO_SVE-NEXT: fmov x9, d4 +; NO_SVE-NEXT: ld1 { v3.s }[0], [x9] +; NO_SVE-NEXT: ldr q5, [x1, #80] +; NO_SVE-NEXT: tbnz w8, #9, .LBB44_19 +; NO_SVE-NEXT: b .LBB44_20 +; NO_SVE-NEXT: .LBB44_16: // %cond.load16 +; NO_SVE-NEXT: fmov x9, d3 +; NO_SVE-NEXT: ld1 { v1.s }[2], [x9] +; NO_SVE-NEXT: ldr q4, [x1, #64] +; NO_SVE-NEXT: tbz w8, #7, .LBB44_14 +; NO_SVE-NEXT: .LBB44_17: // %cond.load19 +; NO_SVE-NEXT: mov x9, v3.d[1] +; NO_SVE-NEXT: ld1 { v1.s }[3], [x9] +; NO_SVE-NEXT: add v4.2d, v2.2d, v4.2d +; NO_SVE-NEXT: tbnz w8, #8, .LBB44_15 +; NO_SVE-NEXT: .LBB44_18: +; NO_SVE-NEXT: // implicit-def: $q3 +; NO_SVE-NEXT: ldr q5, [x1, #80] +; NO_SVE-NEXT: tbz w8, #9, .LBB44_20 +; NO_SVE-NEXT: .LBB44_19: // %cond.load25 +; NO_SVE-NEXT: mov x9, v4.d[1] +; NO_SVE-NEXT: ld1 { v3.s }[1], [x9] +; NO_SVE-NEXT: .LBB44_20: // %else26 +; NO_SVE-NEXT: add v4.2d, v2.2d, v5.2d +; NO_SVE-NEXT: tbnz w8, #10, .LBB44_24 +; NO_SVE-NEXT: // %bb.21: // %else29 +; NO_SVE-NEXT: ldr q5, [x1, #96] +; NO_SVE-NEXT: tbnz w8, #11, .LBB44_25 +; NO_SVE-NEXT: .LBB44_22: // %else32 +; NO_SVE-NEXT: add v5.2d, v2.2d, v5.2d +; NO_SVE-NEXT: tbz w8, #12, .LBB44_26 +; NO_SVE-NEXT: .LBB44_23: // %cond.load34 +; NO_SVE-NEXT: fmov x9, d5 +; NO_SVE-NEXT: ld1 { v4.s }[0], [x9] +; NO_SVE-NEXT: ldr q6, [x1, #112] +; NO_SVE-NEXT: tbnz w8, #13, .LBB44_27 +; NO_SVE-NEXT: b .LBB44_28 +; NO_SVE-NEXT: .LBB44_24: // %cond.load28 +; NO_SVE-NEXT: fmov x9, d4 +; NO_SVE-NEXT: ld1 { v3.s }[2], [x9] +; NO_SVE-NEXT: ldr q5, [x1, #96] +; NO_SVE-NEXT: tbz w8, #11, .LBB44_22 +; NO_SVE-NEXT: .LBB44_25: // %cond.load31 +; NO_SVE-NEXT: mov x9, v4.d[1] +; NO_SVE-NEXT: ld1 { v3.s }[3], [x9] +; NO_SVE-NEXT: add v5.2d, v2.2d, v5.2d +; NO_SVE-NEXT: tbnz w8, #12, .LBB44_23 +; NO_SVE-NEXT: .LBB44_26: +; NO_SVE-NEXT: // implicit-def: $q4 +; NO_SVE-NEXT: ldr q6, [x1, #112] +; NO_SVE-NEXT: tbz w8, #13, .LBB44_28 +; NO_SVE-NEXT: .LBB44_27: // %cond.load37 +; NO_SVE-NEXT: mov x9, v5.d[1] +; NO_SVE-NEXT: ld1 { v4.s }[1], [x9] +; NO_SVE-NEXT: .LBB44_28: // %else38 +; NO_SVE-NEXT: add v5.2d, v2.2d, v6.2d +; NO_SVE-NEXT: tbnz w8, #14, .LBB44_32 +; NO_SVE-NEXT: // %bb.29: // %else41 +; NO_SVE-NEXT: ldr q6, [x1, #128] +; NO_SVE-NEXT: tbnz w8, #15, .LBB44_33 +; NO_SVE-NEXT: .LBB44_30: // %else44 +; NO_SVE-NEXT: add v6.2d, v2.2d, v6.2d +; NO_SVE-NEXT: tbz w8, #16, .LBB44_34 +; NO_SVE-NEXT: .LBB44_31: // %cond.load46 +; NO_SVE-NEXT: fmov x9, d6 +; NO_SVE-NEXT: ld1 { v5.s }[0], [x9] +; NO_SVE-NEXT: ldr q7, [x1, #144] +; NO_SVE-NEXT: tbnz w8, #17, .LBB44_35 +; NO_SVE-NEXT: b .LBB44_36 +; NO_SVE-NEXT: .LBB44_32: // %cond.load40 +; NO_SVE-NEXT: fmov x9, d5 +; NO_SVE-NEXT: ld1 { v4.s }[2], [x9] +; NO_SVE-NEXT: ldr q6, [x1, #128] +; NO_SVE-NEXT: tbz w8, #15, .LBB44_30 +; NO_SVE-NEXT: .LBB44_33: // %cond.load43 +; NO_SVE-NEXT: mov x9, v5.d[1] +; NO_SVE-NEXT: ld1 { v4.s }[3], [x9] +; NO_SVE-NEXT: add v6.2d, v2.2d, v6.2d +; NO_SVE-NEXT: tbnz w8, #16, .LBB44_31 +; NO_SVE-NEXT: .LBB44_34: +; NO_SVE-NEXT: // implicit-def: $q5 +; NO_SVE-NEXT: ldr q7, [x1, #144] +; NO_SVE-NEXT: tbz w8, #17, .LBB44_36 +; NO_SVE-NEXT: .LBB44_35: // %cond.load49 +; NO_SVE-NEXT: mov x9, v6.d[1] +; NO_SVE-NEXT: ld1 { v5.s }[1], [x9] +; NO_SVE-NEXT: .LBB44_36: // %else50 +; NO_SVE-NEXT: add v6.2d, v2.2d, v7.2d +; NO_SVE-NEXT: tbnz w8, #18, .LBB44_40 +; NO_SVE-NEXT: // %bb.37: // %else53 +; NO_SVE-NEXT: ldr q7, [x1, #160] +; NO_SVE-NEXT: tbnz w8, #19, .LBB44_41 +; NO_SVE-NEXT: .LBB44_38: // %else56 +; NO_SVE-NEXT: add v7.2d, v2.2d, v7.2d +; NO_SVE-NEXT: tbz w8, #20, .LBB44_42 +; NO_SVE-NEXT: .LBB44_39: // %cond.load58 +; NO_SVE-NEXT: fmov x9, d7 +; NO_SVE-NEXT: ld1 { v6.s }[0], [x9] +; NO_SVE-NEXT: ldr q16, [x1, #176] +; NO_SVE-NEXT: tbnz w8, #21, .LBB44_43 +; NO_SVE-NEXT: b .LBB44_44 +; NO_SVE-NEXT: .LBB44_40: // %cond.load52 +; NO_SVE-NEXT: fmov x9, d6 +; NO_SVE-NEXT: ld1 { v5.s }[2], [x9] +; NO_SVE-NEXT: ldr q7, [x1, #160] +; NO_SVE-NEXT: tbz w8, #19, .LBB44_38 +; NO_SVE-NEXT: .LBB44_41: // %cond.load55 +; NO_SVE-NEXT: mov x9, v6.d[1] +; NO_SVE-NEXT: ld1 { v5.s }[3], [x9] +; NO_SVE-NEXT: add v7.2d, v2.2d, v7.2d +; NO_SVE-NEXT: tbnz w8, #20, .LBB44_39 +; NO_SVE-NEXT: .LBB44_42: +; NO_SVE-NEXT: // implicit-def: $q6 +; NO_SVE-NEXT: ldr q16, [x1, #176] +; NO_SVE-NEXT: tbz w8, #21, .LBB44_44 +; NO_SVE-NEXT: .LBB44_43: // %cond.load61 +; NO_SVE-NEXT: mov x9, v7.d[1] +; NO_SVE-NEXT: ld1 { v6.s }[1], [x9] +; NO_SVE-NEXT: .LBB44_44: // %else62 +; NO_SVE-NEXT: add v7.2d, v2.2d, v16.2d +; NO_SVE-NEXT: tbnz w8, #22, .LBB44_48 +; NO_SVE-NEXT: // %bb.45: // %else65 +; NO_SVE-NEXT: ldr q16, [x1, #192] +; NO_SVE-NEXT: tbnz w8, #23, .LBB44_49 +; NO_SVE-NEXT: .LBB44_46: // %else68 +; NO_SVE-NEXT: add v16.2d, v2.2d, v16.2d +; NO_SVE-NEXT: tbz w8, #24, .LBB44_50 +; NO_SVE-NEXT: .LBB44_47: // %cond.load70 +; NO_SVE-NEXT: fmov x9, d16 +; NO_SVE-NEXT: ld1 { v7.s }[0], [x9] +; NO_SVE-NEXT: ldr q17, [x1, #208] +; NO_SVE-NEXT: tbnz w8, #25, .LBB44_51 +; NO_SVE-NEXT: b .LBB44_52 +; NO_SVE-NEXT: .LBB44_48: // %cond.load64 +; NO_SVE-NEXT: fmov x9, d7 +; NO_SVE-NEXT: ld1 { v6.s }[2], [x9] +; NO_SVE-NEXT: ldr q16, [x1, #192] +; NO_SVE-NEXT: tbz w8, #23, .LBB44_46 +; NO_SVE-NEXT: .LBB44_49: // %cond.load67 +; NO_SVE-NEXT: mov x9, v7.d[1] +; NO_SVE-NEXT: ld1 { v6.s }[3], [x9] +; NO_SVE-NEXT: add v16.2d, v2.2d, v16.2d +; NO_SVE-NEXT: tbnz w8, #24, .LBB44_47 +; NO_SVE-NEXT: .LBB44_50: +; NO_SVE-NEXT: // implicit-def: $q7 +; NO_SVE-NEXT: ldr q17, [x1, #208] +; NO_SVE-NEXT: tbz w8, #25, .LBB44_52 +; NO_SVE-NEXT: .LBB44_51: // %cond.load73 +; NO_SVE-NEXT: mov x9, v16.d[1] +; NO_SVE-NEXT: ld1 { v7.s }[1], [x9] +; NO_SVE-NEXT: .LBB44_52: // %else74 +; NO_SVE-NEXT: add v16.2d, v2.2d, v17.2d +; NO_SVE-NEXT: tbnz w8, #26, .LBB44_56 +; NO_SVE-NEXT: // %bb.53: // %else77 +; NO_SVE-NEXT: ldr q17, [x1, #224] +; NO_SVE-NEXT: tbnz w8, #27, .LBB44_57 +; NO_SVE-NEXT: .LBB44_54: // %else80 +; NO_SVE-NEXT: add v17.2d, v2.2d, v17.2d +; NO_SVE-NEXT: tbz w8, #28, .LBB44_58 +; NO_SVE-NEXT: .LBB44_55: // %cond.load82 +; NO_SVE-NEXT: fmov x9, d17 +; NO_SVE-NEXT: ld1 { v16.s }[0], [x9] +; NO_SVE-NEXT: ldr q18, [x1, #240] +; NO_SVE-NEXT: tbnz w8, #29, .LBB44_59 +; NO_SVE-NEXT: b .LBB44_60 +; NO_SVE-NEXT: .LBB44_56: // %cond.load76 +; NO_SVE-NEXT: fmov x9, d16 +; NO_SVE-NEXT: ld1 { v7.s }[2], [x9] +; NO_SVE-NEXT: ldr q17, [x1, #224] +; NO_SVE-NEXT: tbz w8, #27, .LBB44_54 +; NO_SVE-NEXT: .LBB44_57: // %cond.load79 +; NO_SVE-NEXT: mov x9, v16.d[1] +; NO_SVE-NEXT: ld1 { v7.s }[3], [x9] +; NO_SVE-NEXT: add v17.2d, v2.2d, v17.2d +; NO_SVE-NEXT: tbnz w8, #28, .LBB44_55 +; NO_SVE-NEXT: .LBB44_58: +; NO_SVE-NEXT: // implicit-def: $q16 +; NO_SVE-NEXT: ldr q18, [x1, #240] +; NO_SVE-NEXT: tbz w8, #29, .LBB44_60 +; NO_SVE-NEXT: .LBB44_59: // %cond.load85 +; NO_SVE-NEXT: mov x9, v17.d[1] +; NO_SVE-NEXT: ld1 { v16.s }[1], [x9] +; NO_SVE-NEXT: .LBB44_60: // %else86 +; NO_SVE-NEXT: add v2.2d, v2.2d, v18.2d +; NO_SVE-NEXT: tbnz w8, #30, .LBB44_64 +; NO_SVE-NEXT: // %bb.61: // %else89 +; NO_SVE-NEXT: tbz w8, #31, .LBB44_63 +; NO_SVE-NEXT: .LBB44_62: // %cond.load91 +; NO_SVE-NEXT: mov x8, v2.d[1] +; NO_SVE-NEXT: ld1 { v16.s }[3], [x8] +; NO_SVE-NEXT: .LBB44_63: // %else92 +; NO_SVE-NEXT: stp q0, q1, [x0] +; NO_SVE-NEXT: stp q3, q4, [x0, #32] +; NO_SVE-NEXT: stp q5, q6, [x0, #64] +; NO_SVE-NEXT: stp q7, q16, [x0, #96] +; NO_SVE-NEXT: add sp, sp, #16 +; NO_SVE-NEXT: ret +; NO_SVE-NEXT: .LBB44_64: // %cond.load88 +; NO_SVE-NEXT: fmov x9, d2 +; NO_SVE-NEXT: ld1 { v16.s }[2], [x9] +; NO_SVE-NEXT: tbnz w8, #31, .LBB44_62 +; NO_SVE-NEXT: b .LBB44_63 +; +; VBITS_EQ_256-LABEL: masked_gather_64b_unscaled: +; VBITS_EQ_256: // %bb.0: +; VBITS_EQ_256-NEXT: mov x8, #24 +; VBITS_EQ_256-NEXT: ptrue p0.s, vl8 +; VBITS_EQ_256-NEXT: mov x10, #16 +; VBITS_EQ_256-NEXT: mov x9, #8 +; VBITS_EQ_256-NEXT: mov x14, #28 +; VBITS_EQ_256-NEXT: ptrue p1.d, vl4 +; VBITS_EQ_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] +; VBITS_EQ_256-NEXT: mov x11, #4 +; VBITS_EQ_256-NEXT: ld1w { z2.s }, p0/z, [x0, x10, lsl #2] +; VBITS_EQ_256-NEXT: ld1w { z1.s }, p0/z, [x0, x9, lsl #2] +; VBITS_EQ_256-NEXT: ld1w { z3.s }, p0/z, [x0] +; VBITS_EQ_256-NEXT: ld1d { z17.d }, p1/z, [x1, x14, lsl #3] +; VBITS_EQ_256-NEXT: ld1d { z18.d }, p1/z, [x1, x8, lsl #3] +; VBITS_EQ_256-NEXT: mov x12, #20 +; VBITS_EQ_256-NEXT: fcmeq p2.s, p0/z, z0.s, #0.0 +; VBITS_EQ_256-NEXT: mov x13, #12 +; VBITS_EQ_256-NEXT: mov z19.s, p2/z, #-1 // =0xffffffffffffffff +; VBITS_EQ_256-NEXT: fcmeq p4.s, p0/z, z2.s, #0.0 +; VBITS_EQ_256-NEXT: ext z19.b, z19.b, z19.b, #16 +; VBITS_EQ_256-NEXT: punpklo p2.h, p2.b +; VBITS_EQ_256-NEXT: sunpklo z2.d, z19.s +; VBITS_EQ_256-NEXT: and p2.b, p2/z, p2.b, p1.b +; VBITS_EQ_256-NEXT: cmpne p3.d, p1/z, z2.d, #0 +; VBITS_EQ_256-NEXT: ld1d { z4.d }, p1/z, [x1, x11, lsl #3] +; VBITS_EQ_256-NEXT: ld1d { z5.d }, p1/z, [x1, x12, lsl #3] +; VBITS_EQ_256-NEXT: ld1d { z6.d }, p1/z, [x1, x13, lsl #3] +; VBITS_EQ_256-NEXT: ld1d { z7.d }, p1/z, [x1, x10, lsl #3] +; VBITS_EQ_256-NEXT: ld1d { z16.d }, p1/z, [x1, x9, lsl #3] +; VBITS_EQ_256-NEXT: ld1d { z0.d }, p1/z, [x1] +; VBITS_EQ_256-NEXT: ld1w { z2.d }, p2/z, [x2, z18.d] +; VBITS_EQ_256-NEXT: ld1w { z17.d }, p3/z, [x2, z17.d] +; VBITS_EQ_256-NEXT: fcmeq p3.s, p0/z, z1.s, #0.0 +; VBITS_EQ_256-NEXT: mov z1.s, p3/z, #-1 // =0xffffffffffffffff +; VBITS_EQ_256-NEXT: fcmeq p2.s, p0/z, z3.s, #0.0 +; VBITS_EQ_256-NEXT: ext z1.b, z1.b, z1.b, #16 +; VBITS_EQ_256-NEXT: mov z18.s, p4/z, #-1 // =0xffffffffffffffff +; VBITS_EQ_256-NEXT: mov z3.s, p2/z, #-1 // =0xffffffffffffffff +; VBITS_EQ_256-NEXT: punpklo p3.h, p3.b +; VBITS_EQ_256-NEXT: sunpklo z1.d, z1.s +; VBITS_EQ_256-NEXT: and p3.b, p3/z, p3.b, p1.b +; VBITS_EQ_256-NEXT: cmpne p5.d, p1/z, z1.d, #0 +; VBITS_EQ_256-NEXT: punpklo p4.h, p4.b +; VBITS_EQ_256-NEXT: ext z18.b, z18.b, z18.b, #16 +; VBITS_EQ_256-NEXT: ext z3.b, z3.b, z3.b, #16 +; VBITS_EQ_256-NEXT: ld1w { z16.d }, p3/z, [x2, z16.d] +; VBITS_EQ_256-NEXT: ld1w { z1.d }, p5/z, [x2, z6.d] +; VBITS_EQ_256-NEXT: and p4.b, p4/z, p4.b, p1.b +; VBITS_EQ_256-NEXT: sunpklo z6.d, z18.s +; VBITS_EQ_256-NEXT: punpklo p2.h, p2.b +; VBITS_EQ_256-NEXT: sunpklo z3.d, z3.s +; VBITS_EQ_256-NEXT: ld1w { z7.d }, p4/z, [x2, z7.d] +; VBITS_EQ_256-NEXT: cmpne p4.d, p1/z, z6.d, #0 +; VBITS_EQ_256-NEXT: and p2.b, p2/z, p2.b, p1.b +; VBITS_EQ_256-NEXT: cmpne p1.d, p1/z, z3.d, #0 +; VBITS_EQ_256-NEXT: ld1w { z5.d }, p4/z, [x2, z5.d] +; VBITS_EQ_256-NEXT: ld1w { z0.d }, p2/z, [x2, z0.d] +; VBITS_EQ_256-NEXT: ld1w { z3.d }, p1/z, [x2, z4.d] +; VBITS_EQ_256-NEXT: ptrue p3.s, vl4 +; VBITS_EQ_256-NEXT: uzp1 z4.s, z16.s, z16.s +; VBITS_EQ_256-NEXT: uzp1 z1.s, z1.s, z1.s +; VBITS_EQ_256-NEXT: uzp1 z2.s, z2.s, z2.s +; VBITS_EQ_256-NEXT: uzp1 z17.s, z17.s, z17.s +; VBITS_EQ_256-NEXT: splice z4.s, p3, z4.s, z1.s +; VBITS_EQ_256-NEXT: uzp1 z1.s, z7.s, z7.s +; VBITS_EQ_256-NEXT: uzp1 z5.s, z5.s, z5.s +; VBITS_EQ_256-NEXT: uzp1 z0.s, z0.s, z0.s +; VBITS_EQ_256-NEXT: uzp1 z3.s, z3.s, z3.s +; VBITS_EQ_256-NEXT: splice z2.s, p3, z2.s, z17.s +; VBITS_EQ_256-NEXT: splice z1.s, p3, z1.s, z5.s +; VBITS_EQ_256-NEXT: splice z0.s, p3, z0.s, z3.s +; VBITS_EQ_256-NEXT: st1w { z1.s }, p0, [x0, x10, lsl #2] +; VBITS_EQ_256-NEXT: st1w { z4.s }, p0, [x0, x9, lsl #2] +; VBITS_EQ_256-NEXT: st1w { z2.s }, p0, [x0, x8, lsl #2] +; VBITS_EQ_256-NEXT: st1w { z0.s }, p0, [x0] +; VBITS_EQ_256-NEXT: ret +; ; VBITS_GE_2048-LABEL: masked_gather_64b_unscaled: ; VBITS_GE_2048: // %bb.0: ; VBITS_GE_2048-NEXT: ptrue p0.s, vl32 @@ -1139,6 +10405,464 @@ } define void @masked_gather_vec_plus_reg(<32 x float>* %a, <32 x i8*>* %b, i64 %off) #0 { +; NO_SVE-LABEL: masked_gather_vec_plus_reg: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: sub sp, sp, #16 +; NO_SVE-NEXT: .cfi_def_cfa_offset 16 +; NO_SVE-NEXT: ldp q3, q4, [x0, #64] +; NO_SVE-NEXT: fcmeq v3.4s, v3.4s, #0.0 +; NO_SVE-NEXT: ldp q1, q2, [x0] +; NO_SVE-NEXT: fcmeq v4.4s, v4.4s, #0.0 +; NO_SVE-NEXT: fcmeq v1.4s, v1.4s, #0.0 +; NO_SVE-NEXT: uzp1 v3.8h, v3.8h, v4.8h +; NO_SVE-NEXT: ldp q5, q6, [x0, #96] +; NO_SVE-NEXT: fcmeq v2.4s, v2.4s, #0.0 +; NO_SVE-NEXT: fcmeq v5.4s, v5.4s, #0.0 +; NO_SVE-NEXT: uzp1 v1.8h, v1.8h, v2.8h +; NO_SVE-NEXT: fcmeq v6.4s, v6.4s, #0.0 +; NO_SVE-NEXT: ldr q4, [x0, #48] +; NO_SVE-NEXT: xtn v2.8b, v3.8h +; NO_SVE-NEXT: ldr q0, [x1] +; NO_SVE-NEXT: umov w8, v2.b[1] +; NO_SVE-NEXT: umov w10, v2.b[2] +; NO_SVE-NEXT: uzp1 v5.8h, v5.8h, v6.8h +; NO_SVE-NEXT: umov w9, v2.b[0] +; NO_SVE-NEXT: umov w11, v2.b[3] +; NO_SVE-NEXT: umov w12, v2.b[4] +; NO_SVE-NEXT: umov w13, v2.b[5] +; NO_SVE-NEXT: and w8, w8, #0x1 +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: xtn v5.8b, v5.8h +; NO_SVE-NEXT: umov w14, v2.b[6] +; NO_SVE-NEXT: and w9, w9, #0x1 +; NO_SVE-NEXT: and w11, w11, #0x1 +; NO_SVE-NEXT: umov w15, v2.b[7] +; NO_SVE-NEXT: bfi w9, w8, #1, #1 +; NO_SVE-NEXT: and w12, w12, #0x1 +; NO_SVE-NEXT: umov w8, v5.b[0] +; NO_SVE-NEXT: bfi w9, w10, #2, #1 +; NO_SVE-NEXT: and w13, w13, #0x1 +; NO_SVE-NEXT: umov w10, v5.b[1] +; NO_SVE-NEXT: bfi w9, w11, #3, #1 +; NO_SVE-NEXT: umov w11, v5.b[2] +; NO_SVE-NEXT: bfi w9, w12, #4, #1 +; NO_SVE-NEXT: and w14, w14, #0x1 +; NO_SVE-NEXT: umov w12, v5.b[3] +; NO_SVE-NEXT: bfi w9, w13, #5, #1 +; NO_SVE-NEXT: and w15, w15, #0x1 +; NO_SVE-NEXT: xtn v1.8b, v1.8h +; NO_SVE-NEXT: umov w13, v5.b[4] +; NO_SVE-NEXT: orr w9, w9, w14, lsl #6 +; NO_SVE-NEXT: and w8, w8, #0x1 +; NO_SVE-NEXT: fcmeq v3.4s, v4.4s, #0.0 +; NO_SVE-NEXT: ldr q4, [x0, #32] +; NO_SVE-NEXT: orr w9, w9, w15, lsl #7 +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: umov w15, v1.b[1] +; NO_SVE-NEXT: orr w8, w9, w8, lsl #8 +; NO_SVE-NEXT: and w11, w11, #0x1 +; NO_SVE-NEXT: orr w8, w8, w10, lsl #9 +; NO_SVE-NEXT: umov w10, v1.b[2] +; NO_SVE-NEXT: and w12, w12, #0x1 +; NO_SVE-NEXT: umov w9, v1.b[0] +; NO_SVE-NEXT: orr w8, w8, w11, lsl #10 +; NO_SVE-NEXT: and w13, w13, #0x1 +; NO_SVE-NEXT: umov w11, v1.b[3] +; NO_SVE-NEXT: fcmeq v2.4s, v4.4s, #0.0 +; NO_SVE-NEXT: orr w8, w8, w12, lsl #11 +; NO_SVE-NEXT: and w12, w15, #0x1 +; NO_SVE-NEXT: orr w8, w8, w13, lsl #12 +; NO_SVE-NEXT: umov w13, v1.b[4] +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: umov w15, v1.b[5] +; NO_SVE-NEXT: and w9, w9, #0x1 +; NO_SVE-NEXT: uzp1 v2.8h, v2.8h, v3.8h +; NO_SVE-NEXT: bfi w9, w12, #1, #1 +; NO_SVE-NEXT: and w11, w11, #0x1 +; NO_SVE-NEXT: bfi w9, w10, #2, #1 +; NO_SVE-NEXT: and w10, w13, #0x1 +; NO_SVE-NEXT: umov w13, v1.b[6] +; NO_SVE-NEXT: umov w14, v5.b[5] +; NO_SVE-NEXT: and w12, w15, #0x1 +; NO_SVE-NEXT: bfi w9, w11, #3, #1 +; NO_SVE-NEXT: umov w11, v1.b[7] +; NO_SVE-NEXT: xtn v1.8b, v2.8h +; NO_SVE-NEXT: bfi w9, w10, #4, #1 +; NO_SVE-NEXT: umov w10, v5.b[6] +; NO_SVE-NEXT: bfi w9, w12, #5, #1 +; NO_SVE-NEXT: and w12, w13, #0x1 +; NO_SVE-NEXT: umov w13, v1.b[0] +; NO_SVE-NEXT: and w14, w14, #0x1 +; NO_SVE-NEXT: umov w15, v1.b[1] +; NO_SVE-NEXT: and w11, w11, #0x1 +; NO_SVE-NEXT: orr w9, w9, w12, lsl #6 +; NO_SVE-NEXT: umov w12, v1.b[2] +; NO_SVE-NEXT: orr w8, w8, w14, lsl #13 +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: umov w14, v1.b[3] +; NO_SVE-NEXT: orr w9, w9, w11, lsl #7 +; NO_SVE-NEXT: and w11, w13, #0x1 +; NO_SVE-NEXT: and w13, w15, #0x1 +; NO_SVE-NEXT: orr w8, w8, w10, lsl #14 +; NO_SVE-NEXT: umov w10, v1.b[4] +; NO_SVE-NEXT: orr w9, w9, w11, lsl #8 +; NO_SVE-NEXT: and w11, w12, #0x1 +; NO_SVE-NEXT: umov w12, v1.b[5] +; NO_SVE-NEXT: orr w9, w9, w13, lsl #9 +; NO_SVE-NEXT: and w13, w14, #0x1 +; NO_SVE-NEXT: umov w14, v1.b[6] +; NO_SVE-NEXT: orr w9, w9, w11, lsl #10 +; NO_SVE-NEXT: umov w11, v5.b[7] +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: orr w9, w9, w13, lsl #11 +; NO_SVE-NEXT: and w12, w12, #0x1 +; NO_SVE-NEXT: umov w13, v1.b[7] +; NO_SVE-NEXT: orr w9, w9, w10, lsl #12 +; NO_SVE-NEXT: and w10, w14, #0x1 +; NO_SVE-NEXT: dup v2.2d, x2 +; NO_SVE-NEXT: orr w11, w8, w11, lsl #15 +; NO_SVE-NEXT: orr w8, w9, w12, lsl #13 +; NO_SVE-NEXT: orr w8, w8, w10, lsl #14 +; NO_SVE-NEXT: orr w8, w8, w13, lsl #15 +; NO_SVE-NEXT: bfi w8, w11, #16, #16 +; NO_SVE-NEXT: add v1.2d, v0.2d, v2.2d +; NO_SVE-NEXT: tbz w8, #0, .LBB45_2 +; NO_SVE-NEXT: // %bb.1: // %cond.load +; NO_SVE-NEXT: fmov x9, d1 +; NO_SVE-NEXT: ldr s0, [x9] +; NO_SVE-NEXT: ldr q3, [x1, #16] +; NO_SVE-NEXT: tbnz w8, #1, .LBB45_3 +; NO_SVE-NEXT: b .LBB45_4 +; NO_SVE-NEXT: .LBB45_2: +; NO_SVE-NEXT: // implicit-def: $q0 +; NO_SVE-NEXT: ldr q3, [x1, #16] +; NO_SVE-NEXT: tbz w8, #1, .LBB45_4 +; NO_SVE-NEXT: .LBB45_3: // %cond.load1 +; NO_SVE-NEXT: mov x9, v1.d[1] +; NO_SVE-NEXT: ld1 { v0.s }[1], [x9] +; NO_SVE-NEXT: .LBB45_4: // %else2 +; NO_SVE-NEXT: add v1.2d, v3.2d, v2.2d +; NO_SVE-NEXT: tbnz w8, #2, .LBB45_8 +; NO_SVE-NEXT: // %bb.5: // %else5 +; NO_SVE-NEXT: ldr q3, [x1, #32] +; NO_SVE-NEXT: tbnz w8, #3, .LBB45_9 +; NO_SVE-NEXT: .LBB45_6: // %else8 +; NO_SVE-NEXT: add v4.2d, v3.2d, v2.2d +; NO_SVE-NEXT: tbz w8, #4, .LBB45_10 +; NO_SVE-NEXT: .LBB45_7: // %cond.load10 +; NO_SVE-NEXT: fmov x9, d4 +; NO_SVE-NEXT: ld1 { v1.s }[0], [x9] +; NO_SVE-NEXT: ldr q3, [x1, #48] +; NO_SVE-NEXT: tbnz w8, #5, .LBB45_11 +; NO_SVE-NEXT: b .LBB45_12 +; NO_SVE-NEXT: .LBB45_8: // %cond.load4 +; NO_SVE-NEXT: fmov x9, d1 +; NO_SVE-NEXT: ld1 { v0.s }[2], [x9] +; NO_SVE-NEXT: ldr q3, [x1, #32] +; NO_SVE-NEXT: tbz w8, #3, .LBB45_6 +; NO_SVE-NEXT: .LBB45_9: // %cond.load7 +; NO_SVE-NEXT: mov x9, v1.d[1] +; NO_SVE-NEXT: ld1 { v0.s }[3], [x9] +; NO_SVE-NEXT: add v4.2d, v3.2d, v2.2d +; NO_SVE-NEXT: tbnz w8, #4, .LBB45_7 +; NO_SVE-NEXT: .LBB45_10: +; NO_SVE-NEXT: // implicit-def: $q1 +; NO_SVE-NEXT: ldr q3, [x1, #48] +; NO_SVE-NEXT: tbz w8, #5, .LBB45_12 +; NO_SVE-NEXT: .LBB45_11: // %cond.load13 +; NO_SVE-NEXT: mov x9, v4.d[1] +; NO_SVE-NEXT: ld1 { v1.s }[1], [x9] +; NO_SVE-NEXT: .LBB45_12: // %else14 +; NO_SVE-NEXT: add v3.2d, v3.2d, v2.2d +; NO_SVE-NEXT: tbnz w8, #6, .LBB45_16 +; NO_SVE-NEXT: // %bb.13: // %else17 +; NO_SVE-NEXT: ldr q4, [x1, #64] +; NO_SVE-NEXT: tbnz w8, #7, .LBB45_17 +; NO_SVE-NEXT: .LBB45_14: // %else20 +; NO_SVE-NEXT: add v4.2d, v4.2d, v2.2d +; NO_SVE-NEXT: tbz w8, #8, .LBB45_18 +; NO_SVE-NEXT: .LBB45_15: // %cond.load22 +; NO_SVE-NEXT: fmov x9, d4 +; NO_SVE-NEXT: ld1 { v3.s }[0], [x9] +; NO_SVE-NEXT: ldr q5, [x1, #80] +; NO_SVE-NEXT: tbnz w8, #9, .LBB45_19 +; NO_SVE-NEXT: b .LBB45_20 +; NO_SVE-NEXT: .LBB45_16: // %cond.load16 +; NO_SVE-NEXT: fmov x9, d3 +; NO_SVE-NEXT: ld1 { v1.s }[2], [x9] +; NO_SVE-NEXT: ldr q4, [x1, #64] +; NO_SVE-NEXT: tbz w8, #7, .LBB45_14 +; NO_SVE-NEXT: .LBB45_17: // %cond.load19 +; NO_SVE-NEXT: mov x9, v3.d[1] +; NO_SVE-NEXT: ld1 { v1.s }[3], [x9] +; NO_SVE-NEXT: add v4.2d, v4.2d, v2.2d +; NO_SVE-NEXT: tbnz w8, #8, .LBB45_15 +; NO_SVE-NEXT: .LBB45_18: +; NO_SVE-NEXT: // implicit-def: $q3 +; NO_SVE-NEXT: ldr q5, [x1, #80] +; NO_SVE-NEXT: tbz w8, #9, .LBB45_20 +; NO_SVE-NEXT: .LBB45_19: // %cond.load25 +; NO_SVE-NEXT: mov x9, v4.d[1] +; NO_SVE-NEXT: ld1 { v3.s }[1], [x9] +; NO_SVE-NEXT: .LBB45_20: // %else26 +; NO_SVE-NEXT: add v4.2d, v5.2d, v2.2d +; NO_SVE-NEXT: tbnz w8, #10, .LBB45_24 +; NO_SVE-NEXT: // %bb.21: // %else29 +; NO_SVE-NEXT: ldr q5, [x1, #96] +; NO_SVE-NEXT: tbnz w8, #11, .LBB45_25 +; NO_SVE-NEXT: .LBB45_22: // %else32 +; NO_SVE-NEXT: add v5.2d, v5.2d, v2.2d +; NO_SVE-NEXT: tbz w8, #12, .LBB45_26 +; NO_SVE-NEXT: .LBB45_23: // %cond.load34 +; NO_SVE-NEXT: fmov x9, d5 +; NO_SVE-NEXT: ld1 { v4.s }[0], [x9] +; NO_SVE-NEXT: ldr q6, [x1, #112] +; NO_SVE-NEXT: tbnz w8, #13, .LBB45_27 +; NO_SVE-NEXT: b .LBB45_28 +; NO_SVE-NEXT: .LBB45_24: // %cond.load28 +; NO_SVE-NEXT: fmov x9, d4 +; NO_SVE-NEXT: ld1 { v3.s }[2], [x9] +; NO_SVE-NEXT: ldr q5, [x1, #96] +; NO_SVE-NEXT: tbz w8, #11, .LBB45_22 +; NO_SVE-NEXT: .LBB45_25: // %cond.load31 +; NO_SVE-NEXT: mov x9, v4.d[1] +; NO_SVE-NEXT: ld1 { v3.s }[3], [x9] +; NO_SVE-NEXT: add v5.2d, v5.2d, v2.2d +; NO_SVE-NEXT: tbnz w8, #12, .LBB45_23 +; NO_SVE-NEXT: .LBB45_26: +; NO_SVE-NEXT: // implicit-def: $q4 +; NO_SVE-NEXT: ldr q6, [x1, #112] +; NO_SVE-NEXT: tbz w8, #13, .LBB45_28 +; NO_SVE-NEXT: .LBB45_27: // %cond.load37 +; NO_SVE-NEXT: mov x9, v5.d[1] +; NO_SVE-NEXT: ld1 { v4.s }[1], [x9] +; NO_SVE-NEXT: .LBB45_28: // %else38 +; NO_SVE-NEXT: add v5.2d, v6.2d, v2.2d +; NO_SVE-NEXT: tbnz w8, #14, .LBB45_32 +; NO_SVE-NEXT: // %bb.29: // %else41 +; NO_SVE-NEXT: ldr q6, [x1, #128] +; NO_SVE-NEXT: tbnz w8, #15, .LBB45_33 +; NO_SVE-NEXT: .LBB45_30: // %else44 +; NO_SVE-NEXT: add v6.2d, v6.2d, v2.2d +; NO_SVE-NEXT: tbz w8, #16, .LBB45_34 +; NO_SVE-NEXT: .LBB45_31: // %cond.load46 +; NO_SVE-NEXT: fmov x9, d6 +; NO_SVE-NEXT: ld1 { v5.s }[0], [x9] +; NO_SVE-NEXT: ldr q7, [x1, #144] +; NO_SVE-NEXT: tbnz w8, #17, .LBB45_35 +; NO_SVE-NEXT: b .LBB45_36 +; NO_SVE-NEXT: .LBB45_32: // %cond.load40 +; NO_SVE-NEXT: fmov x9, d5 +; NO_SVE-NEXT: ld1 { v4.s }[2], [x9] +; NO_SVE-NEXT: ldr q6, [x1, #128] +; NO_SVE-NEXT: tbz w8, #15, .LBB45_30 +; NO_SVE-NEXT: .LBB45_33: // %cond.load43 +; NO_SVE-NEXT: mov x9, v5.d[1] +; NO_SVE-NEXT: ld1 { v4.s }[3], [x9] +; NO_SVE-NEXT: add v6.2d, v6.2d, v2.2d +; NO_SVE-NEXT: tbnz w8, #16, .LBB45_31 +; NO_SVE-NEXT: .LBB45_34: +; NO_SVE-NEXT: // implicit-def: $q5 +; NO_SVE-NEXT: ldr q7, [x1, #144] +; NO_SVE-NEXT: tbz w8, #17, .LBB45_36 +; NO_SVE-NEXT: .LBB45_35: // %cond.load49 +; NO_SVE-NEXT: mov x9, v6.d[1] +; NO_SVE-NEXT: ld1 { v5.s }[1], [x9] +; NO_SVE-NEXT: .LBB45_36: // %else50 +; NO_SVE-NEXT: add v6.2d, v7.2d, v2.2d +; NO_SVE-NEXT: tbnz w8, #18, .LBB45_40 +; NO_SVE-NEXT: // %bb.37: // %else53 +; NO_SVE-NEXT: ldr q7, [x1, #160] +; NO_SVE-NEXT: tbnz w8, #19, .LBB45_41 +; NO_SVE-NEXT: .LBB45_38: // %else56 +; NO_SVE-NEXT: add v7.2d, v7.2d, v2.2d +; NO_SVE-NEXT: tbz w8, #20, .LBB45_42 +; NO_SVE-NEXT: .LBB45_39: // %cond.load58 +; NO_SVE-NEXT: fmov x9, d7 +; NO_SVE-NEXT: ld1 { v6.s }[0], [x9] +; NO_SVE-NEXT: ldr q16, [x1, #176] +; NO_SVE-NEXT: tbnz w8, #21, .LBB45_43 +; NO_SVE-NEXT: b .LBB45_44 +; NO_SVE-NEXT: .LBB45_40: // %cond.load52 +; NO_SVE-NEXT: fmov x9, d6 +; NO_SVE-NEXT: ld1 { v5.s }[2], [x9] +; NO_SVE-NEXT: ldr q7, [x1, #160] +; NO_SVE-NEXT: tbz w8, #19, .LBB45_38 +; NO_SVE-NEXT: .LBB45_41: // %cond.load55 +; NO_SVE-NEXT: mov x9, v6.d[1] +; NO_SVE-NEXT: ld1 { v5.s }[3], [x9] +; NO_SVE-NEXT: add v7.2d, v7.2d, v2.2d +; NO_SVE-NEXT: tbnz w8, #20, .LBB45_39 +; NO_SVE-NEXT: .LBB45_42: +; NO_SVE-NEXT: // implicit-def: $q6 +; NO_SVE-NEXT: ldr q16, [x1, #176] +; NO_SVE-NEXT: tbz w8, #21, .LBB45_44 +; NO_SVE-NEXT: .LBB45_43: // %cond.load61 +; NO_SVE-NEXT: mov x9, v7.d[1] +; NO_SVE-NEXT: ld1 { v6.s }[1], [x9] +; NO_SVE-NEXT: .LBB45_44: // %else62 +; NO_SVE-NEXT: add v7.2d, v16.2d, v2.2d +; NO_SVE-NEXT: tbnz w8, #22, .LBB45_48 +; NO_SVE-NEXT: // %bb.45: // %else65 +; NO_SVE-NEXT: ldr q16, [x1, #192] +; NO_SVE-NEXT: tbnz w8, #23, .LBB45_49 +; NO_SVE-NEXT: .LBB45_46: // %else68 +; NO_SVE-NEXT: add v16.2d, v16.2d, v2.2d +; NO_SVE-NEXT: tbz w8, #24, .LBB45_50 +; NO_SVE-NEXT: .LBB45_47: // %cond.load70 +; NO_SVE-NEXT: fmov x9, d16 +; NO_SVE-NEXT: ld1 { v7.s }[0], [x9] +; NO_SVE-NEXT: ldr q17, [x1, #208] +; NO_SVE-NEXT: tbnz w8, #25, .LBB45_51 +; NO_SVE-NEXT: b .LBB45_52 +; NO_SVE-NEXT: .LBB45_48: // %cond.load64 +; NO_SVE-NEXT: fmov x9, d7 +; NO_SVE-NEXT: ld1 { v6.s }[2], [x9] +; NO_SVE-NEXT: ldr q16, [x1, #192] +; NO_SVE-NEXT: tbz w8, #23, .LBB45_46 +; NO_SVE-NEXT: .LBB45_49: // %cond.load67 +; NO_SVE-NEXT: mov x9, v7.d[1] +; NO_SVE-NEXT: ld1 { v6.s }[3], [x9] +; NO_SVE-NEXT: add v16.2d, v16.2d, v2.2d +; NO_SVE-NEXT: tbnz w8, #24, .LBB45_47 +; NO_SVE-NEXT: .LBB45_50: +; NO_SVE-NEXT: // implicit-def: $q7 +; NO_SVE-NEXT: ldr q17, [x1, #208] +; NO_SVE-NEXT: tbz w8, #25, .LBB45_52 +; NO_SVE-NEXT: .LBB45_51: // %cond.load73 +; NO_SVE-NEXT: mov x9, v16.d[1] +; NO_SVE-NEXT: ld1 { v7.s }[1], [x9] +; NO_SVE-NEXT: .LBB45_52: // %else74 +; NO_SVE-NEXT: add v16.2d, v17.2d, v2.2d +; NO_SVE-NEXT: tbnz w8, #26, .LBB45_56 +; NO_SVE-NEXT: // %bb.53: // %else77 +; NO_SVE-NEXT: ldr q17, [x1, #224] +; NO_SVE-NEXT: tbnz w8, #27, .LBB45_57 +; NO_SVE-NEXT: .LBB45_54: // %else80 +; NO_SVE-NEXT: add v17.2d, v17.2d, v2.2d +; NO_SVE-NEXT: tbz w8, #28, .LBB45_58 +; NO_SVE-NEXT: .LBB45_55: // %cond.load82 +; NO_SVE-NEXT: fmov x9, d17 +; NO_SVE-NEXT: ld1 { v16.s }[0], [x9] +; NO_SVE-NEXT: ldr q18, [x1, #240] +; NO_SVE-NEXT: tbnz w8, #29, .LBB45_59 +; NO_SVE-NEXT: b .LBB45_60 +; NO_SVE-NEXT: .LBB45_56: // %cond.load76 +; NO_SVE-NEXT: fmov x9, d16 +; NO_SVE-NEXT: ld1 { v7.s }[2], [x9] +; NO_SVE-NEXT: ldr q17, [x1, #224] +; NO_SVE-NEXT: tbz w8, #27, .LBB45_54 +; NO_SVE-NEXT: .LBB45_57: // %cond.load79 +; NO_SVE-NEXT: mov x9, v16.d[1] +; NO_SVE-NEXT: ld1 { v7.s }[3], [x9] +; NO_SVE-NEXT: add v17.2d, v17.2d, v2.2d +; NO_SVE-NEXT: tbnz w8, #28, .LBB45_55 +; NO_SVE-NEXT: .LBB45_58: +; NO_SVE-NEXT: // implicit-def: $q16 +; NO_SVE-NEXT: ldr q18, [x1, #240] +; NO_SVE-NEXT: tbz w8, #29, .LBB45_60 +; NO_SVE-NEXT: .LBB45_59: // %cond.load85 +; NO_SVE-NEXT: mov x9, v17.d[1] +; NO_SVE-NEXT: ld1 { v16.s }[1], [x9] +; NO_SVE-NEXT: .LBB45_60: // %else86 +; NO_SVE-NEXT: add v2.2d, v18.2d, v2.2d +; NO_SVE-NEXT: tbnz w8, #30, .LBB45_64 +; NO_SVE-NEXT: // %bb.61: // %else89 +; NO_SVE-NEXT: tbz w8, #31, .LBB45_63 +; NO_SVE-NEXT: .LBB45_62: // %cond.load91 +; NO_SVE-NEXT: mov x8, v2.d[1] +; NO_SVE-NEXT: ld1 { v16.s }[3], [x8] +; NO_SVE-NEXT: .LBB45_63: // %else92 +; NO_SVE-NEXT: stp q0, q1, [x0] +; NO_SVE-NEXT: stp q3, q4, [x0, #32] +; NO_SVE-NEXT: stp q5, q6, [x0, #64] +; NO_SVE-NEXT: stp q7, q16, [x0, #96] +; NO_SVE-NEXT: add sp, sp, #16 +; NO_SVE-NEXT: ret +; NO_SVE-NEXT: .LBB45_64: // %cond.load88 +; NO_SVE-NEXT: fmov x9, d2 +; NO_SVE-NEXT: ld1 { v16.s }[2], [x9] +; NO_SVE-NEXT: tbnz w8, #31, .LBB45_62 +; NO_SVE-NEXT: b .LBB45_63 +; +; VBITS_EQ_256-LABEL: masked_gather_vec_plus_reg: +; VBITS_EQ_256: // %bb.0: +; VBITS_EQ_256-NEXT: mov x8, #24 +; VBITS_EQ_256-NEXT: ptrue p0.s, vl8 +; VBITS_EQ_256-NEXT: mov x10, #16 +; VBITS_EQ_256-NEXT: mov x9, #8 +; VBITS_EQ_256-NEXT: mov x14, #28 +; VBITS_EQ_256-NEXT: ptrue p1.d, vl4 +; VBITS_EQ_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] +; VBITS_EQ_256-NEXT: mov x11, #4 +; VBITS_EQ_256-NEXT: ld1w { z2.s }, p0/z, [x0, x10, lsl #2] +; VBITS_EQ_256-NEXT: ld1w { z1.s }, p0/z, [x0, x9, lsl #2] +; VBITS_EQ_256-NEXT: ld1w { z3.s }, p0/z, [x0] +; VBITS_EQ_256-NEXT: ld1d { z17.d }, p1/z, [x1, x14, lsl #3] +; VBITS_EQ_256-NEXT: ld1d { z18.d }, p1/z, [x1, x8, lsl #3] +; VBITS_EQ_256-NEXT: mov x12, #20 +; VBITS_EQ_256-NEXT: fcmeq p2.s, p0/z, z0.s, #0.0 +; VBITS_EQ_256-NEXT: mov x13, #12 +; VBITS_EQ_256-NEXT: mov z19.s, p2/z, #-1 // =0xffffffffffffffff +; VBITS_EQ_256-NEXT: fcmeq p4.s, p0/z, z2.s, #0.0 +; VBITS_EQ_256-NEXT: ext z19.b, z19.b, z19.b, #16 +; VBITS_EQ_256-NEXT: punpklo p2.h, p2.b +; VBITS_EQ_256-NEXT: sunpklo z2.d, z19.s +; VBITS_EQ_256-NEXT: and p2.b, p2/z, p2.b, p1.b +; VBITS_EQ_256-NEXT: cmpne p3.d, p1/z, z2.d, #0 +; VBITS_EQ_256-NEXT: ld1d { z4.d }, p1/z, [x1, x11, lsl #3] +; VBITS_EQ_256-NEXT: ld1d { z5.d }, p1/z, [x1, x12, lsl #3] +; VBITS_EQ_256-NEXT: ld1d { z6.d }, p1/z, [x1, x13, lsl #3] +; VBITS_EQ_256-NEXT: ld1d { z7.d }, p1/z, [x1, x10, lsl #3] +; VBITS_EQ_256-NEXT: ld1d { z16.d }, p1/z, [x1, x9, lsl #3] +; VBITS_EQ_256-NEXT: ld1d { z0.d }, p1/z, [x1] +; VBITS_EQ_256-NEXT: ld1w { z2.d }, p2/z, [x2, z18.d] +; VBITS_EQ_256-NEXT: ld1w { z17.d }, p3/z, [x2, z17.d] +; VBITS_EQ_256-NEXT: fcmeq p3.s, p0/z, z1.s, #0.0 +; VBITS_EQ_256-NEXT: mov z1.s, p3/z, #-1 // =0xffffffffffffffff +; VBITS_EQ_256-NEXT: fcmeq p2.s, p0/z, z3.s, #0.0 +; VBITS_EQ_256-NEXT: ext z1.b, z1.b, z1.b, #16 +; VBITS_EQ_256-NEXT: mov z18.s, p4/z, #-1 // =0xffffffffffffffff +; VBITS_EQ_256-NEXT: mov z3.s, p2/z, #-1 // =0xffffffffffffffff +; VBITS_EQ_256-NEXT: punpklo p3.h, p3.b +; VBITS_EQ_256-NEXT: sunpklo z1.d, z1.s +; VBITS_EQ_256-NEXT: and p3.b, p3/z, p3.b, p1.b +; VBITS_EQ_256-NEXT: cmpne p5.d, p1/z, z1.d, #0 +; VBITS_EQ_256-NEXT: punpklo p4.h, p4.b +; VBITS_EQ_256-NEXT: ext z18.b, z18.b, z18.b, #16 +; VBITS_EQ_256-NEXT: ext z3.b, z3.b, z3.b, #16 +; VBITS_EQ_256-NEXT: ld1w { z16.d }, p3/z, [x2, z16.d] +; VBITS_EQ_256-NEXT: ld1w { z1.d }, p5/z, [x2, z6.d] +; VBITS_EQ_256-NEXT: and p4.b, p4/z, p4.b, p1.b +; VBITS_EQ_256-NEXT: sunpklo z6.d, z18.s +; VBITS_EQ_256-NEXT: punpklo p2.h, p2.b +; VBITS_EQ_256-NEXT: sunpklo z3.d, z3.s +; VBITS_EQ_256-NEXT: ld1w { z7.d }, p4/z, [x2, z7.d] +; VBITS_EQ_256-NEXT: cmpne p4.d, p1/z, z6.d, #0 +; VBITS_EQ_256-NEXT: and p2.b, p2/z, p2.b, p1.b +; VBITS_EQ_256-NEXT: cmpne p1.d, p1/z, z3.d, #0 +; VBITS_EQ_256-NEXT: ld1w { z5.d }, p4/z, [x2, z5.d] +; VBITS_EQ_256-NEXT: ld1w { z0.d }, p2/z, [x2, z0.d] +; VBITS_EQ_256-NEXT: ld1w { z3.d }, p1/z, [x2, z4.d] +; VBITS_EQ_256-NEXT: ptrue p3.s, vl4 +; VBITS_EQ_256-NEXT: uzp1 z4.s, z16.s, z16.s +; VBITS_EQ_256-NEXT: uzp1 z1.s, z1.s, z1.s +; VBITS_EQ_256-NEXT: uzp1 z2.s, z2.s, z2.s +; VBITS_EQ_256-NEXT: uzp1 z17.s, z17.s, z17.s +; VBITS_EQ_256-NEXT: splice z4.s, p3, z4.s, z1.s +; VBITS_EQ_256-NEXT: uzp1 z1.s, z7.s, z7.s +; VBITS_EQ_256-NEXT: uzp1 z5.s, z5.s, z5.s +; VBITS_EQ_256-NEXT: uzp1 z0.s, z0.s, z0.s +; VBITS_EQ_256-NEXT: uzp1 z3.s, z3.s, z3.s +; VBITS_EQ_256-NEXT: splice z2.s, p3, z2.s, z17.s +; VBITS_EQ_256-NEXT: splice z1.s, p3, z1.s, z5.s +; VBITS_EQ_256-NEXT: splice z0.s, p3, z0.s, z3.s +; VBITS_EQ_256-NEXT: st1w { z1.s }, p0, [x0, x10, lsl #2] +; VBITS_EQ_256-NEXT: st1w { z4.s }, p0, [x0, x9, lsl #2] +; VBITS_EQ_256-NEXT: st1w { z2.s }, p0, [x0, x8, lsl #2] +; VBITS_EQ_256-NEXT: st1w { z0.s }, p0, [x0] +; VBITS_EQ_256-NEXT: ret +; ; VBITS_GE_2048-LABEL: masked_gather_vec_plus_reg: ; VBITS_GE_2048: // %bb.0: ; VBITS_GE_2048-NEXT: ptrue p0.s, vl32 @@ -1162,6 +10886,464 @@ } define void @masked_gather_vec_plus_imm(<32 x float>* %a, <32 x i8*>* %b) #0 { +; NO_SVE-LABEL: masked_gather_vec_plus_imm: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: sub sp, sp, #16 +; NO_SVE-NEXT: .cfi_def_cfa_offset 16 +; NO_SVE-NEXT: ldp q0, q3, [x0, #64] +; NO_SVE-NEXT: mov w9, #4 +; NO_SVE-NEXT: fcmeq v6.4s, v0.4s, #0.0 +; NO_SVE-NEXT: ldp q1, q2, [x0] +; NO_SVE-NEXT: fcmeq v3.4s, v3.4s, #0.0 +; NO_SVE-NEXT: fcmeq v1.4s, v1.4s, #0.0 +; NO_SVE-NEXT: uzp1 v3.8h, v6.8h, v3.8h +; NO_SVE-NEXT: ldp q4, q5, [x0, #96] +; NO_SVE-NEXT: fcmeq v2.4s, v2.4s, #0.0 +; NO_SVE-NEXT: fcmeq v4.4s, v4.4s, #0.0 +; NO_SVE-NEXT: uzp1 v1.8h, v1.8h, v2.8h +; NO_SVE-NEXT: fcmeq v5.4s, v5.4s, #0.0 +; NO_SVE-NEXT: xtn v2.8b, v3.8h +; NO_SVE-NEXT: ldp q0, q7, [x0, #32] +; NO_SVE-NEXT: umov w8, v2.b[1] +; NO_SVE-NEXT: umov w11, v2.b[2] +; NO_SVE-NEXT: uzp1 v4.8h, v4.8h, v5.8h +; NO_SVE-NEXT: umov w10, v2.b[0] +; NO_SVE-NEXT: umov w12, v2.b[3] +; NO_SVE-NEXT: umov w13, v2.b[4] +; NO_SVE-NEXT: umov w14, v2.b[5] +; NO_SVE-NEXT: and w8, w8, #0x1 +; NO_SVE-NEXT: and w11, w11, #0x1 +; NO_SVE-NEXT: xtn v3.8b, v4.8h +; NO_SVE-NEXT: umov w15, v2.b[6] +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: and w12, w12, #0x1 +; NO_SVE-NEXT: umov w16, v2.b[7] +; NO_SVE-NEXT: bfi w10, w8, #1, #1 +; NO_SVE-NEXT: and w13, w13, #0x1 +; NO_SVE-NEXT: umov w8, v3.b[0] +; NO_SVE-NEXT: bfi w10, w11, #2, #1 +; NO_SVE-NEXT: and w14, w14, #0x1 +; NO_SVE-NEXT: umov w11, v3.b[1] +; NO_SVE-NEXT: bfi w10, w12, #3, #1 +; NO_SVE-NEXT: umov w12, v3.b[2] +; NO_SVE-NEXT: bfi w10, w13, #4, #1 +; NO_SVE-NEXT: and w15, w15, #0x1 +; NO_SVE-NEXT: umov w13, v3.b[3] +; NO_SVE-NEXT: bfi w10, w14, #5, #1 +; NO_SVE-NEXT: and w16, w16, #0x1 +; NO_SVE-NEXT: xtn v1.8b, v1.8h +; NO_SVE-NEXT: umov w14, v3.b[4] +; NO_SVE-NEXT: orr w10, w10, w15, lsl #6 +; NO_SVE-NEXT: and w8, w8, #0x1 +; NO_SVE-NEXT: orr w10, w10, w16, lsl #7 +; NO_SVE-NEXT: and w11, w11, #0x1 +; NO_SVE-NEXT: umov w16, v1.b[1] +; NO_SVE-NEXT: orr w8, w10, w8, lsl #8 +; NO_SVE-NEXT: and w12, w12, #0x1 +; NO_SVE-NEXT: orr w8, w8, w11, lsl #9 +; NO_SVE-NEXT: umov w11, v1.b[2] +; NO_SVE-NEXT: and w13, w13, #0x1 +; NO_SVE-NEXT: fcmeq v6.4s, v7.4s, #0.0 +; NO_SVE-NEXT: umov w10, v1.b[0] +; NO_SVE-NEXT: fcmeq v0.4s, v0.4s, #0.0 +; NO_SVE-NEXT: orr w8, w8, w12, lsl #10 +; NO_SVE-NEXT: and w14, w14, #0x1 +; NO_SVE-NEXT: umov w12, v1.b[3] +; NO_SVE-NEXT: orr w8, w8, w13, lsl #11 +; NO_SVE-NEXT: and w13, w16, #0x1 +; NO_SVE-NEXT: orr w8, w8, w14, lsl #12 +; NO_SVE-NEXT: umov w14, v1.b[4] +; NO_SVE-NEXT: and w11, w11, #0x1 +; NO_SVE-NEXT: umov w16, v1.b[5] +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: and w12, w12, #0x1 +; NO_SVE-NEXT: uzp1 v0.8h, v0.8h, v6.8h +; NO_SVE-NEXT: bfi w10, w13, #1, #1 +; NO_SVE-NEXT: bfi w10, w11, #2, #1 +; NO_SVE-NEXT: and w11, w14, #0x1 +; NO_SVE-NEXT: umov w14, v1.b[6] +; NO_SVE-NEXT: umov w15, v3.b[5] +; NO_SVE-NEXT: and w13, w16, #0x1 +; NO_SVE-NEXT: bfi w10, w12, #3, #1 +; NO_SVE-NEXT: umov w12, v1.b[7] +; NO_SVE-NEXT: xtn v0.8b, v0.8h +; NO_SVE-NEXT: bfi w10, w11, #4, #1 +; NO_SVE-NEXT: umov w11, v3.b[6] +; NO_SVE-NEXT: bfi w10, w13, #5, #1 +; NO_SVE-NEXT: and w13, w14, #0x1 +; NO_SVE-NEXT: umov w14, v0.b[0] +; NO_SVE-NEXT: and w15, w15, #0x1 +; NO_SVE-NEXT: umov w16, v0.b[1] +; NO_SVE-NEXT: and w12, w12, #0x1 +; NO_SVE-NEXT: orr w10, w10, w13, lsl #6 +; NO_SVE-NEXT: umov w13, v0.b[2] +; NO_SVE-NEXT: orr w8, w8, w15, lsl #13 +; NO_SVE-NEXT: and w11, w11, #0x1 +; NO_SVE-NEXT: umov w15, v0.b[3] +; NO_SVE-NEXT: orr w10, w10, w12, lsl #7 +; NO_SVE-NEXT: and w12, w14, #0x1 +; NO_SVE-NEXT: and w14, w16, #0x1 +; NO_SVE-NEXT: orr w8, w8, w11, lsl #14 +; NO_SVE-NEXT: umov w11, v0.b[4] +; NO_SVE-NEXT: orr w10, w10, w12, lsl #8 +; NO_SVE-NEXT: and w12, w13, #0x1 +; NO_SVE-NEXT: umov w13, v0.b[5] +; NO_SVE-NEXT: orr w10, w10, w14, lsl #9 +; NO_SVE-NEXT: and w14, w15, #0x1 +; NO_SVE-NEXT: umov w15, v0.b[6] +; NO_SVE-NEXT: orr w10, w10, w12, lsl #10 +; NO_SVE-NEXT: umov w12, v3.b[7] +; NO_SVE-NEXT: and w11, w11, #0x1 +; NO_SVE-NEXT: orr w10, w10, w14, lsl #11 +; NO_SVE-NEXT: and w13, w13, #0x1 +; NO_SVE-NEXT: umov w14, v0.b[7] +; NO_SVE-NEXT: orr w10, w10, w11, lsl #12 +; NO_SVE-NEXT: and w11, w15, #0x1 +; NO_SVE-NEXT: orr w12, w8, w12, lsl #15 +; NO_SVE-NEXT: orr w8, w10, w13, lsl #13 +; NO_SVE-NEXT: orr w8, w8, w11, lsl #14 +; NO_SVE-NEXT: ldr q0, [x1] +; NO_SVE-NEXT: orr w8, w8, w14, lsl #15 +; NO_SVE-NEXT: dup v2.2d, x9 +; NO_SVE-NEXT: bfi w8, w12, #16, #16 +; NO_SVE-NEXT: add v1.2d, v0.2d, v2.2d +; NO_SVE-NEXT: tbz w8, #0, .LBB46_2 +; NO_SVE-NEXT: // %bb.1: // %cond.load +; NO_SVE-NEXT: fmov x9, d1 +; NO_SVE-NEXT: ldr s0, [x9] +; NO_SVE-NEXT: ldr q3, [x1, #16] +; NO_SVE-NEXT: tbnz w8, #1, .LBB46_3 +; NO_SVE-NEXT: b .LBB46_4 +; NO_SVE-NEXT: .LBB46_2: +; NO_SVE-NEXT: // implicit-def: $q0 +; NO_SVE-NEXT: ldr q3, [x1, #16] +; NO_SVE-NEXT: tbz w8, #1, .LBB46_4 +; NO_SVE-NEXT: .LBB46_3: // %cond.load1 +; NO_SVE-NEXT: mov x9, v1.d[1] +; NO_SVE-NEXT: ld1 { v0.s }[1], [x9] +; NO_SVE-NEXT: .LBB46_4: // %else2 +; NO_SVE-NEXT: add v1.2d, v3.2d, v2.2d +; NO_SVE-NEXT: tbnz w8, #2, .LBB46_8 +; NO_SVE-NEXT: // %bb.5: // %else5 +; NO_SVE-NEXT: ldr q3, [x1, #32] +; NO_SVE-NEXT: tbnz w8, #3, .LBB46_9 +; NO_SVE-NEXT: .LBB46_6: // %else8 +; NO_SVE-NEXT: add v4.2d, v3.2d, v2.2d +; NO_SVE-NEXT: tbz w8, #4, .LBB46_10 +; NO_SVE-NEXT: .LBB46_7: // %cond.load10 +; NO_SVE-NEXT: fmov x9, d4 +; NO_SVE-NEXT: ld1 { v1.s }[0], [x9] +; NO_SVE-NEXT: ldr q3, [x1, #48] +; NO_SVE-NEXT: tbnz w8, #5, .LBB46_11 +; NO_SVE-NEXT: b .LBB46_12 +; NO_SVE-NEXT: .LBB46_8: // %cond.load4 +; NO_SVE-NEXT: fmov x9, d1 +; NO_SVE-NEXT: ld1 { v0.s }[2], [x9] +; NO_SVE-NEXT: ldr q3, [x1, #32] +; NO_SVE-NEXT: tbz w8, #3, .LBB46_6 +; NO_SVE-NEXT: .LBB46_9: // %cond.load7 +; NO_SVE-NEXT: mov x9, v1.d[1] +; NO_SVE-NEXT: ld1 { v0.s }[3], [x9] +; NO_SVE-NEXT: add v4.2d, v3.2d, v2.2d +; NO_SVE-NEXT: tbnz w8, #4, .LBB46_7 +; NO_SVE-NEXT: .LBB46_10: +; NO_SVE-NEXT: // implicit-def: $q1 +; NO_SVE-NEXT: ldr q3, [x1, #48] +; NO_SVE-NEXT: tbz w8, #5, .LBB46_12 +; NO_SVE-NEXT: .LBB46_11: // %cond.load13 +; NO_SVE-NEXT: mov x9, v4.d[1] +; NO_SVE-NEXT: ld1 { v1.s }[1], [x9] +; NO_SVE-NEXT: .LBB46_12: // %else14 +; NO_SVE-NEXT: add v3.2d, v3.2d, v2.2d +; NO_SVE-NEXT: tbnz w8, #6, .LBB46_16 +; NO_SVE-NEXT: // %bb.13: // %else17 +; NO_SVE-NEXT: ldr q4, [x1, #64] +; NO_SVE-NEXT: tbnz w8, #7, .LBB46_17 +; NO_SVE-NEXT: .LBB46_14: // %else20 +; NO_SVE-NEXT: add v4.2d, v4.2d, v2.2d +; NO_SVE-NEXT: tbz w8, #8, .LBB46_18 +; NO_SVE-NEXT: .LBB46_15: // %cond.load22 +; NO_SVE-NEXT: fmov x9, d4 +; NO_SVE-NEXT: ld1 { v3.s }[0], [x9] +; NO_SVE-NEXT: ldr q5, [x1, #80] +; NO_SVE-NEXT: tbnz w8, #9, .LBB46_19 +; NO_SVE-NEXT: b .LBB46_20 +; NO_SVE-NEXT: .LBB46_16: // %cond.load16 +; NO_SVE-NEXT: fmov x9, d3 +; NO_SVE-NEXT: ld1 { v1.s }[2], [x9] +; NO_SVE-NEXT: ldr q4, [x1, #64] +; NO_SVE-NEXT: tbz w8, #7, .LBB46_14 +; NO_SVE-NEXT: .LBB46_17: // %cond.load19 +; NO_SVE-NEXT: mov x9, v3.d[1] +; NO_SVE-NEXT: ld1 { v1.s }[3], [x9] +; NO_SVE-NEXT: add v4.2d, v4.2d, v2.2d +; NO_SVE-NEXT: tbnz w8, #8, .LBB46_15 +; NO_SVE-NEXT: .LBB46_18: +; NO_SVE-NEXT: // implicit-def: $q3 +; NO_SVE-NEXT: ldr q5, [x1, #80] +; NO_SVE-NEXT: tbz w8, #9, .LBB46_20 +; NO_SVE-NEXT: .LBB46_19: // %cond.load25 +; NO_SVE-NEXT: mov x9, v4.d[1] +; NO_SVE-NEXT: ld1 { v3.s }[1], [x9] +; NO_SVE-NEXT: .LBB46_20: // %else26 +; NO_SVE-NEXT: add v4.2d, v5.2d, v2.2d +; NO_SVE-NEXT: tbnz w8, #10, .LBB46_24 +; NO_SVE-NEXT: // %bb.21: // %else29 +; NO_SVE-NEXT: ldr q5, [x1, #96] +; NO_SVE-NEXT: tbnz w8, #11, .LBB46_25 +; NO_SVE-NEXT: .LBB46_22: // %else32 +; NO_SVE-NEXT: add v5.2d, v5.2d, v2.2d +; NO_SVE-NEXT: tbz w8, #12, .LBB46_26 +; NO_SVE-NEXT: .LBB46_23: // %cond.load34 +; NO_SVE-NEXT: fmov x9, d5 +; NO_SVE-NEXT: ld1 { v4.s }[0], [x9] +; NO_SVE-NEXT: ldr q6, [x1, #112] +; NO_SVE-NEXT: tbnz w8, #13, .LBB46_27 +; NO_SVE-NEXT: b .LBB46_28 +; NO_SVE-NEXT: .LBB46_24: // %cond.load28 +; NO_SVE-NEXT: fmov x9, d4 +; NO_SVE-NEXT: ld1 { v3.s }[2], [x9] +; NO_SVE-NEXT: ldr q5, [x1, #96] +; NO_SVE-NEXT: tbz w8, #11, .LBB46_22 +; NO_SVE-NEXT: .LBB46_25: // %cond.load31 +; NO_SVE-NEXT: mov x9, v4.d[1] +; NO_SVE-NEXT: ld1 { v3.s }[3], [x9] +; NO_SVE-NEXT: add v5.2d, v5.2d, v2.2d +; NO_SVE-NEXT: tbnz w8, #12, .LBB46_23 +; NO_SVE-NEXT: .LBB46_26: +; NO_SVE-NEXT: // implicit-def: $q4 +; NO_SVE-NEXT: ldr q6, [x1, #112] +; NO_SVE-NEXT: tbz w8, #13, .LBB46_28 +; NO_SVE-NEXT: .LBB46_27: // %cond.load37 +; NO_SVE-NEXT: mov x9, v5.d[1] +; NO_SVE-NEXT: ld1 { v4.s }[1], [x9] +; NO_SVE-NEXT: .LBB46_28: // %else38 +; NO_SVE-NEXT: add v5.2d, v6.2d, v2.2d +; NO_SVE-NEXT: tbnz w8, #14, .LBB46_32 +; NO_SVE-NEXT: // %bb.29: // %else41 +; NO_SVE-NEXT: ldr q6, [x1, #128] +; NO_SVE-NEXT: tbnz w8, #15, .LBB46_33 +; NO_SVE-NEXT: .LBB46_30: // %else44 +; NO_SVE-NEXT: add v6.2d, v6.2d, v2.2d +; NO_SVE-NEXT: tbz w8, #16, .LBB46_34 +; NO_SVE-NEXT: .LBB46_31: // %cond.load46 +; NO_SVE-NEXT: fmov x9, d6 +; NO_SVE-NEXT: ld1 { v5.s }[0], [x9] +; NO_SVE-NEXT: ldr q7, [x1, #144] +; NO_SVE-NEXT: tbnz w8, #17, .LBB46_35 +; NO_SVE-NEXT: b .LBB46_36 +; NO_SVE-NEXT: .LBB46_32: // %cond.load40 +; NO_SVE-NEXT: fmov x9, d5 +; NO_SVE-NEXT: ld1 { v4.s }[2], [x9] +; NO_SVE-NEXT: ldr q6, [x1, #128] +; NO_SVE-NEXT: tbz w8, #15, .LBB46_30 +; NO_SVE-NEXT: .LBB46_33: // %cond.load43 +; NO_SVE-NEXT: mov x9, v5.d[1] +; NO_SVE-NEXT: ld1 { v4.s }[3], [x9] +; NO_SVE-NEXT: add v6.2d, v6.2d, v2.2d +; NO_SVE-NEXT: tbnz w8, #16, .LBB46_31 +; NO_SVE-NEXT: .LBB46_34: +; NO_SVE-NEXT: // implicit-def: $q5 +; NO_SVE-NEXT: ldr q7, [x1, #144] +; NO_SVE-NEXT: tbz w8, #17, .LBB46_36 +; NO_SVE-NEXT: .LBB46_35: // %cond.load49 +; NO_SVE-NEXT: mov x9, v6.d[1] +; NO_SVE-NEXT: ld1 { v5.s }[1], [x9] +; NO_SVE-NEXT: .LBB46_36: // %else50 +; NO_SVE-NEXT: add v6.2d, v7.2d, v2.2d +; NO_SVE-NEXT: tbnz w8, #18, .LBB46_40 +; NO_SVE-NEXT: // %bb.37: // %else53 +; NO_SVE-NEXT: ldr q7, [x1, #160] +; NO_SVE-NEXT: tbnz w8, #19, .LBB46_41 +; NO_SVE-NEXT: .LBB46_38: // %else56 +; NO_SVE-NEXT: add v7.2d, v7.2d, v2.2d +; NO_SVE-NEXT: tbz w8, #20, .LBB46_42 +; NO_SVE-NEXT: .LBB46_39: // %cond.load58 +; NO_SVE-NEXT: fmov x9, d7 +; NO_SVE-NEXT: ld1 { v6.s }[0], [x9] +; NO_SVE-NEXT: ldr q16, [x1, #176] +; NO_SVE-NEXT: tbnz w8, #21, .LBB46_43 +; NO_SVE-NEXT: b .LBB46_44 +; NO_SVE-NEXT: .LBB46_40: // %cond.load52 +; NO_SVE-NEXT: fmov x9, d6 +; NO_SVE-NEXT: ld1 { v5.s }[2], [x9] +; NO_SVE-NEXT: ldr q7, [x1, #160] +; NO_SVE-NEXT: tbz w8, #19, .LBB46_38 +; NO_SVE-NEXT: .LBB46_41: // %cond.load55 +; NO_SVE-NEXT: mov x9, v6.d[1] +; NO_SVE-NEXT: ld1 { v5.s }[3], [x9] +; NO_SVE-NEXT: add v7.2d, v7.2d, v2.2d +; NO_SVE-NEXT: tbnz w8, #20, .LBB46_39 +; NO_SVE-NEXT: .LBB46_42: +; NO_SVE-NEXT: // implicit-def: $q6 +; NO_SVE-NEXT: ldr q16, [x1, #176] +; NO_SVE-NEXT: tbz w8, #21, .LBB46_44 +; NO_SVE-NEXT: .LBB46_43: // %cond.load61 +; NO_SVE-NEXT: mov x9, v7.d[1] +; NO_SVE-NEXT: ld1 { v6.s }[1], [x9] +; NO_SVE-NEXT: .LBB46_44: // %else62 +; NO_SVE-NEXT: add v7.2d, v16.2d, v2.2d +; NO_SVE-NEXT: tbnz w8, #22, .LBB46_48 +; NO_SVE-NEXT: // %bb.45: // %else65 +; NO_SVE-NEXT: ldr q16, [x1, #192] +; NO_SVE-NEXT: tbnz w8, #23, .LBB46_49 +; NO_SVE-NEXT: .LBB46_46: // %else68 +; NO_SVE-NEXT: add v16.2d, v16.2d, v2.2d +; NO_SVE-NEXT: tbz w8, #24, .LBB46_50 +; NO_SVE-NEXT: .LBB46_47: // %cond.load70 +; NO_SVE-NEXT: fmov x9, d16 +; NO_SVE-NEXT: ld1 { v7.s }[0], [x9] +; NO_SVE-NEXT: ldr q17, [x1, #208] +; NO_SVE-NEXT: tbnz w8, #25, .LBB46_51 +; NO_SVE-NEXT: b .LBB46_52 +; NO_SVE-NEXT: .LBB46_48: // %cond.load64 +; NO_SVE-NEXT: fmov x9, d7 +; NO_SVE-NEXT: ld1 { v6.s }[2], [x9] +; NO_SVE-NEXT: ldr q16, [x1, #192] +; NO_SVE-NEXT: tbz w8, #23, .LBB46_46 +; NO_SVE-NEXT: .LBB46_49: // %cond.load67 +; NO_SVE-NEXT: mov x9, v7.d[1] +; NO_SVE-NEXT: ld1 { v6.s }[3], [x9] +; NO_SVE-NEXT: add v16.2d, v16.2d, v2.2d +; NO_SVE-NEXT: tbnz w8, #24, .LBB46_47 +; NO_SVE-NEXT: .LBB46_50: +; NO_SVE-NEXT: // implicit-def: $q7 +; NO_SVE-NEXT: ldr q17, [x1, #208] +; NO_SVE-NEXT: tbz w8, #25, .LBB46_52 +; NO_SVE-NEXT: .LBB46_51: // %cond.load73 +; NO_SVE-NEXT: mov x9, v16.d[1] +; NO_SVE-NEXT: ld1 { v7.s }[1], [x9] +; NO_SVE-NEXT: .LBB46_52: // %else74 +; NO_SVE-NEXT: add v16.2d, v17.2d, v2.2d +; NO_SVE-NEXT: tbnz w8, #26, .LBB46_56 +; NO_SVE-NEXT: // %bb.53: // %else77 +; NO_SVE-NEXT: ldr q17, [x1, #224] +; NO_SVE-NEXT: tbnz w8, #27, .LBB46_57 +; NO_SVE-NEXT: .LBB46_54: // %else80 +; NO_SVE-NEXT: add v17.2d, v17.2d, v2.2d +; NO_SVE-NEXT: tbz w8, #28, .LBB46_58 +; NO_SVE-NEXT: .LBB46_55: // %cond.load82 +; NO_SVE-NEXT: fmov x9, d17 +; NO_SVE-NEXT: ld1 { v16.s }[0], [x9] +; NO_SVE-NEXT: ldr q18, [x1, #240] +; NO_SVE-NEXT: tbnz w8, #29, .LBB46_59 +; NO_SVE-NEXT: b .LBB46_60 +; NO_SVE-NEXT: .LBB46_56: // %cond.load76 +; NO_SVE-NEXT: fmov x9, d16 +; NO_SVE-NEXT: ld1 { v7.s }[2], [x9] +; NO_SVE-NEXT: ldr q17, [x1, #224] +; NO_SVE-NEXT: tbz w8, #27, .LBB46_54 +; NO_SVE-NEXT: .LBB46_57: // %cond.load79 +; NO_SVE-NEXT: mov x9, v16.d[1] +; NO_SVE-NEXT: ld1 { v7.s }[3], [x9] +; NO_SVE-NEXT: add v17.2d, v17.2d, v2.2d +; NO_SVE-NEXT: tbnz w8, #28, .LBB46_55 +; NO_SVE-NEXT: .LBB46_58: +; NO_SVE-NEXT: // implicit-def: $q16 +; NO_SVE-NEXT: ldr q18, [x1, #240] +; NO_SVE-NEXT: tbz w8, #29, .LBB46_60 +; NO_SVE-NEXT: .LBB46_59: // %cond.load85 +; NO_SVE-NEXT: mov x9, v17.d[1] +; NO_SVE-NEXT: ld1 { v16.s }[1], [x9] +; NO_SVE-NEXT: .LBB46_60: // %else86 +; NO_SVE-NEXT: add v2.2d, v18.2d, v2.2d +; NO_SVE-NEXT: tbnz w8, #30, .LBB46_64 +; NO_SVE-NEXT: // %bb.61: // %else89 +; NO_SVE-NEXT: tbz w8, #31, .LBB46_63 +; NO_SVE-NEXT: .LBB46_62: // %cond.load91 +; NO_SVE-NEXT: mov x8, v2.d[1] +; NO_SVE-NEXT: ld1 { v16.s }[3], [x8] +; NO_SVE-NEXT: .LBB46_63: // %else92 +; NO_SVE-NEXT: stp q0, q1, [x0] +; NO_SVE-NEXT: stp q3, q4, [x0, #32] +; NO_SVE-NEXT: stp q5, q6, [x0, #64] +; NO_SVE-NEXT: stp q7, q16, [x0, #96] +; NO_SVE-NEXT: add sp, sp, #16 +; NO_SVE-NEXT: ret +; NO_SVE-NEXT: .LBB46_64: // %cond.load88 +; NO_SVE-NEXT: fmov x9, d2 +; NO_SVE-NEXT: ld1 { v16.s }[2], [x9] +; NO_SVE-NEXT: tbnz w8, #31, .LBB46_62 +; NO_SVE-NEXT: b .LBB46_63 +; +; VBITS_EQ_256-LABEL: masked_gather_vec_plus_imm: +; VBITS_EQ_256: // %bb.0: +; VBITS_EQ_256-NEXT: mov x8, #24 +; VBITS_EQ_256-NEXT: ptrue p0.s, vl8 +; VBITS_EQ_256-NEXT: mov x10, #16 +; VBITS_EQ_256-NEXT: mov x9, #8 +; VBITS_EQ_256-NEXT: mov x14, #28 +; VBITS_EQ_256-NEXT: ptrue p1.d, vl4 +; VBITS_EQ_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] +; VBITS_EQ_256-NEXT: mov x11, #4 +; VBITS_EQ_256-NEXT: ld1w { z2.s }, p0/z, [x0, x10, lsl #2] +; VBITS_EQ_256-NEXT: ld1w { z1.s }, p0/z, [x0, x9, lsl #2] +; VBITS_EQ_256-NEXT: ld1w { z3.s }, p0/z, [x0] +; VBITS_EQ_256-NEXT: ld1d { z17.d }, p1/z, [x1, x14, lsl #3] +; VBITS_EQ_256-NEXT: ld1d { z18.d }, p1/z, [x1, x8, lsl #3] +; VBITS_EQ_256-NEXT: mov x12, #20 +; VBITS_EQ_256-NEXT: fcmeq p2.s, p0/z, z0.s, #0.0 +; VBITS_EQ_256-NEXT: mov x13, #12 +; VBITS_EQ_256-NEXT: mov z19.s, p2/z, #-1 // =0xffffffffffffffff +; VBITS_EQ_256-NEXT: fcmeq p4.s, p0/z, z2.s, #0.0 +; VBITS_EQ_256-NEXT: ext z19.b, z19.b, z19.b, #16 +; VBITS_EQ_256-NEXT: punpklo p2.h, p2.b +; VBITS_EQ_256-NEXT: sunpklo z2.d, z19.s +; VBITS_EQ_256-NEXT: and p2.b, p2/z, p2.b, p1.b +; VBITS_EQ_256-NEXT: cmpne p3.d, p1/z, z2.d, #0 +; VBITS_EQ_256-NEXT: ld1d { z4.d }, p1/z, [x1, x11, lsl #3] +; VBITS_EQ_256-NEXT: ld1d { z5.d }, p1/z, [x1, x12, lsl #3] +; VBITS_EQ_256-NEXT: ld1d { z6.d }, p1/z, [x1, x13, lsl #3] +; VBITS_EQ_256-NEXT: ld1d { z7.d }, p1/z, [x1, x10, lsl #3] +; VBITS_EQ_256-NEXT: ld1d { z16.d }, p1/z, [x1, x9, lsl #3] +; VBITS_EQ_256-NEXT: ld1d { z0.d }, p1/z, [x1] +; VBITS_EQ_256-NEXT: ld1w { z2.d }, p2/z, [z18.d, #4] +; VBITS_EQ_256-NEXT: ld1w { z17.d }, p3/z, [z17.d, #4] +; VBITS_EQ_256-NEXT: fcmeq p3.s, p0/z, z1.s, #0.0 +; VBITS_EQ_256-NEXT: mov z1.s, p3/z, #-1 // =0xffffffffffffffff +; VBITS_EQ_256-NEXT: fcmeq p2.s, p0/z, z3.s, #0.0 +; VBITS_EQ_256-NEXT: ext z1.b, z1.b, z1.b, #16 +; VBITS_EQ_256-NEXT: mov z18.s, p4/z, #-1 // =0xffffffffffffffff +; VBITS_EQ_256-NEXT: mov z3.s, p2/z, #-1 // =0xffffffffffffffff +; VBITS_EQ_256-NEXT: punpklo p3.h, p3.b +; VBITS_EQ_256-NEXT: sunpklo z1.d, z1.s +; VBITS_EQ_256-NEXT: and p3.b, p3/z, p3.b, p1.b +; VBITS_EQ_256-NEXT: cmpne p5.d, p1/z, z1.d, #0 +; VBITS_EQ_256-NEXT: punpklo p4.h, p4.b +; VBITS_EQ_256-NEXT: ext z18.b, z18.b, z18.b, #16 +; VBITS_EQ_256-NEXT: ext z3.b, z3.b, z3.b, #16 +; VBITS_EQ_256-NEXT: ld1w { z16.d }, p3/z, [z16.d, #4] +; VBITS_EQ_256-NEXT: ld1w { z1.d }, p5/z, [z6.d, #4] +; VBITS_EQ_256-NEXT: and p4.b, p4/z, p4.b, p1.b +; VBITS_EQ_256-NEXT: sunpklo z6.d, z18.s +; VBITS_EQ_256-NEXT: punpklo p2.h, p2.b +; VBITS_EQ_256-NEXT: sunpklo z3.d, z3.s +; VBITS_EQ_256-NEXT: ld1w { z7.d }, p4/z, [z7.d, #4] +; VBITS_EQ_256-NEXT: cmpne p4.d, p1/z, z6.d, #0 +; VBITS_EQ_256-NEXT: and p2.b, p2/z, p2.b, p1.b +; VBITS_EQ_256-NEXT: cmpne p1.d, p1/z, z3.d, #0 +; VBITS_EQ_256-NEXT: ld1w { z5.d }, p4/z, [z5.d, #4] +; VBITS_EQ_256-NEXT: ld1w { z0.d }, p2/z, [z0.d, #4] +; VBITS_EQ_256-NEXT: ld1w { z3.d }, p1/z, [z4.d, #4] +; VBITS_EQ_256-NEXT: ptrue p3.s, vl4 +; VBITS_EQ_256-NEXT: uzp1 z4.s, z16.s, z16.s +; VBITS_EQ_256-NEXT: uzp1 z1.s, z1.s, z1.s +; VBITS_EQ_256-NEXT: uzp1 z2.s, z2.s, z2.s +; VBITS_EQ_256-NEXT: uzp1 z17.s, z17.s, z17.s +; VBITS_EQ_256-NEXT: splice z4.s, p3, z4.s, z1.s +; VBITS_EQ_256-NEXT: uzp1 z1.s, z7.s, z7.s +; VBITS_EQ_256-NEXT: uzp1 z5.s, z5.s, z5.s +; VBITS_EQ_256-NEXT: uzp1 z0.s, z0.s, z0.s +; VBITS_EQ_256-NEXT: uzp1 z3.s, z3.s, z3.s +; VBITS_EQ_256-NEXT: splice z2.s, p3, z2.s, z17.s +; VBITS_EQ_256-NEXT: splice z1.s, p3, z1.s, z5.s +; VBITS_EQ_256-NEXT: splice z0.s, p3, z0.s, z3.s +; VBITS_EQ_256-NEXT: st1w { z1.s }, p0, [x0, x10, lsl #2] +; VBITS_EQ_256-NEXT: st1w { z4.s }, p0, [x0, x9, lsl #2] +; VBITS_EQ_256-NEXT: st1w { z2.s }, p0, [x0, x8, lsl #2] +; VBITS_EQ_256-NEXT: st1w { z0.s }, p0, [x0] +; VBITS_EQ_256-NEXT: ret +; ; VBITS_GE_2048-LABEL: masked_gather_vec_plus_imm: ; VBITS_GE_2048: // %bb.0: ; VBITS_GE_2048-NEXT: ptrue p0.s, vl32 @@ -1185,6 +11367,449 @@ } define void @masked_gather_passthru(<32 x float>* %a, <32 x float*>* %b, <32 x float>* %c) #0 { +; NO_SVE-LABEL: masked_gather_passthru: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: sub sp, sp, #16 +; NO_SVE-NEXT: .cfi_def_cfa_offset 16 +; NO_SVE-NEXT: ldp q3, q4, [x0, #64] +; NO_SVE-NEXT: fcmeq v3.4s, v3.4s, #0.0 +; NO_SVE-NEXT: ldp q0, q2, [x0] +; NO_SVE-NEXT: fcmeq v4.4s, v4.4s, #0.0 +; NO_SVE-NEXT: fcmeq v0.4s, v0.4s, #0.0 +; NO_SVE-NEXT: uzp1 v3.8h, v3.8h, v4.8h +; NO_SVE-NEXT: ldp q5, q6, [x0, #96] +; NO_SVE-NEXT: fcmeq v2.4s, v2.4s, #0.0 +; NO_SVE-NEXT: fcmeq v5.4s, v5.4s, #0.0 +; NO_SVE-NEXT: uzp1 v0.8h, v0.8h, v2.8h +; NO_SVE-NEXT: fcmeq v6.4s, v6.4s, #0.0 +; NO_SVE-NEXT: ldr q4, [x0, #48] +; NO_SVE-NEXT: xtn v2.8b, v3.8h +; NO_SVE-NEXT: ldr q1, [x1] +; NO_SVE-NEXT: umov w8, v2.b[1] +; NO_SVE-NEXT: umov w10, v2.b[2] +; NO_SVE-NEXT: uzp1 v5.8h, v5.8h, v6.8h +; NO_SVE-NEXT: umov w9, v2.b[0] +; NO_SVE-NEXT: umov w11, v2.b[3] +; NO_SVE-NEXT: umov w12, v2.b[4] +; NO_SVE-NEXT: umov w13, v2.b[5] +; NO_SVE-NEXT: and w8, w8, #0x1 +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: xtn v5.8b, v5.8h +; NO_SVE-NEXT: umov w14, v2.b[6] +; NO_SVE-NEXT: and w9, w9, #0x1 +; NO_SVE-NEXT: and w11, w11, #0x1 +; NO_SVE-NEXT: umov w15, v2.b[7] +; NO_SVE-NEXT: bfi w9, w8, #1, #1 +; NO_SVE-NEXT: and w12, w12, #0x1 +; NO_SVE-NEXT: umov w8, v5.b[0] +; NO_SVE-NEXT: bfi w9, w10, #2, #1 +; NO_SVE-NEXT: and w13, w13, #0x1 +; NO_SVE-NEXT: umov w10, v5.b[1] +; NO_SVE-NEXT: bfi w9, w11, #3, #1 +; NO_SVE-NEXT: umov w11, v5.b[2] +; NO_SVE-NEXT: bfi w9, w12, #4, #1 +; NO_SVE-NEXT: and w14, w14, #0x1 +; NO_SVE-NEXT: umov w12, v5.b[3] +; NO_SVE-NEXT: bfi w9, w13, #5, #1 +; NO_SVE-NEXT: and w15, w15, #0x1 +; NO_SVE-NEXT: xtn v0.8b, v0.8h +; NO_SVE-NEXT: umov w13, v5.b[4] +; NO_SVE-NEXT: orr w9, w9, w14, lsl #6 +; NO_SVE-NEXT: and w8, w8, #0x1 +; NO_SVE-NEXT: fcmeq v3.4s, v4.4s, #0.0 +; NO_SVE-NEXT: ldr q4, [x0, #32] +; NO_SVE-NEXT: orr w9, w9, w15, lsl #7 +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: umov w15, v0.b[1] +; NO_SVE-NEXT: orr w8, w9, w8, lsl #8 +; NO_SVE-NEXT: and w11, w11, #0x1 +; NO_SVE-NEXT: orr w8, w8, w10, lsl #9 +; NO_SVE-NEXT: umov w10, v0.b[2] +; NO_SVE-NEXT: and w12, w12, #0x1 +; NO_SVE-NEXT: umov w9, v0.b[0] +; NO_SVE-NEXT: orr w8, w8, w11, lsl #10 +; NO_SVE-NEXT: and w13, w13, #0x1 +; NO_SVE-NEXT: umov w11, v0.b[3] +; NO_SVE-NEXT: fcmeq v2.4s, v4.4s, #0.0 +; NO_SVE-NEXT: orr w8, w8, w12, lsl #11 +; NO_SVE-NEXT: and w12, w15, #0x1 +; NO_SVE-NEXT: orr w8, w8, w13, lsl #12 +; NO_SVE-NEXT: umov w13, v0.b[4] +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: umov w15, v0.b[5] +; NO_SVE-NEXT: and w9, w9, #0x1 +; NO_SVE-NEXT: uzp1 v2.8h, v2.8h, v3.8h +; NO_SVE-NEXT: bfi w9, w12, #1, #1 +; NO_SVE-NEXT: and w11, w11, #0x1 +; NO_SVE-NEXT: bfi w9, w10, #2, #1 +; NO_SVE-NEXT: and w10, w13, #0x1 +; NO_SVE-NEXT: umov w13, v0.b[6] +; NO_SVE-NEXT: umov w14, v5.b[5] +; NO_SVE-NEXT: and w12, w15, #0x1 +; NO_SVE-NEXT: bfi w9, w11, #3, #1 +; NO_SVE-NEXT: umov w11, v0.b[7] +; NO_SVE-NEXT: xtn v0.8b, v2.8h +; NO_SVE-NEXT: bfi w9, w10, #4, #1 +; NO_SVE-NEXT: umov w10, v5.b[6] +; NO_SVE-NEXT: bfi w9, w12, #5, #1 +; NO_SVE-NEXT: and w12, w13, #0x1 +; NO_SVE-NEXT: umov w13, v0.b[0] +; NO_SVE-NEXT: and w14, w14, #0x1 +; NO_SVE-NEXT: umov w15, v0.b[1] +; NO_SVE-NEXT: and w11, w11, #0x1 +; NO_SVE-NEXT: orr w9, w9, w12, lsl #6 +; NO_SVE-NEXT: umov w12, v0.b[2] +; NO_SVE-NEXT: orr w8, w8, w14, lsl #13 +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: umov w14, v0.b[3] +; NO_SVE-NEXT: orr w9, w9, w11, lsl #7 +; NO_SVE-NEXT: and w11, w13, #0x1 +; NO_SVE-NEXT: and w13, w15, #0x1 +; NO_SVE-NEXT: orr w8, w8, w10, lsl #14 +; NO_SVE-NEXT: umov w10, v0.b[4] +; NO_SVE-NEXT: orr w9, w9, w11, lsl #8 +; NO_SVE-NEXT: and w11, w12, #0x1 +; NO_SVE-NEXT: umov w12, v0.b[5] +; NO_SVE-NEXT: orr w9, w9, w13, lsl #9 +; NO_SVE-NEXT: and w13, w14, #0x1 +; NO_SVE-NEXT: umov w14, v0.b[6] +; NO_SVE-NEXT: orr w9, w9, w11, lsl #10 +; NO_SVE-NEXT: umov w11, v5.b[7] +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: orr w9, w9, w13, lsl #11 +; NO_SVE-NEXT: and w12, w12, #0x1 +; NO_SVE-NEXT: umov w13, v0.b[7] +; NO_SVE-NEXT: orr w9, w9, w10, lsl #12 +; NO_SVE-NEXT: and w10, w14, #0x1 +; NO_SVE-NEXT: ldr q0, [x2] +; NO_SVE-NEXT: orr w11, w8, w11, lsl #15 +; NO_SVE-NEXT: orr w8, w9, w12, lsl #13 +; NO_SVE-NEXT: orr w8, w8, w10, lsl #14 +; NO_SVE-NEXT: orr w8, w8, w13, lsl #15 +; NO_SVE-NEXT: bfi w8, w11, #16, #16 +; NO_SVE-NEXT: tbnz w8, #0, .LBB47_41 +; NO_SVE-NEXT: // %bb.1: // %else +; NO_SVE-NEXT: tbnz w8, #1, .LBB47_42 +; NO_SVE-NEXT: .LBB47_2: // %else2 +; NO_SVE-NEXT: ldr q1, [x1, #16] +; NO_SVE-NEXT: tbnz w8, #2, .LBB47_43 +; NO_SVE-NEXT: .LBB47_3: // %else5 +; NO_SVE-NEXT: tbz w8, #3, .LBB47_5 +; NO_SVE-NEXT: .LBB47_4: // %cond.load7 +; NO_SVE-NEXT: mov x9, v1.d[1] +; NO_SVE-NEXT: ld1 { v0.s }[3], [x9] +; NO_SVE-NEXT: .LBB47_5: // %else8 +; NO_SVE-NEXT: ldr q2, [x1, #32] +; NO_SVE-NEXT: ldr q1, [x2, #16] +; NO_SVE-NEXT: tbnz w8, #4, .LBB47_44 +; NO_SVE-NEXT: // %bb.6: // %else11 +; NO_SVE-NEXT: tbnz w8, #5, .LBB47_45 +; NO_SVE-NEXT: .LBB47_7: // %else14 +; NO_SVE-NEXT: ldr q2, [x1, #48] +; NO_SVE-NEXT: tbnz w8, #6, .LBB47_46 +; NO_SVE-NEXT: .LBB47_8: // %else17 +; NO_SVE-NEXT: tbz w8, #7, .LBB47_10 +; NO_SVE-NEXT: .LBB47_9: // %cond.load19 +; NO_SVE-NEXT: mov x9, v2.d[1] +; NO_SVE-NEXT: ld1 { v1.s }[3], [x9] +; NO_SVE-NEXT: .LBB47_10: // %else20 +; NO_SVE-NEXT: ldr q3, [x1, #64] +; NO_SVE-NEXT: ldr q2, [x2, #32] +; NO_SVE-NEXT: tbnz w8, #8, .LBB47_47 +; NO_SVE-NEXT: // %bb.11: // %else23 +; NO_SVE-NEXT: tbnz w8, #9, .LBB47_48 +; NO_SVE-NEXT: .LBB47_12: // %else26 +; NO_SVE-NEXT: ldr q3, [x1, #80] +; NO_SVE-NEXT: tbnz w8, #10, .LBB47_49 +; NO_SVE-NEXT: .LBB47_13: // %else29 +; NO_SVE-NEXT: tbz w8, #11, .LBB47_15 +; NO_SVE-NEXT: .LBB47_14: // %cond.load31 +; NO_SVE-NEXT: mov x9, v3.d[1] +; NO_SVE-NEXT: ld1 { v2.s }[3], [x9] +; NO_SVE-NEXT: .LBB47_15: // %else32 +; NO_SVE-NEXT: ldr q4, [x1, #96] +; NO_SVE-NEXT: ldr q3, [x2, #48] +; NO_SVE-NEXT: tbnz w8, #12, .LBB47_50 +; NO_SVE-NEXT: // %bb.16: // %else35 +; NO_SVE-NEXT: tbnz w8, #13, .LBB47_51 +; NO_SVE-NEXT: .LBB47_17: // %else38 +; NO_SVE-NEXT: ldr q4, [x1, #112] +; NO_SVE-NEXT: tbnz w8, #14, .LBB47_52 +; NO_SVE-NEXT: .LBB47_18: // %else41 +; NO_SVE-NEXT: tbz w8, #15, .LBB47_20 +; NO_SVE-NEXT: .LBB47_19: // %cond.load43 +; NO_SVE-NEXT: mov x9, v4.d[1] +; NO_SVE-NEXT: ld1 { v3.s }[3], [x9] +; NO_SVE-NEXT: .LBB47_20: // %else44 +; NO_SVE-NEXT: ldr q5, [x1, #128] +; NO_SVE-NEXT: ldr q4, [x2, #64] +; NO_SVE-NEXT: tbnz w8, #16, .LBB47_53 +; NO_SVE-NEXT: // %bb.21: // %else47 +; NO_SVE-NEXT: tbnz w8, #17, .LBB47_54 +; NO_SVE-NEXT: .LBB47_22: // %else50 +; NO_SVE-NEXT: ldr q5, [x1, #144] +; NO_SVE-NEXT: tbnz w8, #18, .LBB47_55 +; NO_SVE-NEXT: .LBB47_23: // %else53 +; NO_SVE-NEXT: tbz w8, #19, .LBB47_25 +; NO_SVE-NEXT: .LBB47_24: // %cond.load55 +; NO_SVE-NEXT: mov x9, v5.d[1] +; NO_SVE-NEXT: ld1 { v4.s }[3], [x9] +; NO_SVE-NEXT: .LBB47_25: // %else56 +; NO_SVE-NEXT: ldr q6, [x1, #160] +; NO_SVE-NEXT: ldr q5, [x2, #80] +; NO_SVE-NEXT: tbnz w8, #20, .LBB47_56 +; NO_SVE-NEXT: // %bb.26: // %else59 +; NO_SVE-NEXT: tbnz w8, #21, .LBB47_57 +; NO_SVE-NEXT: .LBB47_27: // %else62 +; NO_SVE-NEXT: ldr q6, [x1, #176] +; NO_SVE-NEXT: tbnz w8, #22, .LBB47_58 +; NO_SVE-NEXT: .LBB47_28: // %else65 +; NO_SVE-NEXT: tbz w8, #23, .LBB47_30 +; NO_SVE-NEXT: .LBB47_29: // %cond.load67 +; NO_SVE-NEXT: mov x9, v6.d[1] +; NO_SVE-NEXT: ld1 { v5.s }[3], [x9] +; NO_SVE-NEXT: .LBB47_30: // %else68 +; NO_SVE-NEXT: ldr q7, [x1, #192] +; NO_SVE-NEXT: ldr q6, [x2, #96] +; NO_SVE-NEXT: tbnz w8, #24, .LBB47_59 +; NO_SVE-NEXT: // %bb.31: // %else71 +; NO_SVE-NEXT: tbnz w8, #25, .LBB47_60 +; NO_SVE-NEXT: .LBB47_32: // %else74 +; NO_SVE-NEXT: ldr q7, [x1, #208] +; NO_SVE-NEXT: tbnz w8, #26, .LBB47_61 +; NO_SVE-NEXT: .LBB47_33: // %else77 +; NO_SVE-NEXT: tbz w8, #27, .LBB47_35 +; NO_SVE-NEXT: .LBB47_34: // %cond.load79 +; NO_SVE-NEXT: mov x9, v7.d[1] +; NO_SVE-NEXT: ld1 { v6.s }[3], [x9] +; NO_SVE-NEXT: .LBB47_35: // %else80 +; NO_SVE-NEXT: ldr q16, [x1, #224] +; NO_SVE-NEXT: ldr q7, [x2, #112] +; NO_SVE-NEXT: tbnz w8, #28, .LBB47_62 +; NO_SVE-NEXT: // %bb.36: // %else83 +; NO_SVE-NEXT: tbnz w8, #29, .LBB47_63 +; NO_SVE-NEXT: .LBB47_37: // %else86 +; NO_SVE-NEXT: ldr q16, [x1, #240] +; NO_SVE-NEXT: tbnz w8, #30, .LBB47_64 +; NO_SVE-NEXT: .LBB47_38: // %else89 +; NO_SVE-NEXT: tbz w8, #31, .LBB47_40 +; NO_SVE-NEXT: .LBB47_39: // %cond.load91 +; NO_SVE-NEXT: mov x8, v16.d[1] +; NO_SVE-NEXT: ld1 { v7.s }[3], [x8] +; NO_SVE-NEXT: .LBB47_40: // %else92 +; NO_SVE-NEXT: stp q0, q1, [x0] +; NO_SVE-NEXT: stp q2, q3, [x0, #32] +; NO_SVE-NEXT: stp q4, q5, [x0, #64] +; NO_SVE-NEXT: stp q6, q7, [x0, #96] +; NO_SVE-NEXT: add sp, sp, #16 +; NO_SVE-NEXT: ret +; NO_SVE-NEXT: .LBB47_41: // %cond.load +; NO_SVE-NEXT: fmov x9, d1 +; NO_SVE-NEXT: ld1 { v0.s }[0], [x9] +; NO_SVE-NEXT: tbz w8, #1, .LBB47_2 +; NO_SVE-NEXT: .LBB47_42: // %cond.load1 +; NO_SVE-NEXT: mov x9, v1.d[1] +; NO_SVE-NEXT: ld1 { v0.s }[1], [x9] +; NO_SVE-NEXT: ldr q1, [x1, #16] +; NO_SVE-NEXT: tbz w8, #2, .LBB47_3 +; NO_SVE-NEXT: .LBB47_43: // %cond.load4 +; NO_SVE-NEXT: fmov x9, d1 +; NO_SVE-NEXT: ld1 { v0.s }[2], [x9] +; NO_SVE-NEXT: tbnz w8, #3, .LBB47_4 +; NO_SVE-NEXT: b .LBB47_5 +; NO_SVE-NEXT: .LBB47_44: // %cond.load10 +; NO_SVE-NEXT: fmov x9, d2 +; NO_SVE-NEXT: ld1 { v1.s }[0], [x9] +; NO_SVE-NEXT: tbz w8, #5, .LBB47_7 +; NO_SVE-NEXT: .LBB47_45: // %cond.load13 +; NO_SVE-NEXT: mov x9, v2.d[1] +; NO_SVE-NEXT: ld1 { v1.s }[1], [x9] +; NO_SVE-NEXT: ldr q2, [x1, #48] +; NO_SVE-NEXT: tbz w8, #6, .LBB47_8 +; NO_SVE-NEXT: .LBB47_46: // %cond.load16 +; NO_SVE-NEXT: fmov x9, d2 +; NO_SVE-NEXT: ld1 { v1.s }[2], [x9] +; NO_SVE-NEXT: tbnz w8, #7, .LBB47_9 +; NO_SVE-NEXT: b .LBB47_10 +; NO_SVE-NEXT: .LBB47_47: // %cond.load22 +; NO_SVE-NEXT: fmov x9, d3 +; NO_SVE-NEXT: ld1 { v2.s }[0], [x9] +; NO_SVE-NEXT: tbz w8, #9, .LBB47_12 +; NO_SVE-NEXT: .LBB47_48: // %cond.load25 +; NO_SVE-NEXT: mov x9, v3.d[1] +; NO_SVE-NEXT: ld1 { v2.s }[1], [x9] +; NO_SVE-NEXT: ldr q3, [x1, #80] +; NO_SVE-NEXT: tbz w8, #10, .LBB47_13 +; NO_SVE-NEXT: .LBB47_49: // %cond.load28 +; NO_SVE-NEXT: fmov x9, d3 +; NO_SVE-NEXT: ld1 { v2.s }[2], [x9] +; NO_SVE-NEXT: tbnz w8, #11, .LBB47_14 +; NO_SVE-NEXT: b .LBB47_15 +; NO_SVE-NEXT: .LBB47_50: // %cond.load34 +; NO_SVE-NEXT: fmov x9, d4 +; NO_SVE-NEXT: ld1 { v3.s }[0], [x9] +; NO_SVE-NEXT: tbz w8, #13, .LBB47_17 +; NO_SVE-NEXT: .LBB47_51: // %cond.load37 +; NO_SVE-NEXT: mov x9, v4.d[1] +; NO_SVE-NEXT: ld1 { v3.s }[1], [x9] +; NO_SVE-NEXT: ldr q4, [x1, #112] +; NO_SVE-NEXT: tbz w8, #14, .LBB47_18 +; NO_SVE-NEXT: .LBB47_52: // %cond.load40 +; NO_SVE-NEXT: fmov x9, d4 +; NO_SVE-NEXT: ld1 { v3.s }[2], [x9] +; NO_SVE-NEXT: tbnz w8, #15, .LBB47_19 +; NO_SVE-NEXT: b .LBB47_20 +; NO_SVE-NEXT: .LBB47_53: // %cond.load46 +; NO_SVE-NEXT: fmov x9, d5 +; NO_SVE-NEXT: ld1 { v4.s }[0], [x9] +; NO_SVE-NEXT: tbz w8, #17, .LBB47_22 +; NO_SVE-NEXT: .LBB47_54: // %cond.load49 +; NO_SVE-NEXT: mov x9, v5.d[1] +; NO_SVE-NEXT: ld1 { v4.s }[1], [x9] +; NO_SVE-NEXT: ldr q5, [x1, #144] +; NO_SVE-NEXT: tbz w8, #18, .LBB47_23 +; NO_SVE-NEXT: .LBB47_55: // %cond.load52 +; NO_SVE-NEXT: fmov x9, d5 +; NO_SVE-NEXT: ld1 { v4.s }[2], [x9] +; NO_SVE-NEXT: tbnz w8, #19, .LBB47_24 +; NO_SVE-NEXT: b .LBB47_25 +; NO_SVE-NEXT: .LBB47_56: // %cond.load58 +; NO_SVE-NEXT: fmov x9, d6 +; NO_SVE-NEXT: ld1 { v5.s }[0], [x9] +; NO_SVE-NEXT: tbz w8, #21, .LBB47_27 +; NO_SVE-NEXT: .LBB47_57: // %cond.load61 +; NO_SVE-NEXT: mov x9, v6.d[1] +; NO_SVE-NEXT: ld1 { v5.s }[1], [x9] +; NO_SVE-NEXT: ldr q6, [x1, #176] +; NO_SVE-NEXT: tbz w8, #22, .LBB47_28 +; NO_SVE-NEXT: .LBB47_58: // %cond.load64 +; NO_SVE-NEXT: fmov x9, d6 +; NO_SVE-NEXT: ld1 { v5.s }[2], [x9] +; NO_SVE-NEXT: tbnz w8, #23, .LBB47_29 +; NO_SVE-NEXT: b .LBB47_30 +; NO_SVE-NEXT: .LBB47_59: // %cond.load70 +; NO_SVE-NEXT: fmov x9, d7 +; NO_SVE-NEXT: ld1 { v6.s }[0], [x9] +; NO_SVE-NEXT: tbz w8, #25, .LBB47_32 +; NO_SVE-NEXT: .LBB47_60: // %cond.load73 +; NO_SVE-NEXT: mov x9, v7.d[1] +; NO_SVE-NEXT: ld1 { v6.s }[1], [x9] +; NO_SVE-NEXT: ldr q7, [x1, #208] +; NO_SVE-NEXT: tbz w8, #26, .LBB47_33 +; NO_SVE-NEXT: .LBB47_61: // %cond.load76 +; NO_SVE-NEXT: fmov x9, d7 +; NO_SVE-NEXT: ld1 { v6.s }[2], [x9] +; NO_SVE-NEXT: tbnz w8, #27, .LBB47_34 +; NO_SVE-NEXT: b .LBB47_35 +; NO_SVE-NEXT: .LBB47_62: // %cond.load82 +; NO_SVE-NEXT: fmov x9, d16 +; NO_SVE-NEXT: ld1 { v7.s }[0], [x9] +; NO_SVE-NEXT: tbz w8, #29, .LBB47_37 +; NO_SVE-NEXT: .LBB47_63: // %cond.load85 +; NO_SVE-NEXT: mov x9, v16.d[1] +; NO_SVE-NEXT: ld1 { v7.s }[1], [x9] +; NO_SVE-NEXT: ldr q16, [x1, #240] +; NO_SVE-NEXT: tbz w8, #30, .LBB47_38 +; NO_SVE-NEXT: .LBB47_64: // %cond.load88 +; NO_SVE-NEXT: fmov x9, d16 +; NO_SVE-NEXT: ld1 { v7.s }[2], [x9] +; NO_SVE-NEXT: tbnz w8, #31, .LBB47_39 +; NO_SVE-NEXT: b .LBB47_40 +; +; VBITS_EQ_256-LABEL: masked_gather_passthru: +; VBITS_EQ_256: // %bb.0: +; VBITS_EQ_256-NEXT: mov x8, #24 +; VBITS_EQ_256-NEXT: ptrue p0.s, vl8 +; VBITS_EQ_256-NEXT: mov x9, #8 +; VBITS_EQ_256-NEXT: mov x10, #16 +; VBITS_EQ_256-NEXT: ptrue p1.d, vl4 +; VBITS_EQ_256-NEXT: mov x11, #4 +; VBITS_EQ_256-NEXT: ld1w { z4.s }, p0/z, [x0, x8, lsl #2] +; VBITS_EQ_256-NEXT: mov x12, #20 +; VBITS_EQ_256-NEXT: ld1w { z17.s }, p0/z, [x0, x9, lsl #2] +; VBITS_EQ_256-NEXT: ld1w { z20.s }, p0/z, [x0, x10, lsl #2] +; VBITS_EQ_256-NEXT: ld1w { z7.s }, p0/z, [x0] +; VBITS_EQ_256-NEXT: ld1d { z23.d }, p1/z, [x1, x8, lsl #3] +; VBITS_EQ_256-NEXT: mov x13, #12 +; VBITS_EQ_256-NEXT: mov x14, #28 +; VBITS_EQ_256-NEXT: fcmeq p2.s, p0/z, z4.s, #0.0 +; VBITS_EQ_256-NEXT: ld1d { z0.d }, p1/z, [x1, x11, lsl #3] +; VBITS_EQ_256-NEXT: punpklo p3.h, p2.b +; VBITS_EQ_256-NEXT: ld1d { z2.d }, p1/z, [x1, x12, lsl #3] +; VBITS_EQ_256-NEXT: and p3.b, p3/z, p3.b, p1.b +; VBITS_EQ_256-NEXT: ld1d { z19.d }, p1/z, [x1, x13, lsl #3] +; VBITS_EQ_256-NEXT: ld1d { z18.d }, p1/z, [x1, x10, lsl #3] +; VBITS_EQ_256-NEXT: ld1d { z21.d }, p1/z, [x1, x9, lsl #3] +; VBITS_EQ_256-NEXT: ld1d { z22.d }, p1/z, [x1, x14, lsl #3] +; VBITS_EQ_256-NEXT: ld1d { z3.d }, p1/z, [x1] +; VBITS_EQ_256-NEXT: ld1w { z1.s }, p0/z, [x2, x10, lsl #2] +; VBITS_EQ_256-NEXT: ld1w { z5.s }, p0/z, [x2, x9, lsl #2] +; VBITS_EQ_256-NEXT: ld1w { z16.s }, p0/z, [x2, x8, lsl #2] +; VBITS_EQ_256-NEXT: ld1w { z6.s }, p0/z, [x2] +; VBITS_EQ_256-NEXT: ld1w { z4.d }, p3/z, [z23.d] +; VBITS_EQ_256-NEXT: fcmeq p3.s, p0/z, z17.s, #0.0 +; VBITS_EQ_256-NEXT: mov z17.s, p2/z, #-1 // =0xffffffffffffffff +; VBITS_EQ_256-NEXT: punpklo p2.h, p3.b +; VBITS_EQ_256-NEXT: and p2.b, p2/z, p2.b, p1.b +; VBITS_EQ_256-NEXT: uzp1 z4.s, z4.s, z4.s +; VBITS_EQ_256-NEXT: bif v4.16b, v16.16b, v17.16b +; VBITS_EQ_256-NEXT: ext z17.b, z17.b, z17.b, #16 +; VBITS_EQ_256-NEXT: sunpklo z23.d, z17.s +; VBITS_EQ_256-NEXT: ext z16.b, z16.b, z16.b, #16 +; VBITS_EQ_256-NEXT: cmpne p4.d, p1/z, z23.d, #0 +; VBITS_EQ_256-NEXT: ld1w { z22.d }, p4/z, [z22.d] +; VBITS_EQ_256-NEXT: ld1w { z21.d }, p2/z, [z21.d] +; VBITS_EQ_256-NEXT: fcmeq p2.s, p0/z, z20.s, #0.0 +; VBITS_EQ_256-NEXT: mov z20.s, p3/z, #-1 // =0xffffffffffffffff +; VBITS_EQ_256-NEXT: punpklo p3.h, p2.b +; VBITS_EQ_256-NEXT: and p3.b, p3/z, p3.b, p1.b +; VBITS_EQ_256-NEXT: uzp1 z21.s, z21.s, z21.s +; VBITS_EQ_256-NEXT: uzp1 z22.s, z22.s, z22.s +; VBITS_EQ_256-NEXT: bif v21.16b, v5.16b, v20.16b +; VBITS_EQ_256-NEXT: ext z20.b, z20.b, z20.b, #16 +; VBITS_EQ_256-NEXT: sunpklo z23.d, z20.s +; VBITS_EQ_256-NEXT: ext z5.b, z5.b, z5.b, #16 +; VBITS_EQ_256-NEXT: cmpne p4.d, p1/z, z23.d, #0 +; VBITS_EQ_256-NEXT: ld1w { z19.d }, p4/z, [z19.d] +; VBITS_EQ_256-NEXT: ld1w { z18.d }, p3/z, [z18.d] +; VBITS_EQ_256-NEXT: fcmeq p3.s, p0/z, z7.s, #0.0 +; VBITS_EQ_256-NEXT: mov z7.s, p2/z, #-1 // =0xffffffffffffffff +; VBITS_EQ_256-NEXT: punpklo p2.h, p3.b +; VBITS_EQ_256-NEXT: and p2.b, p2/z, p2.b, p1.b +; VBITS_EQ_256-NEXT: uzp1 z18.s, z18.s, z18.s +; VBITS_EQ_256-NEXT: bif v18.16b, v1.16b, v7.16b +; VBITS_EQ_256-NEXT: ext z7.b, z7.b, z7.b, #16 +; VBITS_EQ_256-NEXT: sunpklo z23.d, z7.s +; VBITS_EQ_256-NEXT: ext z1.b, z1.b, z1.b, #16 +; VBITS_EQ_256-NEXT: cmpne p4.d, p1/z, z23.d, #0 +; VBITS_EQ_256-NEXT: mov z23.s, p3/z, #-1 // =0xffffffffffffffff +; VBITS_EQ_256-NEXT: ld1w { z2.d }, p4/z, [z2.d] +; VBITS_EQ_256-NEXT: ld1w { z3.d }, p2/z, [z3.d] +; VBITS_EQ_256-NEXT: bit v16.16b, v22.16b, v17.16b +; VBITS_EQ_256-NEXT: uzp1 z3.s, z3.s, z3.s +; VBITS_EQ_256-NEXT: uzp1 z2.s, z2.s, z2.s +; VBITS_EQ_256-NEXT: bif v3.16b, v6.16b, v23.16b +; VBITS_EQ_256-NEXT: ext z23.b, z23.b, z23.b, #16 +; VBITS_EQ_256-NEXT: sunpklo z17.d, z23.s +; VBITS_EQ_256-NEXT: ext z6.b, z6.b, z6.b, #16 +; VBITS_EQ_256-NEXT: cmpne p1.d, p1/z, z17.d, #0 +; VBITS_EQ_256-NEXT: uzp1 z17.s, z19.s, z19.s +; VBITS_EQ_256-NEXT: ld1w { z0.d }, p1/z, [z0.d] +; VBITS_EQ_256-NEXT: ptrue p1.s, vl4 +; VBITS_EQ_256-NEXT: bit v5.16b, v17.16b, v20.16b +; VBITS_EQ_256-NEXT: splice z4.s, p1, z4.s, z16.s +; VBITS_EQ_256-NEXT: bit v1.16b, v2.16b, v7.16b +; VBITS_EQ_256-NEXT: st1w { z4.s }, p0, [x0, x8, lsl #2] +; VBITS_EQ_256-NEXT: uzp1 z0.s, z0.s, z0.s +; VBITS_EQ_256-NEXT: bif v0.16b, v6.16b, v23.16b +; VBITS_EQ_256-NEXT: splice z21.s, p1, z21.s, z5.s +; VBITS_EQ_256-NEXT: splice z18.s, p1, z18.s, z1.s +; VBITS_EQ_256-NEXT: st1w { z21.s }, p0, [x0, x9, lsl #2] +; VBITS_EQ_256-NEXT: st1w { z18.s }, p0, [x0, x10, lsl #2] +; VBITS_EQ_256-NEXT: splice z3.s, p1, z3.s, z0.s +; VBITS_EQ_256-NEXT: st1w { z3.s }, p0, [x0] +; VBITS_EQ_256-NEXT: ret +; ; VBITS_GE_2048-LABEL: masked_gather_passthru: ; VBITS_GE_2048: // %bb.0: ; VBITS_GE_2048-NEXT: ptrue p0.s, vl32 @@ -1209,6 +11834,447 @@ } define void @masked_gather_passthru_0(<32 x float>* %a, <32 x float*>* %b) #0 { +; NO_SVE-LABEL: masked_gather_passthru_0: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: sub sp, sp, #16 +; NO_SVE-NEXT: .cfi_def_cfa_offset 16 +; NO_SVE-NEXT: ldp q2, q3, [x0, #64] +; NO_SVE-NEXT: fcmeq v2.4s, v2.4s, #0.0 +; NO_SVE-NEXT: ldp q0, q1, [x0] +; NO_SVE-NEXT: fcmeq v3.4s, v3.4s, #0.0 +; NO_SVE-NEXT: fcmeq v0.4s, v0.4s, #0.0 +; NO_SVE-NEXT: uzp1 v2.8h, v2.8h, v3.8h +; NO_SVE-NEXT: ldp q4, q5, [x0, #96] +; NO_SVE-NEXT: fcmeq v1.4s, v1.4s, #0.0 +; NO_SVE-NEXT: fcmeq v4.4s, v4.4s, #0.0 +; NO_SVE-NEXT: uzp1 v0.8h, v0.8h, v1.8h +; NO_SVE-NEXT: fcmeq v5.4s, v5.4s, #0.0 +; NO_SVE-NEXT: ldr q3, [x0, #48] +; NO_SVE-NEXT: xtn v1.8b, v2.8h +; NO_SVE-NEXT: umov w8, v1.b[1] +; NO_SVE-NEXT: umov w10, v1.b[2] +; NO_SVE-NEXT: uzp1 v4.8h, v4.8h, v5.8h +; NO_SVE-NEXT: umov w9, v1.b[0] +; NO_SVE-NEXT: umov w11, v1.b[3] +; NO_SVE-NEXT: umov w12, v1.b[4] +; NO_SVE-NEXT: umov w13, v1.b[5] +; NO_SVE-NEXT: and w8, w8, #0x1 +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: xtn v4.8b, v4.8h +; NO_SVE-NEXT: umov w14, v1.b[6] +; NO_SVE-NEXT: and w9, w9, #0x1 +; NO_SVE-NEXT: and w11, w11, #0x1 +; NO_SVE-NEXT: umov w15, v1.b[7] +; NO_SVE-NEXT: bfi w9, w8, #1, #1 +; NO_SVE-NEXT: and w12, w12, #0x1 +; NO_SVE-NEXT: umov w8, v4.b[0] +; NO_SVE-NEXT: bfi w9, w10, #2, #1 +; NO_SVE-NEXT: and w13, w13, #0x1 +; NO_SVE-NEXT: umov w10, v4.b[1] +; NO_SVE-NEXT: bfi w9, w11, #3, #1 +; NO_SVE-NEXT: umov w11, v4.b[2] +; NO_SVE-NEXT: bfi w9, w12, #4, #1 +; NO_SVE-NEXT: and w14, w14, #0x1 +; NO_SVE-NEXT: umov w12, v4.b[3] +; NO_SVE-NEXT: bfi w9, w13, #5, #1 +; NO_SVE-NEXT: and w15, w15, #0x1 +; NO_SVE-NEXT: xtn v0.8b, v0.8h +; NO_SVE-NEXT: umov w13, v4.b[4] +; NO_SVE-NEXT: orr w9, w9, w14, lsl #6 +; NO_SVE-NEXT: and w8, w8, #0x1 +; NO_SVE-NEXT: fcmeq v2.4s, v3.4s, #0.0 +; NO_SVE-NEXT: ldr q3, [x0, #32] +; NO_SVE-NEXT: orr w9, w9, w15, lsl #7 +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: umov w15, v0.b[1] +; NO_SVE-NEXT: orr w8, w9, w8, lsl #8 +; NO_SVE-NEXT: and w11, w11, #0x1 +; NO_SVE-NEXT: orr w8, w8, w10, lsl #9 +; NO_SVE-NEXT: umov w10, v0.b[2] +; NO_SVE-NEXT: and w12, w12, #0x1 +; NO_SVE-NEXT: umov w9, v0.b[0] +; NO_SVE-NEXT: orr w8, w8, w11, lsl #10 +; NO_SVE-NEXT: and w13, w13, #0x1 +; NO_SVE-NEXT: umov w11, v0.b[3] +; NO_SVE-NEXT: fcmeq v1.4s, v3.4s, #0.0 +; NO_SVE-NEXT: orr w8, w8, w12, lsl #11 +; NO_SVE-NEXT: and w15, w15, #0x1 +; NO_SVE-NEXT: orr w8, w8, w13, lsl #12 +; NO_SVE-NEXT: umov w12, v0.b[4] +; NO_SVE-NEXT: umov w13, v0.b[5] +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: and w9, w9, #0x1 +; NO_SVE-NEXT: uzp1 v1.8h, v1.8h, v2.8h +; NO_SVE-NEXT: bfi w9, w15, #1, #1 +; NO_SVE-NEXT: and w11, w11, #0x1 +; NO_SVE-NEXT: umov w14, v4.b[5] +; NO_SVE-NEXT: bfi w9, w10, #2, #1 +; NO_SVE-NEXT: and w10, w12, #0x1 +; NO_SVE-NEXT: and w12, w13, #0x1 +; NO_SVE-NEXT: umov w13, v0.b[6] +; NO_SVE-NEXT: bfi w9, w11, #3, #1 +; NO_SVE-NEXT: umov w11, v0.b[7] +; NO_SVE-NEXT: xtn v0.8b, v1.8h +; NO_SVE-NEXT: bfi w9, w10, #4, #1 +; NO_SVE-NEXT: umov w10, v4.b[6] +; NO_SVE-NEXT: bfi w9, w12, #5, #1 +; NO_SVE-NEXT: and w12, w13, #0x1 +; NO_SVE-NEXT: umov w13, v0.b[0] +; NO_SVE-NEXT: and w14, w14, #0x1 +; NO_SVE-NEXT: umov w15, v0.b[1] +; NO_SVE-NEXT: and w11, w11, #0x1 +; NO_SVE-NEXT: orr w9, w9, w12, lsl #6 +; NO_SVE-NEXT: umov w12, v0.b[2] +; NO_SVE-NEXT: orr w8, w8, w14, lsl #13 +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: umov w14, v0.b[3] +; NO_SVE-NEXT: orr w9, w9, w11, lsl #7 +; NO_SVE-NEXT: and w11, w13, #0x1 +; NO_SVE-NEXT: and w13, w15, #0x1 +; NO_SVE-NEXT: orr w8, w8, w10, lsl #14 +; NO_SVE-NEXT: umov w10, v0.b[4] +; NO_SVE-NEXT: orr w9, w9, w11, lsl #8 +; NO_SVE-NEXT: and w11, w12, #0x1 +; NO_SVE-NEXT: umov w12, v0.b[5] +; NO_SVE-NEXT: orr w9, w9, w13, lsl #9 +; NO_SVE-NEXT: and w13, w14, #0x1 +; NO_SVE-NEXT: umov w14, v0.b[6] +; NO_SVE-NEXT: orr w9, w9, w11, lsl #10 +; NO_SVE-NEXT: umov w11, v4.b[7] +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: orr w9, w9, w13, lsl #11 +; NO_SVE-NEXT: and w12, w12, #0x1 +; NO_SVE-NEXT: umov w13, v0.b[7] +; NO_SVE-NEXT: orr w9, w9, w10, lsl #12 +; NO_SVE-NEXT: and w10, w14, #0x1 +; NO_SVE-NEXT: ldr q3, [x1] +; NO_SVE-NEXT: orr w11, w8, w11, lsl #15 +; NO_SVE-NEXT: orr w8, w9, w12, lsl #13 +; NO_SVE-NEXT: orr w8, w8, w10, lsl #14 +; NO_SVE-NEXT: orr w8, w8, w13, lsl #15 +; NO_SVE-NEXT: movi v0.2d, #0000000000000000 +; NO_SVE-NEXT: bfi w8, w11, #16, #16 +; NO_SVE-NEXT: tbz w8, #0, .LBB48_2 +; NO_SVE-NEXT: // %bb.1: // %cond.load +; NO_SVE-NEXT: fmov x9, d3 +; NO_SVE-NEXT: movi v1.2d, #0000000000000000 +; NO_SVE-NEXT: ld1 { v0.s }[0], [x9] +; NO_SVE-NEXT: movi v2.2d, #0000000000000000 +; NO_SVE-NEXT: tbnz w8, #1, .LBB48_3 +; NO_SVE-NEXT: b .LBB48_4 +; NO_SVE-NEXT: .LBB48_2: +; NO_SVE-NEXT: movi v1.2d, #0000000000000000 +; NO_SVE-NEXT: movi v2.2d, #0000000000000000 +; NO_SVE-NEXT: tbz w8, #1, .LBB48_4 +; NO_SVE-NEXT: .LBB48_3: // %cond.load1 +; NO_SVE-NEXT: mov x9, v3.d[1] +; NO_SVE-NEXT: ld1 { v0.s }[1], [x9] +; NO_SVE-NEXT: .LBB48_4: // %else2 +; NO_SVE-NEXT: ldr q3, [x1, #16] +; NO_SVE-NEXT: tbnz w8, #2, .LBB48_12 +; NO_SVE-NEXT: // %bb.5: // %else5 +; NO_SVE-NEXT: tbnz w8, #3, .LBB48_13 +; NO_SVE-NEXT: .LBB48_6: // %else8 +; NO_SVE-NEXT: ldr q3, [x1, #32] +; NO_SVE-NEXT: tbnz w8, #4, .LBB48_14 +; NO_SVE-NEXT: .LBB48_7: // %else11 +; NO_SVE-NEXT: tbnz w8, #5, .LBB48_15 +; NO_SVE-NEXT: .LBB48_8: // %else14 +; NO_SVE-NEXT: ldr q3, [x1, #48] +; NO_SVE-NEXT: tbnz w8, #6, .LBB48_16 +; NO_SVE-NEXT: .LBB48_9: // %else17 +; NO_SVE-NEXT: tbnz w8, #7, .LBB48_17 +; NO_SVE-NEXT: .LBB48_10: // %else20 +; NO_SVE-NEXT: ldr q16, [x1, #64] +; NO_SVE-NEXT: tbz w8, #8, .LBB48_18 +; NO_SVE-NEXT: .LBB48_11: // %cond.load22 +; NO_SVE-NEXT: fmov x9, d16 +; NO_SVE-NEXT: mov v17.16b, v2.16b +; NO_SVE-NEXT: mov v3.16b, v2.16b +; NO_SVE-NEXT: ld1 { v17.s }[0], [x9] +; NO_SVE-NEXT: mov v4.16b, v2.16b +; NO_SVE-NEXT: mov v5.16b, v2.16b +; NO_SVE-NEXT: mov v6.16b, v2.16b +; NO_SVE-NEXT: mov v7.16b, v2.16b +; NO_SVE-NEXT: mov v2.16b, v17.16b +; NO_SVE-NEXT: tbnz w8, #9, .LBB48_19 +; NO_SVE-NEXT: b .LBB48_20 +; NO_SVE-NEXT: .LBB48_12: // %cond.load4 +; NO_SVE-NEXT: fmov x9, d3 +; NO_SVE-NEXT: ld1 { v0.s }[2], [x9] +; NO_SVE-NEXT: tbz w8, #3, .LBB48_6 +; NO_SVE-NEXT: .LBB48_13: // %cond.load7 +; NO_SVE-NEXT: mov x9, v3.d[1] +; NO_SVE-NEXT: ld1 { v0.s }[3], [x9] +; NO_SVE-NEXT: ldr q3, [x1, #32] +; NO_SVE-NEXT: tbz w8, #4, .LBB48_7 +; NO_SVE-NEXT: .LBB48_14: // %cond.load10 +; NO_SVE-NEXT: fmov x9, d3 +; NO_SVE-NEXT: ld1 { v1.s }[0], [x9] +; NO_SVE-NEXT: tbz w8, #5, .LBB48_8 +; NO_SVE-NEXT: .LBB48_15: // %cond.load13 +; NO_SVE-NEXT: mov x9, v3.d[1] +; NO_SVE-NEXT: ld1 { v1.s }[1], [x9] +; NO_SVE-NEXT: ldr q3, [x1, #48] +; NO_SVE-NEXT: tbz w8, #6, .LBB48_9 +; NO_SVE-NEXT: .LBB48_16: // %cond.load16 +; NO_SVE-NEXT: fmov x9, d3 +; NO_SVE-NEXT: ld1 { v1.s }[2], [x9] +; NO_SVE-NEXT: tbz w8, #7, .LBB48_10 +; NO_SVE-NEXT: .LBB48_17: // %cond.load19 +; NO_SVE-NEXT: mov x9, v3.d[1] +; NO_SVE-NEXT: ld1 { v1.s }[3], [x9] +; NO_SVE-NEXT: ldr q16, [x1, #64] +; NO_SVE-NEXT: tbnz w8, #8, .LBB48_11 +; NO_SVE-NEXT: .LBB48_18: +; NO_SVE-NEXT: mov v3.16b, v2.16b +; NO_SVE-NEXT: mov v4.16b, v2.16b +; NO_SVE-NEXT: mov v5.16b, v2.16b +; NO_SVE-NEXT: mov v6.16b, v2.16b +; NO_SVE-NEXT: mov v7.16b, v2.16b +; NO_SVE-NEXT: tbz w8, #9, .LBB48_20 +; NO_SVE-NEXT: .LBB48_19: // %cond.load25 +; NO_SVE-NEXT: mov x9, v16.d[1] +; NO_SVE-NEXT: ld1 { v2.s }[1], [x9] +; NO_SVE-NEXT: .LBB48_20: // %else26 +; NO_SVE-NEXT: ldr q16, [x1, #80] +; NO_SVE-NEXT: tbnz w8, #10, .LBB48_44 +; NO_SVE-NEXT: // %bb.21: // %else29 +; NO_SVE-NEXT: tbnz w8, #11, .LBB48_45 +; NO_SVE-NEXT: .LBB48_22: // %else32 +; NO_SVE-NEXT: ldr q16, [x1, #96] +; NO_SVE-NEXT: tbnz w8, #12, .LBB48_46 +; NO_SVE-NEXT: .LBB48_23: // %else35 +; NO_SVE-NEXT: tbnz w8, #13, .LBB48_47 +; NO_SVE-NEXT: .LBB48_24: // %else38 +; NO_SVE-NEXT: ldr q16, [x1, #112] +; NO_SVE-NEXT: tbnz w8, #14, .LBB48_48 +; NO_SVE-NEXT: .LBB48_25: // %else41 +; NO_SVE-NEXT: tbnz w8, #15, .LBB48_49 +; NO_SVE-NEXT: .LBB48_26: // %else44 +; NO_SVE-NEXT: ldr q16, [x1, #128] +; NO_SVE-NEXT: tbnz w8, #16, .LBB48_50 +; NO_SVE-NEXT: .LBB48_27: // %else47 +; NO_SVE-NEXT: tbnz w8, #17, .LBB48_51 +; NO_SVE-NEXT: .LBB48_28: // %else50 +; NO_SVE-NEXT: ldr q16, [x1, #144] +; NO_SVE-NEXT: tbnz w8, #18, .LBB48_52 +; NO_SVE-NEXT: .LBB48_29: // %else53 +; NO_SVE-NEXT: tbnz w8, #19, .LBB48_53 +; NO_SVE-NEXT: .LBB48_30: // %else56 +; NO_SVE-NEXT: ldr q16, [x1, #160] +; NO_SVE-NEXT: tbnz w8, #20, .LBB48_54 +; NO_SVE-NEXT: .LBB48_31: // %else59 +; NO_SVE-NEXT: tbnz w8, #21, .LBB48_55 +; NO_SVE-NEXT: .LBB48_32: // %else62 +; NO_SVE-NEXT: ldr q16, [x1, #176] +; NO_SVE-NEXT: tbnz w8, #22, .LBB48_56 +; NO_SVE-NEXT: .LBB48_33: // %else65 +; NO_SVE-NEXT: tbnz w8, #23, .LBB48_57 +; NO_SVE-NEXT: .LBB48_34: // %else68 +; NO_SVE-NEXT: ldr q16, [x1, #192] +; NO_SVE-NEXT: tbnz w8, #24, .LBB48_58 +; NO_SVE-NEXT: .LBB48_35: // %else71 +; NO_SVE-NEXT: tbnz w8, #25, .LBB48_59 +; NO_SVE-NEXT: .LBB48_36: // %else74 +; NO_SVE-NEXT: ldr q16, [x1, #208] +; NO_SVE-NEXT: tbnz w8, #26, .LBB48_60 +; NO_SVE-NEXT: .LBB48_37: // %else77 +; NO_SVE-NEXT: tbnz w8, #27, .LBB48_61 +; NO_SVE-NEXT: .LBB48_38: // %else80 +; NO_SVE-NEXT: ldr q16, [x1, #224] +; NO_SVE-NEXT: tbnz w8, #28, .LBB48_62 +; NO_SVE-NEXT: .LBB48_39: // %else83 +; NO_SVE-NEXT: tbnz w8, #29, .LBB48_63 +; NO_SVE-NEXT: .LBB48_40: // %else86 +; NO_SVE-NEXT: ldr q16, [x1, #240] +; NO_SVE-NEXT: tbnz w8, #30, .LBB48_64 +; NO_SVE-NEXT: .LBB48_41: // %else89 +; NO_SVE-NEXT: tbz w8, #31, .LBB48_43 +; NO_SVE-NEXT: .LBB48_42: // %cond.load91 +; NO_SVE-NEXT: mov x8, v16.d[1] +; NO_SVE-NEXT: ld1 { v7.s }[3], [x8] +; NO_SVE-NEXT: .LBB48_43: // %else92 +; NO_SVE-NEXT: stp q0, q1, [x0] +; NO_SVE-NEXT: stp q2, q3, [x0, #32] +; NO_SVE-NEXT: stp q4, q5, [x0, #64] +; NO_SVE-NEXT: stp q6, q7, [x0, #96] +; NO_SVE-NEXT: add sp, sp, #16 +; NO_SVE-NEXT: ret +; NO_SVE-NEXT: .LBB48_44: // %cond.load28 +; NO_SVE-NEXT: fmov x9, d16 +; NO_SVE-NEXT: ld1 { v2.s }[2], [x9] +; NO_SVE-NEXT: tbz w8, #11, .LBB48_22 +; NO_SVE-NEXT: .LBB48_45: // %cond.load31 +; NO_SVE-NEXT: mov x9, v16.d[1] +; NO_SVE-NEXT: ld1 { v2.s }[3], [x9] +; NO_SVE-NEXT: ldr q16, [x1, #96] +; NO_SVE-NEXT: tbz w8, #12, .LBB48_23 +; NO_SVE-NEXT: .LBB48_46: // %cond.load34 +; NO_SVE-NEXT: fmov x9, d16 +; NO_SVE-NEXT: ld1 { v3.s }[0], [x9] +; NO_SVE-NEXT: tbz w8, #13, .LBB48_24 +; NO_SVE-NEXT: .LBB48_47: // %cond.load37 +; NO_SVE-NEXT: mov x9, v16.d[1] +; NO_SVE-NEXT: ld1 { v3.s }[1], [x9] +; NO_SVE-NEXT: ldr q16, [x1, #112] +; NO_SVE-NEXT: tbz w8, #14, .LBB48_25 +; NO_SVE-NEXT: .LBB48_48: // %cond.load40 +; NO_SVE-NEXT: fmov x9, d16 +; NO_SVE-NEXT: ld1 { v3.s }[2], [x9] +; NO_SVE-NEXT: tbz w8, #15, .LBB48_26 +; NO_SVE-NEXT: .LBB48_49: // %cond.load43 +; NO_SVE-NEXT: mov x9, v16.d[1] +; NO_SVE-NEXT: ld1 { v3.s }[3], [x9] +; NO_SVE-NEXT: ldr q16, [x1, #128] +; NO_SVE-NEXT: tbz w8, #16, .LBB48_27 +; NO_SVE-NEXT: .LBB48_50: // %cond.load46 +; NO_SVE-NEXT: fmov x9, d16 +; NO_SVE-NEXT: ld1 { v4.s }[0], [x9] +; NO_SVE-NEXT: tbz w8, #17, .LBB48_28 +; NO_SVE-NEXT: .LBB48_51: // %cond.load49 +; NO_SVE-NEXT: mov x9, v16.d[1] +; NO_SVE-NEXT: ld1 { v4.s }[1], [x9] +; NO_SVE-NEXT: ldr q16, [x1, #144] +; NO_SVE-NEXT: tbz w8, #18, .LBB48_29 +; NO_SVE-NEXT: .LBB48_52: // %cond.load52 +; NO_SVE-NEXT: fmov x9, d16 +; NO_SVE-NEXT: ld1 { v4.s }[2], [x9] +; NO_SVE-NEXT: tbz w8, #19, .LBB48_30 +; NO_SVE-NEXT: .LBB48_53: // %cond.load55 +; NO_SVE-NEXT: mov x9, v16.d[1] +; NO_SVE-NEXT: ld1 { v4.s }[3], [x9] +; NO_SVE-NEXT: ldr q16, [x1, #160] +; NO_SVE-NEXT: tbz w8, #20, .LBB48_31 +; NO_SVE-NEXT: .LBB48_54: // %cond.load58 +; NO_SVE-NEXT: fmov x9, d16 +; NO_SVE-NEXT: ld1 { v5.s }[0], [x9] +; NO_SVE-NEXT: tbz w8, #21, .LBB48_32 +; NO_SVE-NEXT: .LBB48_55: // %cond.load61 +; NO_SVE-NEXT: mov x9, v16.d[1] +; NO_SVE-NEXT: ld1 { v5.s }[1], [x9] +; NO_SVE-NEXT: ldr q16, [x1, #176] +; NO_SVE-NEXT: tbz w8, #22, .LBB48_33 +; NO_SVE-NEXT: .LBB48_56: // %cond.load64 +; NO_SVE-NEXT: fmov x9, d16 +; NO_SVE-NEXT: ld1 { v5.s }[2], [x9] +; NO_SVE-NEXT: tbz w8, #23, .LBB48_34 +; NO_SVE-NEXT: .LBB48_57: // %cond.load67 +; NO_SVE-NEXT: mov x9, v16.d[1] +; NO_SVE-NEXT: ld1 { v5.s }[3], [x9] +; NO_SVE-NEXT: ldr q16, [x1, #192] +; NO_SVE-NEXT: tbz w8, #24, .LBB48_35 +; NO_SVE-NEXT: .LBB48_58: // %cond.load70 +; NO_SVE-NEXT: fmov x9, d16 +; NO_SVE-NEXT: ld1 { v6.s }[0], [x9] +; NO_SVE-NEXT: tbz w8, #25, .LBB48_36 +; NO_SVE-NEXT: .LBB48_59: // %cond.load73 +; NO_SVE-NEXT: mov x9, v16.d[1] +; NO_SVE-NEXT: ld1 { v6.s }[1], [x9] +; NO_SVE-NEXT: ldr q16, [x1, #208] +; NO_SVE-NEXT: tbz w8, #26, .LBB48_37 +; NO_SVE-NEXT: .LBB48_60: // %cond.load76 +; NO_SVE-NEXT: fmov x9, d16 +; NO_SVE-NEXT: ld1 { v6.s }[2], [x9] +; NO_SVE-NEXT: tbz w8, #27, .LBB48_38 +; NO_SVE-NEXT: .LBB48_61: // %cond.load79 +; NO_SVE-NEXT: mov x9, v16.d[1] +; NO_SVE-NEXT: ld1 { v6.s }[3], [x9] +; NO_SVE-NEXT: ldr q16, [x1, #224] +; NO_SVE-NEXT: tbz w8, #28, .LBB48_39 +; NO_SVE-NEXT: .LBB48_62: // %cond.load82 +; NO_SVE-NEXT: fmov x9, d16 +; NO_SVE-NEXT: ld1 { v7.s }[0], [x9] +; NO_SVE-NEXT: tbz w8, #29, .LBB48_40 +; NO_SVE-NEXT: .LBB48_63: // %cond.load85 +; NO_SVE-NEXT: mov x9, v16.d[1] +; NO_SVE-NEXT: ld1 { v7.s }[1], [x9] +; NO_SVE-NEXT: ldr q16, [x1, #240] +; NO_SVE-NEXT: tbz w8, #30, .LBB48_41 +; NO_SVE-NEXT: .LBB48_64: // %cond.load88 +; NO_SVE-NEXT: fmov x9, d16 +; NO_SVE-NEXT: ld1 { v7.s }[2], [x9] +; NO_SVE-NEXT: tbnz w8, #31, .LBB48_42 +; NO_SVE-NEXT: b .LBB48_43 +; +; VBITS_EQ_256-LABEL: masked_gather_passthru_0: +; VBITS_EQ_256: // %bb.0: +; VBITS_EQ_256-NEXT: mov x8, #24 +; VBITS_EQ_256-NEXT: ptrue p0.s, vl8 +; VBITS_EQ_256-NEXT: mov x10, #16 +; VBITS_EQ_256-NEXT: mov x9, #8 +; VBITS_EQ_256-NEXT: mov x14, #28 +; VBITS_EQ_256-NEXT: ptrue p1.d, vl4 +; VBITS_EQ_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] +; VBITS_EQ_256-NEXT: mov x11, #4 +; VBITS_EQ_256-NEXT: ld1w { z2.s }, p0/z, [x0, x10, lsl #2] +; VBITS_EQ_256-NEXT: ld1w { z1.s }, p0/z, [x0, x9, lsl #2] +; VBITS_EQ_256-NEXT: ld1w { z3.s }, p0/z, [x0] +; VBITS_EQ_256-NEXT: ld1d { z17.d }, p1/z, [x1, x14, lsl #3] +; VBITS_EQ_256-NEXT: ld1d { z18.d }, p1/z, [x1, x8, lsl #3] +; VBITS_EQ_256-NEXT: mov x12, #20 +; VBITS_EQ_256-NEXT: fcmeq p2.s, p0/z, z0.s, #0.0 +; VBITS_EQ_256-NEXT: mov x13, #12 +; VBITS_EQ_256-NEXT: mov z19.s, p2/z, #-1 // =0xffffffffffffffff +; VBITS_EQ_256-NEXT: fcmeq p4.s, p0/z, z2.s, #0.0 +; VBITS_EQ_256-NEXT: ext z19.b, z19.b, z19.b, #16 +; VBITS_EQ_256-NEXT: punpklo p2.h, p2.b +; VBITS_EQ_256-NEXT: sunpklo z2.d, z19.s +; VBITS_EQ_256-NEXT: and p2.b, p2/z, p2.b, p1.b +; VBITS_EQ_256-NEXT: cmpne p3.d, p1/z, z2.d, #0 +; VBITS_EQ_256-NEXT: ld1d { z4.d }, p1/z, [x1, x11, lsl #3] +; VBITS_EQ_256-NEXT: ld1d { z5.d }, p1/z, [x1, x12, lsl #3] +; VBITS_EQ_256-NEXT: ld1d { z6.d }, p1/z, [x1, x13, lsl #3] +; VBITS_EQ_256-NEXT: ld1d { z7.d }, p1/z, [x1, x10, lsl #3] +; VBITS_EQ_256-NEXT: ld1d { z16.d }, p1/z, [x1, x9, lsl #3] +; VBITS_EQ_256-NEXT: ld1d { z0.d }, p1/z, [x1] +; VBITS_EQ_256-NEXT: ld1w { z2.d }, p2/z, [z18.d] +; VBITS_EQ_256-NEXT: ld1w { z17.d }, p3/z, [z17.d] +; VBITS_EQ_256-NEXT: fcmeq p3.s, p0/z, z1.s, #0.0 +; VBITS_EQ_256-NEXT: mov z1.s, p3/z, #-1 // =0xffffffffffffffff +; VBITS_EQ_256-NEXT: fcmeq p2.s, p0/z, z3.s, #0.0 +; VBITS_EQ_256-NEXT: ext z1.b, z1.b, z1.b, #16 +; VBITS_EQ_256-NEXT: mov z18.s, p4/z, #-1 // =0xffffffffffffffff +; VBITS_EQ_256-NEXT: mov z3.s, p2/z, #-1 // =0xffffffffffffffff +; VBITS_EQ_256-NEXT: punpklo p3.h, p3.b +; VBITS_EQ_256-NEXT: sunpklo z1.d, z1.s +; VBITS_EQ_256-NEXT: and p3.b, p3/z, p3.b, p1.b +; VBITS_EQ_256-NEXT: cmpne p5.d, p1/z, z1.d, #0 +; VBITS_EQ_256-NEXT: punpklo p4.h, p4.b +; VBITS_EQ_256-NEXT: ext z18.b, z18.b, z18.b, #16 +; VBITS_EQ_256-NEXT: ext z3.b, z3.b, z3.b, #16 +; VBITS_EQ_256-NEXT: ld1w { z16.d }, p3/z, [z16.d] +; VBITS_EQ_256-NEXT: ld1w { z1.d }, p5/z, [z6.d] +; VBITS_EQ_256-NEXT: and p4.b, p4/z, p4.b, p1.b +; VBITS_EQ_256-NEXT: sunpklo z6.d, z18.s +; VBITS_EQ_256-NEXT: punpklo p2.h, p2.b +; VBITS_EQ_256-NEXT: sunpklo z3.d, z3.s +; VBITS_EQ_256-NEXT: ld1w { z7.d }, p4/z, [z7.d] +; VBITS_EQ_256-NEXT: cmpne p4.d, p1/z, z6.d, #0 +; VBITS_EQ_256-NEXT: and p2.b, p2/z, p2.b, p1.b +; VBITS_EQ_256-NEXT: cmpne p1.d, p1/z, z3.d, #0 +; VBITS_EQ_256-NEXT: ld1w { z5.d }, p4/z, [z5.d] +; VBITS_EQ_256-NEXT: ld1w { z0.d }, p2/z, [z0.d] +; VBITS_EQ_256-NEXT: ld1w { z3.d }, p1/z, [z4.d] +; VBITS_EQ_256-NEXT: ptrue p3.s, vl4 +; VBITS_EQ_256-NEXT: uzp1 z4.s, z16.s, z16.s +; VBITS_EQ_256-NEXT: uzp1 z1.s, z1.s, z1.s +; VBITS_EQ_256-NEXT: uzp1 z2.s, z2.s, z2.s +; VBITS_EQ_256-NEXT: uzp1 z17.s, z17.s, z17.s +; VBITS_EQ_256-NEXT: splice z4.s, p3, z4.s, z1.s +; VBITS_EQ_256-NEXT: uzp1 z1.s, z7.s, z7.s +; VBITS_EQ_256-NEXT: uzp1 z5.s, z5.s, z5.s +; VBITS_EQ_256-NEXT: uzp1 z0.s, z0.s, z0.s +; VBITS_EQ_256-NEXT: uzp1 z3.s, z3.s, z3.s +; VBITS_EQ_256-NEXT: splice z2.s, p3, z2.s, z17.s +; VBITS_EQ_256-NEXT: splice z1.s, p3, z1.s, z5.s +; VBITS_EQ_256-NEXT: splice z0.s, p3, z0.s, z3.s +; VBITS_EQ_256-NEXT: st1w { z1.s }, p0, [x0, x10, lsl #2] +; VBITS_EQ_256-NEXT: st1w { z4.s }, p0, [x0, x9, lsl #2] +; VBITS_EQ_256-NEXT: st1w { z2.s }, p0, [x0, x8, lsl #2] +; VBITS_EQ_256-NEXT: st1w { z0.s }, p0, [x0] +; VBITS_EQ_256-NEXT: ret +; ; VBITS_GE_2048-LABEL: masked_gather_passthru_0: ; VBITS_GE_2048: // %bb.0: ; VBITS_GE_2048-NEXT: ptrue p0.s, vl32 diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-loads.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-loads.ll --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-loads.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-loads.ll @@ -1,3 +1,4 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -aarch64-sve-vector-bits-min=128 < %s | FileCheck %s -D#VBYTES=16 -check-prefix=NO_SVE ; RUN: llc -aarch64-sve-vector-bits-min=256 < %s | FileCheck %s -D#VBYTES=32 -check-prefixes=CHECK ; RUN: llc -aarch64-sve-vector-bits-min=384 < %s | FileCheck %s -D#VBYTES=32 -check-prefixes=CHECK @@ -24,6 +25,34 @@ ; Masked Loads ; define <2 x half> @masked_load_v2f16(<2 x half>* %ap, <2 x half>* %bp) #0 { +; NO_SVE-LABEL: masked_load_v2f16: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: sub sp, sp, #16 +; NO_SVE-NEXT: .cfi_def_cfa_offset 16 +; NO_SVE-NEXT: ldr s1, [x0] +; NO_SVE-NEXT: movi d0, #0000000000000000 +; NO_SVE-NEXT: ldr s2, [x1] +; NO_SVE-NEXT: fcmeq v1.4h, v1.4h, v2.4h +; NO_SVE-NEXT: umov w8, v1.h[1] +; NO_SVE-NEXT: umov w9, v1.h[0] +; NO_SVE-NEXT: bfi w9, w8, #1, #31 +; NO_SVE-NEXT: and w8, w9, #0x3 +; NO_SVE-NEXT: tbz w9, #0, .LBB0_2 +; NO_SVE-NEXT: // %bb.1: // %cond.load +; NO_SVE-NEXT: movi d0, #0000000000000000 +; NO_SVE-NEXT: ld1 { v0.h }[0], [x0] +; NO_SVE-NEXT: tbz w8, #1, .LBB0_4 +; NO_SVE-NEXT: b .LBB0_3 +; NO_SVE-NEXT: .LBB0_2: // %else +; NO_SVE-NEXT: tbz w8, #1, .LBB0_4 +; NO_SVE-NEXT: .LBB0_3: // %cond.load1 +; NO_SVE-NEXT: add x8, x0, #2 +; NO_SVE-NEXT: ld1 { v0.h }[1], [x8] +; NO_SVE-NEXT: .LBB0_4: // %else2 +; NO_SVE-NEXT: // kill: def $d0 killed $d0 killed $q0 +; NO_SVE-NEXT: add sp, sp, #16 +; NO_SVE-NEXT: ret +; ; CHECK-LABEL: masked_load_v2f16: ; CHECK: // %bb.0: ; CHECK-NEXT: ldr s1, [x0] @@ -55,6 +84,34 @@ } define <2 x float> @masked_load_v2f32(<2 x float>* %ap, <2 x float>* %bp) #0 { +; NO_SVE-LABEL: masked_load_v2f32: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: sub sp, sp, #16 +; NO_SVE-NEXT: .cfi_def_cfa_offset 16 +; NO_SVE-NEXT: ldr d1, [x0] +; NO_SVE-NEXT: movi d0, #0000000000000000 +; NO_SVE-NEXT: ldr d2, [x1] +; NO_SVE-NEXT: fcmeq v1.2s, v1.2s, v2.2s +; NO_SVE-NEXT: mov w8, v1.s[1] +; NO_SVE-NEXT: fmov w9, s1 +; NO_SVE-NEXT: bfi w9, w8, #1, #31 +; NO_SVE-NEXT: and w8, w9, #0x3 +; NO_SVE-NEXT: tbz w9, #0, .LBB1_2 +; NO_SVE-NEXT: // %bb.1: // %cond.load +; NO_SVE-NEXT: movi d0, #0000000000000000 +; NO_SVE-NEXT: ld1 { v0.s }[0], [x0] +; NO_SVE-NEXT: tbz w8, #1, .LBB1_4 +; NO_SVE-NEXT: b .LBB1_3 +; NO_SVE-NEXT: .LBB1_2: // %else +; NO_SVE-NEXT: tbz w8, #1, .LBB1_4 +; NO_SVE-NEXT: .LBB1_3: // %cond.load1 +; NO_SVE-NEXT: add x8, x0, #4 +; NO_SVE-NEXT: ld1 { v0.s }[1], [x8] +; NO_SVE-NEXT: .LBB1_4: // %else2 +; NO_SVE-NEXT: // kill: def $d0 killed $d0 killed $q0 +; NO_SVE-NEXT: add sp, sp, #16 +; NO_SVE-NEXT: ret +; ; CHECK-LABEL: masked_load_v2f32: ; CHECK: // %bb.0: ; CHECK-NEXT: ldr d0, [x0] @@ -73,6 +130,53 @@ } define <4 x float> @masked_load_v4f32(<4 x float>* %ap, <4 x float>* %bp) #0 { +; NO_SVE-LABEL: masked_load_v4f32: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: sub sp, sp, #16 +; NO_SVE-NEXT: .cfi_def_cfa_offset 16 +; NO_SVE-NEXT: ldr q1, [x0] +; NO_SVE-NEXT: ldr q2, [x1] +; NO_SVE-NEXT: movi v0.2d, #0000000000000000 +; NO_SVE-NEXT: fcmeq v1.4s, v1.4s, v2.4s +; NO_SVE-NEXT: xtn v1.4h, v1.4s +; NO_SVE-NEXT: umov w8, v1.h[1] +; NO_SVE-NEXT: umov w9, v1.h[2] +; NO_SVE-NEXT: umov w10, v1.h[0] +; NO_SVE-NEXT: umov w11, v1.h[3] +; NO_SVE-NEXT: and w8, w8, #0x1 +; NO_SVE-NEXT: and w9, w9, #0x1 +; NO_SVE-NEXT: bfi w10, w8, #1, #1 +; NO_SVE-NEXT: bfi w10, w9, #2, #1 +; NO_SVE-NEXT: bfi w10, w11, #3, #29 +; NO_SVE-NEXT: and w8, w10, #0xf +; NO_SVE-NEXT: tbnz w10, #0, .LBB2_5 +; NO_SVE-NEXT: // %bb.1: // %else +; NO_SVE-NEXT: tbnz w8, #1, .LBB2_6 +; NO_SVE-NEXT: .LBB2_2: // %else2 +; NO_SVE-NEXT: tbnz w8, #2, .LBB2_7 +; NO_SVE-NEXT: .LBB2_3: // %else5 +; NO_SVE-NEXT: tbnz w8, #3, .LBB2_8 +; NO_SVE-NEXT: .LBB2_4: // %else8 +; NO_SVE-NEXT: add sp, sp, #16 +; NO_SVE-NEXT: ret +; NO_SVE-NEXT: .LBB2_5: // %cond.load +; NO_SVE-NEXT: movi v0.2d, #0000000000000000 +; NO_SVE-NEXT: ld1 { v0.s }[0], [x0] +; NO_SVE-NEXT: tbz w8, #1, .LBB2_2 +; NO_SVE-NEXT: .LBB2_6: // %cond.load1 +; NO_SVE-NEXT: add x9, x0, #4 +; NO_SVE-NEXT: ld1 { v0.s }[1], [x9] +; NO_SVE-NEXT: tbz w8, #2, .LBB2_3 +; NO_SVE-NEXT: .LBB2_7: // %cond.load4 +; NO_SVE-NEXT: add x9, x0, #8 +; NO_SVE-NEXT: ld1 { v0.s }[2], [x9] +; NO_SVE-NEXT: tbz w8, #3, .LBB2_4 +; NO_SVE-NEXT: .LBB2_8: // %cond.load7 +; NO_SVE-NEXT: add x8, x0, #12 +; NO_SVE-NEXT: ld1 { v0.s }[3], [x8] +; NO_SVE-NEXT: add sp, sp, #16 +; NO_SVE-NEXT: ret +; ; CHECK-LABEL: masked_load_v4f32: ; CHECK: // %bb.0: ; CHECK-NEXT: ldr q0, [x0] @@ -91,6 +195,94 @@ } define <8 x float> @masked_load_v8f32(<8 x float>* %ap, <8 x float>* %bp) #0 { +; NO_SVE-LABEL: masked_load_v8f32: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: sub sp, sp, #16 +; NO_SVE-NEXT: .cfi_def_cfa_offset 16 +; NO_SVE-NEXT: ldp q1, q2, [x0] +; NO_SVE-NEXT: movi v0.2d, #0000000000000000 +; NO_SVE-NEXT: ldp q4, q3, [x1] +; NO_SVE-NEXT: fcmeq v1.4s, v1.4s, v4.4s +; NO_SVE-NEXT: fcmeq v2.4s, v2.4s, v3.4s +; NO_SVE-NEXT: uzp1 v1.8h, v1.8h, v2.8h +; NO_SVE-NEXT: xtn v1.8b, v1.8h +; NO_SVE-NEXT: umov w8, v1.b[1] +; NO_SVE-NEXT: umov w9, v1.b[2] +; NO_SVE-NEXT: umov w10, v1.b[0] +; NO_SVE-NEXT: umov w11, v1.b[3] +; NO_SVE-NEXT: umov w12, v1.b[4] +; NO_SVE-NEXT: umov w13, v1.b[5] +; NO_SVE-NEXT: umov w14, v1.b[6] +; NO_SVE-NEXT: and w8, w8, #0x1 +; NO_SVE-NEXT: and w9, w9, #0x1 +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: and w11, w11, #0x1 +; NO_SVE-NEXT: bfi w10, w8, #1, #1 +; NO_SVE-NEXT: and w8, w12, #0x1 +; NO_SVE-NEXT: bfi w10, w9, #2, #1 +; NO_SVE-NEXT: and w9, w13, #0x1 +; NO_SVE-NEXT: bfi w10, w11, #3, #1 +; NO_SVE-NEXT: umov w11, v1.b[7] +; NO_SVE-NEXT: bfi w10, w8, #4, #1 +; NO_SVE-NEXT: and w8, w14, #0x1 +; NO_SVE-NEXT: bfi w10, w9, #5, #1 +; NO_SVE-NEXT: orr w8, w10, w8, lsl #6 +; NO_SVE-NEXT: orr w9, w8, w11, lsl #7 +; NO_SVE-NEXT: movi v1.2d, #0000000000000000 +; NO_SVE-NEXT: and w8, w9, #0xff +; NO_SVE-NEXT: tbnz w9, #0, .LBB3_9 +; NO_SVE-NEXT: // %bb.1: // %else +; NO_SVE-NEXT: tbnz w8, #1, .LBB3_10 +; NO_SVE-NEXT: .LBB3_2: // %else2 +; NO_SVE-NEXT: tbnz w8, #2, .LBB3_11 +; NO_SVE-NEXT: .LBB3_3: // %else5 +; NO_SVE-NEXT: tbnz w8, #3, .LBB3_12 +; NO_SVE-NEXT: .LBB3_4: // %else8 +; NO_SVE-NEXT: tbnz w8, #4, .LBB3_13 +; NO_SVE-NEXT: .LBB3_5: // %else11 +; NO_SVE-NEXT: tbnz w8, #5, .LBB3_14 +; NO_SVE-NEXT: .LBB3_6: // %else14 +; NO_SVE-NEXT: tbnz w8, #6, .LBB3_15 +; NO_SVE-NEXT: .LBB3_7: // %else17 +; NO_SVE-NEXT: tbnz w8, #7, .LBB3_16 +; NO_SVE-NEXT: .LBB3_8: // %else20 +; NO_SVE-NEXT: add sp, sp, #16 +; NO_SVE-NEXT: ret +; NO_SVE-NEXT: .LBB3_9: // %cond.load +; NO_SVE-NEXT: movi v0.2d, #0000000000000000 +; NO_SVE-NEXT: movi v1.2d, #0000000000000000 +; NO_SVE-NEXT: ld1 { v0.s }[0], [x0] +; NO_SVE-NEXT: tbz w8, #1, .LBB3_2 +; NO_SVE-NEXT: .LBB3_10: // %cond.load1 +; NO_SVE-NEXT: add x9, x0, #4 +; NO_SVE-NEXT: ld1 { v0.s }[1], [x9] +; NO_SVE-NEXT: tbz w8, #2, .LBB3_3 +; NO_SVE-NEXT: .LBB3_11: // %cond.load4 +; NO_SVE-NEXT: add x9, x0, #8 +; NO_SVE-NEXT: ld1 { v0.s }[2], [x9] +; NO_SVE-NEXT: tbz w8, #3, .LBB3_4 +; NO_SVE-NEXT: .LBB3_12: // %cond.load7 +; NO_SVE-NEXT: add x9, x0, #12 +; NO_SVE-NEXT: ld1 { v0.s }[3], [x9] +; NO_SVE-NEXT: tbz w8, #4, .LBB3_5 +; NO_SVE-NEXT: .LBB3_13: // %cond.load10 +; NO_SVE-NEXT: add x9, x0, #16 +; NO_SVE-NEXT: ld1 { v1.s }[0], [x9] +; NO_SVE-NEXT: tbz w8, #5, .LBB3_6 +; NO_SVE-NEXT: .LBB3_14: // %cond.load13 +; NO_SVE-NEXT: add x9, x0, #20 +; NO_SVE-NEXT: ld1 { v1.s }[1], [x9] +; NO_SVE-NEXT: tbz w8, #6, .LBB3_7 +; NO_SVE-NEXT: .LBB3_15: // %cond.load16 +; NO_SVE-NEXT: add x9, x0, #24 +; NO_SVE-NEXT: ld1 { v1.s }[2], [x9] +; NO_SVE-NEXT: tbz w8, #7, .LBB3_8 +; NO_SVE-NEXT: .LBB3_16: // %cond.load19 +; NO_SVE-NEXT: add x8, x0, #28 +; NO_SVE-NEXT: ld1 { v1.s }[3], [x8] +; NO_SVE-NEXT: add sp, sp, #16 +; NO_SVE-NEXT: ret +; ; CHECK-LABEL: masked_load_v8f32: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.s, vl8 @@ -108,6 +300,176 @@ } define <16 x float> @masked_load_v16f32(<16 x float>* %ap, <16 x float>* %bp) #0 { +; NO_SVE-LABEL: masked_load_v16f32: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: sub sp, sp, #16 +; NO_SVE-NEXT: .cfi_def_cfa_offset 16 +; NO_SVE-NEXT: ldp q0, q1, [x0] +; NO_SVE-NEXT: ldp q3, q2, [x1] +; NO_SVE-NEXT: fcmeq v0.4s, v0.4s, v3.4s +; NO_SVE-NEXT: fcmeq v1.4s, v1.4s, v2.4s +; NO_SVE-NEXT: ldp q4, q3, [x1, #32] +; NO_SVE-NEXT: uzp1 v0.8h, v0.8h, v1.8h +; NO_SVE-NEXT: ldp q1, q2, [x0, #32] +; NO_SVE-NEXT: xtn v0.8b, v0.8h +; NO_SVE-NEXT: umov w8, v0.b[1] +; NO_SVE-NEXT: umov w10, v0.b[2] +; NO_SVE-NEXT: fcmeq v1.4s, v1.4s, v4.4s +; NO_SVE-NEXT: umov w9, v0.b[0] +; NO_SVE-NEXT: umov w11, v0.b[3] +; NO_SVE-NEXT: fcmeq v2.4s, v2.4s, v3.4s +; NO_SVE-NEXT: umov w12, v0.b[4] +; NO_SVE-NEXT: and w8, w8, #0x1 +; NO_SVE-NEXT: umov w13, v0.b[5] +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: and w9, w9, #0x1 +; NO_SVE-NEXT: and w11, w11, #0x1 +; NO_SVE-NEXT: uzp1 v1.8h, v1.8h, v2.8h +; NO_SVE-NEXT: bfi w9, w8, #1, #1 +; NO_SVE-NEXT: umov w8, v0.b[6] +; NO_SVE-NEXT: and w12, w12, #0x1 +; NO_SVE-NEXT: bfi w9, w10, #2, #1 +; NO_SVE-NEXT: umov w10, v0.b[7] +; NO_SVE-NEXT: and w13, w13, #0x1 +; NO_SVE-NEXT: bfi w9, w11, #3, #1 +; NO_SVE-NEXT: xtn v1.8b, v1.8h +; NO_SVE-NEXT: bfi w9, w12, #4, #1 +; NO_SVE-NEXT: and w8, w8, #0x1 +; NO_SVE-NEXT: umov w11, v1.b[0] +; NO_SVE-NEXT: umov w12, v1.b[1] +; NO_SVE-NEXT: bfi w9, w13, #5, #1 +; NO_SVE-NEXT: umov w13, v1.b[2] +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: orr w8, w9, w8, lsl #6 +; NO_SVE-NEXT: umov w9, v1.b[3] +; NO_SVE-NEXT: and w11, w11, #0x1 +; NO_SVE-NEXT: orr w8, w8, w10, lsl #7 +; NO_SVE-NEXT: umov w10, v1.b[4] +; NO_SVE-NEXT: and w12, w12, #0x1 +; NO_SVE-NEXT: orr w8, w8, w11, lsl #8 +; NO_SVE-NEXT: umov w11, v1.b[5] +; NO_SVE-NEXT: and w13, w13, #0x1 +; NO_SVE-NEXT: orr w8, w8, w12, lsl #9 +; NO_SVE-NEXT: umov w12, v1.b[6] +; NO_SVE-NEXT: and w9, w9, #0x1 +; NO_SVE-NEXT: orr w8, w8, w13, lsl #10 +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: orr w8, w8, w9, lsl #11 +; NO_SVE-NEXT: and w9, w11, #0x1 +; NO_SVE-NEXT: umov w11, v1.b[7] +; NO_SVE-NEXT: orr w8, w8, w10, lsl #12 +; NO_SVE-NEXT: and w10, w12, #0x1 +; NO_SVE-NEXT: orr w8, w8, w9, lsl #13 +; NO_SVE-NEXT: movi v0.2d, #0000000000000000 +; NO_SVE-NEXT: orr w8, w8, w10, lsl #14 +; NO_SVE-NEXT: orr w9, w8, w11, lsl #15 +; NO_SVE-NEXT: movi v1.2d, #0000000000000000 +; NO_SVE-NEXT: and w8, w9, #0xffff +; NO_SVE-NEXT: tbnz w9, #0, .LBB4_10 +; NO_SVE-NEXT: // %bb.1: // %else +; NO_SVE-NEXT: movi v2.2d, #0000000000000000 +; NO_SVE-NEXT: tbnz w8, #1, .LBB4_11 +; NO_SVE-NEXT: .LBB4_2: // %else2 +; NO_SVE-NEXT: tbnz w8, #2, .LBB4_12 +; NO_SVE-NEXT: .LBB4_3: // %else5 +; NO_SVE-NEXT: tbnz w8, #3, .LBB4_13 +; NO_SVE-NEXT: .LBB4_4: // %else8 +; NO_SVE-NEXT: tbnz w8, #4, .LBB4_14 +; NO_SVE-NEXT: .LBB4_5: // %else11 +; NO_SVE-NEXT: tbnz w8, #5, .LBB4_15 +; NO_SVE-NEXT: .LBB4_6: // %else14 +; NO_SVE-NEXT: tbnz w8, #6, .LBB4_16 +; NO_SVE-NEXT: .LBB4_7: // %else17 +; NO_SVE-NEXT: tbnz w8, #7, .LBB4_17 +; NO_SVE-NEXT: .LBB4_8: // %else20 +; NO_SVE-NEXT: tbz w8, #8, .LBB4_18 +; NO_SVE-NEXT: .LBB4_9: // %cond.load22 +; NO_SVE-NEXT: add x9, x0, #32 +; NO_SVE-NEXT: mov v4.16b, v2.16b +; NO_SVE-NEXT: mov v3.16b, v2.16b +; NO_SVE-NEXT: ld1 { v4.s }[0], [x9] +; NO_SVE-NEXT: mov v2.16b, v4.16b +; NO_SVE-NEXT: tbnz w8, #9, .LBB4_19 +; NO_SVE-NEXT: b .LBB4_20 +; NO_SVE-NEXT: .LBB4_10: // %cond.load +; NO_SVE-NEXT: ld1 { v0.s }[0], [x0] +; NO_SVE-NEXT: movi v2.2d, #0000000000000000 +; NO_SVE-NEXT: tbz w8, #1, .LBB4_2 +; NO_SVE-NEXT: .LBB4_11: // %cond.load1 +; NO_SVE-NEXT: add x9, x0, #4 +; NO_SVE-NEXT: ld1 { v0.s }[1], [x9] +; NO_SVE-NEXT: tbz w8, #2, .LBB4_3 +; NO_SVE-NEXT: .LBB4_12: // %cond.load4 +; NO_SVE-NEXT: add x9, x0, #8 +; NO_SVE-NEXT: ld1 { v0.s }[2], [x9] +; NO_SVE-NEXT: tbz w8, #3, .LBB4_4 +; NO_SVE-NEXT: .LBB4_13: // %cond.load7 +; NO_SVE-NEXT: add x9, x0, #12 +; NO_SVE-NEXT: ld1 { v0.s }[3], [x9] +; NO_SVE-NEXT: tbz w8, #4, .LBB4_5 +; NO_SVE-NEXT: .LBB4_14: // %cond.load10 +; NO_SVE-NEXT: add x9, x0, #16 +; NO_SVE-NEXT: ld1 { v1.s }[0], [x9] +; NO_SVE-NEXT: tbz w8, #5, .LBB4_6 +; NO_SVE-NEXT: .LBB4_15: // %cond.load13 +; NO_SVE-NEXT: add x9, x0, #20 +; NO_SVE-NEXT: ld1 { v1.s }[1], [x9] +; NO_SVE-NEXT: tbz w8, #6, .LBB4_7 +; NO_SVE-NEXT: .LBB4_16: // %cond.load16 +; NO_SVE-NEXT: add x9, x0, #24 +; NO_SVE-NEXT: ld1 { v1.s }[2], [x9] +; NO_SVE-NEXT: tbz w8, #7, .LBB4_8 +; NO_SVE-NEXT: .LBB4_17: // %cond.load19 +; NO_SVE-NEXT: add x9, x0, #28 +; NO_SVE-NEXT: ld1 { v1.s }[3], [x9] +; NO_SVE-NEXT: tbnz w8, #8, .LBB4_9 +; NO_SVE-NEXT: .LBB4_18: +; NO_SVE-NEXT: mov v3.16b, v2.16b +; NO_SVE-NEXT: tbz w8, #9, .LBB4_20 +; NO_SVE-NEXT: .LBB4_19: // %cond.load25 +; NO_SVE-NEXT: add x9, x0, #36 +; NO_SVE-NEXT: ld1 { v2.s }[1], [x9] +; NO_SVE-NEXT: .LBB4_20: // %else26 +; NO_SVE-NEXT: tbnz w8, #10, .LBB4_27 +; NO_SVE-NEXT: // %bb.21: // %else29 +; NO_SVE-NEXT: tbnz w8, #11, .LBB4_28 +; NO_SVE-NEXT: .LBB4_22: // %else32 +; NO_SVE-NEXT: tbnz w8, #12, .LBB4_29 +; NO_SVE-NEXT: .LBB4_23: // %else35 +; NO_SVE-NEXT: tbnz w8, #13, .LBB4_30 +; NO_SVE-NEXT: .LBB4_24: // %else38 +; NO_SVE-NEXT: tbnz w8, #14, .LBB4_31 +; NO_SVE-NEXT: .LBB4_25: // %else41 +; NO_SVE-NEXT: tbnz w8, #15, .LBB4_32 +; NO_SVE-NEXT: .LBB4_26: // %else44 +; NO_SVE-NEXT: add sp, sp, #16 +; NO_SVE-NEXT: ret +; NO_SVE-NEXT: .LBB4_27: // %cond.load28 +; NO_SVE-NEXT: add x9, x0, #40 +; NO_SVE-NEXT: ld1 { v2.s }[2], [x9] +; NO_SVE-NEXT: tbz w8, #11, .LBB4_22 +; NO_SVE-NEXT: .LBB4_28: // %cond.load31 +; NO_SVE-NEXT: add x9, x0, #44 +; NO_SVE-NEXT: ld1 { v2.s }[3], [x9] +; NO_SVE-NEXT: tbz w8, #12, .LBB4_23 +; NO_SVE-NEXT: .LBB4_29: // %cond.load34 +; NO_SVE-NEXT: add x9, x0, #48 +; NO_SVE-NEXT: ld1 { v3.s }[0], [x9] +; NO_SVE-NEXT: tbz w8, #13, .LBB4_24 +; NO_SVE-NEXT: .LBB4_30: // %cond.load37 +; NO_SVE-NEXT: add x9, x0, #52 +; NO_SVE-NEXT: ld1 { v3.s }[1], [x9] +; NO_SVE-NEXT: tbz w8, #14, .LBB4_25 +; NO_SVE-NEXT: .LBB4_31: // %cond.load40 +; NO_SVE-NEXT: add x9, x0, #56 +; NO_SVE-NEXT: ld1 { v3.s }[2], [x9] +; NO_SVE-NEXT: tbz w8, #15, .LBB4_26 +; NO_SVE-NEXT: .LBB4_32: // %cond.load43 +; NO_SVE-NEXT: add x8, x0, #60 +; NO_SVE-NEXT: ld1 { v3.s }[3], [x8] +; NO_SVE-NEXT: add sp, sp, #16 +; NO_SVE-NEXT: ret +; ; VBITS_GE_512-LABEL: masked_load_v16f32: ; VBITS_GE_512: // %bb.0: ; VBITS_GE_512-NEXT: ptrue p0.s, vl16 @@ -125,6 +487,338 @@ } define <32 x float> @masked_load_v32f32(<32 x float>* %ap, <32 x float>* %bp) #0 { +; NO_SVE-LABEL: masked_load_v32f32: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: sub sp, sp, #16 +; NO_SVE-NEXT: .cfi_def_cfa_offset 16 +; NO_SVE-NEXT: ldp q0, q1, [x0, #64] +; NO_SVE-NEXT: ldp q2, q3, [x1, #64] +; NO_SVE-NEXT: fcmeq v0.4s, v0.4s, v2.4s +; NO_SVE-NEXT: fcmeq v1.4s, v1.4s, v3.4s +; NO_SVE-NEXT: ldp q3, q2, [x0, #96] +; NO_SVE-NEXT: uzp1 v0.8h, v0.8h, v1.8h +; NO_SVE-NEXT: ldp q6, q7, [x1, #96] +; NO_SVE-NEXT: xtn v0.8b, v0.8h +; NO_SVE-NEXT: umov w8, v0.b[1] +; NO_SVE-NEXT: umov w10, v0.b[2] +; NO_SVE-NEXT: fcmeq v3.4s, v3.4s, v6.4s +; NO_SVE-NEXT: umov w9, v0.b[0] +; NO_SVE-NEXT: umov w11, v0.b[3] +; NO_SVE-NEXT: fcmeq v2.4s, v2.4s, v7.4s +; NO_SVE-NEXT: umov w12, v0.b[4] +; NO_SVE-NEXT: ldp q4, q5, [x0] +; NO_SVE-NEXT: and w8, w8, #0x1 +; NO_SVE-NEXT: umov w13, v0.b[5] +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: umov w14, v0.b[6] +; NO_SVE-NEXT: uzp1 v2.8h, v3.8h, v2.8h +; NO_SVE-NEXT: and w9, w9, #0x1 +; NO_SVE-NEXT: and w11, w11, #0x1 +; NO_SVE-NEXT: bfi w9, w8, #1, #1 +; NO_SVE-NEXT: umov w8, v0.b[7] +; NO_SVE-NEXT: and w12, w12, #0x1 +; NO_SVE-NEXT: ldp q16, q1, [x1] +; NO_SVE-NEXT: xtn v2.8b, v2.8h +; NO_SVE-NEXT: bfi w9, w10, #2, #1 +; NO_SVE-NEXT: and w13, w13, #0x1 +; NO_SVE-NEXT: bfi w9, w11, #3, #1 +; NO_SVE-NEXT: umov w10, v2.b[0] +; NO_SVE-NEXT: umov w11, v2.b[1] +; NO_SVE-NEXT: fcmeq v4.4s, v4.4s, v16.4s +; NO_SVE-NEXT: bfi w9, w12, #4, #1 +; NO_SVE-NEXT: umov w12, v2.b[2] +; NO_SVE-NEXT: and w14, w14, #0x1 +; NO_SVE-NEXT: fcmeq v1.4s, v5.4s, v1.4s +; NO_SVE-NEXT: bfi w9, w13, #5, #1 +; NO_SVE-NEXT: ldp q3, q5, [x0, #32] +; NO_SVE-NEXT: and w8, w8, #0x1 +; NO_SVE-NEXT: umov w13, v2.b[3] +; NO_SVE-NEXT: orr w9, w9, w14, lsl #6 +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: uzp1 v0.8h, v4.8h, v1.8h +; NO_SVE-NEXT: umov w14, v2.b[4] +; NO_SVE-NEXT: and w11, w11, #0x1 +; NO_SVE-NEXT: orr w8, w9, w8, lsl #7 +; NO_SVE-NEXT: and w12, w12, #0x1 +; NO_SVE-NEXT: orr w8, w8, w10, lsl #8 +; NO_SVE-NEXT: ldp q1, q4, [x1, #32] +; NO_SVE-NEXT: xtn v0.8b, v0.8h +; NO_SVE-NEXT: orr w8, w8, w11, lsl #9 +; NO_SVE-NEXT: and w13, w13, #0x1 +; NO_SVE-NEXT: orr w8, w8, w12, lsl #10 +; NO_SVE-NEXT: umov w10, v0.b[1] +; NO_SVE-NEXT: umov w12, v0.b[2] +; NO_SVE-NEXT: and w14, w14, #0x1 +; NO_SVE-NEXT: umov w11, v0.b[0] +; NO_SVE-NEXT: fcmeq v1.4s, v3.4s, v1.4s +; NO_SVE-NEXT: orr w8, w8, w13, lsl #11 +; NO_SVE-NEXT: fcmeq v4.4s, v5.4s, v4.4s +; NO_SVE-NEXT: orr w8, w8, w14, lsl #12 +; NO_SVE-NEXT: umov w13, v0.b[3] +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: umov w14, v0.b[4] +; NO_SVE-NEXT: and w12, w12, #0x1 +; NO_SVE-NEXT: umov w15, v0.b[5] +; NO_SVE-NEXT: and w11, w11, #0x1 +; NO_SVE-NEXT: uzp1 v1.8h, v1.8h, v4.8h +; NO_SVE-NEXT: bfi w11, w10, #1, #1 +; NO_SVE-NEXT: and w10, w13, #0x1 +; NO_SVE-NEXT: bfi w11, w12, #2, #1 +; NO_SVE-NEXT: and w12, w14, #0x1 +; NO_SVE-NEXT: umov w14, v0.b[6] +; NO_SVE-NEXT: umov w9, v2.b[5] +; NO_SVE-NEXT: and w13, w15, #0x1 +; NO_SVE-NEXT: bfi w11, w10, #3, #1 +; NO_SVE-NEXT: umov w10, v0.b[7] +; NO_SVE-NEXT: xtn v0.8b, v1.8h +; NO_SVE-NEXT: bfi w11, w12, #4, #1 +; NO_SVE-NEXT: umov w12, v2.b[6] +; NO_SVE-NEXT: bfi w11, w13, #5, #1 +; NO_SVE-NEXT: and w13, w14, #0x1 +; NO_SVE-NEXT: umov w14, v0.b[0] +; NO_SVE-NEXT: and w9, w9, #0x1 +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: umov w15, v0.b[1] +; NO_SVE-NEXT: orr w8, w8, w9, lsl #13 +; NO_SVE-NEXT: orr w9, w11, w13, lsl #6 +; NO_SVE-NEXT: and w11, w12, #0x1 +; NO_SVE-NEXT: umov w12, v0.b[2] +; NO_SVE-NEXT: orr w9, w9, w10, lsl #7 +; NO_SVE-NEXT: and w10, w14, #0x1 +; NO_SVE-NEXT: umov w14, v0.b[3] +; NO_SVE-NEXT: and w13, w15, #0x1 +; NO_SVE-NEXT: orr w9, w9, w10, lsl #8 +; NO_SVE-NEXT: umov w10, v0.b[4] +; NO_SVE-NEXT: orr w8, w8, w11, lsl #14 +; NO_SVE-NEXT: and w11, w12, #0x1 +; NO_SVE-NEXT: umov w12, v0.b[5] +; NO_SVE-NEXT: orr w9, w9, w13, lsl #9 +; NO_SVE-NEXT: and w13, w14, #0x1 +; NO_SVE-NEXT: umov w14, v0.b[6] +; NO_SVE-NEXT: orr w9, w9, w11, lsl #10 +; NO_SVE-NEXT: umov w11, v2.b[7] +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: orr w9, w9, w13, lsl #11 +; NO_SVE-NEXT: and w12, w12, #0x1 +; NO_SVE-NEXT: umov w13, v0.b[7] +; NO_SVE-NEXT: orr w9, w9, w10, lsl #12 +; NO_SVE-NEXT: and w10, w14, #0x1 +; NO_SVE-NEXT: orr w11, w8, w11, lsl #15 +; NO_SVE-NEXT: orr w8, w9, w12, lsl #13 +; NO_SVE-NEXT: orr w8, w8, w10, lsl #14 +; NO_SVE-NEXT: orr w8, w8, w13, lsl #15 +; NO_SVE-NEXT: movi v0.2d, #0000000000000000 +; NO_SVE-NEXT: bfi w8, w11, #16, #16 +; NO_SVE-NEXT: movi v1.2d, #0000000000000000 +; NO_SVE-NEXT: tbnz w8, #0, .LBB5_10 +; NO_SVE-NEXT: // %bb.1: // %else +; NO_SVE-NEXT: movi v2.2d, #0000000000000000 +; NO_SVE-NEXT: tbnz w8, #1, .LBB5_11 +; NO_SVE-NEXT: .LBB5_2: // %else2 +; NO_SVE-NEXT: tbnz w8, #2, .LBB5_12 +; NO_SVE-NEXT: .LBB5_3: // %else5 +; NO_SVE-NEXT: tbnz w8, #3, .LBB5_13 +; NO_SVE-NEXT: .LBB5_4: // %else8 +; NO_SVE-NEXT: tbnz w8, #4, .LBB5_14 +; NO_SVE-NEXT: .LBB5_5: // %else11 +; NO_SVE-NEXT: tbnz w8, #5, .LBB5_15 +; NO_SVE-NEXT: .LBB5_6: // %else14 +; NO_SVE-NEXT: tbnz w8, #6, .LBB5_16 +; NO_SVE-NEXT: .LBB5_7: // %else17 +; NO_SVE-NEXT: tbnz w8, #7, .LBB5_17 +; NO_SVE-NEXT: .LBB5_8: // %else20 +; NO_SVE-NEXT: tbz w8, #8, .LBB5_18 +; NO_SVE-NEXT: .LBB5_9: // %cond.load22 +; NO_SVE-NEXT: add x9, x0, #32 +; NO_SVE-NEXT: mov v16.16b, v2.16b +; NO_SVE-NEXT: mov v3.16b, v2.16b +; NO_SVE-NEXT: ld1 { v16.s }[0], [x9] +; NO_SVE-NEXT: mov v4.16b, v2.16b +; NO_SVE-NEXT: mov v5.16b, v2.16b +; NO_SVE-NEXT: mov v6.16b, v2.16b +; NO_SVE-NEXT: mov v7.16b, v2.16b +; NO_SVE-NEXT: mov v2.16b, v16.16b +; NO_SVE-NEXT: tbnz w8, #9, .LBB5_19 +; NO_SVE-NEXT: b .LBB5_20 +; NO_SVE-NEXT: .LBB5_10: // %cond.load +; NO_SVE-NEXT: ld1 { v0.s }[0], [x0] +; NO_SVE-NEXT: movi v2.2d, #0000000000000000 +; NO_SVE-NEXT: tbz w8, #1, .LBB5_2 +; NO_SVE-NEXT: .LBB5_11: // %cond.load1 +; NO_SVE-NEXT: add x9, x0, #4 +; NO_SVE-NEXT: ld1 { v0.s }[1], [x9] +; NO_SVE-NEXT: tbz w8, #2, .LBB5_3 +; NO_SVE-NEXT: .LBB5_12: // %cond.load4 +; NO_SVE-NEXT: add x9, x0, #8 +; NO_SVE-NEXT: ld1 { v0.s }[2], [x9] +; NO_SVE-NEXT: tbz w8, #3, .LBB5_4 +; NO_SVE-NEXT: .LBB5_13: // %cond.load7 +; NO_SVE-NEXT: add x9, x0, #12 +; NO_SVE-NEXT: ld1 { v0.s }[3], [x9] +; NO_SVE-NEXT: tbz w8, #4, .LBB5_5 +; NO_SVE-NEXT: .LBB5_14: // %cond.load10 +; NO_SVE-NEXT: add x9, x0, #16 +; NO_SVE-NEXT: ld1 { v1.s }[0], [x9] +; NO_SVE-NEXT: tbz w8, #5, .LBB5_6 +; NO_SVE-NEXT: .LBB5_15: // %cond.load13 +; NO_SVE-NEXT: add x9, x0, #20 +; NO_SVE-NEXT: ld1 { v1.s }[1], [x9] +; NO_SVE-NEXT: tbz w8, #6, .LBB5_7 +; NO_SVE-NEXT: .LBB5_16: // %cond.load16 +; NO_SVE-NEXT: add x9, x0, #24 +; NO_SVE-NEXT: ld1 { v1.s }[2], [x9] +; NO_SVE-NEXT: tbz w8, #7, .LBB5_8 +; NO_SVE-NEXT: .LBB5_17: // %cond.load19 +; NO_SVE-NEXT: add x9, x0, #28 +; NO_SVE-NEXT: ld1 { v1.s }[3], [x9] +; NO_SVE-NEXT: tbnz w8, #8, .LBB5_9 +; NO_SVE-NEXT: .LBB5_18: +; NO_SVE-NEXT: mov v3.16b, v2.16b +; NO_SVE-NEXT: mov v4.16b, v2.16b +; NO_SVE-NEXT: mov v5.16b, v2.16b +; NO_SVE-NEXT: mov v6.16b, v2.16b +; NO_SVE-NEXT: mov v7.16b, v2.16b +; NO_SVE-NEXT: tbz w8, #9, .LBB5_20 +; NO_SVE-NEXT: .LBB5_19: // %cond.load25 +; NO_SVE-NEXT: add x9, x0, #36 +; NO_SVE-NEXT: ld1 { v2.s }[1], [x9] +; NO_SVE-NEXT: .LBB5_20: // %else26 +; NO_SVE-NEXT: tbnz w8, #10, .LBB5_43 +; NO_SVE-NEXT: // %bb.21: // %else29 +; NO_SVE-NEXT: tbnz w8, #11, .LBB5_44 +; NO_SVE-NEXT: .LBB5_22: // %else32 +; NO_SVE-NEXT: tbnz w8, #12, .LBB5_45 +; NO_SVE-NEXT: .LBB5_23: // %else35 +; NO_SVE-NEXT: tbnz w8, #13, .LBB5_46 +; NO_SVE-NEXT: .LBB5_24: // %else38 +; NO_SVE-NEXT: tbnz w8, #14, .LBB5_47 +; NO_SVE-NEXT: .LBB5_25: // %else41 +; NO_SVE-NEXT: tbnz w8, #15, .LBB5_48 +; NO_SVE-NEXT: .LBB5_26: // %else44 +; NO_SVE-NEXT: tbnz w8, #16, .LBB5_49 +; NO_SVE-NEXT: .LBB5_27: // %else47 +; NO_SVE-NEXT: tbnz w8, #17, .LBB5_50 +; NO_SVE-NEXT: .LBB5_28: // %else50 +; NO_SVE-NEXT: tbnz w8, #18, .LBB5_51 +; NO_SVE-NEXT: .LBB5_29: // %else53 +; NO_SVE-NEXT: tbnz w8, #19, .LBB5_52 +; NO_SVE-NEXT: .LBB5_30: // %else56 +; NO_SVE-NEXT: tbnz w8, #20, .LBB5_53 +; NO_SVE-NEXT: .LBB5_31: // %else59 +; NO_SVE-NEXT: tbnz w8, #21, .LBB5_54 +; NO_SVE-NEXT: .LBB5_32: // %else62 +; NO_SVE-NEXT: tbnz w8, #22, .LBB5_55 +; NO_SVE-NEXT: .LBB5_33: // %else65 +; NO_SVE-NEXT: tbnz w8, #23, .LBB5_56 +; NO_SVE-NEXT: .LBB5_34: // %else68 +; NO_SVE-NEXT: tbnz w8, #24, .LBB5_57 +; NO_SVE-NEXT: .LBB5_35: // %else71 +; NO_SVE-NEXT: tbnz w8, #25, .LBB5_58 +; NO_SVE-NEXT: .LBB5_36: // %else74 +; NO_SVE-NEXT: tbnz w8, #26, .LBB5_59 +; NO_SVE-NEXT: .LBB5_37: // %else77 +; NO_SVE-NEXT: tbnz w8, #27, .LBB5_60 +; NO_SVE-NEXT: .LBB5_38: // %else80 +; NO_SVE-NEXT: tbnz w8, #28, .LBB5_61 +; NO_SVE-NEXT: .LBB5_39: // %else83 +; NO_SVE-NEXT: tbnz w8, #29, .LBB5_62 +; NO_SVE-NEXT: .LBB5_40: // %else86 +; NO_SVE-NEXT: tbnz w8, #30, .LBB5_63 +; NO_SVE-NEXT: .LBB5_41: // %else89 +; NO_SVE-NEXT: tbnz w8, #31, .LBB5_64 +; NO_SVE-NEXT: .LBB5_42: // %else92 +; NO_SVE-NEXT: add sp, sp, #16 +; NO_SVE-NEXT: ret +; NO_SVE-NEXT: .LBB5_43: // %cond.load28 +; NO_SVE-NEXT: add x9, x0, #40 +; NO_SVE-NEXT: ld1 { v2.s }[2], [x9] +; NO_SVE-NEXT: tbz w8, #11, .LBB5_22 +; NO_SVE-NEXT: .LBB5_44: // %cond.load31 +; NO_SVE-NEXT: add x9, x0, #44 +; NO_SVE-NEXT: ld1 { v2.s }[3], [x9] +; NO_SVE-NEXT: tbz w8, #12, .LBB5_23 +; NO_SVE-NEXT: .LBB5_45: // %cond.load34 +; NO_SVE-NEXT: add x9, x0, #48 +; NO_SVE-NEXT: ld1 { v3.s }[0], [x9] +; NO_SVE-NEXT: tbz w8, #13, .LBB5_24 +; NO_SVE-NEXT: .LBB5_46: // %cond.load37 +; NO_SVE-NEXT: add x9, x0, #52 +; NO_SVE-NEXT: ld1 { v3.s }[1], [x9] +; NO_SVE-NEXT: tbz w8, #14, .LBB5_25 +; NO_SVE-NEXT: .LBB5_47: // %cond.load40 +; NO_SVE-NEXT: add x9, x0, #56 +; NO_SVE-NEXT: ld1 { v3.s }[2], [x9] +; NO_SVE-NEXT: tbz w8, #15, .LBB5_26 +; NO_SVE-NEXT: .LBB5_48: // %cond.load43 +; NO_SVE-NEXT: add x9, x0, #60 +; NO_SVE-NEXT: ld1 { v3.s }[3], [x9] +; NO_SVE-NEXT: tbz w8, #16, .LBB5_27 +; NO_SVE-NEXT: .LBB5_49: // %cond.load46 +; NO_SVE-NEXT: add x9, x0, #64 +; NO_SVE-NEXT: ld1 { v4.s }[0], [x9] +; NO_SVE-NEXT: tbz w8, #17, .LBB5_28 +; NO_SVE-NEXT: .LBB5_50: // %cond.load49 +; NO_SVE-NEXT: add x9, x0, #68 +; NO_SVE-NEXT: ld1 { v4.s }[1], [x9] +; NO_SVE-NEXT: tbz w8, #18, .LBB5_29 +; NO_SVE-NEXT: .LBB5_51: // %cond.load52 +; NO_SVE-NEXT: add x9, x0, #72 +; NO_SVE-NEXT: ld1 { v4.s }[2], [x9] +; NO_SVE-NEXT: tbz w8, #19, .LBB5_30 +; NO_SVE-NEXT: .LBB5_52: // %cond.load55 +; NO_SVE-NEXT: add x9, x0, #76 +; NO_SVE-NEXT: ld1 { v4.s }[3], [x9] +; NO_SVE-NEXT: tbz w8, #20, .LBB5_31 +; NO_SVE-NEXT: .LBB5_53: // %cond.load58 +; NO_SVE-NEXT: add x9, x0, #80 +; NO_SVE-NEXT: ld1 { v5.s }[0], [x9] +; NO_SVE-NEXT: tbz w8, #21, .LBB5_32 +; NO_SVE-NEXT: .LBB5_54: // %cond.load61 +; NO_SVE-NEXT: add x9, x0, #84 +; NO_SVE-NEXT: ld1 { v5.s }[1], [x9] +; NO_SVE-NEXT: tbz w8, #22, .LBB5_33 +; NO_SVE-NEXT: .LBB5_55: // %cond.load64 +; NO_SVE-NEXT: add x9, x0, #88 +; NO_SVE-NEXT: ld1 { v5.s }[2], [x9] +; NO_SVE-NEXT: tbz w8, #23, .LBB5_34 +; NO_SVE-NEXT: .LBB5_56: // %cond.load67 +; NO_SVE-NEXT: add x9, x0, #92 +; NO_SVE-NEXT: ld1 { v5.s }[3], [x9] +; NO_SVE-NEXT: tbz w8, #24, .LBB5_35 +; NO_SVE-NEXT: .LBB5_57: // %cond.load70 +; NO_SVE-NEXT: add x9, x0, #96 +; NO_SVE-NEXT: ld1 { v6.s }[0], [x9] +; NO_SVE-NEXT: tbz w8, #25, .LBB5_36 +; NO_SVE-NEXT: .LBB5_58: // %cond.load73 +; NO_SVE-NEXT: add x9, x0, #100 +; NO_SVE-NEXT: ld1 { v6.s }[1], [x9] +; NO_SVE-NEXT: tbz w8, #26, .LBB5_37 +; NO_SVE-NEXT: .LBB5_59: // %cond.load76 +; NO_SVE-NEXT: add x9, x0, #104 +; NO_SVE-NEXT: ld1 { v6.s }[2], [x9] +; NO_SVE-NEXT: tbz w8, #27, .LBB5_38 +; NO_SVE-NEXT: .LBB5_60: // %cond.load79 +; NO_SVE-NEXT: add x9, x0, #108 +; NO_SVE-NEXT: ld1 { v6.s }[3], [x9] +; NO_SVE-NEXT: tbz w8, #28, .LBB5_39 +; NO_SVE-NEXT: .LBB5_61: // %cond.load82 +; NO_SVE-NEXT: add x9, x0, #112 +; NO_SVE-NEXT: ld1 { v7.s }[0], [x9] +; NO_SVE-NEXT: tbz w8, #29, .LBB5_40 +; NO_SVE-NEXT: .LBB5_62: // %cond.load85 +; NO_SVE-NEXT: add x9, x0, #116 +; NO_SVE-NEXT: ld1 { v7.s }[1], [x9] +; NO_SVE-NEXT: tbz w8, #30, .LBB5_41 +; NO_SVE-NEXT: .LBB5_63: // %cond.load88 +; NO_SVE-NEXT: add x9, x0, #120 +; NO_SVE-NEXT: ld1 { v7.s }[2], [x9] +; NO_SVE-NEXT: tbz w8, #31, .LBB5_42 +; NO_SVE-NEXT: .LBB5_64: // %cond.load91 +; NO_SVE-NEXT: add x8, x0, #124 +; NO_SVE-NEXT: ld1 { v7.s }[3], [x8] +; NO_SVE-NEXT: add sp, sp, #16 +; NO_SVE-NEXT: ret +; ; VBITS_GE_1024-LABEL: masked_load_v32f32: ; VBITS_GE_1024: // %bb.0: ; VBITS_GE_1024-NEXT: ptrue p0.s, vl32 @@ -142,6 +836,671 @@ } define <64 x float> @masked_load_v64f32(<64 x float>* %ap, <64 x float>* %bp) #0 { +; NO_SVE-LABEL: masked_load_v64f32: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: sub sp, sp, #16 +; NO_SVE-NEXT: .cfi_def_cfa_offset 16 +; NO_SVE-NEXT: ldp q2, q3, [x0, #192] +; NO_SVE-NEXT: ldp q4, q5, [x1, #192] +; NO_SVE-NEXT: fcmeq v2.4s, v2.4s, v4.4s +; NO_SVE-NEXT: fcmeq v3.4s, v3.4s, v5.4s +; NO_SVE-NEXT: ldp q0, q1, [x0, #224] +; NO_SVE-NEXT: uzp1 v2.8h, v2.8h, v3.8h +; NO_SVE-NEXT: ldp q6, q7, [x1, #224] +; NO_SVE-NEXT: xtn v2.8b, v2.8h +; NO_SVE-NEXT: umov w9, v2.b[1] +; NO_SVE-NEXT: umov w11, v2.b[2] +; NO_SVE-NEXT: fcmeq v6.4s, v0.4s, v6.4s +; NO_SVE-NEXT: umov w10, v2.b[0] +; NO_SVE-NEXT: umov w12, v2.b[3] +; NO_SVE-NEXT: fcmeq v7.4s, v1.4s, v7.4s +; NO_SVE-NEXT: umov w13, v2.b[4] +; NO_SVE-NEXT: ldp q16, q17, [x0, #128] +; NO_SVE-NEXT: umov w14, v2.b[5] +; NO_SVE-NEXT: and w9, w9, #0x1 +; NO_SVE-NEXT: and w11, w11, #0x1 +; NO_SVE-NEXT: umov w15, v2.b[6] +; NO_SVE-NEXT: uzp1 v3.8h, v6.8h, v7.8h +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: and w12, w12, #0x1 +; NO_SVE-NEXT: and w13, w13, #0x1 +; NO_SVE-NEXT: bfi w10, w9, #1, #1 +; NO_SVE-NEXT: and w9, w14, #0x1 +; NO_SVE-NEXT: ldp q21, q22, [x1, #128] +; NO_SVE-NEXT: xtn v18.8b, v3.8h +; NO_SVE-NEXT: bfi w10, w11, #2, #1 +; NO_SVE-NEXT: umov w16, v2.b[7] +; NO_SVE-NEXT: bfi w10, w12, #3, #1 +; NO_SVE-NEXT: umov w11, v18.b[0] +; NO_SVE-NEXT: and w12, w15, #0x1 +; NO_SVE-NEXT: bfi w10, w13, #4, #1 +; NO_SVE-NEXT: umov w15, v18.b[6] +; NO_SVE-NEXT: fcmeq v21.4s, v16.4s, v21.4s +; NO_SVE-NEXT: bfi w10, w9, #5, #1 +; NO_SVE-NEXT: fcmeq v22.4s, v17.4s, v22.4s +; NO_SVE-NEXT: orr w9, w10, w12, lsl #6 +; NO_SVE-NEXT: and w10, w11, #0x1 +; NO_SVE-NEXT: umov w11, v18.b[1] +; NO_SVE-NEXT: and w13, w16, #0x1 +; NO_SVE-NEXT: umov w12, v18.b[2] +; NO_SVE-NEXT: ldp q23, q24, [x0, #160] +; NO_SVE-NEXT: uzp1 v25.8h, v21.8h, v22.8h +; NO_SVE-NEXT: orr w9, w9, w13, lsl #7 +; NO_SVE-NEXT: orr w9, w9, w10, lsl #8 +; NO_SVE-NEXT: and w10, w11, #0x1 +; NO_SVE-NEXT: umov w11, v18.b[3] +; NO_SVE-NEXT: and w12, w12, #0x1 +; NO_SVE-NEXT: orr w9, w9, w10, lsl #9 +; NO_SVE-NEXT: xtn v25.8b, v25.8h +; NO_SVE-NEXT: orr w9, w9, w12, lsl #10 +; NO_SVE-NEXT: ldp q28, q29, [x1, #160] +; NO_SVE-NEXT: and w10, w11, #0x1 +; NO_SVE-NEXT: umov w11, v18.b[4] +; NO_SVE-NEXT: umov w12, v25.b[1] +; NO_SVE-NEXT: umov w13, v25.b[0] +; NO_SVE-NEXT: umov w14, v25.b[2] +; NO_SVE-NEXT: orr w9, w9, w10, lsl #11 +; NO_SVE-NEXT: fcmeq v23.4s, v23.4s, v28.4s +; NO_SVE-NEXT: umov w10, v18.b[5] +; NO_SVE-NEXT: and w11, w11, #0x1 +; NO_SVE-NEXT: umov w16, v25.b[6] +; NO_SVE-NEXT: and w12, w12, #0x1 +; NO_SVE-NEXT: orr w9, w9, w11, lsl #12 +; NO_SVE-NEXT: and w11, w13, #0x1 +; NO_SVE-NEXT: umov w13, v25.b[3] +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: bfi w11, w12, #1, #1 +; NO_SVE-NEXT: and w12, w14, #0x1 +; NO_SVE-NEXT: umov w14, v25.b[4] +; NO_SVE-NEXT: orr w9, w9, w10, lsl #13 +; NO_SVE-NEXT: fcmeq v24.4s, v24.4s, v29.4s +; NO_SVE-NEXT: and w10, w15, #0x1 +; NO_SVE-NEXT: ldp q19, q20, [x0, #64] +; NO_SVE-NEXT: bfi w11, w12, #2, #1 +; NO_SVE-NEXT: and w12, w13, #0x1 +; NO_SVE-NEXT: and w13, w14, #0x1 +; NO_SVE-NEXT: umov w14, v25.b[5] +; NO_SVE-NEXT: uzp1 v23.8h, v23.8h, v24.8h +; NO_SVE-NEXT: orr w9, w9, w10, lsl #14 +; NO_SVE-NEXT: bfi w11, w12, #3, #1 +; NO_SVE-NEXT: umov w15, v18.b[7] +; NO_SVE-NEXT: bfi w11, w13, #4, #1 +; NO_SVE-NEXT: and w13, w16, #0x1 +; NO_SVE-NEXT: ldp q26, q27, [x1, #64] +; NO_SVE-NEXT: and w12, w14, #0x1 +; NO_SVE-NEXT: xtn v23.8b, v23.8h +; NO_SVE-NEXT: umov w14, v25.b[7] +; NO_SVE-NEXT: orr w9, w9, w15, lsl #15 +; NO_SVE-NEXT: umov w15, v23.b[6] +; NO_SVE-NEXT: fcmeq v19.4s, v19.4s, v26.4s +; NO_SVE-NEXT: bfi w11, w12, #5, #1 +; NO_SVE-NEXT: umov w12, v23.b[0] +; NO_SVE-NEXT: orr w10, w11, w13, lsl #6 +; NO_SVE-NEXT: fcmeq v20.4s, v20.4s, v27.4s +; NO_SVE-NEXT: umov w13, v23.b[1] +; NO_SVE-NEXT: and w11, w14, #0x1 +; NO_SVE-NEXT: umov w14, v23.b[2] +; NO_SVE-NEXT: ldp q6, q7, [x0, #96] +; NO_SVE-NEXT: and w12, w12, #0x1 +; NO_SVE-NEXT: orr w10, w10, w11, lsl #7 +; NO_SVE-NEXT: uzp1 v19.8h, v19.8h, v20.8h +; NO_SVE-NEXT: umov w11, v23.b[3] +; NO_SVE-NEXT: orr w10, w10, w12, lsl #8 +; NO_SVE-NEXT: and w12, w13, #0x1 +; NO_SVE-NEXT: and w13, w14, #0x1 +; NO_SVE-NEXT: umov w14, v23.b[4] +; NO_SVE-NEXT: orr w10, w10, w12, lsl #9 +; NO_SVE-NEXT: and w15, w15, #0x1 +; NO_SVE-NEXT: xtn v18.8b, v19.8h +; NO_SVE-NEXT: and w11, w11, #0x1 +; NO_SVE-NEXT: ldp q21, q22, [x1, #96] +; NO_SVE-NEXT: umov w12, v18.b[1] +; NO_SVE-NEXT: orr w10, w10, w13, lsl #10 +; NO_SVE-NEXT: umov w13, v23.b[5] +; NO_SVE-NEXT: umov w16, v18.b[0] +; NO_SVE-NEXT: orr w10, w10, w11, lsl #11 +; NO_SVE-NEXT: and w11, w14, #0x1 +; NO_SVE-NEXT: umov w14, v18.b[2] +; NO_SVE-NEXT: umov w17, v18.b[5] +; NO_SVE-NEXT: and w12, w12, #0x1 +; NO_SVE-NEXT: orr w10, w10, w11, lsl #12 +; NO_SVE-NEXT: fcmeq v7.4s, v7.4s, v22.4s +; NO_SVE-NEXT: and w11, w13, #0x1 +; NO_SVE-NEXT: fcmeq v6.4s, v6.4s, v21.4s +; NO_SVE-NEXT: and w13, w16, #0x1 +; NO_SVE-NEXT: bfi w13, w12, #1, #1 +; NO_SVE-NEXT: and w12, w14, #0x1 +; NO_SVE-NEXT: umov w14, v18.b[3] +; NO_SVE-NEXT: umov w16, v18.b[4] +; NO_SVE-NEXT: ldp q4, q5, [x0] +; NO_SVE-NEXT: uzp1 v6.8h, v6.8h, v7.8h +; NO_SVE-NEXT: bfi w13, w12, #2, #1 +; NO_SVE-NEXT: orr w10, w10, w11, lsl #13 +; NO_SVE-NEXT: umov w11, v18.b[7] +; NO_SVE-NEXT: and w12, w14, #0x1 +; NO_SVE-NEXT: and w14, w16, #0x1 +; NO_SVE-NEXT: and w16, w17, #0x1 +; NO_SVE-NEXT: umov w17, v18.b[6] +; NO_SVE-NEXT: xtn v6.8b, v6.8h +; NO_SVE-NEXT: orr w10, w10, w15, lsl #14 +; NO_SVE-NEXT: ldp q16, q17, [x1] +; NO_SVE-NEXT: bfi w13, w12, #3, #1 +; NO_SVE-NEXT: and w11, w11, #0x1 +; NO_SVE-NEXT: bfi w13, w14, #4, #1 +; NO_SVE-NEXT: and w12, w17, #0x1 +; NO_SVE-NEXT: umov w14, v6.b[0] +; NO_SVE-NEXT: bfi w13, w16, #5, #1 +; NO_SVE-NEXT: fcmeq v4.4s, v4.4s, v16.4s +; NO_SVE-NEXT: orr w12, w13, w12, lsl #6 +; NO_SVE-NEXT: umov w13, v6.b[1] +; NO_SVE-NEXT: orr w11, w12, w11, lsl #7 +; NO_SVE-NEXT: fcmeq v5.4s, v5.4s, v17.4s +; NO_SVE-NEXT: umov w16, v6.b[3] +; NO_SVE-NEXT: and w12, w14, #0x1 +; NO_SVE-NEXT: umov w14, v6.b[2] +; NO_SVE-NEXT: ldp q0, q1, [x0, #32] +; NO_SVE-NEXT: and w13, w13, #0x1 +; NO_SVE-NEXT: orr w11, w11, w12, lsl #8 +; NO_SVE-NEXT: uzp1 v4.8h, v4.8h, v5.8h +; NO_SVE-NEXT: umov w12, v6.b[4] +; NO_SVE-NEXT: umov w15, v23.b[7] +; NO_SVE-NEXT: orr w11, w11, w13, lsl #9 +; NO_SVE-NEXT: and w13, w14, #0x1 +; NO_SVE-NEXT: umov w14, v6.b[5] +; NO_SVE-NEXT: and w16, w16, #0x1 +; NO_SVE-NEXT: ldp q2, q3, [x1, #32] +; NO_SVE-NEXT: xtn v4.8b, v4.8h +; NO_SVE-NEXT: orr w11, w11, w13, lsl #10 +; NO_SVE-NEXT: and w12, w12, #0x1 +; NO_SVE-NEXT: orr w10, w10, w15, lsl #15 +; NO_SVE-NEXT: umov w17, v4.b[1] +; NO_SVE-NEXT: orr w11, w11, w16, lsl #11 +; NO_SVE-NEXT: umov w15, v4.b[2] +; NO_SVE-NEXT: orr w11, w11, w12, lsl #12 +; NO_SVE-NEXT: and w12, w14, #0x1 +; NO_SVE-NEXT: umov w14, v4.b[0] +; NO_SVE-NEXT: fcmeq v1.4s, v1.4s, v3.4s +; NO_SVE-NEXT: umov w16, v4.b[3] +; NO_SVE-NEXT: fcmeq v0.4s, v0.4s, v2.4s +; NO_SVE-NEXT: and w13, w17, #0x1 +; NO_SVE-NEXT: umov w17, v4.b[4] +; NO_SVE-NEXT: and w15, w15, #0x1 +; NO_SVE-NEXT: umov w18, v4.b[5] +; NO_SVE-NEXT: and w14, w14, #0x1 +; NO_SVE-NEXT: bfi w14, w13, #1, #1 +; NO_SVE-NEXT: and w13, w16, #0x1 +; NO_SVE-NEXT: uzp1 v0.8h, v0.8h, v1.8h +; NO_SVE-NEXT: bfi w14, w15, #2, #1 +; NO_SVE-NEXT: and w15, w17, #0x1 +; NO_SVE-NEXT: umov w17, v4.b[6] +; NO_SVE-NEXT: and w16, w18, #0x1 +; NO_SVE-NEXT: bfi w14, w13, #3, #1 +; NO_SVE-NEXT: umov w13, v4.b[7] +; NO_SVE-NEXT: orr w11, w11, w12, lsl #13 +; NO_SVE-NEXT: xtn v0.8b, v0.8h +; NO_SVE-NEXT: bfi w14, w15, #4, #1 +; NO_SVE-NEXT: umov w15, v6.b[6] +; NO_SVE-NEXT: bfi w14, w16, #5, #1 +; NO_SVE-NEXT: and w16, w17, #0x1 +; NO_SVE-NEXT: umov w17, v0.b[0] +; NO_SVE-NEXT: and w13, w13, #0x1 +; NO_SVE-NEXT: umov w18, v0.b[1] +; NO_SVE-NEXT: orr w12, w14, w16, lsl #6 +; NO_SVE-NEXT: bfi w10, w9, #16, #16 +; NO_SVE-NEXT: and w14, w15, #0x1 +; NO_SVE-NEXT: umov w15, v0.b[2] +; NO_SVE-NEXT: orr w12, w12, w13, lsl #7 +; NO_SVE-NEXT: and w13, w17, #0x1 +; NO_SVE-NEXT: umov w17, v0.b[3] +; NO_SVE-NEXT: and w16, w18, #0x1 +; NO_SVE-NEXT: orr w12, w12, w13, lsl #8 +; NO_SVE-NEXT: umov w13, v0.b[4] +; NO_SVE-NEXT: orr w11, w11, w14, lsl #14 +; NO_SVE-NEXT: and w14, w15, #0x1 +; NO_SVE-NEXT: umov w15, v0.b[5] +; NO_SVE-NEXT: orr w12, w12, w16, lsl #9 +; NO_SVE-NEXT: and w16, w17, #0x1 +; NO_SVE-NEXT: umov w17, v0.b[6] +; NO_SVE-NEXT: orr w12, w12, w14, lsl #10 +; NO_SVE-NEXT: umov w14, v6.b[7] +; NO_SVE-NEXT: and w13, w13, #0x1 +; NO_SVE-NEXT: orr w12, w12, w16, lsl #11 +; NO_SVE-NEXT: and w15, w15, #0x1 +; NO_SVE-NEXT: umov w16, v0.b[7] +; NO_SVE-NEXT: orr w12, w12, w13, lsl #12 +; NO_SVE-NEXT: and w13, w17, #0x1 +; NO_SVE-NEXT: orr w11, w11, w14, lsl #15 +; NO_SVE-NEXT: orr w12, w12, w15, lsl #13 +; NO_SVE-NEXT: orr w9, w12, w13, lsl #14 +; NO_SVE-NEXT: orr w9, w9, w16, lsl #15 +; NO_SVE-NEXT: bfi w9, w11, #16, #16 +; NO_SVE-NEXT: movi v0.2d, #0000000000000000 +; NO_SVE-NEXT: bfi x9, x10, #32, #32 +; NO_SVE-NEXT: movi v1.2d, #0000000000000000 +; NO_SVE-NEXT: tbnz w9, #0, .LBB6_10 +; NO_SVE-NEXT: // %bb.1: // %else +; NO_SVE-NEXT: movi v2.2d, #0000000000000000 +; NO_SVE-NEXT: tbnz w9, #1, .LBB6_11 +; NO_SVE-NEXT: .LBB6_2: // %else2 +; NO_SVE-NEXT: tbnz w9, #2, .LBB6_12 +; NO_SVE-NEXT: .LBB6_3: // %else5 +; NO_SVE-NEXT: tbnz w9, #3, .LBB6_13 +; NO_SVE-NEXT: .LBB6_4: // %else8 +; NO_SVE-NEXT: tbnz w9, #4, .LBB6_14 +; NO_SVE-NEXT: .LBB6_5: // %else11 +; NO_SVE-NEXT: tbnz w9, #5, .LBB6_15 +; NO_SVE-NEXT: .LBB6_6: // %else14 +; NO_SVE-NEXT: tbnz w9, #6, .LBB6_16 +; NO_SVE-NEXT: .LBB6_7: // %else17 +; NO_SVE-NEXT: tbnz w9, #7, .LBB6_17 +; NO_SVE-NEXT: .LBB6_8: // %else20 +; NO_SVE-NEXT: tbz w9, #8, .LBB6_18 +; NO_SVE-NEXT: .LBB6_9: // %cond.load22 +; NO_SVE-NEXT: add x10, x0, #32 +; NO_SVE-NEXT: mov v24.16b, v2.16b +; NO_SVE-NEXT: mov v3.16b, v2.16b +; NO_SVE-NEXT: ld1 { v24.s }[0], [x10] +; NO_SVE-NEXT: mov v4.16b, v2.16b +; NO_SVE-NEXT: mov v5.16b, v2.16b +; NO_SVE-NEXT: mov v6.16b, v2.16b +; NO_SVE-NEXT: mov v7.16b, v2.16b +; NO_SVE-NEXT: mov v16.16b, v2.16b +; NO_SVE-NEXT: mov v17.16b, v2.16b +; NO_SVE-NEXT: mov v18.16b, v2.16b +; NO_SVE-NEXT: mov v19.16b, v2.16b +; NO_SVE-NEXT: mov v20.16b, v2.16b +; NO_SVE-NEXT: mov v21.16b, v2.16b +; NO_SVE-NEXT: mov v22.16b, v2.16b +; NO_SVE-NEXT: mov v23.16b, v2.16b +; NO_SVE-NEXT: mov v2.16b, v24.16b +; NO_SVE-NEXT: tbnz w9, #9, .LBB6_19 +; NO_SVE-NEXT: b .LBB6_20 +; NO_SVE-NEXT: .LBB6_10: // %cond.load +; NO_SVE-NEXT: ld1 { v0.s }[0], [x0] +; NO_SVE-NEXT: movi v2.2d, #0000000000000000 +; NO_SVE-NEXT: tbz w9, #1, .LBB6_2 +; NO_SVE-NEXT: .LBB6_11: // %cond.load1 +; NO_SVE-NEXT: add x10, x0, #4 +; NO_SVE-NEXT: ld1 { v0.s }[1], [x10] +; NO_SVE-NEXT: tbz w9, #2, .LBB6_3 +; NO_SVE-NEXT: .LBB6_12: // %cond.load4 +; NO_SVE-NEXT: add x10, x0, #8 +; NO_SVE-NEXT: ld1 { v0.s }[2], [x10] +; NO_SVE-NEXT: tbz w9, #3, .LBB6_4 +; NO_SVE-NEXT: .LBB6_13: // %cond.load7 +; NO_SVE-NEXT: add x10, x0, #12 +; NO_SVE-NEXT: ld1 { v0.s }[3], [x10] +; NO_SVE-NEXT: tbz w9, #4, .LBB6_5 +; NO_SVE-NEXT: .LBB6_14: // %cond.load10 +; NO_SVE-NEXT: add x10, x0, #16 +; NO_SVE-NEXT: ld1 { v1.s }[0], [x10] +; NO_SVE-NEXT: tbz w9, #5, .LBB6_6 +; NO_SVE-NEXT: .LBB6_15: // %cond.load13 +; NO_SVE-NEXT: add x10, x0, #20 +; NO_SVE-NEXT: ld1 { v1.s }[1], [x10] +; NO_SVE-NEXT: tbz w9, #6, .LBB6_7 +; NO_SVE-NEXT: .LBB6_16: // %cond.load16 +; NO_SVE-NEXT: add x10, x0, #24 +; NO_SVE-NEXT: ld1 { v1.s }[2], [x10] +; NO_SVE-NEXT: tbz w9, #7, .LBB6_8 +; NO_SVE-NEXT: .LBB6_17: // %cond.load19 +; NO_SVE-NEXT: add x10, x0, #28 +; NO_SVE-NEXT: ld1 { v1.s }[3], [x10] +; NO_SVE-NEXT: tbnz w9, #8, .LBB6_9 +; NO_SVE-NEXT: .LBB6_18: +; NO_SVE-NEXT: mov v3.16b, v2.16b +; NO_SVE-NEXT: mov v4.16b, v2.16b +; NO_SVE-NEXT: mov v5.16b, v2.16b +; NO_SVE-NEXT: mov v6.16b, v2.16b +; NO_SVE-NEXT: mov v7.16b, v2.16b +; NO_SVE-NEXT: mov v16.16b, v2.16b +; NO_SVE-NEXT: mov v17.16b, v2.16b +; NO_SVE-NEXT: mov v18.16b, v2.16b +; NO_SVE-NEXT: mov v19.16b, v2.16b +; NO_SVE-NEXT: mov v20.16b, v2.16b +; NO_SVE-NEXT: mov v21.16b, v2.16b +; NO_SVE-NEXT: mov v22.16b, v2.16b +; NO_SVE-NEXT: mov v23.16b, v2.16b +; NO_SVE-NEXT: tbz w9, #9, .LBB6_20 +; NO_SVE-NEXT: .LBB6_19: // %cond.load25 +; NO_SVE-NEXT: add x10, x0, #36 +; NO_SVE-NEXT: ld1 { v2.s }[1], [x10] +; NO_SVE-NEXT: .LBB6_20: // %else26 +; NO_SVE-NEXT: tbnz w9, #10, .LBB6_76 +; NO_SVE-NEXT: // %bb.21: // %else29 +; NO_SVE-NEXT: tbnz w9, #11, .LBB6_77 +; NO_SVE-NEXT: .LBB6_22: // %else32 +; NO_SVE-NEXT: tbnz w9, #12, .LBB6_78 +; NO_SVE-NEXT: .LBB6_23: // %else35 +; NO_SVE-NEXT: tbnz w9, #13, .LBB6_79 +; NO_SVE-NEXT: .LBB6_24: // %else38 +; NO_SVE-NEXT: tbnz w9, #14, .LBB6_80 +; NO_SVE-NEXT: .LBB6_25: // %else41 +; NO_SVE-NEXT: tbnz w9, #15, .LBB6_81 +; NO_SVE-NEXT: .LBB6_26: // %else44 +; NO_SVE-NEXT: tbnz w9, #16, .LBB6_82 +; NO_SVE-NEXT: .LBB6_27: // %else47 +; NO_SVE-NEXT: tbnz w9, #17, .LBB6_83 +; NO_SVE-NEXT: .LBB6_28: // %else50 +; NO_SVE-NEXT: tbnz w9, #18, .LBB6_84 +; NO_SVE-NEXT: .LBB6_29: // %else53 +; NO_SVE-NEXT: tbnz w9, #19, .LBB6_85 +; NO_SVE-NEXT: .LBB6_30: // %else56 +; NO_SVE-NEXT: tbnz w9, #20, .LBB6_86 +; NO_SVE-NEXT: .LBB6_31: // %else59 +; NO_SVE-NEXT: tbnz w9, #21, .LBB6_87 +; NO_SVE-NEXT: .LBB6_32: // %else62 +; NO_SVE-NEXT: tbnz w9, #22, .LBB6_88 +; NO_SVE-NEXT: .LBB6_33: // %else65 +; NO_SVE-NEXT: tbnz w9, #23, .LBB6_89 +; NO_SVE-NEXT: .LBB6_34: // %else68 +; NO_SVE-NEXT: tbnz w9, #24, .LBB6_90 +; NO_SVE-NEXT: .LBB6_35: // %else71 +; NO_SVE-NEXT: tbnz w9, #25, .LBB6_91 +; NO_SVE-NEXT: .LBB6_36: // %else74 +; NO_SVE-NEXT: tbnz w9, #26, .LBB6_92 +; NO_SVE-NEXT: .LBB6_37: // %else77 +; NO_SVE-NEXT: tbnz w9, #27, .LBB6_93 +; NO_SVE-NEXT: .LBB6_38: // %else80 +; NO_SVE-NEXT: tbnz w9, #28, .LBB6_94 +; NO_SVE-NEXT: .LBB6_39: // %else83 +; NO_SVE-NEXT: tbnz w9, #29, .LBB6_95 +; NO_SVE-NEXT: .LBB6_40: // %else86 +; NO_SVE-NEXT: tbnz w9, #30, .LBB6_96 +; NO_SVE-NEXT: .LBB6_41: // %else89 +; NO_SVE-NEXT: tbnz w9, #31, .LBB6_97 +; NO_SVE-NEXT: .LBB6_42: // %else92 +; NO_SVE-NEXT: tbnz x9, #32, .LBB6_98 +; NO_SVE-NEXT: .LBB6_43: // %else95 +; NO_SVE-NEXT: tbnz x9, #33, .LBB6_99 +; NO_SVE-NEXT: .LBB6_44: // %else98 +; NO_SVE-NEXT: tbnz x9, #34, .LBB6_100 +; NO_SVE-NEXT: .LBB6_45: // %else101 +; NO_SVE-NEXT: tbnz x9, #35, .LBB6_101 +; NO_SVE-NEXT: .LBB6_46: // %else104 +; NO_SVE-NEXT: tbnz x9, #36, .LBB6_102 +; NO_SVE-NEXT: .LBB6_47: // %else107 +; NO_SVE-NEXT: tbnz x9, #37, .LBB6_103 +; NO_SVE-NEXT: .LBB6_48: // %else110 +; NO_SVE-NEXT: tbnz x9, #38, .LBB6_104 +; NO_SVE-NEXT: .LBB6_49: // %else113 +; NO_SVE-NEXT: tbnz x9, #39, .LBB6_105 +; NO_SVE-NEXT: .LBB6_50: // %else116 +; NO_SVE-NEXT: tbnz x9, #40, .LBB6_106 +; NO_SVE-NEXT: .LBB6_51: // %else119 +; NO_SVE-NEXT: tbnz x9, #41, .LBB6_107 +; NO_SVE-NEXT: .LBB6_52: // %else122 +; NO_SVE-NEXT: tbnz x9, #42, .LBB6_108 +; NO_SVE-NEXT: .LBB6_53: // %else125 +; NO_SVE-NEXT: tbnz x9, #43, .LBB6_109 +; NO_SVE-NEXT: .LBB6_54: // %else128 +; NO_SVE-NEXT: tbnz x9, #44, .LBB6_110 +; NO_SVE-NEXT: .LBB6_55: // %else131 +; NO_SVE-NEXT: tbnz x9, #45, .LBB6_111 +; NO_SVE-NEXT: .LBB6_56: // %else134 +; NO_SVE-NEXT: tbnz x9, #46, .LBB6_112 +; NO_SVE-NEXT: .LBB6_57: // %else137 +; NO_SVE-NEXT: tbnz x9, #47, .LBB6_113 +; NO_SVE-NEXT: .LBB6_58: // %else140 +; NO_SVE-NEXT: tbnz x9, #48, .LBB6_114 +; NO_SVE-NEXT: .LBB6_59: // %else143 +; NO_SVE-NEXT: tbnz x9, #49, .LBB6_115 +; NO_SVE-NEXT: .LBB6_60: // %else146 +; NO_SVE-NEXT: tbnz x9, #50, .LBB6_116 +; NO_SVE-NEXT: .LBB6_61: // %else149 +; NO_SVE-NEXT: tbnz x9, #51, .LBB6_117 +; NO_SVE-NEXT: .LBB6_62: // %else152 +; NO_SVE-NEXT: tbnz x9, #52, .LBB6_118 +; NO_SVE-NEXT: .LBB6_63: // %else155 +; NO_SVE-NEXT: tbnz x9, #53, .LBB6_119 +; NO_SVE-NEXT: .LBB6_64: // %else158 +; NO_SVE-NEXT: tbnz x9, #54, .LBB6_120 +; NO_SVE-NEXT: .LBB6_65: // %else161 +; NO_SVE-NEXT: tbnz x9, #55, .LBB6_121 +; NO_SVE-NEXT: .LBB6_66: // %else164 +; NO_SVE-NEXT: tbnz x9, #56, .LBB6_122 +; NO_SVE-NEXT: .LBB6_67: // %else167 +; NO_SVE-NEXT: tbnz x9, #57, .LBB6_123 +; NO_SVE-NEXT: .LBB6_68: // %else170 +; NO_SVE-NEXT: tbnz x9, #58, .LBB6_124 +; NO_SVE-NEXT: .LBB6_69: // %else173 +; NO_SVE-NEXT: tbnz x9, #59, .LBB6_125 +; NO_SVE-NEXT: .LBB6_70: // %else176 +; NO_SVE-NEXT: tbnz x9, #60, .LBB6_126 +; NO_SVE-NEXT: .LBB6_71: // %else179 +; NO_SVE-NEXT: tbnz x9, #61, .LBB6_127 +; NO_SVE-NEXT: .LBB6_72: // %else182 +; NO_SVE-NEXT: tbnz x9, #62, .LBB6_128 +; NO_SVE-NEXT: .LBB6_73: // %else185 +; NO_SVE-NEXT: tbz x9, #63, .LBB6_75 +; NO_SVE-NEXT: .LBB6_74: // %cond.load187 +; NO_SVE-NEXT: add x9, x0, #252 +; NO_SVE-NEXT: ld1 { v23.s }[3], [x9] +; NO_SVE-NEXT: .LBB6_75: // %else188 +; NO_SVE-NEXT: stp q0, q1, [x8] +; NO_SVE-NEXT: stp q2, q3, [x8, #32] +; NO_SVE-NEXT: stp q4, q5, [x8, #64] +; NO_SVE-NEXT: stp q6, q7, [x8, #96] +; NO_SVE-NEXT: stp q16, q17, [x8, #128] +; NO_SVE-NEXT: stp q18, q19, [x8, #160] +; NO_SVE-NEXT: stp q20, q21, [x8, #192] +; NO_SVE-NEXT: stp q22, q23, [x8, #224] +; NO_SVE-NEXT: add sp, sp, #16 +; NO_SVE-NEXT: ret +; NO_SVE-NEXT: .LBB6_76: // %cond.load28 +; NO_SVE-NEXT: add x10, x0, #40 +; NO_SVE-NEXT: ld1 { v2.s }[2], [x10] +; NO_SVE-NEXT: tbz w9, #11, .LBB6_22 +; NO_SVE-NEXT: .LBB6_77: // %cond.load31 +; NO_SVE-NEXT: add x10, x0, #44 +; NO_SVE-NEXT: ld1 { v2.s }[3], [x10] +; NO_SVE-NEXT: tbz w9, #12, .LBB6_23 +; NO_SVE-NEXT: .LBB6_78: // %cond.load34 +; NO_SVE-NEXT: add x10, x0, #48 +; NO_SVE-NEXT: ld1 { v3.s }[0], [x10] +; NO_SVE-NEXT: tbz w9, #13, .LBB6_24 +; NO_SVE-NEXT: .LBB6_79: // %cond.load37 +; NO_SVE-NEXT: add x10, x0, #52 +; NO_SVE-NEXT: ld1 { v3.s }[1], [x10] +; NO_SVE-NEXT: tbz w9, #14, .LBB6_25 +; NO_SVE-NEXT: .LBB6_80: // %cond.load40 +; NO_SVE-NEXT: add x10, x0, #56 +; NO_SVE-NEXT: ld1 { v3.s }[2], [x10] +; NO_SVE-NEXT: tbz w9, #15, .LBB6_26 +; NO_SVE-NEXT: .LBB6_81: // %cond.load43 +; NO_SVE-NEXT: add x10, x0, #60 +; NO_SVE-NEXT: ld1 { v3.s }[3], [x10] +; NO_SVE-NEXT: tbz w9, #16, .LBB6_27 +; NO_SVE-NEXT: .LBB6_82: // %cond.load46 +; NO_SVE-NEXT: add x10, x0, #64 +; NO_SVE-NEXT: ld1 { v4.s }[0], [x10] +; NO_SVE-NEXT: tbz w9, #17, .LBB6_28 +; NO_SVE-NEXT: .LBB6_83: // %cond.load49 +; NO_SVE-NEXT: add x10, x0, #68 +; NO_SVE-NEXT: ld1 { v4.s }[1], [x10] +; NO_SVE-NEXT: tbz w9, #18, .LBB6_29 +; NO_SVE-NEXT: .LBB6_84: // %cond.load52 +; NO_SVE-NEXT: add x10, x0, #72 +; NO_SVE-NEXT: ld1 { v4.s }[2], [x10] +; NO_SVE-NEXT: tbz w9, #19, .LBB6_30 +; NO_SVE-NEXT: .LBB6_85: // %cond.load55 +; NO_SVE-NEXT: add x10, x0, #76 +; NO_SVE-NEXT: ld1 { v4.s }[3], [x10] +; NO_SVE-NEXT: tbz w9, #20, .LBB6_31 +; NO_SVE-NEXT: .LBB6_86: // %cond.load58 +; NO_SVE-NEXT: add x10, x0, #80 +; NO_SVE-NEXT: ld1 { v5.s }[0], [x10] +; NO_SVE-NEXT: tbz w9, #21, .LBB6_32 +; NO_SVE-NEXT: .LBB6_87: // %cond.load61 +; NO_SVE-NEXT: add x10, x0, #84 +; NO_SVE-NEXT: ld1 { v5.s }[1], [x10] +; NO_SVE-NEXT: tbz w9, #22, .LBB6_33 +; NO_SVE-NEXT: .LBB6_88: // %cond.load64 +; NO_SVE-NEXT: add x10, x0, #88 +; NO_SVE-NEXT: ld1 { v5.s }[2], [x10] +; NO_SVE-NEXT: tbz w9, #23, .LBB6_34 +; NO_SVE-NEXT: .LBB6_89: // %cond.load67 +; NO_SVE-NEXT: add x10, x0, #92 +; NO_SVE-NEXT: ld1 { v5.s }[3], [x10] +; NO_SVE-NEXT: tbz w9, #24, .LBB6_35 +; NO_SVE-NEXT: .LBB6_90: // %cond.load70 +; NO_SVE-NEXT: add x10, x0, #96 +; NO_SVE-NEXT: ld1 { v6.s }[0], [x10] +; NO_SVE-NEXT: tbz w9, #25, .LBB6_36 +; NO_SVE-NEXT: .LBB6_91: // %cond.load73 +; NO_SVE-NEXT: add x10, x0, #100 +; NO_SVE-NEXT: ld1 { v6.s }[1], [x10] +; NO_SVE-NEXT: tbz w9, #26, .LBB6_37 +; NO_SVE-NEXT: .LBB6_92: // %cond.load76 +; NO_SVE-NEXT: add x10, x0, #104 +; NO_SVE-NEXT: ld1 { v6.s }[2], [x10] +; NO_SVE-NEXT: tbz w9, #27, .LBB6_38 +; NO_SVE-NEXT: .LBB6_93: // %cond.load79 +; NO_SVE-NEXT: add x10, x0, #108 +; NO_SVE-NEXT: ld1 { v6.s }[3], [x10] +; NO_SVE-NEXT: tbz w9, #28, .LBB6_39 +; NO_SVE-NEXT: .LBB6_94: // %cond.load82 +; NO_SVE-NEXT: add x10, x0, #112 +; NO_SVE-NEXT: ld1 { v7.s }[0], [x10] +; NO_SVE-NEXT: tbz w9, #29, .LBB6_40 +; NO_SVE-NEXT: .LBB6_95: // %cond.load85 +; NO_SVE-NEXT: add x10, x0, #116 +; NO_SVE-NEXT: ld1 { v7.s }[1], [x10] +; NO_SVE-NEXT: tbz w9, #30, .LBB6_41 +; NO_SVE-NEXT: .LBB6_96: // %cond.load88 +; NO_SVE-NEXT: add x10, x0, #120 +; NO_SVE-NEXT: ld1 { v7.s }[2], [x10] +; NO_SVE-NEXT: tbz w9, #31, .LBB6_42 +; NO_SVE-NEXT: .LBB6_97: // %cond.load91 +; NO_SVE-NEXT: add x10, x0, #124 +; NO_SVE-NEXT: ld1 { v7.s }[3], [x10] +; NO_SVE-NEXT: tbz x9, #32, .LBB6_43 +; NO_SVE-NEXT: .LBB6_98: // %cond.load94 +; NO_SVE-NEXT: add x10, x0, #128 +; NO_SVE-NEXT: ld1 { v16.s }[0], [x10] +; NO_SVE-NEXT: tbz x9, #33, .LBB6_44 +; NO_SVE-NEXT: .LBB6_99: // %cond.load97 +; NO_SVE-NEXT: add x10, x0, #132 +; NO_SVE-NEXT: ld1 { v16.s }[1], [x10] +; NO_SVE-NEXT: tbz x9, #34, .LBB6_45 +; NO_SVE-NEXT: .LBB6_100: // %cond.load100 +; NO_SVE-NEXT: add x10, x0, #136 +; NO_SVE-NEXT: ld1 { v16.s }[2], [x10] +; NO_SVE-NEXT: tbz x9, #35, .LBB6_46 +; NO_SVE-NEXT: .LBB6_101: // %cond.load103 +; NO_SVE-NEXT: add x10, x0, #140 +; NO_SVE-NEXT: ld1 { v16.s }[3], [x10] +; NO_SVE-NEXT: tbz x9, #36, .LBB6_47 +; NO_SVE-NEXT: .LBB6_102: // %cond.load106 +; NO_SVE-NEXT: add x10, x0, #144 +; NO_SVE-NEXT: ld1 { v17.s }[0], [x10] +; NO_SVE-NEXT: tbz x9, #37, .LBB6_48 +; NO_SVE-NEXT: .LBB6_103: // %cond.load109 +; NO_SVE-NEXT: add x10, x0, #148 +; NO_SVE-NEXT: ld1 { v17.s }[1], [x10] +; NO_SVE-NEXT: tbz x9, #38, .LBB6_49 +; NO_SVE-NEXT: .LBB6_104: // %cond.load112 +; NO_SVE-NEXT: add x10, x0, #152 +; NO_SVE-NEXT: ld1 { v17.s }[2], [x10] +; NO_SVE-NEXT: tbz x9, #39, .LBB6_50 +; NO_SVE-NEXT: .LBB6_105: // %cond.load115 +; NO_SVE-NEXT: add x10, x0, #156 +; NO_SVE-NEXT: ld1 { v17.s }[3], [x10] +; NO_SVE-NEXT: tbz x9, #40, .LBB6_51 +; NO_SVE-NEXT: .LBB6_106: // %cond.load118 +; NO_SVE-NEXT: add x10, x0, #160 +; NO_SVE-NEXT: ld1 { v18.s }[0], [x10] +; NO_SVE-NEXT: tbz x9, #41, .LBB6_52 +; NO_SVE-NEXT: .LBB6_107: // %cond.load121 +; NO_SVE-NEXT: add x10, x0, #164 +; NO_SVE-NEXT: ld1 { v18.s }[1], [x10] +; NO_SVE-NEXT: tbz x9, #42, .LBB6_53 +; NO_SVE-NEXT: .LBB6_108: // %cond.load124 +; NO_SVE-NEXT: add x10, x0, #168 +; NO_SVE-NEXT: ld1 { v18.s }[2], [x10] +; NO_SVE-NEXT: tbz x9, #43, .LBB6_54 +; NO_SVE-NEXT: .LBB6_109: // %cond.load127 +; NO_SVE-NEXT: add x10, x0, #172 +; NO_SVE-NEXT: ld1 { v18.s }[3], [x10] +; NO_SVE-NEXT: tbz x9, #44, .LBB6_55 +; NO_SVE-NEXT: .LBB6_110: // %cond.load130 +; NO_SVE-NEXT: add x10, x0, #176 +; NO_SVE-NEXT: ld1 { v19.s }[0], [x10] +; NO_SVE-NEXT: tbz x9, #45, .LBB6_56 +; NO_SVE-NEXT: .LBB6_111: // %cond.load133 +; NO_SVE-NEXT: add x10, x0, #180 +; NO_SVE-NEXT: ld1 { v19.s }[1], [x10] +; NO_SVE-NEXT: tbz x9, #46, .LBB6_57 +; NO_SVE-NEXT: .LBB6_112: // %cond.load136 +; NO_SVE-NEXT: add x10, x0, #184 +; NO_SVE-NEXT: ld1 { v19.s }[2], [x10] +; NO_SVE-NEXT: tbz x9, #47, .LBB6_58 +; NO_SVE-NEXT: .LBB6_113: // %cond.load139 +; NO_SVE-NEXT: add x10, x0, #188 +; NO_SVE-NEXT: ld1 { v19.s }[3], [x10] +; NO_SVE-NEXT: tbz x9, #48, .LBB6_59 +; NO_SVE-NEXT: .LBB6_114: // %cond.load142 +; NO_SVE-NEXT: add x10, x0, #192 +; NO_SVE-NEXT: ld1 { v20.s }[0], [x10] +; NO_SVE-NEXT: tbz x9, #49, .LBB6_60 +; NO_SVE-NEXT: .LBB6_115: // %cond.load145 +; NO_SVE-NEXT: add x10, x0, #196 +; NO_SVE-NEXT: ld1 { v20.s }[1], [x10] +; NO_SVE-NEXT: tbz x9, #50, .LBB6_61 +; NO_SVE-NEXT: .LBB6_116: // %cond.load148 +; NO_SVE-NEXT: add x10, x0, #200 +; NO_SVE-NEXT: ld1 { v20.s }[2], [x10] +; NO_SVE-NEXT: tbz x9, #51, .LBB6_62 +; NO_SVE-NEXT: .LBB6_117: // %cond.load151 +; NO_SVE-NEXT: add x10, x0, #204 +; NO_SVE-NEXT: ld1 { v20.s }[3], [x10] +; NO_SVE-NEXT: tbz x9, #52, .LBB6_63 +; NO_SVE-NEXT: .LBB6_118: // %cond.load154 +; NO_SVE-NEXT: add x10, x0, #208 +; NO_SVE-NEXT: ld1 { v21.s }[0], [x10] +; NO_SVE-NEXT: tbz x9, #53, .LBB6_64 +; NO_SVE-NEXT: .LBB6_119: // %cond.load157 +; NO_SVE-NEXT: add x10, x0, #212 +; NO_SVE-NEXT: ld1 { v21.s }[1], [x10] +; NO_SVE-NEXT: tbz x9, #54, .LBB6_65 +; NO_SVE-NEXT: .LBB6_120: // %cond.load160 +; NO_SVE-NEXT: add x10, x0, #216 +; NO_SVE-NEXT: ld1 { v21.s }[2], [x10] +; NO_SVE-NEXT: tbz x9, #55, .LBB6_66 +; NO_SVE-NEXT: .LBB6_121: // %cond.load163 +; NO_SVE-NEXT: add x10, x0, #220 +; NO_SVE-NEXT: ld1 { v21.s }[3], [x10] +; NO_SVE-NEXT: tbz x9, #56, .LBB6_67 +; NO_SVE-NEXT: .LBB6_122: // %cond.load166 +; NO_SVE-NEXT: add x10, x0, #224 +; NO_SVE-NEXT: ld1 { v22.s }[0], [x10] +; NO_SVE-NEXT: tbz x9, #57, .LBB6_68 +; NO_SVE-NEXT: .LBB6_123: // %cond.load169 +; NO_SVE-NEXT: add x10, x0, #228 +; NO_SVE-NEXT: ld1 { v22.s }[1], [x10] +; NO_SVE-NEXT: tbz x9, #58, .LBB6_69 +; NO_SVE-NEXT: .LBB6_124: // %cond.load172 +; NO_SVE-NEXT: add x10, x0, #232 +; NO_SVE-NEXT: ld1 { v22.s }[2], [x10] +; NO_SVE-NEXT: tbz x9, #59, .LBB6_70 +; NO_SVE-NEXT: .LBB6_125: // %cond.load175 +; NO_SVE-NEXT: add x10, x0, #236 +; NO_SVE-NEXT: ld1 { v22.s }[3], [x10] +; NO_SVE-NEXT: tbz x9, #60, .LBB6_71 +; NO_SVE-NEXT: .LBB6_126: // %cond.load178 +; NO_SVE-NEXT: add x10, x0, #240 +; NO_SVE-NEXT: ld1 { v23.s }[0], [x10] +; NO_SVE-NEXT: tbz x9, #61, .LBB6_72 +; NO_SVE-NEXT: .LBB6_127: // %cond.load181 +; NO_SVE-NEXT: add x10, x0, #244 +; NO_SVE-NEXT: ld1 { v23.s }[1], [x10] +; NO_SVE-NEXT: tbz x9, #62, .LBB6_73 +; NO_SVE-NEXT: .LBB6_128: // %cond.load184 +; NO_SVE-NEXT: add x10, x0, #248 +; NO_SVE-NEXT: ld1 { v23.s }[2], [x10] +; NO_SVE-NEXT: tbnz x9, #63, .LBB6_74 +; NO_SVE-NEXT: b .LBB6_75 +; ; VBITS_GE_2048-LABEL: masked_load_v64f32: ; VBITS_GE_2048: // %bb.0: ; VBITS_GE_2048-NEXT: ptrue p0.s, vl64 @@ -159,6 +1518,596 @@ } define <64 x i8> @masked_load_v64i8(<64 x i8>* %ap, <64 x i8>* %bp) #0 { +; NO_SVE-LABEL: masked_load_v64i8: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: sub sp, sp, #16 +; NO_SVE-NEXT: .cfi_def_cfa_offset 16 +; NO_SVE-NEXT: ldp q2, q3, [x0, #32] +; NO_SVE-NEXT: ldp q5, q4, [x1, #32] +; NO_SVE-NEXT: cmeq v4.16b, v3.16b, v4.16b +; NO_SVE-NEXT: cmeq v3.16b, v2.16b, v5.16b +; NO_SVE-NEXT: umov w8, v4.b[1] +; NO_SVE-NEXT: umov w10, v4.b[2] +; NO_SVE-NEXT: umov w9, v4.b[0] +; NO_SVE-NEXT: umov w11, v4.b[3] +; NO_SVE-NEXT: umov w12, v4.b[4] +; NO_SVE-NEXT: umov w13, v4.b[5] +; NO_SVE-NEXT: umov w14, v4.b[6] +; NO_SVE-NEXT: and w8, w8, #0x1 +; NO_SVE-NEXT: umov w15, v4.b[7] +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: and w9, w9, #0x1 +; NO_SVE-NEXT: and w11, w11, #0x1 +; NO_SVE-NEXT: and w12, w12, #0x1 +; NO_SVE-NEXT: bfi w9, w8, #1, #1 +; NO_SVE-NEXT: umov w16, v4.b[8] +; NO_SVE-NEXT: bfi w9, w10, #2, #1 +; NO_SVE-NEXT: and w13, w13, #0x1 +; NO_SVE-NEXT: umov w17, v4.b[9] +; NO_SVE-NEXT: bfi w9, w11, #3, #1 +; NO_SVE-NEXT: umov w8, v4.b[10] +; NO_SVE-NEXT: bfi w9, w12, #4, #1 +; NO_SVE-NEXT: and w14, w14, #0x1 +; NO_SVE-NEXT: umov w10, v4.b[11] +; NO_SVE-NEXT: bfi w9, w13, #5, #1 +; NO_SVE-NEXT: and w15, w15, #0x1 +; NO_SVE-NEXT: umov w11, v4.b[12] +; NO_SVE-NEXT: and w16, w16, #0x1 +; NO_SVE-NEXT: orr w9, w9, w14, lsl #6 +; NO_SVE-NEXT: umov w12, v4.b[13] +; NO_SVE-NEXT: and w14, w17, #0x1 +; NO_SVE-NEXT: orr w9, w9, w15, lsl #7 +; NO_SVE-NEXT: umov w13, v4.b[14] +; NO_SVE-NEXT: and w8, w8, #0x1 +; NO_SVE-NEXT: orr w9, w9, w16, lsl #8 +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: orr w9, w9, w14, lsl #9 +; NO_SVE-NEXT: and w11, w11, #0x1 +; NO_SVE-NEXT: orr w8, w9, w8, lsl #10 +; NO_SVE-NEXT: umov w9, v3.b[1] +; NO_SVE-NEXT: orr w8, w8, w10, lsl #11 +; NO_SVE-NEXT: and w10, w12, #0x1 +; NO_SVE-NEXT: orr w8, w8, w11, lsl #12 +; NO_SVE-NEXT: and w11, w13, #0x1 +; NO_SVE-NEXT: umov w12, v3.b[0] +; NO_SVE-NEXT: orr w8, w8, w10, lsl #13 +; NO_SVE-NEXT: umov w10, v3.b[2] +; NO_SVE-NEXT: orr w8, w8, w11, lsl #14 +; NO_SVE-NEXT: umov w11, v3.b[3] +; NO_SVE-NEXT: and w9, w9, #0x1 +; NO_SVE-NEXT: umov w13, v3.b[4] +; NO_SVE-NEXT: umov w15, v3.b[5] +; NO_SVE-NEXT: and w12, w12, #0x1 +; NO_SVE-NEXT: umov w14, v4.b[15] +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: bfi w12, w9, #1, #1 +; NO_SVE-NEXT: and w9, w11, #0x1 +; NO_SVE-NEXT: umov w16, v3.b[14] +; NO_SVE-NEXT: and w11, w13, #0x1 +; NO_SVE-NEXT: umov w13, v3.b[6] +; NO_SVE-NEXT: bfi w12, w10, #2, #1 +; NO_SVE-NEXT: umov w10, v3.b[7] +; NO_SVE-NEXT: bfi w12, w9, #3, #1 +; NO_SVE-NEXT: and w9, w15, #0x1 +; NO_SVE-NEXT: ldp q0, q1, [x0] +; NO_SVE-NEXT: bfi w12, w11, #4, #1 +; NO_SVE-NEXT: umov w11, v3.b[8] +; NO_SVE-NEXT: and w13, w13, #0x1 +; NO_SVE-NEXT: umov w15, v3.b[9] +; NO_SVE-NEXT: bfi w12, w9, #5, #1 +; NO_SVE-NEXT: and w9, w10, #0x1 +; NO_SVE-NEXT: orr w10, w12, w13, lsl #6 +; NO_SVE-NEXT: umov w12, v3.b[10] +; NO_SVE-NEXT: orr w9, w10, w9, lsl #7 +; NO_SVE-NEXT: and w10, w11, #0x1 +; NO_SVE-NEXT: ldp q2, q5, [x1] +; NO_SVE-NEXT: and w11, w15, #0x1 +; NO_SVE-NEXT: umov w13, v3.b[11] +; NO_SVE-NEXT: orr w9, w9, w10, lsl #8 +; NO_SVE-NEXT: umov w10, v3.b[12] +; NO_SVE-NEXT: orr w9, w9, w11, lsl #9 +; NO_SVE-NEXT: and w11, w12, #0x1 +; NO_SVE-NEXT: orr w8, w8, w14, lsl #15 +; NO_SVE-NEXT: orr w9, w9, w11, lsl #10 +; NO_SVE-NEXT: and w11, w13, #0x1 +; NO_SVE-NEXT: cmeq v1.16b, v1.16b, v5.16b +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: orr w9, w9, w11, lsl #11 +; NO_SVE-NEXT: umov w13, v3.b[13] +; NO_SVE-NEXT: umov w12, v1.b[1] +; NO_SVE-NEXT: umov w15, v1.b[0] +; NO_SVE-NEXT: umov w11, v1.b[2] +; NO_SVE-NEXT: orr w9, w9, w10, lsl #12 +; NO_SVE-NEXT: cmeq v0.16b, v0.16b, v2.16b +; NO_SVE-NEXT: umov w17, v1.b[13] +; NO_SVE-NEXT: and w13, w13, #0x1 +; NO_SVE-NEXT: and w10, w12, #0x1 +; NO_SVE-NEXT: umov w12, v1.b[3] +; NO_SVE-NEXT: and w14, w15, #0x1 +; NO_SVE-NEXT: umov w15, v1.b[4] +; NO_SVE-NEXT: and w11, w11, #0x1 +; NO_SVE-NEXT: orr w9, w9, w13, lsl #13 +; NO_SVE-NEXT: bfi w14, w10, #1, #1 +; NO_SVE-NEXT: umov w13, v1.b[6] +; NO_SVE-NEXT: and w10, w12, #0x1 +; NO_SVE-NEXT: umov w12, v1.b[5] +; NO_SVE-NEXT: bfi w14, w11, #2, #1 +; NO_SVE-NEXT: and w11, w15, #0x1 +; NO_SVE-NEXT: and w15, w16, #0x1 +; NO_SVE-NEXT: umov w16, v1.b[9] +; NO_SVE-NEXT: bfi w14, w10, #3, #1 +; NO_SVE-NEXT: and w13, w13, #0x1 +; NO_SVE-NEXT: and w10, w12, #0x1 +; NO_SVE-NEXT: umov w12, v1.b[7] +; NO_SVE-NEXT: bfi w14, w11, #4, #1 +; NO_SVE-NEXT: umov w11, v1.b[8] +; NO_SVE-NEXT: orr w9, w9, w15, lsl #14 +; NO_SVE-NEXT: umov w15, v0.b[4] +; NO_SVE-NEXT: bfi w14, w10, #5, #1 +; NO_SVE-NEXT: orr w10, w14, w13, lsl #6 +; NO_SVE-NEXT: and w12, w12, #0x1 +; NO_SVE-NEXT: umov w13, v1.b[10] +; NO_SVE-NEXT: and w11, w11, #0x1 +; NO_SVE-NEXT: orr w10, w10, w12, lsl #7 +; NO_SVE-NEXT: and w12, w16, #0x1 +; NO_SVE-NEXT: orr w10, w10, w11, lsl #8 +; NO_SVE-NEXT: umov w11, v1.b[11] +; NO_SVE-NEXT: orr w10, w10, w12, lsl #9 +; NO_SVE-NEXT: umov w14, v3.b[15] +; NO_SVE-NEXT: and w12, w13, #0x1 +; NO_SVE-NEXT: umov w13, v1.b[12] +; NO_SVE-NEXT: umov w16, v0.b[5] +; NO_SVE-NEXT: orr w10, w10, w12, lsl #10 +; NO_SVE-NEXT: and w11, w11, #0x1 +; NO_SVE-NEXT: umov w12, v0.b[1] +; NO_SVE-NEXT: orr w9, w9, w14, lsl #15 +; NO_SVE-NEXT: and w13, w13, #0x1 +; NO_SVE-NEXT: umov w14, v0.b[2] +; NO_SVE-NEXT: orr w10, w10, w11, lsl #11 +; NO_SVE-NEXT: umov w11, v0.b[0] +; NO_SVE-NEXT: orr w10, w10, w13, lsl #12 +; NO_SVE-NEXT: umov w13, v0.b[3] +; NO_SVE-NEXT: and w12, w12, #0x1 +; NO_SVE-NEXT: bfi w9, w8, #16, #16 +; NO_SVE-NEXT: and w14, w14, #0x1 +; NO_SVE-NEXT: and w11, w11, #0x1 +; NO_SVE-NEXT: bfi w11, w12, #1, #1 +; NO_SVE-NEXT: and w12, w13, #0x1 +; NO_SVE-NEXT: and w13, w15, #0x1 +; NO_SVE-NEXT: umov w15, v0.b[6] +; NO_SVE-NEXT: bfi w11, w14, #2, #1 +; NO_SVE-NEXT: and w14, w16, #0x1 +; NO_SVE-NEXT: bfi w11, w12, #3, #1 +; NO_SVE-NEXT: umov w12, v0.b[7] +; NO_SVE-NEXT: bfi w11, w13, #4, #1 +; NO_SVE-NEXT: umov w13, v1.b[14] +; NO_SVE-NEXT: bfi w11, w14, #5, #1 +; NO_SVE-NEXT: and w14, w15, #0x1 +; NO_SVE-NEXT: umov w15, v0.b[8] +; NO_SVE-NEXT: and w16, w17, #0x1 +; NO_SVE-NEXT: and w12, w12, #0x1 +; NO_SVE-NEXT: umov w17, v0.b[9] +; NO_SVE-NEXT: orr w11, w11, w14, lsl #6 +; NO_SVE-NEXT: umov w14, v0.b[10] +; NO_SVE-NEXT: orr w10, w10, w16, lsl #13 +; NO_SVE-NEXT: orr w11, w11, w12, lsl #7 +; NO_SVE-NEXT: and w12, w15, #0x1 +; NO_SVE-NEXT: umov w16, v0.b[11] +; NO_SVE-NEXT: and w13, w13, #0x1 +; NO_SVE-NEXT: and w15, w17, #0x1 +; NO_SVE-NEXT: orr w11, w11, w12, lsl #8 +; NO_SVE-NEXT: umov w12, v0.b[12] +; NO_SVE-NEXT: orr w10, w10, w13, lsl #14 +; NO_SVE-NEXT: and w13, w14, #0x1 +; NO_SVE-NEXT: umov w14, v0.b[13] +; NO_SVE-NEXT: orr w11, w11, w15, lsl #9 +; NO_SVE-NEXT: and w15, w16, #0x1 +; NO_SVE-NEXT: umov w16, v0.b[14] +; NO_SVE-NEXT: orr w11, w11, w13, lsl #10 +; NO_SVE-NEXT: umov w13, v1.b[15] +; NO_SVE-NEXT: and w12, w12, #0x1 +; NO_SVE-NEXT: orr w11, w11, w15, lsl #11 +; NO_SVE-NEXT: and w14, w14, #0x1 +; NO_SVE-NEXT: umov w15, v0.b[15] +; NO_SVE-NEXT: orr w11, w11, w12, lsl #12 +; NO_SVE-NEXT: and w12, w16, #0x1 +; NO_SVE-NEXT: orr w10, w10, w13, lsl #15 +; NO_SVE-NEXT: orr w11, w11, w14, lsl #13 +; NO_SVE-NEXT: orr w8, w11, w12, lsl #14 +; NO_SVE-NEXT: orr w8, w8, w15, lsl #15 +; NO_SVE-NEXT: bfi w8, w10, #16, #16 +; NO_SVE-NEXT: bfi x8, x9, #32, #32 +; NO_SVE-NEXT: tbz w8, #0, .LBB7_2 +; NO_SVE-NEXT: // %bb.1: // %cond.load +; NO_SVE-NEXT: ldr b0, [x0] +; NO_SVE-NEXT: tbnz w8, #1, .LBB7_3 +; NO_SVE-NEXT: b .LBB7_4 +; NO_SVE-NEXT: .LBB7_2: +; NO_SVE-NEXT: // implicit-def: $q0 +; NO_SVE-NEXT: tbz w8, #1, .LBB7_4 +; NO_SVE-NEXT: .LBB7_3: // %cond.load1 +; NO_SVE-NEXT: add x9, x0, #1 +; NO_SVE-NEXT: ld1 { v0.b }[1], [x9] +; NO_SVE-NEXT: .LBB7_4: // %else2 +; NO_SVE-NEXT: tbnz w8, #2, .LBB7_20 +; NO_SVE-NEXT: // %bb.5: // %else5 +; NO_SVE-NEXT: tbnz w8, #3, .LBB7_21 +; NO_SVE-NEXT: .LBB7_6: // %else8 +; NO_SVE-NEXT: tbnz w8, #4, .LBB7_22 +; NO_SVE-NEXT: .LBB7_7: // %else11 +; NO_SVE-NEXT: tbnz w8, #5, .LBB7_23 +; NO_SVE-NEXT: .LBB7_8: // %else14 +; NO_SVE-NEXT: tbnz w8, #6, .LBB7_24 +; NO_SVE-NEXT: .LBB7_9: // %else17 +; NO_SVE-NEXT: tbnz w8, #7, .LBB7_25 +; NO_SVE-NEXT: .LBB7_10: // %else20 +; NO_SVE-NEXT: tbnz w8, #8, .LBB7_26 +; NO_SVE-NEXT: .LBB7_11: // %else23 +; NO_SVE-NEXT: tbnz w8, #9, .LBB7_27 +; NO_SVE-NEXT: .LBB7_12: // %else26 +; NO_SVE-NEXT: tbnz w8, #10, .LBB7_28 +; NO_SVE-NEXT: .LBB7_13: // %else29 +; NO_SVE-NEXT: tbnz w8, #11, .LBB7_29 +; NO_SVE-NEXT: .LBB7_14: // %else32 +; NO_SVE-NEXT: tbnz w8, #12, .LBB7_30 +; NO_SVE-NEXT: .LBB7_15: // %else35 +; NO_SVE-NEXT: tbnz w8, #13, .LBB7_31 +; NO_SVE-NEXT: .LBB7_16: // %else38 +; NO_SVE-NEXT: tbnz w8, #14, .LBB7_32 +; NO_SVE-NEXT: .LBB7_17: // %else41 +; NO_SVE-NEXT: tbnz w8, #15, .LBB7_33 +; NO_SVE-NEXT: .LBB7_18: // %else44 +; NO_SVE-NEXT: tbz w8, #16, .LBB7_34 +; NO_SVE-NEXT: .LBB7_19: // %cond.load46 +; NO_SVE-NEXT: add x9, x0, #16 +; NO_SVE-NEXT: ld1 { v1.b }[0], [x9] +; NO_SVE-NEXT: tbnz w8, #17, .LBB7_35 +; NO_SVE-NEXT: b .LBB7_36 +; NO_SVE-NEXT: .LBB7_20: // %cond.load4 +; NO_SVE-NEXT: add x9, x0, #2 +; NO_SVE-NEXT: ld1 { v0.b }[2], [x9] +; NO_SVE-NEXT: tbz w8, #3, .LBB7_6 +; NO_SVE-NEXT: .LBB7_21: // %cond.load7 +; NO_SVE-NEXT: add x9, x0, #3 +; NO_SVE-NEXT: ld1 { v0.b }[3], [x9] +; NO_SVE-NEXT: tbz w8, #4, .LBB7_7 +; NO_SVE-NEXT: .LBB7_22: // %cond.load10 +; NO_SVE-NEXT: add x9, x0, #4 +; NO_SVE-NEXT: ld1 { v0.b }[4], [x9] +; NO_SVE-NEXT: tbz w8, #5, .LBB7_8 +; NO_SVE-NEXT: .LBB7_23: // %cond.load13 +; NO_SVE-NEXT: add x9, x0, #5 +; NO_SVE-NEXT: ld1 { v0.b }[5], [x9] +; NO_SVE-NEXT: tbz w8, #6, .LBB7_9 +; NO_SVE-NEXT: .LBB7_24: // %cond.load16 +; NO_SVE-NEXT: add x9, x0, #6 +; NO_SVE-NEXT: ld1 { v0.b }[6], [x9] +; NO_SVE-NEXT: tbz w8, #7, .LBB7_10 +; NO_SVE-NEXT: .LBB7_25: // %cond.load19 +; NO_SVE-NEXT: add x9, x0, #7 +; NO_SVE-NEXT: ld1 { v0.b }[7], [x9] +; NO_SVE-NEXT: tbz w8, #8, .LBB7_11 +; NO_SVE-NEXT: .LBB7_26: // %cond.load22 +; NO_SVE-NEXT: add x9, x0, #8 +; NO_SVE-NEXT: ld1 { v0.b }[8], [x9] +; NO_SVE-NEXT: tbz w8, #9, .LBB7_12 +; NO_SVE-NEXT: .LBB7_27: // %cond.load25 +; NO_SVE-NEXT: add x9, x0, #9 +; NO_SVE-NEXT: ld1 { v0.b }[9], [x9] +; NO_SVE-NEXT: tbz w8, #10, .LBB7_13 +; NO_SVE-NEXT: .LBB7_28: // %cond.load28 +; NO_SVE-NEXT: add x9, x0, #10 +; NO_SVE-NEXT: ld1 { v0.b }[10], [x9] +; NO_SVE-NEXT: tbz w8, #11, .LBB7_14 +; NO_SVE-NEXT: .LBB7_29: // %cond.load31 +; NO_SVE-NEXT: add x9, x0, #11 +; NO_SVE-NEXT: ld1 { v0.b }[11], [x9] +; NO_SVE-NEXT: tbz w8, #12, .LBB7_15 +; NO_SVE-NEXT: .LBB7_30: // %cond.load34 +; NO_SVE-NEXT: add x9, x0, #12 +; NO_SVE-NEXT: ld1 { v0.b }[12], [x9] +; NO_SVE-NEXT: tbz w8, #13, .LBB7_16 +; NO_SVE-NEXT: .LBB7_31: // %cond.load37 +; NO_SVE-NEXT: add x9, x0, #13 +; NO_SVE-NEXT: ld1 { v0.b }[13], [x9] +; NO_SVE-NEXT: tbz w8, #14, .LBB7_17 +; NO_SVE-NEXT: .LBB7_32: // %cond.load40 +; NO_SVE-NEXT: add x9, x0, #14 +; NO_SVE-NEXT: ld1 { v0.b }[14], [x9] +; NO_SVE-NEXT: tbz w8, #15, .LBB7_18 +; NO_SVE-NEXT: .LBB7_33: // %cond.load43 +; NO_SVE-NEXT: add x9, x0, #15 +; NO_SVE-NEXT: ld1 { v0.b }[15], [x9] +; NO_SVE-NEXT: tbnz w8, #16, .LBB7_19 +; NO_SVE-NEXT: .LBB7_34: +; NO_SVE-NEXT: // implicit-def: $q1 +; NO_SVE-NEXT: tbz w8, #17, .LBB7_36 +; NO_SVE-NEXT: .LBB7_35: // %cond.load49 +; NO_SVE-NEXT: add x9, x0, #17 +; NO_SVE-NEXT: ld1 { v1.b }[1], [x9] +; NO_SVE-NEXT: .LBB7_36: // %else50 +; NO_SVE-NEXT: tbnz w8, #18, .LBB7_52 +; NO_SVE-NEXT: // %bb.37: // %else53 +; NO_SVE-NEXT: tbnz w8, #19, .LBB7_53 +; NO_SVE-NEXT: .LBB7_38: // %else56 +; NO_SVE-NEXT: tbnz w8, #20, .LBB7_54 +; NO_SVE-NEXT: .LBB7_39: // %else59 +; NO_SVE-NEXT: tbnz w8, #21, .LBB7_55 +; NO_SVE-NEXT: .LBB7_40: // %else62 +; NO_SVE-NEXT: tbnz w8, #22, .LBB7_56 +; NO_SVE-NEXT: .LBB7_41: // %else65 +; NO_SVE-NEXT: tbnz w8, #23, .LBB7_57 +; NO_SVE-NEXT: .LBB7_42: // %else68 +; NO_SVE-NEXT: tbnz w8, #24, .LBB7_58 +; NO_SVE-NEXT: .LBB7_43: // %else71 +; NO_SVE-NEXT: tbnz w8, #25, .LBB7_59 +; NO_SVE-NEXT: .LBB7_44: // %else74 +; NO_SVE-NEXT: tbnz w8, #26, .LBB7_60 +; NO_SVE-NEXT: .LBB7_45: // %else77 +; NO_SVE-NEXT: tbnz w8, #27, .LBB7_61 +; NO_SVE-NEXT: .LBB7_46: // %else80 +; NO_SVE-NEXT: tbnz w8, #28, .LBB7_62 +; NO_SVE-NEXT: .LBB7_47: // %else83 +; NO_SVE-NEXT: tbnz w8, #29, .LBB7_63 +; NO_SVE-NEXT: .LBB7_48: // %else86 +; NO_SVE-NEXT: tbnz w8, #30, .LBB7_64 +; NO_SVE-NEXT: .LBB7_49: // %else89 +; NO_SVE-NEXT: tbnz w8, #31, .LBB7_65 +; NO_SVE-NEXT: .LBB7_50: // %else92 +; NO_SVE-NEXT: tbz x8, #32, .LBB7_66 +; NO_SVE-NEXT: .LBB7_51: // %cond.load94 +; NO_SVE-NEXT: add x9, x0, #32 +; NO_SVE-NEXT: ld1 { v2.b }[0], [x9] +; NO_SVE-NEXT: tbnz x8, #33, .LBB7_67 +; NO_SVE-NEXT: b .LBB7_68 +; NO_SVE-NEXT: .LBB7_52: // %cond.load52 +; NO_SVE-NEXT: add x9, x0, #18 +; NO_SVE-NEXT: ld1 { v1.b }[2], [x9] +; NO_SVE-NEXT: tbz w8, #19, .LBB7_38 +; NO_SVE-NEXT: .LBB7_53: // %cond.load55 +; NO_SVE-NEXT: add x9, x0, #19 +; NO_SVE-NEXT: ld1 { v1.b }[3], [x9] +; NO_SVE-NEXT: tbz w8, #20, .LBB7_39 +; NO_SVE-NEXT: .LBB7_54: // %cond.load58 +; NO_SVE-NEXT: add x9, x0, #20 +; NO_SVE-NEXT: ld1 { v1.b }[4], [x9] +; NO_SVE-NEXT: tbz w8, #21, .LBB7_40 +; NO_SVE-NEXT: .LBB7_55: // %cond.load61 +; NO_SVE-NEXT: add x9, x0, #21 +; NO_SVE-NEXT: ld1 { v1.b }[5], [x9] +; NO_SVE-NEXT: tbz w8, #22, .LBB7_41 +; NO_SVE-NEXT: .LBB7_56: // %cond.load64 +; NO_SVE-NEXT: add x9, x0, #22 +; NO_SVE-NEXT: ld1 { v1.b }[6], [x9] +; NO_SVE-NEXT: tbz w8, #23, .LBB7_42 +; NO_SVE-NEXT: .LBB7_57: // %cond.load67 +; NO_SVE-NEXT: add x9, x0, #23 +; NO_SVE-NEXT: ld1 { v1.b }[7], [x9] +; NO_SVE-NEXT: tbz w8, #24, .LBB7_43 +; NO_SVE-NEXT: .LBB7_58: // %cond.load70 +; NO_SVE-NEXT: add x9, x0, #24 +; NO_SVE-NEXT: ld1 { v1.b }[8], [x9] +; NO_SVE-NEXT: tbz w8, #25, .LBB7_44 +; NO_SVE-NEXT: .LBB7_59: // %cond.load73 +; NO_SVE-NEXT: add x9, x0, #25 +; NO_SVE-NEXT: ld1 { v1.b }[9], [x9] +; NO_SVE-NEXT: tbz w8, #26, .LBB7_45 +; NO_SVE-NEXT: .LBB7_60: // %cond.load76 +; NO_SVE-NEXT: add x9, x0, #26 +; NO_SVE-NEXT: ld1 { v1.b }[10], [x9] +; NO_SVE-NEXT: tbz w8, #27, .LBB7_46 +; NO_SVE-NEXT: .LBB7_61: // %cond.load79 +; NO_SVE-NEXT: add x9, x0, #27 +; NO_SVE-NEXT: ld1 { v1.b }[11], [x9] +; NO_SVE-NEXT: tbz w8, #28, .LBB7_47 +; NO_SVE-NEXT: .LBB7_62: // %cond.load82 +; NO_SVE-NEXT: add x9, x0, #28 +; NO_SVE-NEXT: ld1 { v1.b }[12], [x9] +; NO_SVE-NEXT: tbz w8, #29, .LBB7_48 +; NO_SVE-NEXT: .LBB7_63: // %cond.load85 +; NO_SVE-NEXT: add x9, x0, #29 +; NO_SVE-NEXT: ld1 { v1.b }[13], [x9] +; NO_SVE-NEXT: tbz w8, #30, .LBB7_49 +; NO_SVE-NEXT: .LBB7_64: // %cond.load88 +; NO_SVE-NEXT: add x9, x0, #30 +; NO_SVE-NEXT: ld1 { v1.b }[14], [x9] +; NO_SVE-NEXT: tbz w8, #31, .LBB7_50 +; NO_SVE-NEXT: .LBB7_65: // %cond.load91 +; NO_SVE-NEXT: add x9, x0, #31 +; NO_SVE-NEXT: ld1 { v1.b }[15], [x9] +; NO_SVE-NEXT: tbnz x8, #32, .LBB7_51 +; NO_SVE-NEXT: .LBB7_66: +; NO_SVE-NEXT: // implicit-def: $q2 +; NO_SVE-NEXT: tbz x8, #33, .LBB7_68 +; NO_SVE-NEXT: .LBB7_67: // %cond.load97 +; NO_SVE-NEXT: add x9, x0, #33 +; NO_SVE-NEXT: ld1 { v2.b }[1], [x9] +; NO_SVE-NEXT: .LBB7_68: // %else98 +; NO_SVE-NEXT: tbnz x8, #34, .LBB7_84 +; NO_SVE-NEXT: // %bb.69: // %else101 +; NO_SVE-NEXT: tbnz x8, #35, .LBB7_85 +; NO_SVE-NEXT: .LBB7_70: // %else104 +; NO_SVE-NEXT: tbnz x8, #36, .LBB7_86 +; NO_SVE-NEXT: .LBB7_71: // %else107 +; NO_SVE-NEXT: tbnz x8, #37, .LBB7_87 +; NO_SVE-NEXT: .LBB7_72: // %else110 +; NO_SVE-NEXT: tbnz x8, #38, .LBB7_88 +; NO_SVE-NEXT: .LBB7_73: // %else113 +; NO_SVE-NEXT: tbnz x8, #39, .LBB7_89 +; NO_SVE-NEXT: .LBB7_74: // %else116 +; NO_SVE-NEXT: tbnz x8, #40, .LBB7_90 +; NO_SVE-NEXT: .LBB7_75: // %else119 +; NO_SVE-NEXT: tbnz x8, #41, .LBB7_91 +; NO_SVE-NEXT: .LBB7_76: // %else122 +; NO_SVE-NEXT: tbnz x8, #42, .LBB7_92 +; NO_SVE-NEXT: .LBB7_77: // %else125 +; NO_SVE-NEXT: tbnz x8, #43, .LBB7_93 +; NO_SVE-NEXT: .LBB7_78: // %else128 +; NO_SVE-NEXT: tbnz x8, #44, .LBB7_94 +; NO_SVE-NEXT: .LBB7_79: // %else131 +; NO_SVE-NEXT: tbnz x8, #45, .LBB7_95 +; NO_SVE-NEXT: .LBB7_80: // %else134 +; NO_SVE-NEXT: tbnz x8, #46, .LBB7_96 +; NO_SVE-NEXT: .LBB7_81: // %else137 +; NO_SVE-NEXT: tbnz x8, #47, .LBB7_97 +; NO_SVE-NEXT: .LBB7_82: // %else140 +; NO_SVE-NEXT: tbz x8, #48, .LBB7_98 +; NO_SVE-NEXT: .LBB7_83: // %cond.load142 +; NO_SVE-NEXT: add x9, x0, #48 +; NO_SVE-NEXT: ld1 { v3.b }[0], [x9] +; NO_SVE-NEXT: tbnz x8, #49, .LBB7_99 +; NO_SVE-NEXT: b .LBB7_100 +; NO_SVE-NEXT: .LBB7_84: // %cond.load100 +; NO_SVE-NEXT: add x9, x0, #34 +; NO_SVE-NEXT: ld1 { v2.b }[2], [x9] +; NO_SVE-NEXT: tbz x8, #35, .LBB7_70 +; NO_SVE-NEXT: .LBB7_85: // %cond.load103 +; NO_SVE-NEXT: add x9, x0, #35 +; NO_SVE-NEXT: ld1 { v2.b }[3], [x9] +; NO_SVE-NEXT: tbz x8, #36, .LBB7_71 +; NO_SVE-NEXT: .LBB7_86: // %cond.load106 +; NO_SVE-NEXT: add x9, x0, #36 +; NO_SVE-NEXT: ld1 { v2.b }[4], [x9] +; NO_SVE-NEXT: tbz x8, #37, .LBB7_72 +; NO_SVE-NEXT: .LBB7_87: // %cond.load109 +; NO_SVE-NEXT: add x9, x0, #37 +; NO_SVE-NEXT: ld1 { v2.b }[5], [x9] +; NO_SVE-NEXT: tbz x8, #38, .LBB7_73 +; NO_SVE-NEXT: .LBB7_88: // %cond.load112 +; NO_SVE-NEXT: add x9, x0, #38 +; NO_SVE-NEXT: ld1 { v2.b }[6], [x9] +; NO_SVE-NEXT: tbz x8, #39, .LBB7_74 +; NO_SVE-NEXT: .LBB7_89: // %cond.load115 +; NO_SVE-NEXT: add x9, x0, #39 +; NO_SVE-NEXT: ld1 { v2.b }[7], [x9] +; NO_SVE-NEXT: tbz x8, #40, .LBB7_75 +; NO_SVE-NEXT: .LBB7_90: // %cond.load118 +; NO_SVE-NEXT: add x9, x0, #40 +; NO_SVE-NEXT: ld1 { v2.b }[8], [x9] +; NO_SVE-NEXT: tbz x8, #41, .LBB7_76 +; NO_SVE-NEXT: .LBB7_91: // %cond.load121 +; NO_SVE-NEXT: add x9, x0, #41 +; NO_SVE-NEXT: ld1 { v2.b }[9], [x9] +; NO_SVE-NEXT: tbz x8, #42, .LBB7_77 +; NO_SVE-NEXT: .LBB7_92: // %cond.load124 +; NO_SVE-NEXT: add x9, x0, #42 +; NO_SVE-NEXT: ld1 { v2.b }[10], [x9] +; NO_SVE-NEXT: tbz x8, #43, .LBB7_78 +; NO_SVE-NEXT: .LBB7_93: // %cond.load127 +; NO_SVE-NEXT: add x9, x0, #43 +; NO_SVE-NEXT: ld1 { v2.b }[11], [x9] +; NO_SVE-NEXT: tbz x8, #44, .LBB7_79 +; NO_SVE-NEXT: .LBB7_94: // %cond.load130 +; NO_SVE-NEXT: add x9, x0, #44 +; NO_SVE-NEXT: ld1 { v2.b }[12], [x9] +; NO_SVE-NEXT: tbz x8, #45, .LBB7_80 +; NO_SVE-NEXT: .LBB7_95: // %cond.load133 +; NO_SVE-NEXT: add x9, x0, #45 +; NO_SVE-NEXT: ld1 { v2.b }[13], [x9] +; NO_SVE-NEXT: tbz x8, #46, .LBB7_81 +; NO_SVE-NEXT: .LBB7_96: // %cond.load136 +; NO_SVE-NEXT: add x9, x0, #46 +; NO_SVE-NEXT: ld1 { v2.b }[14], [x9] +; NO_SVE-NEXT: tbz x8, #47, .LBB7_82 +; NO_SVE-NEXT: .LBB7_97: // %cond.load139 +; NO_SVE-NEXT: add x9, x0, #47 +; NO_SVE-NEXT: ld1 { v2.b }[15], [x9] +; NO_SVE-NEXT: tbnz x8, #48, .LBB7_83 +; NO_SVE-NEXT: .LBB7_98: +; NO_SVE-NEXT: // implicit-def: $q3 +; NO_SVE-NEXT: tbz x8, #49, .LBB7_100 +; NO_SVE-NEXT: .LBB7_99: // %cond.load145 +; NO_SVE-NEXT: add x9, x0, #49 +; NO_SVE-NEXT: ld1 { v3.b }[1], [x9] +; NO_SVE-NEXT: .LBB7_100: // %else146 +; NO_SVE-NEXT: tbnz x8, #50, .LBB7_115 +; NO_SVE-NEXT: // %bb.101: // %else149 +; NO_SVE-NEXT: tbnz x8, #51, .LBB7_116 +; NO_SVE-NEXT: .LBB7_102: // %else152 +; NO_SVE-NEXT: tbnz x8, #52, .LBB7_117 +; NO_SVE-NEXT: .LBB7_103: // %else155 +; NO_SVE-NEXT: tbnz x8, #53, .LBB7_118 +; NO_SVE-NEXT: .LBB7_104: // %else158 +; NO_SVE-NEXT: tbnz x8, #54, .LBB7_119 +; NO_SVE-NEXT: .LBB7_105: // %else161 +; NO_SVE-NEXT: tbnz x8, #55, .LBB7_120 +; NO_SVE-NEXT: .LBB7_106: // %else164 +; NO_SVE-NEXT: tbnz x8, #56, .LBB7_121 +; NO_SVE-NEXT: .LBB7_107: // %else167 +; NO_SVE-NEXT: tbnz x8, #57, .LBB7_122 +; NO_SVE-NEXT: .LBB7_108: // %else170 +; NO_SVE-NEXT: tbnz x8, #58, .LBB7_123 +; NO_SVE-NEXT: .LBB7_109: // %else173 +; NO_SVE-NEXT: tbnz x8, #59, .LBB7_124 +; NO_SVE-NEXT: .LBB7_110: // %else176 +; NO_SVE-NEXT: tbnz x8, #60, .LBB7_125 +; NO_SVE-NEXT: .LBB7_111: // %else179 +; NO_SVE-NEXT: tbnz x8, #61, .LBB7_126 +; NO_SVE-NEXT: .LBB7_112: // %else182 +; NO_SVE-NEXT: tbnz x8, #62, .LBB7_127 +; NO_SVE-NEXT: .LBB7_113: // %else185 +; NO_SVE-NEXT: tbnz x8, #63, .LBB7_128 +; NO_SVE-NEXT: .LBB7_114: // %else188 +; NO_SVE-NEXT: add sp, sp, #16 +; NO_SVE-NEXT: ret +; NO_SVE-NEXT: .LBB7_115: // %cond.load148 +; NO_SVE-NEXT: add x9, x0, #50 +; NO_SVE-NEXT: ld1 { v3.b }[2], [x9] +; NO_SVE-NEXT: tbz x8, #51, .LBB7_102 +; NO_SVE-NEXT: .LBB7_116: // %cond.load151 +; NO_SVE-NEXT: add x9, x0, #51 +; NO_SVE-NEXT: ld1 { v3.b }[3], [x9] +; NO_SVE-NEXT: tbz x8, #52, .LBB7_103 +; NO_SVE-NEXT: .LBB7_117: // %cond.load154 +; NO_SVE-NEXT: add x9, x0, #52 +; NO_SVE-NEXT: ld1 { v3.b }[4], [x9] +; NO_SVE-NEXT: tbz x8, #53, .LBB7_104 +; NO_SVE-NEXT: .LBB7_118: // %cond.load157 +; NO_SVE-NEXT: add x9, x0, #53 +; NO_SVE-NEXT: ld1 { v3.b }[5], [x9] +; NO_SVE-NEXT: tbz x8, #54, .LBB7_105 +; NO_SVE-NEXT: .LBB7_119: // %cond.load160 +; NO_SVE-NEXT: add x9, x0, #54 +; NO_SVE-NEXT: ld1 { v3.b }[6], [x9] +; NO_SVE-NEXT: tbz x8, #55, .LBB7_106 +; NO_SVE-NEXT: .LBB7_120: // %cond.load163 +; NO_SVE-NEXT: add x9, x0, #55 +; NO_SVE-NEXT: ld1 { v3.b }[7], [x9] +; NO_SVE-NEXT: tbz x8, #56, .LBB7_107 +; NO_SVE-NEXT: .LBB7_121: // %cond.load166 +; NO_SVE-NEXT: add x9, x0, #56 +; NO_SVE-NEXT: ld1 { v3.b }[8], [x9] +; NO_SVE-NEXT: tbz x8, #57, .LBB7_108 +; NO_SVE-NEXT: .LBB7_122: // %cond.load169 +; NO_SVE-NEXT: add x9, x0, #57 +; NO_SVE-NEXT: ld1 { v3.b }[9], [x9] +; NO_SVE-NEXT: tbz x8, #58, .LBB7_109 +; NO_SVE-NEXT: .LBB7_123: // %cond.load172 +; NO_SVE-NEXT: add x9, x0, #58 +; NO_SVE-NEXT: ld1 { v3.b }[10], [x9] +; NO_SVE-NEXT: tbz x8, #59, .LBB7_110 +; NO_SVE-NEXT: .LBB7_124: // %cond.load175 +; NO_SVE-NEXT: add x9, x0, #59 +; NO_SVE-NEXT: ld1 { v3.b }[11], [x9] +; NO_SVE-NEXT: tbz x8, #60, .LBB7_111 +; NO_SVE-NEXT: .LBB7_125: // %cond.load178 +; NO_SVE-NEXT: add x9, x0, #60 +; NO_SVE-NEXT: ld1 { v3.b }[12], [x9] +; NO_SVE-NEXT: tbz x8, #61, .LBB7_112 +; NO_SVE-NEXT: .LBB7_126: // %cond.load181 +; NO_SVE-NEXT: add x9, x0, #61 +; NO_SVE-NEXT: ld1 { v3.b }[13], [x9] +; NO_SVE-NEXT: tbz x8, #62, .LBB7_113 +; NO_SVE-NEXT: .LBB7_127: // %cond.load184 +; NO_SVE-NEXT: add x9, x0, #62 +; NO_SVE-NEXT: ld1 { v3.b }[14], [x9] +; NO_SVE-NEXT: tbz x8, #63, .LBB7_114 +; NO_SVE-NEXT: .LBB7_128: // %cond.load187 +; NO_SVE-NEXT: add x8, x0, #63 +; NO_SVE-NEXT: ld1 { v3.b }[15], [x8] +; NO_SVE-NEXT: add sp, sp, #16 +; NO_SVE-NEXT: ret +; ; VBITS_GE_512-LABEL: masked_load_v64i8: ; VBITS_GE_512: // %bb.0: ; VBITS_GE_512-NEXT: ptrue p0.b, vl64 @@ -176,6 +2125,314 @@ } define <32 x i16> @masked_load_v32i16(<32 x i16>* %ap, <32 x i16>* %bp) #0 { +; NO_SVE-LABEL: masked_load_v32i16: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: sub sp, sp, #16 +; NO_SVE-NEXT: .cfi_def_cfa_offset 16 +; NO_SVE-NEXT: ldp q0, q2, [x0, #32] +; NO_SVE-NEXT: ldp q1, q3, [x1, #32] +; NO_SVE-NEXT: cmeq v1.8h, v0.8h, v1.8h +; NO_SVE-NEXT: xtn v5.8b, v1.8h +; NO_SVE-NEXT: cmeq v1.8h, v2.8h, v3.8h +; NO_SVE-NEXT: umov w8, v5.b[1] +; NO_SVE-NEXT: umov w9, v5.b[2] +; NO_SVE-NEXT: umov w10, v5.b[0] +; NO_SVE-NEXT: umov w11, v5.b[3] +; NO_SVE-NEXT: umov w12, v5.b[4] +; NO_SVE-NEXT: umov w13, v5.b[5] +; NO_SVE-NEXT: xtn v1.8b, v1.8h +; NO_SVE-NEXT: and w8, w8, #0x1 +; NO_SVE-NEXT: umov w14, v5.b[6] +; NO_SVE-NEXT: ldp q4, q0, [x0] +; NO_SVE-NEXT: and w9, w9, #0x1 +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: and w11, w11, #0x1 +; NO_SVE-NEXT: umov w15, v5.b[7] +; NO_SVE-NEXT: bfi w10, w8, #1, #1 +; NO_SVE-NEXT: and w12, w12, #0x1 +; NO_SVE-NEXT: umov w16, v1.b[0] +; NO_SVE-NEXT: bfi w10, w9, #2, #1 +; NO_SVE-NEXT: and w13, w13, #0x1 +; NO_SVE-NEXT: umov w8, v1.b[1] +; NO_SVE-NEXT: ldp q3, q2, [x1] +; NO_SVE-NEXT: bfi w10, w11, #3, #1 +; NO_SVE-NEXT: umov w9, v1.b[2] +; NO_SVE-NEXT: and w14, w14, #0x1 +; NO_SVE-NEXT: bfi w10, w12, #4, #1 +; NO_SVE-NEXT: umov w11, v1.b[3] +; NO_SVE-NEXT: and w15, w15, #0x1 +; NO_SVE-NEXT: cmeq v3.8h, v4.8h, v3.8h +; NO_SVE-NEXT: bfi w10, w13, #5, #1 +; NO_SVE-NEXT: and w16, w16, #0x1 +; NO_SVE-NEXT: orr w10, w10, w14, lsl #6 +; NO_SVE-NEXT: xtn v3.8b, v3.8h +; NO_SVE-NEXT: and w8, w8, #0x1 +; NO_SVE-NEXT: umov w12, v1.b[4] +; NO_SVE-NEXT: orr w10, w10, w15, lsl #7 +; NO_SVE-NEXT: umov w13, v3.b[1] +; NO_SVE-NEXT: umov w14, v3.b[2] +; NO_SVE-NEXT: and w9, w9, #0x1 +; NO_SVE-NEXT: orr w10, w10, w16, lsl #8 +; NO_SVE-NEXT: and w11, w11, #0x1 +; NO_SVE-NEXT: orr w8, w10, w8, lsl #9 +; NO_SVE-NEXT: orr w8, w8, w9, lsl #10 +; NO_SVE-NEXT: and w10, w12, #0x1 +; NO_SVE-NEXT: umov w9, v3.b[0] +; NO_SVE-NEXT: orr w8, w8, w11, lsl #11 +; NO_SVE-NEXT: umov w11, v1.b[5] +; NO_SVE-NEXT: and w12, w13, #0x1 +; NO_SVE-NEXT: and w13, w14, #0x1 +; NO_SVE-NEXT: umov w14, v3.b[3] +; NO_SVE-NEXT: umov w15, v3.b[4] +; NO_SVE-NEXT: umov w16, v3.b[5] +; NO_SVE-NEXT: and w9, w9, #0x1 +; NO_SVE-NEXT: orr w8, w8, w10, lsl #12 +; NO_SVE-NEXT: and w10, w11, #0x1 +; NO_SVE-NEXT: bfi w9, w12, #1, #1 +; NO_SVE-NEXT: and w11, w14, #0x1 +; NO_SVE-NEXT: umov w14, v3.b[6] +; NO_SVE-NEXT: and w12, w15, #0x1 +; NO_SVE-NEXT: bfi w9, w13, #2, #1 +; NO_SVE-NEXT: cmeq v0.8h, v0.8h, v2.8h +; NO_SVE-NEXT: and w13, w16, #0x1 +; NO_SVE-NEXT: bfi w9, w11, #3, #1 +; NO_SVE-NEXT: umov w11, v3.b[7] +; NO_SVE-NEXT: xtn v0.8b, v0.8h +; NO_SVE-NEXT: bfi w9, w12, #4, #1 +; NO_SVE-NEXT: umov w12, v1.b[6] +; NO_SVE-NEXT: bfi w9, w13, #5, #1 +; NO_SVE-NEXT: and w13, w14, #0x1 +; NO_SVE-NEXT: umov w14, v0.b[0] +; NO_SVE-NEXT: and w11, w11, #0x1 +; NO_SVE-NEXT: umov w15, v0.b[1] +; NO_SVE-NEXT: orr w8, w8, w10, lsl #13 +; NO_SVE-NEXT: orr w9, w9, w13, lsl #6 +; NO_SVE-NEXT: and w10, w12, #0x1 +; NO_SVE-NEXT: umov w12, v0.b[2] +; NO_SVE-NEXT: orr w9, w9, w11, lsl #7 +; NO_SVE-NEXT: and w11, w14, #0x1 +; NO_SVE-NEXT: umov w14, v0.b[3] +; NO_SVE-NEXT: and w13, w15, #0x1 +; NO_SVE-NEXT: orr w8, w8, w10, lsl #14 +; NO_SVE-NEXT: umov w10, v0.b[4] +; NO_SVE-NEXT: orr w9, w9, w11, lsl #8 +; NO_SVE-NEXT: and w11, w12, #0x1 +; NO_SVE-NEXT: umov w12, v0.b[5] +; NO_SVE-NEXT: orr w9, w9, w13, lsl #9 +; NO_SVE-NEXT: and w13, w14, #0x1 +; NO_SVE-NEXT: umov w14, v0.b[6] +; NO_SVE-NEXT: orr w9, w9, w11, lsl #10 +; NO_SVE-NEXT: umov w11, v1.b[7] +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: orr w9, w9, w13, lsl #11 +; NO_SVE-NEXT: and w12, w12, #0x1 +; NO_SVE-NEXT: umov w13, v0.b[7] +; NO_SVE-NEXT: orr w9, w9, w10, lsl #12 +; NO_SVE-NEXT: and w10, w14, #0x1 +; NO_SVE-NEXT: orr w11, w8, w11, lsl #15 +; NO_SVE-NEXT: orr w8, w9, w12, lsl #13 +; NO_SVE-NEXT: orr w8, w8, w10, lsl #14 +; NO_SVE-NEXT: orr w8, w8, w13, lsl #15 +; NO_SVE-NEXT: bfi w8, w11, #16, #16 +; NO_SVE-NEXT: tbz w8, #0, .LBB8_2 +; NO_SVE-NEXT: // %bb.1: // %cond.load +; NO_SVE-NEXT: ldr h0, [x0] +; NO_SVE-NEXT: tbnz w8, #1, .LBB8_3 +; NO_SVE-NEXT: b .LBB8_4 +; NO_SVE-NEXT: .LBB8_2: +; NO_SVE-NEXT: // implicit-def: $q0 +; NO_SVE-NEXT: tbz w8, #1, .LBB8_4 +; NO_SVE-NEXT: .LBB8_3: // %cond.load1 +; NO_SVE-NEXT: add x9, x0, #2 +; NO_SVE-NEXT: ld1 { v0.h }[1], [x9] +; NO_SVE-NEXT: .LBB8_4: // %else2 +; NO_SVE-NEXT: tbnz w8, #2, .LBB8_12 +; NO_SVE-NEXT: // %bb.5: // %else5 +; NO_SVE-NEXT: tbnz w8, #3, .LBB8_13 +; NO_SVE-NEXT: .LBB8_6: // %else8 +; NO_SVE-NEXT: tbnz w8, #4, .LBB8_14 +; NO_SVE-NEXT: .LBB8_7: // %else11 +; NO_SVE-NEXT: tbnz w8, #5, .LBB8_15 +; NO_SVE-NEXT: .LBB8_8: // %else14 +; NO_SVE-NEXT: tbnz w8, #6, .LBB8_16 +; NO_SVE-NEXT: .LBB8_9: // %else17 +; NO_SVE-NEXT: tbnz w8, #7, .LBB8_17 +; NO_SVE-NEXT: .LBB8_10: // %else20 +; NO_SVE-NEXT: tbz w8, #8, .LBB8_18 +; NO_SVE-NEXT: .LBB8_11: // %cond.load22 +; NO_SVE-NEXT: add x9, x0, #16 +; NO_SVE-NEXT: ld1 { v1.h }[0], [x9] +; NO_SVE-NEXT: tbnz w8, #9, .LBB8_19 +; NO_SVE-NEXT: b .LBB8_20 +; NO_SVE-NEXT: .LBB8_12: // %cond.load4 +; NO_SVE-NEXT: add x9, x0, #4 +; NO_SVE-NEXT: ld1 { v0.h }[2], [x9] +; NO_SVE-NEXT: tbz w8, #3, .LBB8_6 +; NO_SVE-NEXT: .LBB8_13: // %cond.load7 +; NO_SVE-NEXT: add x9, x0, #6 +; NO_SVE-NEXT: ld1 { v0.h }[3], [x9] +; NO_SVE-NEXT: tbz w8, #4, .LBB8_7 +; NO_SVE-NEXT: .LBB8_14: // %cond.load10 +; NO_SVE-NEXT: add x9, x0, #8 +; NO_SVE-NEXT: ld1 { v0.h }[4], [x9] +; NO_SVE-NEXT: tbz w8, #5, .LBB8_8 +; NO_SVE-NEXT: .LBB8_15: // %cond.load13 +; NO_SVE-NEXT: add x9, x0, #10 +; NO_SVE-NEXT: ld1 { v0.h }[5], [x9] +; NO_SVE-NEXT: tbz w8, #6, .LBB8_9 +; NO_SVE-NEXT: .LBB8_16: // %cond.load16 +; NO_SVE-NEXT: add x9, x0, #12 +; NO_SVE-NEXT: ld1 { v0.h }[6], [x9] +; NO_SVE-NEXT: tbz w8, #7, .LBB8_10 +; NO_SVE-NEXT: .LBB8_17: // %cond.load19 +; NO_SVE-NEXT: add x9, x0, #14 +; NO_SVE-NEXT: ld1 { v0.h }[7], [x9] +; NO_SVE-NEXT: tbnz w8, #8, .LBB8_11 +; NO_SVE-NEXT: .LBB8_18: +; NO_SVE-NEXT: // implicit-def: $q1 +; NO_SVE-NEXT: tbz w8, #9, .LBB8_20 +; NO_SVE-NEXT: .LBB8_19: // %cond.load25 +; NO_SVE-NEXT: add x9, x0, #18 +; NO_SVE-NEXT: ld1 { v1.h }[1], [x9] +; NO_SVE-NEXT: .LBB8_20: // %else26 +; NO_SVE-NEXT: tbnz w8, #10, .LBB8_28 +; NO_SVE-NEXT: // %bb.21: // %else29 +; NO_SVE-NEXT: tbnz w8, #11, .LBB8_29 +; NO_SVE-NEXT: .LBB8_22: // %else32 +; NO_SVE-NEXT: tbnz w8, #12, .LBB8_30 +; NO_SVE-NEXT: .LBB8_23: // %else35 +; NO_SVE-NEXT: tbnz w8, #13, .LBB8_31 +; NO_SVE-NEXT: .LBB8_24: // %else38 +; NO_SVE-NEXT: tbnz w8, #14, .LBB8_32 +; NO_SVE-NEXT: .LBB8_25: // %else41 +; NO_SVE-NEXT: tbnz w8, #15, .LBB8_33 +; NO_SVE-NEXT: .LBB8_26: // %else44 +; NO_SVE-NEXT: tbz w8, #16, .LBB8_34 +; NO_SVE-NEXT: .LBB8_27: // %cond.load46 +; NO_SVE-NEXT: add x9, x0, #32 +; NO_SVE-NEXT: ld1 { v2.h }[0], [x9] +; NO_SVE-NEXT: tbnz w8, #17, .LBB8_35 +; NO_SVE-NEXT: b .LBB8_36 +; NO_SVE-NEXT: .LBB8_28: // %cond.load28 +; NO_SVE-NEXT: add x9, x0, #20 +; NO_SVE-NEXT: ld1 { v1.h }[2], [x9] +; NO_SVE-NEXT: tbz w8, #11, .LBB8_22 +; NO_SVE-NEXT: .LBB8_29: // %cond.load31 +; NO_SVE-NEXT: add x9, x0, #22 +; NO_SVE-NEXT: ld1 { v1.h }[3], [x9] +; NO_SVE-NEXT: tbz w8, #12, .LBB8_23 +; NO_SVE-NEXT: .LBB8_30: // %cond.load34 +; NO_SVE-NEXT: add x9, x0, #24 +; NO_SVE-NEXT: ld1 { v1.h }[4], [x9] +; NO_SVE-NEXT: tbz w8, #13, .LBB8_24 +; NO_SVE-NEXT: .LBB8_31: // %cond.load37 +; NO_SVE-NEXT: add x9, x0, #26 +; NO_SVE-NEXT: ld1 { v1.h }[5], [x9] +; NO_SVE-NEXT: tbz w8, #14, .LBB8_25 +; NO_SVE-NEXT: .LBB8_32: // %cond.load40 +; NO_SVE-NEXT: add x9, x0, #28 +; NO_SVE-NEXT: ld1 { v1.h }[6], [x9] +; NO_SVE-NEXT: tbz w8, #15, .LBB8_26 +; NO_SVE-NEXT: .LBB8_33: // %cond.load43 +; NO_SVE-NEXT: add x9, x0, #30 +; NO_SVE-NEXT: ld1 { v1.h }[7], [x9] +; NO_SVE-NEXT: tbnz w8, #16, .LBB8_27 +; NO_SVE-NEXT: .LBB8_34: +; NO_SVE-NEXT: // implicit-def: $q2 +; NO_SVE-NEXT: tbz w8, #17, .LBB8_36 +; NO_SVE-NEXT: .LBB8_35: // %cond.load49 +; NO_SVE-NEXT: add x9, x0, #34 +; NO_SVE-NEXT: ld1 { v2.h }[1], [x9] +; NO_SVE-NEXT: .LBB8_36: // %else50 +; NO_SVE-NEXT: tbnz w8, #18, .LBB8_44 +; NO_SVE-NEXT: // %bb.37: // %else53 +; NO_SVE-NEXT: tbnz w8, #19, .LBB8_45 +; NO_SVE-NEXT: .LBB8_38: // %else56 +; NO_SVE-NEXT: tbnz w8, #20, .LBB8_46 +; NO_SVE-NEXT: .LBB8_39: // %else59 +; NO_SVE-NEXT: tbnz w8, #21, .LBB8_47 +; NO_SVE-NEXT: .LBB8_40: // %else62 +; NO_SVE-NEXT: tbnz w8, #22, .LBB8_48 +; NO_SVE-NEXT: .LBB8_41: // %else65 +; NO_SVE-NEXT: tbnz w8, #23, .LBB8_49 +; NO_SVE-NEXT: .LBB8_42: // %else68 +; NO_SVE-NEXT: tbz w8, #24, .LBB8_50 +; NO_SVE-NEXT: .LBB8_43: // %cond.load70 +; NO_SVE-NEXT: add x9, x0, #48 +; NO_SVE-NEXT: ld1 { v3.h }[0], [x9] +; NO_SVE-NEXT: tbnz w8, #25, .LBB8_51 +; NO_SVE-NEXT: b .LBB8_52 +; NO_SVE-NEXT: .LBB8_44: // %cond.load52 +; NO_SVE-NEXT: add x9, x0, #36 +; NO_SVE-NEXT: ld1 { v2.h }[2], [x9] +; NO_SVE-NEXT: tbz w8, #19, .LBB8_38 +; NO_SVE-NEXT: .LBB8_45: // %cond.load55 +; NO_SVE-NEXT: add x9, x0, #38 +; NO_SVE-NEXT: ld1 { v2.h }[3], [x9] +; NO_SVE-NEXT: tbz w8, #20, .LBB8_39 +; NO_SVE-NEXT: .LBB8_46: // %cond.load58 +; NO_SVE-NEXT: add x9, x0, #40 +; NO_SVE-NEXT: ld1 { v2.h }[4], [x9] +; NO_SVE-NEXT: tbz w8, #21, .LBB8_40 +; NO_SVE-NEXT: .LBB8_47: // %cond.load61 +; NO_SVE-NEXT: add x9, x0, #42 +; NO_SVE-NEXT: ld1 { v2.h }[5], [x9] +; NO_SVE-NEXT: tbz w8, #22, .LBB8_41 +; NO_SVE-NEXT: .LBB8_48: // %cond.load64 +; NO_SVE-NEXT: add x9, x0, #44 +; NO_SVE-NEXT: ld1 { v2.h }[6], [x9] +; NO_SVE-NEXT: tbz w8, #23, .LBB8_42 +; NO_SVE-NEXT: .LBB8_49: // %cond.load67 +; NO_SVE-NEXT: add x9, x0, #46 +; NO_SVE-NEXT: ld1 { v2.h }[7], [x9] +; NO_SVE-NEXT: tbnz w8, #24, .LBB8_43 +; NO_SVE-NEXT: .LBB8_50: +; NO_SVE-NEXT: // implicit-def: $q3 +; NO_SVE-NEXT: tbz w8, #25, .LBB8_52 +; NO_SVE-NEXT: .LBB8_51: // %cond.load73 +; NO_SVE-NEXT: add x9, x0, #50 +; NO_SVE-NEXT: ld1 { v3.h }[1], [x9] +; NO_SVE-NEXT: .LBB8_52: // %else74 +; NO_SVE-NEXT: tbnz w8, #26, .LBB8_59 +; NO_SVE-NEXT: // %bb.53: // %else77 +; NO_SVE-NEXT: tbnz w8, #27, .LBB8_60 +; NO_SVE-NEXT: .LBB8_54: // %else80 +; NO_SVE-NEXT: tbnz w8, #28, .LBB8_61 +; NO_SVE-NEXT: .LBB8_55: // %else83 +; NO_SVE-NEXT: tbnz w8, #29, .LBB8_62 +; NO_SVE-NEXT: .LBB8_56: // %else86 +; NO_SVE-NEXT: tbnz w8, #30, .LBB8_63 +; NO_SVE-NEXT: .LBB8_57: // %else89 +; NO_SVE-NEXT: tbnz w8, #31, .LBB8_64 +; NO_SVE-NEXT: .LBB8_58: // %else92 +; NO_SVE-NEXT: add sp, sp, #16 +; NO_SVE-NEXT: ret +; NO_SVE-NEXT: .LBB8_59: // %cond.load76 +; NO_SVE-NEXT: add x9, x0, #52 +; NO_SVE-NEXT: ld1 { v3.h }[2], [x9] +; NO_SVE-NEXT: tbz w8, #27, .LBB8_54 +; NO_SVE-NEXT: .LBB8_60: // %cond.load79 +; NO_SVE-NEXT: add x9, x0, #54 +; NO_SVE-NEXT: ld1 { v3.h }[3], [x9] +; NO_SVE-NEXT: tbz w8, #28, .LBB8_55 +; NO_SVE-NEXT: .LBB8_61: // %cond.load82 +; NO_SVE-NEXT: add x9, x0, #56 +; NO_SVE-NEXT: ld1 { v3.h }[4], [x9] +; NO_SVE-NEXT: tbz w8, #29, .LBB8_56 +; NO_SVE-NEXT: .LBB8_62: // %cond.load85 +; NO_SVE-NEXT: add x9, x0, #58 +; NO_SVE-NEXT: ld1 { v3.h }[5], [x9] +; NO_SVE-NEXT: tbz w8, #30, .LBB8_57 +; NO_SVE-NEXT: .LBB8_63: // %cond.load88 +; NO_SVE-NEXT: add x9, x0, #60 +; NO_SVE-NEXT: ld1 { v3.h }[6], [x9] +; NO_SVE-NEXT: tbz w8, #31, .LBB8_58 +; NO_SVE-NEXT: .LBB8_64: // %cond.load91 +; NO_SVE-NEXT: add x8, x0, #62 +; NO_SVE-NEXT: ld1 { v3.h }[7], [x8] +; NO_SVE-NEXT: add sp, sp, #16 +; NO_SVE-NEXT: ret +; ; VBITS_GE_512-LABEL: masked_load_v32i16: ; VBITS_GE_512: // %bb.0: ; VBITS_GE_512-NEXT: ptrue p0.h, vl32 @@ -193,6 +2450,172 @@ } define <16 x i32> @masked_load_v16i32(<16 x i32>* %ap, <16 x i32>* %bp) #0 { +; NO_SVE-LABEL: masked_load_v16i32: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: sub sp, sp, #16 +; NO_SVE-NEXT: .cfi_def_cfa_offset 16 +; NO_SVE-NEXT: ldp q0, q1, [x0] +; NO_SVE-NEXT: ldp q3, q2, [x1] +; NO_SVE-NEXT: cmeq v0.4s, v0.4s, v3.4s +; NO_SVE-NEXT: cmeq v1.4s, v1.4s, v2.4s +; NO_SVE-NEXT: ldp q3, q2, [x0, #32] +; NO_SVE-NEXT: uzp1 v0.8h, v0.8h, v1.8h +; NO_SVE-NEXT: xtn v0.8b, v0.8h +; NO_SVE-NEXT: ldp q4, q1, [x1, #32] +; NO_SVE-NEXT: umov w8, v0.b[1] +; NO_SVE-NEXT: umov w10, v0.b[2] +; NO_SVE-NEXT: umov w9, v0.b[0] +; NO_SVE-NEXT: umov w11, v0.b[3] +; NO_SVE-NEXT: umov w12, v0.b[4] +; NO_SVE-NEXT: umov w13, v0.b[5] +; NO_SVE-NEXT: and w8, w8, #0x1 +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: cmeq v1.4s, v2.4s, v1.4s +; NO_SVE-NEXT: and w9, w9, #0x1 +; NO_SVE-NEXT: cmeq v2.4s, v3.4s, v4.4s +; NO_SVE-NEXT: and w11, w11, #0x1 +; NO_SVE-NEXT: bfi w9, w8, #1, #1 +; NO_SVE-NEXT: umov w8, v0.b[6] +; NO_SVE-NEXT: uzp1 v1.8h, v2.8h, v1.8h +; NO_SVE-NEXT: and w12, w12, #0x1 +; NO_SVE-NEXT: bfi w9, w10, #2, #1 +; NO_SVE-NEXT: umov w10, v0.b[7] +; NO_SVE-NEXT: and w13, w13, #0x1 +; NO_SVE-NEXT: bfi w9, w11, #3, #1 +; NO_SVE-NEXT: bfi w9, w12, #4, #1 +; NO_SVE-NEXT: and w8, w8, #0x1 +; NO_SVE-NEXT: xtn v1.8b, v1.8h +; NO_SVE-NEXT: bfi w9, w13, #5, #1 +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: umov w11, v1.b[0] +; NO_SVE-NEXT: umov w12, v1.b[1] +; NO_SVE-NEXT: umov w13, v1.b[2] +; NO_SVE-NEXT: orr w8, w9, w8, lsl #6 +; NO_SVE-NEXT: umov w9, v1.b[3] +; NO_SVE-NEXT: orr w8, w8, w10, lsl #7 +; NO_SVE-NEXT: umov w10, v1.b[4] +; NO_SVE-NEXT: and w11, w11, #0x1 +; NO_SVE-NEXT: and w12, w12, #0x1 +; NO_SVE-NEXT: and w13, w13, #0x1 +; NO_SVE-NEXT: orr w8, w8, w11, lsl #8 +; NO_SVE-NEXT: umov w11, v1.b[5] +; NO_SVE-NEXT: orr w8, w8, w12, lsl #9 +; NO_SVE-NEXT: umov w12, v1.b[6] +; NO_SVE-NEXT: and w9, w9, #0x1 +; NO_SVE-NEXT: orr w8, w8, w13, lsl #10 +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: orr w8, w8, w9, lsl #11 +; NO_SVE-NEXT: and w9, w11, #0x1 +; NO_SVE-NEXT: umov w11, v1.b[7] +; NO_SVE-NEXT: orr w8, w8, w10, lsl #12 +; NO_SVE-NEXT: and w10, w12, #0x1 +; NO_SVE-NEXT: orr w8, w8, w9, lsl #13 +; NO_SVE-NEXT: orr w8, w8, w10, lsl #14 +; NO_SVE-NEXT: orr w9, w8, w11, lsl #15 +; NO_SVE-NEXT: and w8, w9, #0xffff +; NO_SVE-NEXT: tbz w9, #0, .LBB9_2 +; NO_SVE-NEXT: // %bb.1: // %cond.load +; NO_SVE-NEXT: ldr s0, [x0] +; NO_SVE-NEXT: tbnz w8, #1, .LBB9_3 +; NO_SVE-NEXT: b .LBB9_4 +; NO_SVE-NEXT: .LBB9_2: +; NO_SVE-NEXT: // implicit-def: $q0 +; NO_SVE-NEXT: tbz w8, #1, .LBB9_4 +; NO_SVE-NEXT: .LBB9_3: // %cond.load1 +; NO_SVE-NEXT: add x9, x0, #4 +; NO_SVE-NEXT: ld1 { v0.s }[1], [x9] +; NO_SVE-NEXT: .LBB9_4: // %else2 +; NO_SVE-NEXT: tbnz w8, #2, .LBB9_8 +; NO_SVE-NEXT: // %bb.5: // %else5 +; NO_SVE-NEXT: tbnz w8, #3, .LBB9_9 +; NO_SVE-NEXT: .LBB9_6: // %else8 +; NO_SVE-NEXT: tbz w8, #4, .LBB9_10 +; NO_SVE-NEXT: .LBB9_7: // %cond.load10 +; NO_SVE-NEXT: add x9, x0, #16 +; NO_SVE-NEXT: ld1 { v1.s }[0], [x9] +; NO_SVE-NEXT: tbnz w8, #5, .LBB9_11 +; NO_SVE-NEXT: b .LBB9_12 +; NO_SVE-NEXT: .LBB9_8: // %cond.load4 +; NO_SVE-NEXT: add x9, x0, #8 +; NO_SVE-NEXT: ld1 { v0.s }[2], [x9] +; NO_SVE-NEXT: tbz w8, #3, .LBB9_6 +; NO_SVE-NEXT: .LBB9_9: // %cond.load7 +; NO_SVE-NEXT: add x9, x0, #12 +; NO_SVE-NEXT: ld1 { v0.s }[3], [x9] +; NO_SVE-NEXT: tbnz w8, #4, .LBB9_7 +; NO_SVE-NEXT: .LBB9_10: +; NO_SVE-NEXT: // implicit-def: $q1 +; NO_SVE-NEXT: tbz w8, #5, .LBB9_12 +; NO_SVE-NEXT: .LBB9_11: // %cond.load13 +; NO_SVE-NEXT: add x9, x0, #20 +; NO_SVE-NEXT: ld1 { v1.s }[1], [x9] +; NO_SVE-NEXT: .LBB9_12: // %else14 +; NO_SVE-NEXT: tbnz w8, #6, .LBB9_16 +; NO_SVE-NEXT: // %bb.13: // %else17 +; NO_SVE-NEXT: tbnz w8, #7, .LBB9_17 +; NO_SVE-NEXT: .LBB9_14: // %else20 +; NO_SVE-NEXT: tbz w8, #8, .LBB9_18 +; NO_SVE-NEXT: .LBB9_15: // %cond.load22 +; NO_SVE-NEXT: add x9, x0, #32 +; NO_SVE-NEXT: ld1 { v2.s }[0], [x9] +; NO_SVE-NEXT: tbnz w8, #9, .LBB9_19 +; NO_SVE-NEXT: b .LBB9_20 +; NO_SVE-NEXT: .LBB9_16: // %cond.load16 +; NO_SVE-NEXT: add x9, x0, #24 +; NO_SVE-NEXT: ld1 { v1.s }[2], [x9] +; NO_SVE-NEXT: tbz w8, #7, .LBB9_14 +; NO_SVE-NEXT: .LBB9_17: // %cond.load19 +; NO_SVE-NEXT: add x9, x0, #28 +; NO_SVE-NEXT: ld1 { v1.s }[3], [x9] +; NO_SVE-NEXT: tbnz w8, #8, .LBB9_15 +; NO_SVE-NEXT: .LBB9_18: +; NO_SVE-NEXT: // implicit-def: $q2 +; NO_SVE-NEXT: tbz w8, #9, .LBB9_20 +; NO_SVE-NEXT: .LBB9_19: // %cond.load25 +; NO_SVE-NEXT: add x9, x0, #36 +; NO_SVE-NEXT: ld1 { v2.s }[1], [x9] +; NO_SVE-NEXT: .LBB9_20: // %else26 +; NO_SVE-NEXT: tbnz w8, #10, .LBB9_24 +; NO_SVE-NEXT: // %bb.21: // %else29 +; NO_SVE-NEXT: tbnz w8, #11, .LBB9_25 +; NO_SVE-NEXT: .LBB9_22: // %else32 +; NO_SVE-NEXT: tbz w8, #12, .LBB9_26 +; NO_SVE-NEXT: .LBB9_23: // %cond.load34 +; NO_SVE-NEXT: add x9, x0, #48 +; NO_SVE-NEXT: ld1 { v3.s }[0], [x9] +; NO_SVE-NEXT: tbnz w8, #13, .LBB9_27 +; NO_SVE-NEXT: b .LBB9_28 +; NO_SVE-NEXT: .LBB9_24: // %cond.load28 +; NO_SVE-NEXT: add x9, x0, #40 +; NO_SVE-NEXT: ld1 { v2.s }[2], [x9] +; NO_SVE-NEXT: tbz w8, #11, .LBB9_22 +; NO_SVE-NEXT: .LBB9_25: // %cond.load31 +; NO_SVE-NEXT: add x9, x0, #44 +; NO_SVE-NEXT: ld1 { v2.s }[3], [x9] +; NO_SVE-NEXT: tbnz w8, #12, .LBB9_23 +; NO_SVE-NEXT: .LBB9_26: +; NO_SVE-NEXT: // implicit-def: $q3 +; NO_SVE-NEXT: tbz w8, #13, .LBB9_28 +; NO_SVE-NEXT: .LBB9_27: // %cond.load37 +; NO_SVE-NEXT: add x9, x0, #52 +; NO_SVE-NEXT: ld1 { v3.s }[1], [x9] +; NO_SVE-NEXT: .LBB9_28: // %else38 +; NO_SVE-NEXT: tbnz w8, #14, .LBB9_31 +; NO_SVE-NEXT: // %bb.29: // %else41 +; NO_SVE-NEXT: tbnz w8, #15, .LBB9_32 +; NO_SVE-NEXT: .LBB9_30: // %else44 +; NO_SVE-NEXT: add sp, sp, #16 +; NO_SVE-NEXT: ret +; NO_SVE-NEXT: .LBB9_31: // %cond.load40 +; NO_SVE-NEXT: add x9, x0, #56 +; NO_SVE-NEXT: ld1 { v3.s }[2], [x9] +; NO_SVE-NEXT: tbz w8, #15, .LBB9_30 +; NO_SVE-NEXT: .LBB9_32: // %cond.load43 +; NO_SVE-NEXT: add x8, x0, #60 +; NO_SVE-NEXT: ld1 { v3.s }[3], [x8] +; NO_SVE-NEXT: add sp, sp, #16 +; NO_SVE-NEXT: ret +; ; VBITS_GE_512-LABEL: masked_load_v16i32: ; VBITS_GE_512: // %bb.0: ; VBITS_GE_512-NEXT: ptrue p0.s, vl16 @@ -210,6 +2633,99 @@ } define <8 x i64> @masked_load_v8i64(<8 x i64>* %ap, <8 x i64>* %bp) #0 { +; NO_SVE-LABEL: masked_load_v8i64: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: sub sp, sp, #16 +; NO_SVE-NEXT: .cfi_def_cfa_offset 16 +; NO_SVE-NEXT: ldp q0, q1, [x0] +; NO_SVE-NEXT: ldp q2, q3, [x0, #32] +; NO_SVE-NEXT: ldp q4, q5, [x1, #32] +; NO_SVE-NEXT: cmeq v2.2d, v2.2d, v4.2d +; NO_SVE-NEXT: ldp q6, q7, [x1] +; NO_SVE-NEXT: cmeq v3.2d, v3.2d, v5.2d +; NO_SVE-NEXT: uzp1 v2.4s, v2.4s, v3.4s +; NO_SVE-NEXT: cmeq v0.2d, v0.2d, v6.2d +; NO_SVE-NEXT: cmeq v1.2d, v1.2d, v7.2d +; NO_SVE-NEXT: uzp1 v0.4s, v0.4s, v1.4s +; NO_SVE-NEXT: uzp1 v0.8h, v0.8h, v2.8h +; NO_SVE-NEXT: xtn v0.8b, v0.8h +; NO_SVE-NEXT: umov w8, v0.b[1] +; NO_SVE-NEXT: umov w10, v0.b[2] +; NO_SVE-NEXT: umov w9, v0.b[0] +; NO_SVE-NEXT: umov w11, v0.b[3] +; NO_SVE-NEXT: umov w12, v0.b[4] +; NO_SVE-NEXT: umov w13, v0.b[5] +; NO_SVE-NEXT: umov w14, v0.b[6] +; NO_SVE-NEXT: and w8, w8, #0x1 +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: and w9, w9, #0x1 +; NO_SVE-NEXT: and w11, w11, #0x1 +; NO_SVE-NEXT: bfi w9, w8, #1, #1 +; NO_SVE-NEXT: and w8, w12, #0x1 +; NO_SVE-NEXT: bfi w9, w10, #2, #1 +; NO_SVE-NEXT: and w10, w13, #0x1 +; NO_SVE-NEXT: bfi w9, w11, #3, #1 +; NO_SVE-NEXT: umov w11, v0.b[7] +; NO_SVE-NEXT: bfi w9, w8, #4, #1 +; NO_SVE-NEXT: and w8, w14, #0x1 +; NO_SVE-NEXT: bfi w9, w10, #5, #1 +; NO_SVE-NEXT: orr w8, w9, w8, lsl #6 +; NO_SVE-NEXT: orr w9, w8, w11, lsl #7 +; NO_SVE-NEXT: and w8, w9, #0xff +; NO_SVE-NEXT: tbz w9, #0, .LBB10_2 +; NO_SVE-NEXT: // %bb.1: // %cond.load +; NO_SVE-NEXT: ldr d0, [x0] +; NO_SVE-NEXT: tbnz w8, #1, .LBB10_3 +; NO_SVE-NEXT: b .LBB10_4 +; NO_SVE-NEXT: .LBB10_2: +; NO_SVE-NEXT: // implicit-def: $q0 +; NO_SVE-NEXT: tbz w8, #1, .LBB10_4 +; NO_SVE-NEXT: .LBB10_3: // %cond.load1 +; NO_SVE-NEXT: add x9, x0, #8 +; NO_SVE-NEXT: ld1 { v0.d }[1], [x9] +; NO_SVE-NEXT: .LBB10_4: // %else2 +; NO_SVE-NEXT: tbz w8, #2, .LBB10_6 +; NO_SVE-NEXT: // %bb.5: // %cond.load4 +; NO_SVE-NEXT: add x9, x0, #16 +; NO_SVE-NEXT: ld1 { v1.d }[0], [x9] +; NO_SVE-NEXT: tbnz w8, #3, .LBB10_7 +; NO_SVE-NEXT: b .LBB10_8 +; NO_SVE-NEXT: .LBB10_6: +; NO_SVE-NEXT: // implicit-def: $q1 +; NO_SVE-NEXT: tbz w8, #3, .LBB10_8 +; NO_SVE-NEXT: .LBB10_7: // %cond.load7 +; NO_SVE-NEXT: add x9, x0, #24 +; NO_SVE-NEXT: ld1 { v1.d }[1], [x9] +; NO_SVE-NEXT: .LBB10_8: // %else8 +; NO_SVE-NEXT: tbz w8, #4, .LBB10_10 +; NO_SVE-NEXT: // %bb.9: // %cond.load10 +; NO_SVE-NEXT: add x9, x0, #32 +; NO_SVE-NEXT: ld1 { v2.d }[0], [x9] +; NO_SVE-NEXT: tbnz w8, #5, .LBB10_11 +; NO_SVE-NEXT: b .LBB10_12 +; NO_SVE-NEXT: .LBB10_10: +; NO_SVE-NEXT: // implicit-def: $q2 +; NO_SVE-NEXT: tbz w8, #5, .LBB10_12 +; NO_SVE-NEXT: .LBB10_11: // %cond.load13 +; NO_SVE-NEXT: add x9, x0, #40 +; NO_SVE-NEXT: ld1 { v2.d }[1], [x9] +; NO_SVE-NEXT: .LBB10_12: // %else14 +; NO_SVE-NEXT: tbz w8, #6, .LBB10_14 +; NO_SVE-NEXT: // %bb.13: // %cond.load16 +; NO_SVE-NEXT: add x9, x0, #48 +; NO_SVE-NEXT: ld1 { v3.d }[0], [x9] +; NO_SVE-NEXT: tbnz w8, #7, .LBB10_15 +; NO_SVE-NEXT: b .LBB10_16 +; NO_SVE-NEXT: .LBB10_14: +; NO_SVE-NEXT: // implicit-def: $q3 +; NO_SVE-NEXT: tbz w8, #7, .LBB10_16 +; NO_SVE-NEXT: .LBB10_15: // %cond.load19 +; NO_SVE-NEXT: add x8, x0, #56 +; NO_SVE-NEXT: ld1 { v3.d }[1], [x8] +; NO_SVE-NEXT: .LBB10_16: // %else20 +; NO_SVE-NEXT: add sp, sp, #16 +; NO_SVE-NEXT: ret +; ; VBITS_GE_512-LABEL: masked_load_v8i64: ; VBITS_GE_512: // %bb.0: ; VBITS_GE_512-NEXT: ptrue p0.d, vl8 @@ -227,6 +2743,96 @@ } define <8 x i64> @masked_load_passthru_v8i64(<8 x i64>* %ap, <8 x i64>* %bp) #0 { +; NO_SVE-LABEL: masked_load_passthru_v8i64: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: sub sp, sp, #16 +; NO_SVE-NEXT: .cfi_def_cfa_offset 16 +; NO_SVE-NEXT: ldp q4, q5, [x0] +; NO_SVE-NEXT: ldp q6, q7, [x0, #32] +; NO_SVE-NEXT: ldp q2, q3, [x1, #32] +; NO_SVE-NEXT: cmeq v6.2d, v6.2d, v2.2d +; NO_SVE-NEXT: ldp q0, q1, [x1] +; NO_SVE-NEXT: cmeq v7.2d, v7.2d, v3.2d +; NO_SVE-NEXT: uzp1 v6.4s, v6.4s, v7.4s +; NO_SVE-NEXT: cmeq v4.2d, v4.2d, v0.2d +; NO_SVE-NEXT: cmeq v5.2d, v5.2d, v1.2d +; NO_SVE-NEXT: uzp1 v4.4s, v4.4s, v5.4s +; NO_SVE-NEXT: uzp1 v4.8h, v4.8h, v6.8h +; NO_SVE-NEXT: xtn v4.8b, v4.8h +; NO_SVE-NEXT: umov w8, v4.b[1] +; NO_SVE-NEXT: umov w10, v4.b[2] +; NO_SVE-NEXT: umov w9, v4.b[0] +; NO_SVE-NEXT: umov w11, v4.b[3] +; NO_SVE-NEXT: umov w12, v4.b[4] +; NO_SVE-NEXT: umov w13, v4.b[5] +; NO_SVE-NEXT: umov w14, v4.b[6] +; NO_SVE-NEXT: and w8, w8, #0x1 +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: and w9, w9, #0x1 +; NO_SVE-NEXT: and w11, w11, #0x1 +; NO_SVE-NEXT: bfi w9, w8, #1, #1 +; NO_SVE-NEXT: and w8, w12, #0x1 +; NO_SVE-NEXT: bfi w9, w10, #2, #1 +; NO_SVE-NEXT: and w10, w13, #0x1 +; NO_SVE-NEXT: bfi w9, w11, #3, #1 +; NO_SVE-NEXT: umov w11, v4.b[7] +; NO_SVE-NEXT: bfi w9, w8, #4, #1 +; NO_SVE-NEXT: and w8, w14, #0x1 +; NO_SVE-NEXT: bfi w9, w10, #5, #1 +; NO_SVE-NEXT: orr w8, w9, w8, lsl #6 +; NO_SVE-NEXT: orr w9, w8, w11, lsl #7 +; NO_SVE-NEXT: and w8, w9, #0xff +; NO_SVE-NEXT: tbnz w9, #0, .LBB11_9 +; NO_SVE-NEXT: // %bb.1: // %else +; NO_SVE-NEXT: tbnz w8, #1, .LBB11_10 +; NO_SVE-NEXT: .LBB11_2: // %else2 +; NO_SVE-NEXT: tbnz w8, #2, .LBB11_11 +; NO_SVE-NEXT: .LBB11_3: // %else5 +; NO_SVE-NEXT: tbnz w8, #3, .LBB11_12 +; NO_SVE-NEXT: .LBB11_4: // %else8 +; NO_SVE-NEXT: tbnz w8, #4, .LBB11_13 +; NO_SVE-NEXT: .LBB11_5: // %else11 +; NO_SVE-NEXT: tbnz w8, #5, .LBB11_14 +; NO_SVE-NEXT: .LBB11_6: // %else14 +; NO_SVE-NEXT: tbnz w8, #6, .LBB11_15 +; NO_SVE-NEXT: .LBB11_7: // %else17 +; NO_SVE-NEXT: tbnz w8, #7, .LBB11_16 +; NO_SVE-NEXT: .LBB11_8: // %else20 +; NO_SVE-NEXT: add sp, sp, #16 +; NO_SVE-NEXT: ret +; NO_SVE-NEXT: .LBB11_9: // %cond.load +; NO_SVE-NEXT: ld1 { v0.d }[0], [x0] +; NO_SVE-NEXT: tbz w8, #1, .LBB11_2 +; NO_SVE-NEXT: .LBB11_10: // %cond.load1 +; NO_SVE-NEXT: add x9, x0, #8 +; NO_SVE-NEXT: ld1 { v0.d }[1], [x9] +; NO_SVE-NEXT: tbz w8, #2, .LBB11_3 +; NO_SVE-NEXT: .LBB11_11: // %cond.load4 +; NO_SVE-NEXT: add x9, x0, #16 +; NO_SVE-NEXT: ld1 { v1.d }[0], [x9] +; NO_SVE-NEXT: tbz w8, #3, .LBB11_4 +; NO_SVE-NEXT: .LBB11_12: // %cond.load7 +; NO_SVE-NEXT: add x9, x0, #24 +; NO_SVE-NEXT: ld1 { v1.d }[1], [x9] +; NO_SVE-NEXT: tbz w8, #4, .LBB11_5 +; NO_SVE-NEXT: .LBB11_13: // %cond.load10 +; NO_SVE-NEXT: add x9, x0, #32 +; NO_SVE-NEXT: ld1 { v2.d }[0], [x9] +; NO_SVE-NEXT: tbz w8, #5, .LBB11_6 +; NO_SVE-NEXT: .LBB11_14: // %cond.load13 +; NO_SVE-NEXT: add x9, x0, #40 +; NO_SVE-NEXT: ld1 { v2.d }[1], [x9] +; NO_SVE-NEXT: tbz w8, #6, .LBB11_7 +; NO_SVE-NEXT: .LBB11_15: // %cond.load16 +; NO_SVE-NEXT: add x9, x0, #48 +; NO_SVE-NEXT: ld1 { v3.d }[0], [x9] +; NO_SVE-NEXT: tbz w8, #7, .LBB11_8 +; NO_SVE-NEXT: .LBB11_16: // %cond.load19 +; NO_SVE-NEXT: add x8, x0, #56 +; NO_SVE-NEXT: ld1 { v3.d }[1], [x8] +; NO_SVE-NEXT: add sp, sp, #16 +; NO_SVE-NEXT: ret +; ; VBITS_GE_512-LABEL: masked_load_passthru_v8i64: ; VBITS_GE_512: // %bb.0: ; VBITS_GE_512-NEXT: ptrue p0.d, vl8 @@ -245,6 +2851,96 @@ } define <8 x double> @masked_load_passthru_v8f64(<8 x double>* %ap, <8 x double>* %bp) #0 { +; NO_SVE-LABEL: masked_load_passthru_v8f64: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: sub sp, sp, #16 +; NO_SVE-NEXT: .cfi_def_cfa_offset 16 +; NO_SVE-NEXT: ldp q4, q5, [x0] +; NO_SVE-NEXT: ldp q6, q7, [x0, #32] +; NO_SVE-NEXT: ldp q2, q3, [x1, #32] +; NO_SVE-NEXT: fcmeq v6.2d, v6.2d, v2.2d +; NO_SVE-NEXT: ldp q0, q1, [x1] +; NO_SVE-NEXT: fcmeq v7.2d, v7.2d, v3.2d +; NO_SVE-NEXT: fcmeq v4.2d, v4.2d, v0.2d +; NO_SVE-NEXT: uzp1 v6.4s, v6.4s, v7.4s +; NO_SVE-NEXT: fcmeq v5.2d, v5.2d, v1.2d +; NO_SVE-NEXT: uzp1 v4.4s, v4.4s, v5.4s +; NO_SVE-NEXT: uzp1 v4.8h, v4.8h, v6.8h +; NO_SVE-NEXT: xtn v4.8b, v4.8h +; NO_SVE-NEXT: umov w8, v4.b[1] +; NO_SVE-NEXT: umov w10, v4.b[2] +; NO_SVE-NEXT: umov w9, v4.b[0] +; NO_SVE-NEXT: umov w11, v4.b[3] +; NO_SVE-NEXT: umov w12, v4.b[4] +; NO_SVE-NEXT: umov w13, v4.b[5] +; NO_SVE-NEXT: umov w14, v4.b[6] +; NO_SVE-NEXT: and w8, w8, #0x1 +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: and w9, w9, #0x1 +; NO_SVE-NEXT: and w11, w11, #0x1 +; NO_SVE-NEXT: bfi w9, w8, #1, #1 +; NO_SVE-NEXT: and w8, w12, #0x1 +; NO_SVE-NEXT: bfi w9, w10, #2, #1 +; NO_SVE-NEXT: and w10, w13, #0x1 +; NO_SVE-NEXT: bfi w9, w11, #3, #1 +; NO_SVE-NEXT: umov w11, v4.b[7] +; NO_SVE-NEXT: bfi w9, w8, #4, #1 +; NO_SVE-NEXT: and w8, w14, #0x1 +; NO_SVE-NEXT: bfi w9, w10, #5, #1 +; NO_SVE-NEXT: orr w8, w9, w8, lsl #6 +; NO_SVE-NEXT: orr w9, w8, w11, lsl #7 +; NO_SVE-NEXT: and w8, w9, #0xff +; NO_SVE-NEXT: tbnz w9, #0, .LBB12_9 +; NO_SVE-NEXT: // %bb.1: // %else +; NO_SVE-NEXT: tbnz w8, #1, .LBB12_10 +; NO_SVE-NEXT: .LBB12_2: // %else2 +; NO_SVE-NEXT: tbnz w8, #2, .LBB12_11 +; NO_SVE-NEXT: .LBB12_3: // %else5 +; NO_SVE-NEXT: tbnz w8, #3, .LBB12_12 +; NO_SVE-NEXT: .LBB12_4: // %else8 +; NO_SVE-NEXT: tbnz w8, #4, .LBB12_13 +; NO_SVE-NEXT: .LBB12_5: // %else11 +; NO_SVE-NEXT: tbnz w8, #5, .LBB12_14 +; NO_SVE-NEXT: .LBB12_6: // %else14 +; NO_SVE-NEXT: tbnz w8, #6, .LBB12_15 +; NO_SVE-NEXT: .LBB12_7: // %else17 +; NO_SVE-NEXT: tbnz w8, #7, .LBB12_16 +; NO_SVE-NEXT: .LBB12_8: // %else20 +; NO_SVE-NEXT: add sp, sp, #16 +; NO_SVE-NEXT: ret +; NO_SVE-NEXT: .LBB12_9: // %cond.load +; NO_SVE-NEXT: ld1 { v0.d }[0], [x0] +; NO_SVE-NEXT: tbz w8, #1, .LBB12_2 +; NO_SVE-NEXT: .LBB12_10: // %cond.load1 +; NO_SVE-NEXT: add x9, x0, #8 +; NO_SVE-NEXT: ld1 { v0.d }[1], [x9] +; NO_SVE-NEXT: tbz w8, #2, .LBB12_3 +; NO_SVE-NEXT: .LBB12_11: // %cond.load4 +; NO_SVE-NEXT: add x9, x0, #16 +; NO_SVE-NEXT: ld1 { v1.d }[0], [x9] +; NO_SVE-NEXT: tbz w8, #3, .LBB12_4 +; NO_SVE-NEXT: .LBB12_12: // %cond.load7 +; NO_SVE-NEXT: add x9, x0, #24 +; NO_SVE-NEXT: ld1 { v1.d }[1], [x9] +; NO_SVE-NEXT: tbz w8, #4, .LBB12_5 +; NO_SVE-NEXT: .LBB12_13: // %cond.load10 +; NO_SVE-NEXT: add x9, x0, #32 +; NO_SVE-NEXT: ld1 { v2.d }[0], [x9] +; NO_SVE-NEXT: tbz w8, #5, .LBB12_6 +; NO_SVE-NEXT: .LBB12_14: // %cond.load13 +; NO_SVE-NEXT: add x9, x0, #40 +; NO_SVE-NEXT: ld1 { v2.d }[1], [x9] +; NO_SVE-NEXT: tbz w8, #6, .LBB12_7 +; NO_SVE-NEXT: .LBB12_15: // %cond.load16 +; NO_SVE-NEXT: add x9, x0, #48 +; NO_SVE-NEXT: ld1 { v3.d }[0], [x9] +; NO_SVE-NEXT: tbz w8, #7, .LBB12_8 +; NO_SVE-NEXT: .LBB12_16: // %cond.load19 +; NO_SVE-NEXT: add x8, x0, #56 +; NO_SVE-NEXT: ld1 { v3.d }[1], [x8] +; NO_SVE-NEXT: add sp, sp, #16 +; NO_SVE-NEXT: ret +; ; VBITS_GE_512-LABEL: masked_load_passthru_v8f64: ; VBITS_GE_512: // %bb.0: ; VBITS_GE_512-NEXT: ptrue p0.d, vl8 @@ -263,6 +2959,306 @@ } define <32 x i16> @masked_load_sext_v32i8i16(<32 x i8>* %ap, <32 x i8>* %bp) #0 { +; NO_SVE-LABEL: masked_load_sext_v32i8i16: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: sub sp, sp, #16 +; NO_SVE-NEXT: .cfi_def_cfa_offset 16 +; NO_SVE-NEXT: ldp q1, q0, [x1] +; NO_SVE-NEXT: cmeq v1.16b, v1.16b, #0 +; NO_SVE-NEXT: cmeq v0.16b, v0.16b, #0 +; NO_SVE-NEXT: umov w8, v0.b[1] +; NO_SVE-NEXT: umov w10, v0.b[2] +; NO_SVE-NEXT: umov w9, v0.b[0] +; NO_SVE-NEXT: umov w11, v0.b[3] +; NO_SVE-NEXT: umov w12, v0.b[4] +; NO_SVE-NEXT: umov w13, v0.b[5] +; NO_SVE-NEXT: umov w14, v0.b[6] +; NO_SVE-NEXT: umov w15, v0.b[7] +; NO_SVE-NEXT: and w8, w8, #0x1 +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: and w9, w9, #0x1 +; NO_SVE-NEXT: and w11, w11, #0x1 +; NO_SVE-NEXT: and w12, w12, #0x1 +; NO_SVE-NEXT: umov w16, v0.b[8] +; NO_SVE-NEXT: bfi w9, w8, #1, #1 +; NO_SVE-NEXT: and w13, w13, #0x1 +; NO_SVE-NEXT: bfi w9, w10, #2, #1 +; NO_SVE-NEXT: umov w10, v1.b[1] +; NO_SVE-NEXT: bfi w9, w11, #3, #1 +; NO_SVE-NEXT: umov w17, v0.b[9] +; NO_SVE-NEXT: bfi w9, w12, #4, #1 +; NO_SVE-NEXT: umov w12, v1.b[0] +; NO_SVE-NEXT: umov w8, v0.b[10] +; NO_SVE-NEXT: and w14, w14, #0x1 +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: bfi w9, w13, #5, #1 +; NO_SVE-NEXT: and w15, w15, #0x1 +; NO_SVE-NEXT: umov w11, v0.b[11] +; NO_SVE-NEXT: and w12, w12, #0x1 +; NO_SVE-NEXT: orr w9, w9, w14, lsl #6 +; NO_SVE-NEXT: umov w14, v1.b[2] +; NO_SVE-NEXT: bfi w12, w10, #1, #1 +; NO_SVE-NEXT: and w10, w16, #0x1 +; NO_SVE-NEXT: umov w13, v0.b[12] +; NO_SVE-NEXT: orr w9, w9, w15, lsl #7 +; NO_SVE-NEXT: and w15, w17, #0x1 +; NO_SVE-NEXT: and w8, w8, #0x1 +; NO_SVE-NEXT: orr w9, w9, w10, lsl #8 +; NO_SVE-NEXT: umov w10, v1.b[3] +; NO_SVE-NEXT: orr w9, w9, w15, lsl #9 +; NO_SVE-NEXT: umov w15, v1.b[4] +; NO_SVE-NEXT: and w11, w11, #0x1 +; NO_SVE-NEXT: and w14, w14, #0x1 +; NO_SVE-NEXT: orr w8, w9, w8, lsl #10 +; NO_SVE-NEXT: umov w9, v1.b[5] +; NO_SVE-NEXT: and w13, w13, #0x1 +; NO_SVE-NEXT: orr w8, w8, w11, lsl #11 +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: orr w8, w8, w13, lsl #12 +; NO_SVE-NEXT: bfi w12, w14, #2, #1 +; NO_SVE-NEXT: and w13, w15, #0x1 +; NO_SVE-NEXT: umov w14, v1.b[6] +; NO_SVE-NEXT: and w9, w9, #0x1 +; NO_SVE-NEXT: bfi w12, w10, #3, #1 +; NO_SVE-NEXT: umov w10, v1.b[7] +; NO_SVE-NEXT: umov w11, v0.b[13] +; NO_SVE-NEXT: bfi w12, w13, #4, #1 +; NO_SVE-NEXT: umov w13, v0.b[14] +; NO_SVE-NEXT: bfi w12, w9, #5, #1 +; NO_SVE-NEXT: and w9, w14, #0x1 +; NO_SVE-NEXT: umov w14, v1.b[8] +; NO_SVE-NEXT: umov w15, v1.b[9] +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: and w11, w11, #0x1 +; NO_SVE-NEXT: orr w9, w12, w9, lsl #6 +; NO_SVE-NEXT: umov w12, v1.b[10] +; NO_SVE-NEXT: orr w9, w9, w10, lsl #7 +; NO_SVE-NEXT: orr w8, w8, w11, lsl #13 +; NO_SVE-NEXT: and w10, w14, #0x1 +; NO_SVE-NEXT: umov w14, v1.b[11] +; NO_SVE-NEXT: and w11, w13, #0x1 +; NO_SVE-NEXT: and w13, w15, #0x1 +; NO_SVE-NEXT: orr w9, w9, w10, lsl #8 +; NO_SVE-NEXT: umov w10, v1.b[12] +; NO_SVE-NEXT: orr w8, w8, w11, lsl #14 +; NO_SVE-NEXT: and w11, w12, #0x1 +; NO_SVE-NEXT: umov w12, v1.b[13] +; NO_SVE-NEXT: orr w9, w9, w13, lsl #9 +; NO_SVE-NEXT: and w13, w14, #0x1 +; NO_SVE-NEXT: umov w14, v1.b[14] +; NO_SVE-NEXT: orr w9, w9, w11, lsl #10 +; NO_SVE-NEXT: umov w11, v0.b[15] +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: orr w9, w9, w13, lsl #11 +; NO_SVE-NEXT: and w12, w12, #0x1 +; NO_SVE-NEXT: umov w13, v1.b[15] +; NO_SVE-NEXT: orr w9, w9, w10, lsl #12 +; NO_SVE-NEXT: and w10, w14, #0x1 +; NO_SVE-NEXT: orr w11, w8, w11, lsl #15 +; NO_SVE-NEXT: orr w8, w9, w12, lsl #13 +; NO_SVE-NEXT: orr w8, w8, w10, lsl #14 +; NO_SVE-NEXT: orr w8, w8, w13, lsl #15 +; NO_SVE-NEXT: bfi w8, w11, #16, #16 +; NO_SVE-NEXT: tbz w8, #0, .LBB13_2 +; NO_SVE-NEXT: // %bb.1: // %cond.load +; NO_SVE-NEXT: ldr b0, [x0] +; NO_SVE-NEXT: tbnz w8, #1, .LBB13_3 +; NO_SVE-NEXT: b .LBB13_4 +; NO_SVE-NEXT: .LBB13_2: +; NO_SVE-NEXT: // implicit-def: $q0 +; NO_SVE-NEXT: tbz w8, #1, .LBB13_4 +; NO_SVE-NEXT: .LBB13_3: // %cond.load1 +; NO_SVE-NEXT: add x9, x0, #1 +; NO_SVE-NEXT: ld1 { v0.b }[1], [x9] +; NO_SVE-NEXT: .LBB13_4: // %else2 +; NO_SVE-NEXT: tbnz w8, #2, .LBB13_20 +; NO_SVE-NEXT: // %bb.5: // %else5 +; NO_SVE-NEXT: tbnz w8, #3, .LBB13_21 +; NO_SVE-NEXT: .LBB13_6: // %else8 +; NO_SVE-NEXT: tbnz w8, #4, .LBB13_22 +; NO_SVE-NEXT: .LBB13_7: // %else11 +; NO_SVE-NEXT: tbnz w8, #5, .LBB13_23 +; NO_SVE-NEXT: .LBB13_8: // %else14 +; NO_SVE-NEXT: tbnz w8, #6, .LBB13_24 +; NO_SVE-NEXT: .LBB13_9: // %else17 +; NO_SVE-NEXT: tbnz w8, #7, .LBB13_25 +; NO_SVE-NEXT: .LBB13_10: // %else20 +; NO_SVE-NEXT: tbnz w8, #8, .LBB13_26 +; NO_SVE-NEXT: .LBB13_11: // %else23 +; NO_SVE-NEXT: tbnz w8, #9, .LBB13_27 +; NO_SVE-NEXT: .LBB13_12: // %else26 +; NO_SVE-NEXT: tbnz w8, #10, .LBB13_28 +; NO_SVE-NEXT: .LBB13_13: // %else29 +; NO_SVE-NEXT: tbnz w8, #11, .LBB13_29 +; NO_SVE-NEXT: .LBB13_14: // %else32 +; NO_SVE-NEXT: tbnz w8, #12, .LBB13_30 +; NO_SVE-NEXT: .LBB13_15: // %else35 +; NO_SVE-NEXT: tbnz w8, #13, .LBB13_31 +; NO_SVE-NEXT: .LBB13_16: // %else38 +; NO_SVE-NEXT: tbnz w8, #14, .LBB13_32 +; NO_SVE-NEXT: .LBB13_17: // %else41 +; NO_SVE-NEXT: tbnz w8, #15, .LBB13_33 +; NO_SVE-NEXT: .LBB13_18: // %else44 +; NO_SVE-NEXT: tbz w8, #16, .LBB13_34 +; NO_SVE-NEXT: .LBB13_19: // %cond.load46 +; NO_SVE-NEXT: add x9, x0, #16 +; NO_SVE-NEXT: ld1 { v2.b }[0], [x9] +; NO_SVE-NEXT: tbnz w8, #17, .LBB13_35 +; NO_SVE-NEXT: b .LBB13_36 +; NO_SVE-NEXT: .LBB13_20: // %cond.load4 +; NO_SVE-NEXT: add x9, x0, #2 +; NO_SVE-NEXT: ld1 { v0.b }[2], [x9] +; NO_SVE-NEXT: tbz w8, #3, .LBB13_6 +; NO_SVE-NEXT: .LBB13_21: // %cond.load7 +; NO_SVE-NEXT: add x9, x0, #3 +; NO_SVE-NEXT: ld1 { v0.b }[3], [x9] +; NO_SVE-NEXT: tbz w8, #4, .LBB13_7 +; NO_SVE-NEXT: .LBB13_22: // %cond.load10 +; NO_SVE-NEXT: add x9, x0, #4 +; NO_SVE-NEXT: ld1 { v0.b }[4], [x9] +; NO_SVE-NEXT: tbz w8, #5, .LBB13_8 +; NO_SVE-NEXT: .LBB13_23: // %cond.load13 +; NO_SVE-NEXT: add x9, x0, #5 +; NO_SVE-NEXT: ld1 { v0.b }[5], [x9] +; NO_SVE-NEXT: tbz w8, #6, .LBB13_9 +; NO_SVE-NEXT: .LBB13_24: // %cond.load16 +; NO_SVE-NEXT: add x9, x0, #6 +; NO_SVE-NEXT: ld1 { v0.b }[6], [x9] +; NO_SVE-NEXT: tbz w8, #7, .LBB13_10 +; NO_SVE-NEXT: .LBB13_25: // %cond.load19 +; NO_SVE-NEXT: add x9, x0, #7 +; NO_SVE-NEXT: ld1 { v0.b }[7], [x9] +; NO_SVE-NEXT: tbz w8, #8, .LBB13_11 +; NO_SVE-NEXT: .LBB13_26: // %cond.load22 +; NO_SVE-NEXT: add x9, x0, #8 +; NO_SVE-NEXT: ld1 { v0.b }[8], [x9] +; NO_SVE-NEXT: tbz w8, #9, .LBB13_12 +; NO_SVE-NEXT: .LBB13_27: // %cond.load25 +; NO_SVE-NEXT: add x9, x0, #9 +; NO_SVE-NEXT: ld1 { v0.b }[9], [x9] +; NO_SVE-NEXT: tbz w8, #10, .LBB13_13 +; NO_SVE-NEXT: .LBB13_28: // %cond.load28 +; NO_SVE-NEXT: add x9, x0, #10 +; NO_SVE-NEXT: ld1 { v0.b }[10], [x9] +; NO_SVE-NEXT: tbz w8, #11, .LBB13_14 +; NO_SVE-NEXT: .LBB13_29: // %cond.load31 +; NO_SVE-NEXT: add x9, x0, #11 +; NO_SVE-NEXT: ld1 { v0.b }[11], [x9] +; NO_SVE-NEXT: tbz w8, #12, .LBB13_15 +; NO_SVE-NEXT: .LBB13_30: // %cond.load34 +; NO_SVE-NEXT: add x9, x0, #12 +; NO_SVE-NEXT: ld1 { v0.b }[12], [x9] +; NO_SVE-NEXT: tbz w8, #13, .LBB13_16 +; NO_SVE-NEXT: .LBB13_31: // %cond.load37 +; NO_SVE-NEXT: add x9, x0, #13 +; NO_SVE-NEXT: ld1 { v0.b }[13], [x9] +; NO_SVE-NEXT: tbz w8, #14, .LBB13_17 +; NO_SVE-NEXT: .LBB13_32: // %cond.load40 +; NO_SVE-NEXT: add x9, x0, #14 +; NO_SVE-NEXT: ld1 { v0.b }[14], [x9] +; NO_SVE-NEXT: tbz w8, #15, .LBB13_18 +; NO_SVE-NEXT: .LBB13_33: // %cond.load43 +; NO_SVE-NEXT: add x9, x0, #15 +; NO_SVE-NEXT: ld1 { v0.b }[15], [x9] +; NO_SVE-NEXT: tbnz w8, #16, .LBB13_19 +; NO_SVE-NEXT: .LBB13_34: +; NO_SVE-NEXT: // implicit-def: $q2 +; NO_SVE-NEXT: tbz w8, #17, .LBB13_36 +; NO_SVE-NEXT: .LBB13_35: // %cond.load49 +; NO_SVE-NEXT: add x9, x0, #17 +; NO_SVE-NEXT: ld1 { v2.b }[1], [x9] +; NO_SVE-NEXT: .LBB13_36: // %else50 +; NO_SVE-NEXT: tbnz w8, #18, .LBB13_52 +; NO_SVE-NEXT: // %bb.37: // %else53 +; NO_SVE-NEXT: tbnz w8, #19, .LBB13_53 +; NO_SVE-NEXT: .LBB13_38: // %else56 +; NO_SVE-NEXT: tbnz w8, #20, .LBB13_54 +; NO_SVE-NEXT: .LBB13_39: // %else59 +; NO_SVE-NEXT: tbnz w8, #21, .LBB13_55 +; NO_SVE-NEXT: .LBB13_40: // %else62 +; NO_SVE-NEXT: tbnz w8, #22, .LBB13_56 +; NO_SVE-NEXT: .LBB13_41: // %else65 +; NO_SVE-NEXT: tbnz w8, #23, .LBB13_57 +; NO_SVE-NEXT: .LBB13_42: // %else68 +; NO_SVE-NEXT: tbnz w8, #24, .LBB13_58 +; NO_SVE-NEXT: .LBB13_43: // %else71 +; NO_SVE-NEXT: tbnz w8, #25, .LBB13_59 +; NO_SVE-NEXT: .LBB13_44: // %else74 +; NO_SVE-NEXT: tbnz w8, #26, .LBB13_60 +; NO_SVE-NEXT: .LBB13_45: // %else77 +; NO_SVE-NEXT: tbnz w8, #27, .LBB13_61 +; NO_SVE-NEXT: .LBB13_46: // %else80 +; NO_SVE-NEXT: tbnz w8, #28, .LBB13_62 +; NO_SVE-NEXT: .LBB13_47: // %else83 +; NO_SVE-NEXT: tbnz w8, #29, .LBB13_63 +; NO_SVE-NEXT: .LBB13_48: // %else86 +; NO_SVE-NEXT: tbnz w8, #30, .LBB13_64 +; NO_SVE-NEXT: .LBB13_49: // %else89 +; NO_SVE-NEXT: tbz w8, #31, .LBB13_51 +; NO_SVE-NEXT: .LBB13_50: // %cond.load91 +; NO_SVE-NEXT: add x8, x0, #31 +; NO_SVE-NEXT: ld1 { v2.b }[15], [x8] +; NO_SVE-NEXT: .LBB13_51: // %else92 +; NO_SVE-NEXT: sshll2 v1.8h, v0.16b, #0 +; NO_SVE-NEXT: sshll v0.8h, v0.8b, #0 +; NO_SVE-NEXT: sshll2 v3.8h, v2.16b, #0 +; NO_SVE-NEXT: sshll v2.8h, v2.8b, #0 +; NO_SVE-NEXT: add sp, sp, #16 +; NO_SVE-NEXT: ret +; NO_SVE-NEXT: .LBB13_52: // %cond.load52 +; NO_SVE-NEXT: add x9, x0, #18 +; NO_SVE-NEXT: ld1 { v2.b }[2], [x9] +; NO_SVE-NEXT: tbz w8, #19, .LBB13_38 +; NO_SVE-NEXT: .LBB13_53: // %cond.load55 +; NO_SVE-NEXT: add x9, x0, #19 +; NO_SVE-NEXT: ld1 { v2.b }[3], [x9] +; NO_SVE-NEXT: tbz w8, #20, .LBB13_39 +; NO_SVE-NEXT: .LBB13_54: // %cond.load58 +; NO_SVE-NEXT: add x9, x0, #20 +; NO_SVE-NEXT: ld1 { v2.b }[4], [x9] +; NO_SVE-NEXT: tbz w8, #21, .LBB13_40 +; NO_SVE-NEXT: .LBB13_55: // %cond.load61 +; NO_SVE-NEXT: add x9, x0, #21 +; NO_SVE-NEXT: ld1 { v2.b }[5], [x9] +; NO_SVE-NEXT: tbz w8, #22, .LBB13_41 +; NO_SVE-NEXT: .LBB13_56: // %cond.load64 +; NO_SVE-NEXT: add x9, x0, #22 +; NO_SVE-NEXT: ld1 { v2.b }[6], [x9] +; NO_SVE-NEXT: tbz w8, #23, .LBB13_42 +; NO_SVE-NEXT: .LBB13_57: // %cond.load67 +; NO_SVE-NEXT: add x9, x0, #23 +; NO_SVE-NEXT: ld1 { v2.b }[7], [x9] +; NO_SVE-NEXT: tbz w8, #24, .LBB13_43 +; NO_SVE-NEXT: .LBB13_58: // %cond.load70 +; NO_SVE-NEXT: add x9, x0, #24 +; NO_SVE-NEXT: ld1 { v2.b }[8], [x9] +; NO_SVE-NEXT: tbz w8, #25, .LBB13_44 +; NO_SVE-NEXT: .LBB13_59: // %cond.load73 +; NO_SVE-NEXT: add x9, x0, #25 +; NO_SVE-NEXT: ld1 { v2.b }[9], [x9] +; NO_SVE-NEXT: tbz w8, #26, .LBB13_45 +; NO_SVE-NEXT: .LBB13_60: // %cond.load76 +; NO_SVE-NEXT: add x9, x0, #26 +; NO_SVE-NEXT: ld1 { v2.b }[10], [x9] +; NO_SVE-NEXT: tbz w8, #27, .LBB13_46 +; NO_SVE-NEXT: .LBB13_61: // %cond.load79 +; NO_SVE-NEXT: add x9, x0, #27 +; NO_SVE-NEXT: ld1 { v2.b }[11], [x9] +; NO_SVE-NEXT: tbz w8, #28, .LBB13_47 +; NO_SVE-NEXT: .LBB13_62: // %cond.load82 +; NO_SVE-NEXT: add x9, x0, #28 +; NO_SVE-NEXT: ld1 { v2.b }[12], [x9] +; NO_SVE-NEXT: tbz w8, #29, .LBB13_48 +; NO_SVE-NEXT: .LBB13_63: // %cond.load85 +; NO_SVE-NEXT: add x9, x0, #29 +; NO_SVE-NEXT: ld1 { v2.b }[13], [x9] +; NO_SVE-NEXT: tbz w8, #30, .LBB13_49 +; NO_SVE-NEXT: .LBB13_64: // %cond.load88 +; NO_SVE-NEXT: add x9, x0, #30 +; NO_SVE-NEXT: ld1 { v2.b }[14], [x9] +; NO_SVE-NEXT: tbnz w8, #31, .LBB13_50 +; NO_SVE-NEXT: b .LBB13_51 +; ; VBITS_GE_512-LABEL: masked_load_sext_v32i8i16: ; VBITS_GE_512: // %bb.0: ; VBITS_GE_512-NEXT: ptrue p0.h, vl32 @@ -279,6 +3275,164 @@ } define <16 x i32> @masked_load_sext_v16i8i32(<16 x i8>* %ap, <16 x i8>* %bp) #0 { +; NO_SVE-LABEL: masked_load_sext_v16i8i32: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: sub sp, sp, #16 +; NO_SVE-NEXT: .cfi_def_cfa_offset 16 +; NO_SVE-NEXT: ldr q0, [x1] +; NO_SVE-NEXT: cmeq v0.16b, v0.16b, #0 +; NO_SVE-NEXT: umov w8, v0.b[1] +; NO_SVE-NEXT: umov w10, v0.b[2] +; NO_SVE-NEXT: umov w9, v0.b[0] +; NO_SVE-NEXT: umov w11, v0.b[3] +; NO_SVE-NEXT: umov w12, v0.b[4] +; NO_SVE-NEXT: umov w13, v0.b[5] +; NO_SVE-NEXT: and w8, w8, #0x1 +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: and w9, w9, #0x1 +; NO_SVE-NEXT: and w11, w11, #0x1 +; NO_SVE-NEXT: and w12, w12, #0x1 +; NO_SVE-NEXT: and w13, w13, #0x1 +; NO_SVE-NEXT: bfi w9, w8, #1, #1 +; NO_SVE-NEXT: umov w8, v0.b[6] +; NO_SVE-NEXT: bfi w9, w10, #2, #1 +; NO_SVE-NEXT: umov w10, v0.b[7] +; NO_SVE-NEXT: bfi w9, w11, #3, #1 +; NO_SVE-NEXT: umov w11, v0.b[8] +; NO_SVE-NEXT: bfi w9, w12, #4, #1 +; NO_SVE-NEXT: umov w12, v0.b[9] +; NO_SVE-NEXT: and w8, w8, #0x1 +; NO_SVE-NEXT: bfi w9, w13, #5, #1 +; NO_SVE-NEXT: umov w13, v0.b[10] +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: orr w8, w9, w8, lsl #6 +; NO_SVE-NEXT: umov w9, v0.b[11] +; NO_SVE-NEXT: and w11, w11, #0x1 +; NO_SVE-NEXT: orr w8, w8, w10, lsl #7 +; NO_SVE-NEXT: umov w10, v0.b[12] +; NO_SVE-NEXT: and w12, w12, #0x1 +; NO_SVE-NEXT: orr w8, w8, w11, lsl #8 +; NO_SVE-NEXT: umov w11, v0.b[13] +; NO_SVE-NEXT: and w13, w13, #0x1 +; NO_SVE-NEXT: orr w8, w8, w12, lsl #9 +; NO_SVE-NEXT: umov w12, v0.b[14] +; NO_SVE-NEXT: and w9, w9, #0x1 +; NO_SVE-NEXT: orr w8, w8, w13, lsl #10 +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: orr w8, w8, w9, lsl #11 +; NO_SVE-NEXT: and w9, w11, #0x1 +; NO_SVE-NEXT: umov w11, v0.b[15] +; NO_SVE-NEXT: orr w8, w8, w10, lsl #12 +; NO_SVE-NEXT: and w10, w12, #0x1 +; NO_SVE-NEXT: orr w8, w8, w9, lsl #13 +; NO_SVE-NEXT: // implicit-def: $q0 +; NO_SVE-NEXT: orr w8, w8, w10, lsl #14 +; NO_SVE-NEXT: orr w9, w8, w11, lsl #15 +; NO_SVE-NEXT: and w8, w9, #0xffff +; NO_SVE-NEXT: tbnz w9, #0, .LBB14_18 +; NO_SVE-NEXT: // %bb.1: // %else +; NO_SVE-NEXT: tbnz w8, #1, .LBB14_19 +; NO_SVE-NEXT: .LBB14_2: // %else2 +; NO_SVE-NEXT: tbnz w8, #2, .LBB14_20 +; NO_SVE-NEXT: .LBB14_3: // %else5 +; NO_SVE-NEXT: tbnz w8, #3, .LBB14_21 +; NO_SVE-NEXT: .LBB14_4: // %else8 +; NO_SVE-NEXT: tbnz w8, #4, .LBB14_22 +; NO_SVE-NEXT: .LBB14_5: // %else11 +; NO_SVE-NEXT: tbnz w8, #5, .LBB14_23 +; NO_SVE-NEXT: .LBB14_6: // %else14 +; NO_SVE-NEXT: tbnz w8, #6, .LBB14_24 +; NO_SVE-NEXT: .LBB14_7: // %else17 +; NO_SVE-NEXT: tbnz w8, #7, .LBB14_25 +; NO_SVE-NEXT: .LBB14_8: // %else20 +; NO_SVE-NEXT: tbnz w8, #8, .LBB14_26 +; NO_SVE-NEXT: .LBB14_9: // %else23 +; NO_SVE-NEXT: tbnz w8, #9, .LBB14_27 +; NO_SVE-NEXT: .LBB14_10: // %else26 +; NO_SVE-NEXT: tbnz w8, #10, .LBB14_28 +; NO_SVE-NEXT: .LBB14_11: // %else29 +; NO_SVE-NEXT: tbnz w8, #11, .LBB14_29 +; NO_SVE-NEXT: .LBB14_12: // %else32 +; NO_SVE-NEXT: tbnz w8, #12, .LBB14_30 +; NO_SVE-NEXT: .LBB14_13: // %else35 +; NO_SVE-NEXT: tbnz w8, #13, .LBB14_31 +; NO_SVE-NEXT: .LBB14_14: // %else38 +; NO_SVE-NEXT: tbnz w8, #14, .LBB14_32 +; NO_SVE-NEXT: .LBB14_15: // %else41 +; NO_SVE-NEXT: tbz w8, #15, .LBB14_17 +; NO_SVE-NEXT: .LBB14_16: // %cond.load43 +; NO_SVE-NEXT: add x8, x0, #15 +; NO_SVE-NEXT: ld1 { v0.b }[15], [x8] +; NO_SVE-NEXT: .LBB14_17: // %else44 +; NO_SVE-NEXT: sshll2 v2.8h, v0.16b, #0 +; NO_SVE-NEXT: sshll v0.8h, v0.8b, #0 +; NO_SVE-NEXT: sshll2 v3.4s, v2.8h, #0 +; NO_SVE-NEXT: sshll2 v1.4s, v0.8h, #0 +; NO_SVE-NEXT: sshll v0.4s, v0.4h, #0 +; NO_SVE-NEXT: sshll v2.4s, v2.4h, #0 +; NO_SVE-NEXT: add sp, sp, #16 +; NO_SVE-NEXT: ret +; NO_SVE-NEXT: .LBB14_18: // %cond.load +; NO_SVE-NEXT: ldr b0, [x0] +; NO_SVE-NEXT: tbz w8, #1, .LBB14_2 +; NO_SVE-NEXT: .LBB14_19: // %cond.load1 +; NO_SVE-NEXT: add x9, x0, #1 +; NO_SVE-NEXT: ld1 { v0.b }[1], [x9] +; NO_SVE-NEXT: tbz w8, #2, .LBB14_3 +; NO_SVE-NEXT: .LBB14_20: // %cond.load4 +; NO_SVE-NEXT: add x9, x0, #2 +; NO_SVE-NEXT: ld1 { v0.b }[2], [x9] +; NO_SVE-NEXT: tbz w8, #3, .LBB14_4 +; NO_SVE-NEXT: .LBB14_21: // %cond.load7 +; NO_SVE-NEXT: add x9, x0, #3 +; NO_SVE-NEXT: ld1 { v0.b }[3], [x9] +; NO_SVE-NEXT: tbz w8, #4, .LBB14_5 +; NO_SVE-NEXT: .LBB14_22: // %cond.load10 +; NO_SVE-NEXT: add x9, x0, #4 +; NO_SVE-NEXT: ld1 { v0.b }[4], [x9] +; NO_SVE-NEXT: tbz w8, #5, .LBB14_6 +; NO_SVE-NEXT: .LBB14_23: // %cond.load13 +; NO_SVE-NEXT: add x9, x0, #5 +; NO_SVE-NEXT: ld1 { v0.b }[5], [x9] +; NO_SVE-NEXT: tbz w8, #6, .LBB14_7 +; NO_SVE-NEXT: .LBB14_24: // %cond.load16 +; NO_SVE-NEXT: add x9, x0, #6 +; NO_SVE-NEXT: ld1 { v0.b }[6], [x9] +; NO_SVE-NEXT: tbz w8, #7, .LBB14_8 +; NO_SVE-NEXT: .LBB14_25: // %cond.load19 +; NO_SVE-NEXT: add x9, x0, #7 +; NO_SVE-NEXT: ld1 { v0.b }[7], [x9] +; NO_SVE-NEXT: tbz w8, #8, .LBB14_9 +; NO_SVE-NEXT: .LBB14_26: // %cond.load22 +; NO_SVE-NEXT: add x9, x0, #8 +; NO_SVE-NEXT: ld1 { v0.b }[8], [x9] +; NO_SVE-NEXT: tbz w8, #9, .LBB14_10 +; NO_SVE-NEXT: .LBB14_27: // %cond.load25 +; NO_SVE-NEXT: add x9, x0, #9 +; NO_SVE-NEXT: ld1 { v0.b }[9], [x9] +; NO_SVE-NEXT: tbz w8, #10, .LBB14_11 +; NO_SVE-NEXT: .LBB14_28: // %cond.load28 +; NO_SVE-NEXT: add x9, x0, #10 +; NO_SVE-NEXT: ld1 { v0.b }[10], [x9] +; NO_SVE-NEXT: tbz w8, #11, .LBB14_12 +; NO_SVE-NEXT: .LBB14_29: // %cond.load31 +; NO_SVE-NEXT: add x9, x0, #11 +; NO_SVE-NEXT: ld1 { v0.b }[11], [x9] +; NO_SVE-NEXT: tbz w8, #12, .LBB14_13 +; NO_SVE-NEXT: .LBB14_30: // %cond.load34 +; NO_SVE-NEXT: add x9, x0, #12 +; NO_SVE-NEXT: ld1 { v0.b }[12], [x9] +; NO_SVE-NEXT: tbz w8, #13, .LBB14_14 +; NO_SVE-NEXT: .LBB14_31: // %cond.load37 +; NO_SVE-NEXT: add x9, x0, #13 +; NO_SVE-NEXT: ld1 { v0.b }[13], [x9] +; NO_SVE-NEXT: tbz w8, #14, .LBB14_15 +; NO_SVE-NEXT: .LBB14_32: // %cond.load40 +; NO_SVE-NEXT: add x9, x0, #14 +; NO_SVE-NEXT: ld1 { v0.b }[14], [x9] +; NO_SVE-NEXT: tbnz w8, #15, .LBB14_16 +; NO_SVE-NEXT: b .LBB14_17 +; ; VBITS_GE_512-LABEL: masked_load_sext_v16i8i32: ; VBITS_GE_512: // %bb.0: ; VBITS_GE_512-NEXT: ptrue p0.s, vl16 @@ -295,6 +3449,93 @@ } define <8 x i64> @masked_load_sext_v8i8i64(<8 x i8>* %ap, <8 x i8>* %bp) #0 { +; NO_SVE-LABEL: masked_load_sext_v8i8i64: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: sub sp, sp, #16 +; NO_SVE-NEXT: .cfi_def_cfa_offset 16 +; NO_SVE-NEXT: ldr d0, [x1] +; NO_SVE-NEXT: cmeq v0.8b, v0.8b, #0 +; NO_SVE-NEXT: umov w8, v0.b[1] +; NO_SVE-NEXT: umov w10, v0.b[2] +; NO_SVE-NEXT: umov w9, v0.b[0] +; NO_SVE-NEXT: umov w11, v0.b[3] +; NO_SVE-NEXT: umov w12, v0.b[4] +; NO_SVE-NEXT: umov w13, v0.b[5] +; NO_SVE-NEXT: umov w14, v0.b[6] +; NO_SVE-NEXT: and w8, w8, #0x1 +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: and w9, w9, #0x1 +; NO_SVE-NEXT: and w11, w11, #0x1 +; NO_SVE-NEXT: bfi w9, w8, #1, #1 +; NO_SVE-NEXT: and w8, w12, #0x1 +; NO_SVE-NEXT: bfi w9, w10, #2, #1 +; NO_SVE-NEXT: and w10, w13, #0x1 +; NO_SVE-NEXT: bfi w9, w11, #3, #1 +; NO_SVE-NEXT: umov w11, v0.b[7] +; NO_SVE-NEXT: bfi w9, w8, #4, #1 +; NO_SVE-NEXT: and w8, w14, #0x1 +; NO_SVE-NEXT: bfi w9, w10, #5, #1 +; NO_SVE-NEXT: // implicit-def: $d0 +; NO_SVE-NEXT: orr w8, w9, w8, lsl #6 +; NO_SVE-NEXT: orr w9, w8, w11, lsl #7 +; NO_SVE-NEXT: and w8, w9, #0xff +; NO_SVE-NEXT: tbnz w9, #0, .LBB15_10 +; NO_SVE-NEXT: // %bb.1: // %else +; NO_SVE-NEXT: tbnz w8, #1, .LBB15_11 +; NO_SVE-NEXT: .LBB15_2: // %else2 +; NO_SVE-NEXT: tbnz w8, #2, .LBB15_12 +; NO_SVE-NEXT: .LBB15_3: // %else5 +; NO_SVE-NEXT: tbnz w8, #3, .LBB15_13 +; NO_SVE-NEXT: .LBB15_4: // %else8 +; NO_SVE-NEXT: tbnz w8, #4, .LBB15_14 +; NO_SVE-NEXT: .LBB15_5: // %else11 +; NO_SVE-NEXT: tbnz w8, #5, .LBB15_15 +; NO_SVE-NEXT: .LBB15_6: // %else14 +; NO_SVE-NEXT: tbnz w8, #6, .LBB15_16 +; NO_SVE-NEXT: .LBB15_7: // %else17 +; NO_SVE-NEXT: tbz w8, #7, .LBB15_9 +; NO_SVE-NEXT: .LBB15_8: // %cond.load19 +; NO_SVE-NEXT: add x8, x0, #7 +; NO_SVE-NEXT: ld1 { v0.b }[7], [x8] +; NO_SVE-NEXT: .LBB15_9: // %else20 +; NO_SVE-NEXT: sshll v0.8h, v0.8b, #0 +; NO_SVE-NEXT: sshll2 v2.4s, v0.8h, #0 +; NO_SVE-NEXT: sshll v0.4s, v0.4h, #0 +; NO_SVE-NEXT: sshll2 v3.2d, v2.4s, #0 +; NO_SVE-NEXT: sshll2 v1.2d, v0.4s, #0 +; NO_SVE-NEXT: sshll v0.2d, v0.2s, #0 +; NO_SVE-NEXT: sshll v2.2d, v2.2s, #0 +; NO_SVE-NEXT: add sp, sp, #16 +; NO_SVE-NEXT: ret +; NO_SVE-NEXT: .LBB15_10: // %cond.load +; NO_SVE-NEXT: ldr b0, [x0] +; NO_SVE-NEXT: tbz w8, #1, .LBB15_2 +; NO_SVE-NEXT: .LBB15_11: // %cond.load1 +; NO_SVE-NEXT: add x9, x0, #1 +; NO_SVE-NEXT: ld1 { v0.b }[1], [x9] +; NO_SVE-NEXT: tbz w8, #2, .LBB15_3 +; NO_SVE-NEXT: .LBB15_12: // %cond.load4 +; NO_SVE-NEXT: add x9, x0, #2 +; NO_SVE-NEXT: ld1 { v0.b }[2], [x9] +; NO_SVE-NEXT: tbz w8, #3, .LBB15_4 +; NO_SVE-NEXT: .LBB15_13: // %cond.load7 +; NO_SVE-NEXT: add x9, x0, #3 +; NO_SVE-NEXT: ld1 { v0.b }[3], [x9] +; NO_SVE-NEXT: tbz w8, #4, .LBB15_5 +; NO_SVE-NEXT: .LBB15_14: // %cond.load10 +; NO_SVE-NEXT: add x9, x0, #4 +; NO_SVE-NEXT: ld1 { v0.b }[4], [x9] +; NO_SVE-NEXT: tbz w8, #5, .LBB15_6 +; NO_SVE-NEXT: .LBB15_15: // %cond.load13 +; NO_SVE-NEXT: add x9, x0, #5 +; NO_SVE-NEXT: ld1 { v0.b }[5], [x9] +; NO_SVE-NEXT: tbz w8, #6, .LBB15_7 +; NO_SVE-NEXT: .LBB15_16: // %cond.load16 +; NO_SVE-NEXT: add x9, x0, #6 +; NO_SVE-NEXT: ld1 { v0.b }[6], [x9] +; NO_SVE-NEXT: tbnz w8, #7, .LBB15_8 +; NO_SVE-NEXT: b .LBB15_9 +; ; VBITS_GE_512-LABEL: masked_load_sext_v8i8i64: ; VBITS_GE_512: // %bb.0: ; VBITS_GE_512-NEXT: ptrue p0.d, vl8 @@ -311,6 +3552,166 @@ } define <16 x i32> @masked_load_sext_v16i16i32(<16 x i16>* %ap, <16 x i16>* %bp) #0 { +; NO_SVE-LABEL: masked_load_sext_v16i16i32: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: sub sp, sp, #16 +; NO_SVE-NEXT: .cfi_def_cfa_offset 16 +; NO_SVE-NEXT: ldp q0, q1, [x1] +; NO_SVE-NEXT: cmeq v0.8h, v0.8h, #0 +; NO_SVE-NEXT: xtn v0.8b, v0.8h +; NO_SVE-NEXT: cmeq v1.8h, v1.8h, #0 +; NO_SVE-NEXT: umov w8, v0.b[1] +; NO_SVE-NEXT: umov w10, v0.b[2] +; NO_SVE-NEXT: umov w9, v0.b[0] +; NO_SVE-NEXT: umov w11, v0.b[3] +; NO_SVE-NEXT: umov w12, v0.b[4] +; NO_SVE-NEXT: umov w13, v0.b[5] +; NO_SVE-NEXT: xtn v1.8b, v1.8h +; NO_SVE-NEXT: and w8, w8, #0x1 +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: and w9, w9, #0x1 +; NO_SVE-NEXT: and w11, w11, #0x1 +; NO_SVE-NEXT: and w12, w12, #0x1 +; NO_SVE-NEXT: bfi w9, w8, #1, #1 +; NO_SVE-NEXT: umov w8, v0.b[6] +; NO_SVE-NEXT: bfi w9, w10, #2, #1 +; NO_SVE-NEXT: umov w10, v0.b[7] +; NO_SVE-NEXT: and w13, w13, #0x1 +; NO_SVE-NEXT: bfi w9, w11, #3, #1 +; NO_SVE-NEXT: umov w11, v1.b[0] +; NO_SVE-NEXT: bfi w9, w12, #4, #1 +; NO_SVE-NEXT: umov w12, v1.b[1] +; NO_SVE-NEXT: and w8, w8, #0x1 +; NO_SVE-NEXT: bfi w9, w13, #5, #1 +; NO_SVE-NEXT: umov w13, v1.b[2] +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: orr w8, w9, w8, lsl #6 +; NO_SVE-NEXT: umov w9, v1.b[3] +; NO_SVE-NEXT: and w11, w11, #0x1 +; NO_SVE-NEXT: orr w8, w8, w10, lsl #7 +; NO_SVE-NEXT: umov w10, v1.b[4] +; NO_SVE-NEXT: and w12, w12, #0x1 +; NO_SVE-NEXT: orr w8, w8, w11, lsl #8 +; NO_SVE-NEXT: umov w11, v1.b[5] +; NO_SVE-NEXT: and w13, w13, #0x1 +; NO_SVE-NEXT: orr w8, w8, w12, lsl #9 +; NO_SVE-NEXT: umov w12, v1.b[6] +; NO_SVE-NEXT: and w9, w9, #0x1 +; NO_SVE-NEXT: orr w8, w8, w13, lsl #10 +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: orr w8, w8, w9, lsl #11 +; NO_SVE-NEXT: and w9, w11, #0x1 +; NO_SVE-NEXT: umov w11, v1.b[7] +; NO_SVE-NEXT: orr w8, w8, w10, lsl #12 +; NO_SVE-NEXT: and w10, w12, #0x1 +; NO_SVE-NEXT: orr w8, w8, w9, lsl #13 +; NO_SVE-NEXT: orr w8, w8, w10, lsl #14 +; NO_SVE-NEXT: orr w9, w8, w11, lsl #15 +; NO_SVE-NEXT: and w8, w9, #0xffff +; NO_SVE-NEXT: tbz w9, #0, .LBB16_2 +; NO_SVE-NEXT: // %bb.1: // %cond.load +; NO_SVE-NEXT: ldr h0, [x0] +; NO_SVE-NEXT: tbnz w8, #1, .LBB16_3 +; NO_SVE-NEXT: b .LBB16_4 +; NO_SVE-NEXT: .LBB16_2: +; NO_SVE-NEXT: // implicit-def: $q0 +; NO_SVE-NEXT: tbz w8, #1, .LBB16_4 +; NO_SVE-NEXT: .LBB16_3: // %cond.load1 +; NO_SVE-NEXT: add x9, x0, #2 +; NO_SVE-NEXT: ld1 { v0.h }[1], [x9] +; NO_SVE-NEXT: .LBB16_4: // %else2 +; NO_SVE-NEXT: tbnz w8, #2, .LBB16_12 +; NO_SVE-NEXT: // %bb.5: // %else5 +; NO_SVE-NEXT: tbnz w8, #3, .LBB16_13 +; NO_SVE-NEXT: .LBB16_6: // %else8 +; NO_SVE-NEXT: tbnz w8, #4, .LBB16_14 +; NO_SVE-NEXT: .LBB16_7: // %else11 +; NO_SVE-NEXT: tbnz w8, #5, .LBB16_15 +; NO_SVE-NEXT: .LBB16_8: // %else14 +; NO_SVE-NEXT: tbnz w8, #6, .LBB16_16 +; NO_SVE-NEXT: .LBB16_9: // %else17 +; NO_SVE-NEXT: tbnz w8, #7, .LBB16_17 +; NO_SVE-NEXT: .LBB16_10: // %else20 +; NO_SVE-NEXT: tbz w8, #8, .LBB16_18 +; NO_SVE-NEXT: .LBB16_11: // %cond.load22 +; NO_SVE-NEXT: add x9, x0, #16 +; NO_SVE-NEXT: ld1 { v2.h }[0], [x9] +; NO_SVE-NEXT: tbnz w8, #9, .LBB16_19 +; NO_SVE-NEXT: b .LBB16_20 +; NO_SVE-NEXT: .LBB16_12: // %cond.load4 +; NO_SVE-NEXT: add x9, x0, #4 +; NO_SVE-NEXT: ld1 { v0.h }[2], [x9] +; NO_SVE-NEXT: tbz w8, #3, .LBB16_6 +; NO_SVE-NEXT: .LBB16_13: // %cond.load7 +; NO_SVE-NEXT: add x9, x0, #6 +; NO_SVE-NEXT: ld1 { v0.h }[3], [x9] +; NO_SVE-NEXT: tbz w8, #4, .LBB16_7 +; NO_SVE-NEXT: .LBB16_14: // %cond.load10 +; NO_SVE-NEXT: add x9, x0, #8 +; NO_SVE-NEXT: ld1 { v0.h }[4], [x9] +; NO_SVE-NEXT: tbz w8, #5, .LBB16_8 +; NO_SVE-NEXT: .LBB16_15: // %cond.load13 +; NO_SVE-NEXT: add x9, x0, #10 +; NO_SVE-NEXT: ld1 { v0.h }[5], [x9] +; NO_SVE-NEXT: tbz w8, #6, .LBB16_9 +; NO_SVE-NEXT: .LBB16_16: // %cond.load16 +; NO_SVE-NEXT: add x9, x0, #12 +; NO_SVE-NEXT: ld1 { v0.h }[6], [x9] +; NO_SVE-NEXT: tbz w8, #7, .LBB16_10 +; NO_SVE-NEXT: .LBB16_17: // %cond.load19 +; NO_SVE-NEXT: add x9, x0, #14 +; NO_SVE-NEXT: ld1 { v0.h }[7], [x9] +; NO_SVE-NEXT: tbnz w8, #8, .LBB16_11 +; NO_SVE-NEXT: .LBB16_18: +; NO_SVE-NEXT: // implicit-def: $q2 +; NO_SVE-NEXT: tbz w8, #9, .LBB16_20 +; NO_SVE-NEXT: .LBB16_19: // %cond.load25 +; NO_SVE-NEXT: add x9, x0, #18 +; NO_SVE-NEXT: ld1 { v2.h }[1], [x9] +; NO_SVE-NEXT: .LBB16_20: // %else26 +; NO_SVE-NEXT: tbnz w8, #10, .LBB16_28 +; NO_SVE-NEXT: // %bb.21: // %else29 +; NO_SVE-NEXT: tbnz w8, #11, .LBB16_29 +; NO_SVE-NEXT: .LBB16_22: // %else32 +; NO_SVE-NEXT: tbnz w8, #12, .LBB16_30 +; NO_SVE-NEXT: .LBB16_23: // %else35 +; NO_SVE-NEXT: tbnz w8, #13, .LBB16_31 +; NO_SVE-NEXT: .LBB16_24: // %else38 +; NO_SVE-NEXT: tbnz w8, #14, .LBB16_32 +; NO_SVE-NEXT: .LBB16_25: // %else41 +; NO_SVE-NEXT: tbz w8, #15, .LBB16_27 +; NO_SVE-NEXT: .LBB16_26: // %cond.load43 +; NO_SVE-NEXT: add x8, x0, #30 +; NO_SVE-NEXT: ld1 { v2.h }[7], [x8] +; NO_SVE-NEXT: .LBB16_27: // %else44 +; NO_SVE-NEXT: sshll2 v1.4s, v0.8h, #0 +; NO_SVE-NEXT: sshll v0.4s, v0.4h, #0 +; NO_SVE-NEXT: sshll2 v3.4s, v2.8h, #0 +; NO_SVE-NEXT: sshll v2.4s, v2.4h, #0 +; NO_SVE-NEXT: add sp, sp, #16 +; NO_SVE-NEXT: ret +; NO_SVE-NEXT: .LBB16_28: // %cond.load28 +; NO_SVE-NEXT: add x9, x0, #20 +; NO_SVE-NEXT: ld1 { v2.h }[2], [x9] +; NO_SVE-NEXT: tbz w8, #11, .LBB16_22 +; NO_SVE-NEXT: .LBB16_29: // %cond.load31 +; NO_SVE-NEXT: add x9, x0, #22 +; NO_SVE-NEXT: ld1 { v2.h }[3], [x9] +; NO_SVE-NEXT: tbz w8, #12, .LBB16_23 +; NO_SVE-NEXT: .LBB16_30: // %cond.load34 +; NO_SVE-NEXT: add x9, x0, #24 +; NO_SVE-NEXT: ld1 { v2.h }[4], [x9] +; NO_SVE-NEXT: tbz w8, #13, .LBB16_24 +; NO_SVE-NEXT: .LBB16_31: // %cond.load37 +; NO_SVE-NEXT: add x9, x0, #26 +; NO_SVE-NEXT: ld1 { v2.h }[5], [x9] +; NO_SVE-NEXT: tbz w8, #14, .LBB16_25 +; NO_SVE-NEXT: .LBB16_32: // %cond.load40 +; NO_SVE-NEXT: add x9, x0, #28 +; NO_SVE-NEXT: ld1 { v2.h }[6], [x9] +; NO_SVE-NEXT: tbnz w8, #15, .LBB16_26 +; NO_SVE-NEXT: b .LBB16_27 +; ; VBITS_GE_512-LABEL: masked_load_sext_v16i16i32: ; VBITS_GE_512: // %bb.0: ; VBITS_GE_512-NEXT: ptrue p0.s, vl16 @@ -327,6 +3728,93 @@ } define <8 x i64> @masked_load_sext_v8i16i64(<8 x i16>* %ap, <8 x i16>* %bp) #0 { +; NO_SVE-LABEL: masked_load_sext_v8i16i64: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: sub sp, sp, #16 +; NO_SVE-NEXT: .cfi_def_cfa_offset 16 +; NO_SVE-NEXT: ldr q0, [x1] +; NO_SVE-NEXT: cmeq v0.8h, v0.8h, #0 +; NO_SVE-NEXT: xtn v0.8b, v0.8h +; NO_SVE-NEXT: umov w8, v0.b[1] +; NO_SVE-NEXT: umov w10, v0.b[2] +; NO_SVE-NEXT: umov w9, v0.b[0] +; NO_SVE-NEXT: umov w11, v0.b[3] +; NO_SVE-NEXT: umov w12, v0.b[4] +; NO_SVE-NEXT: umov w13, v0.b[5] +; NO_SVE-NEXT: umov w14, v0.b[6] +; NO_SVE-NEXT: and w8, w8, #0x1 +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: and w9, w9, #0x1 +; NO_SVE-NEXT: and w11, w11, #0x1 +; NO_SVE-NEXT: bfi w9, w8, #1, #1 +; NO_SVE-NEXT: and w8, w12, #0x1 +; NO_SVE-NEXT: bfi w9, w10, #2, #1 +; NO_SVE-NEXT: and w10, w13, #0x1 +; NO_SVE-NEXT: bfi w9, w11, #3, #1 +; NO_SVE-NEXT: umov w11, v0.b[7] +; NO_SVE-NEXT: bfi w9, w8, #4, #1 +; NO_SVE-NEXT: and w8, w14, #0x1 +; NO_SVE-NEXT: bfi w9, w10, #5, #1 +; NO_SVE-NEXT: // implicit-def: $q0 +; NO_SVE-NEXT: orr w8, w9, w8, lsl #6 +; NO_SVE-NEXT: orr w9, w8, w11, lsl #7 +; NO_SVE-NEXT: and w8, w9, #0xff +; NO_SVE-NEXT: tbnz w9, #0, .LBB17_10 +; NO_SVE-NEXT: // %bb.1: // %else +; NO_SVE-NEXT: tbnz w8, #1, .LBB17_11 +; NO_SVE-NEXT: .LBB17_2: // %else2 +; NO_SVE-NEXT: tbnz w8, #2, .LBB17_12 +; NO_SVE-NEXT: .LBB17_3: // %else5 +; NO_SVE-NEXT: tbnz w8, #3, .LBB17_13 +; NO_SVE-NEXT: .LBB17_4: // %else8 +; NO_SVE-NEXT: tbnz w8, #4, .LBB17_14 +; NO_SVE-NEXT: .LBB17_5: // %else11 +; NO_SVE-NEXT: tbnz w8, #5, .LBB17_15 +; NO_SVE-NEXT: .LBB17_6: // %else14 +; NO_SVE-NEXT: tbnz w8, #6, .LBB17_16 +; NO_SVE-NEXT: .LBB17_7: // %else17 +; NO_SVE-NEXT: tbz w8, #7, .LBB17_9 +; NO_SVE-NEXT: .LBB17_8: // %cond.load19 +; NO_SVE-NEXT: add x8, x0, #14 +; NO_SVE-NEXT: ld1 { v0.h }[7], [x8] +; NO_SVE-NEXT: .LBB17_9: // %else20 +; NO_SVE-NEXT: sshll2 v2.4s, v0.8h, #0 +; NO_SVE-NEXT: sshll v0.4s, v0.4h, #0 +; NO_SVE-NEXT: sshll2 v3.2d, v2.4s, #0 +; NO_SVE-NEXT: sshll2 v1.2d, v0.4s, #0 +; NO_SVE-NEXT: sshll v0.2d, v0.2s, #0 +; NO_SVE-NEXT: sshll v2.2d, v2.2s, #0 +; NO_SVE-NEXT: add sp, sp, #16 +; NO_SVE-NEXT: ret +; NO_SVE-NEXT: .LBB17_10: // %cond.load +; NO_SVE-NEXT: ldr h0, [x0] +; NO_SVE-NEXT: tbz w8, #1, .LBB17_2 +; NO_SVE-NEXT: .LBB17_11: // %cond.load1 +; NO_SVE-NEXT: add x9, x0, #2 +; NO_SVE-NEXT: ld1 { v0.h }[1], [x9] +; NO_SVE-NEXT: tbz w8, #2, .LBB17_3 +; NO_SVE-NEXT: .LBB17_12: // %cond.load4 +; NO_SVE-NEXT: add x9, x0, #4 +; NO_SVE-NEXT: ld1 { v0.h }[2], [x9] +; NO_SVE-NEXT: tbz w8, #3, .LBB17_4 +; NO_SVE-NEXT: .LBB17_13: // %cond.load7 +; NO_SVE-NEXT: add x9, x0, #6 +; NO_SVE-NEXT: ld1 { v0.h }[3], [x9] +; NO_SVE-NEXT: tbz w8, #4, .LBB17_5 +; NO_SVE-NEXT: .LBB17_14: // %cond.load10 +; NO_SVE-NEXT: add x9, x0, #8 +; NO_SVE-NEXT: ld1 { v0.h }[4], [x9] +; NO_SVE-NEXT: tbz w8, #5, .LBB17_6 +; NO_SVE-NEXT: .LBB17_15: // %cond.load13 +; NO_SVE-NEXT: add x9, x0, #10 +; NO_SVE-NEXT: ld1 { v0.h }[5], [x9] +; NO_SVE-NEXT: tbz w8, #6, .LBB17_7 +; NO_SVE-NEXT: .LBB17_16: // %cond.load16 +; NO_SVE-NEXT: add x9, x0, #12 +; NO_SVE-NEXT: ld1 { v0.h }[6], [x9] +; NO_SVE-NEXT: tbnz w8, #7, .LBB17_8 +; NO_SVE-NEXT: b .LBB17_9 +; ; VBITS_GE_512-LABEL: masked_load_sext_v8i16i64: ; VBITS_GE_512: // %bb.0: ; VBITS_GE_512-NEXT: ptrue p0.d, vl8 @@ -343,6 +3831,94 @@ } define <8 x i64> @masked_load_sext_v8i32i64(<8 x i32>* %ap, <8 x i32>* %bp) #0 { +; NO_SVE-LABEL: masked_load_sext_v8i32i64: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: sub sp, sp, #16 +; NO_SVE-NEXT: .cfi_def_cfa_offset 16 +; NO_SVE-NEXT: ldp q1, q0, [x1] +; NO_SVE-NEXT: cmeq v1.4s, v1.4s, #0 +; NO_SVE-NEXT: cmeq v0.4s, v0.4s, #0 +; NO_SVE-NEXT: uzp1 v0.8h, v1.8h, v0.8h +; NO_SVE-NEXT: xtn v0.8b, v0.8h +; NO_SVE-NEXT: umov w8, v0.b[1] +; NO_SVE-NEXT: umov w10, v0.b[2] +; NO_SVE-NEXT: umov w9, v0.b[0] +; NO_SVE-NEXT: umov w11, v0.b[3] +; NO_SVE-NEXT: umov w12, v0.b[4] +; NO_SVE-NEXT: umov w13, v0.b[5] +; NO_SVE-NEXT: umov w14, v0.b[6] +; NO_SVE-NEXT: and w8, w8, #0x1 +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: and w9, w9, #0x1 +; NO_SVE-NEXT: and w11, w11, #0x1 +; NO_SVE-NEXT: bfi w9, w8, #1, #1 +; NO_SVE-NEXT: and w8, w12, #0x1 +; NO_SVE-NEXT: bfi w9, w10, #2, #1 +; NO_SVE-NEXT: and w10, w13, #0x1 +; NO_SVE-NEXT: bfi w9, w11, #3, #1 +; NO_SVE-NEXT: umov w11, v0.b[7] +; NO_SVE-NEXT: bfi w9, w8, #4, #1 +; NO_SVE-NEXT: and w8, w14, #0x1 +; NO_SVE-NEXT: bfi w9, w10, #5, #1 +; NO_SVE-NEXT: orr w8, w9, w8, lsl #6 +; NO_SVE-NEXT: orr w9, w8, w11, lsl #7 +; NO_SVE-NEXT: and w8, w9, #0xff +; NO_SVE-NEXT: tbz w9, #0, .LBB18_2 +; NO_SVE-NEXT: // %bb.1: // %cond.load +; NO_SVE-NEXT: ldr s0, [x0] +; NO_SVE-NEXT: tbnz w8, #1, .LBB18_3 +; NO_SVE-NEXT: b .LBB18_4 +; NO_SVE-NEXT: .LBB18_2: +; NO_SVE-NEXT: // implicit-def: $q0 +; NO_SVE-NEXT: tbz w8, #1, .LBB18_4 +; NO_SVE-NEXT: .LBB18_3: // %cond.load1 +; NO_SVE-NEXT: add x9, x0, #4 +; NO_SVE-NEXT: ld1 { v0.s }[1], [x9] +; NO_SVE-NEXT: .LBB18_4: // %else2 +; NO_SVE-NEXT: tbnz w8, #2, .LBB18_8 +; NO_SVE-NEXT: // %bb.5: // %else5 +; NO_SVE-NEXT: tbnz w8, #3, .LBB18_9 +; NO_SVE-NEXT: .LBB18_6: // %else8 +; NO_SVE-NEXT: tbz w8, #4, .LBB18_10 +; NO_SVE-NEXT: .LBB18_7: // %cond.load10 +; NO_SVE-NEXT: add x9, x0, #16 +; NO_SVE-NEXT: ld1 { v2.s }[0], [x9] +; NO_SVE-NEXT: tbnz w8, #5, .LBB18_11 +; NO_SVE-NEXT: b .LBB18_12 +; NO_SVE-NEXT: .LBB18_8: // %cond.load4 +; NO_SVE-NEXT: add x9, x0, #8 +; NO_SVE-NEXT: ld1 { v0.s }[2], [x9] +; NO_SVE-NEXT: tbz w8, #3, .LBB18_6 +; NO_SVE-NEXT: .LBB18_9: // %cond.load7 +; NO_SVE-NEXT: add x9, x0, #12 +; NO_SVE-NEXT: ld1 { v0.s }[3], [x9] +; NO_SVE-NEXT: tbnz w8, #4, .LBB18_7 +; NO_SVE-NEXT: .LBB18_10: +; NO_SVE-NEXT: // implicit-def: $q2 +; NO_SVE-NEXT: tbz w8, #5, .LBB18_12 +; NO_SVE-NEXT: .LBB18_11: // %cond.load13 +; NO_SVE-NEXT: add x9, x0, #20 +; NO_SVE-NEXT: ld1 { v2.s }[1], [x9] +; NO_SVE-NEXT: .LBB18_12: // %else14 +; NO_SVE-NEXT: tbnz w8, #6, .LBB18_16 +; NO_SVE-NEXT: // %bb.13: // %else17 +; NO_SVE-NEXT: tbz w8, #7, .LBB18_15 +; NO_SVE-NEXT: .LBB18_14: // %cond.load19 +; NO_SVE-NEXT: add x8, x0, #28 +; NO_SVE-NEXT: ld1 { v2.s }[3], [x8] +; NO_SVE-NEXT: .LBB18_15: // %else20 +; NO_SVE-NEXT: sshll2 v1.2d, v0.4s, #0 +; NO_SVE-NEXT: sshll v0.2d, v0.2s, #0 +; NO_SVE-NEXT: sshll2 v3.2d, v2.4s, #0 +; NO_SVE-NEXT: sshll v2.2d, v2.2s, #0 +; NO_SVE-NEXT: add sp, sp, #16 +; NO_SVE-NEXT: ret +; NO_SVE-NEXT: .LBB18_16: // %cond.load16 +; NO_SVE-NEXT: add x9, x0, #24 +; NO_SVE-NEXT: ld1 { v2.s }[2], [x9] +; NO_SVE-NEXT: tbnz w8, #7, .LBB18_14 +; NO_SVE-NEXT: b .LBB18_15 +; ; VBITS_GE_512-LABEL: masked_load_sext_v8i32i64: ; VBITS_GE_512: // %bb.0: ; VBITS_GE_512-NEXT: ptrue p0.d, vl8 @@ -359,6 +3935,306 @@ } define <32 x i16> @masked_load_zext_v32i8i16(<32 x i8>* %ap, <32 x i8>* %bp) #0 { +; NO_SVE-LABEL: masked_load_zext_v32i8i16: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: sub sp, sp, #16 +; NO_SVE-NEXT: .cfi_def_cfa_offset 16 +; NO_SVE-NEXT: ldp q1, q0, [x1] +; NO_SVE-NEXT: cmeq v1.16b, v1.16b, #0 +; NO_SVE-NEXT: cmeq v0.16b, v0.16b, #0 +; NO_SVE-NEXT: umov w8, v0.b[1] +; NO_SVE-NEXT: umov w10, v0.b[2] +; NO_SVE-NEXT: umov w9, v0.b[0] +; NO_SVE-NEXT: umov w11, v0.b[3] +; NO_SVE-NEXT: umov w12, v0.b[4] +; NO_SVE-NEXT: umov w13, v0.b[5] +; NO_SVE-NEXT: umov w14, v0.b[6] +; NO_SVE-NEXT: umov w15, v0.b[7] +; NO_SVE-NEXT: and w8, w8, #0x1 +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: and w9, w9, #0x1 +; NO_SVE-NEXT: and w11, w11, #0x1 +; NO_SVE-NEXT: and w12, w12, #0x1 +; NO_SVE-NEXT: umov w16, v0.b[8] +; NO_SVE-NEXT: bfi w9, w8, #1, #1 +; NO_SVE-NEXT: and w13, w13, #0x1 +; NO_SVE-NEXT: bfi w9, w10, #2, #1 +; NO_SVE-NEXT: umov w10, v1.b[1] +; NO_SVE-NEXT: bfi w9, w11, #3, #1 +; NO_SVE-NEXT: umov w17, v0.b[9] +; NO_SVE-NEXT: bfi w9, w12, #4, #1 +; NO_SVE-NEXT: umov w12, v1.b[0] +; NO_SVE-NEXT: umov w8, v0.b[10] +; NO_SVE-NEXT: and w14, w14, #0x1 +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: bfi w9, w13, #5, #1 +; NO_SVE-NEXT: and w15, w15, #0x1 +; NO_SVE-NEXT: umov w11, v0.b[11] +; NO_SVE-NEXT: and w12, w12, #0x1 +; NO_SVE-NEXT: orr w9, w9, w14, lsl #6 +; NO_SVE-NEXT: umov w14, v1.b[2] +; NO_SVE-NEXT: bfi w12, w10, #1, #1 +; NO_SVE-NEXT: and w10, w16, #0x1 +; NO_SVE-NEXT: umov w13, v0.b[12] +; NO_SVE-NEXT: orr w9, w9, w15, lsl #7 +; NO_SVE-NEXT: and w15, w17, #0x1 +; NO_SVE-NEXT: and w8, w8, #0x1 +; NO_SVE-NEXT: orr w9, w9, w10, lsl #8 +; NO_SVE-NEXT: umov w10, v1.b[3] +; NO_SVE-NEXT: orr w9, w9, w15, lsl #9 +; NO_SVE-NEXT: umov w15, v1.b[4] +; NO_SVE-NEXT: and w11, w11, #0x1 +; NO_SVE-NEXT: and w14, w14, #0x1 +; NO_SVE-NEXT: orr w8, w9, w8, lsl #10 +; NO_SVE-NEXT: umov w9, v1.b[5] +; NO_SVE-NEXT: and w13, w13, #0x1 +; NO_SVE-NEXT: orr w8, w8, w11, lsl #11 +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: orr w8, w8, w13, lsl #12 +; NO_SVE-NEXT: bfi w12, w14, #2, #1 +; NO_SVE-NEXT: and w13, w15, #0x1 +; NO_SVE-NEXT: umov w14, v1.b[6] +; NO_SVE-NEXT: and w9, w9, #0x1 +; NO_SVE-NEXT: bfi w12, w10, #3, #1 +; NO_SVE-NEXT: umov w10, v1.b[7] +; NO_SVE-NEXT: umov w11, v0.b[13] +; NO_SVE-NEXT: bfi w12, w13, #4, #1 +; NO_SVE-NEXT: umov w13, v0.b[14] +; NO_SVE-NEXT: bfi w12, w9, #5, #1 +; NO_SVE-NEXT: and w9, w14, #0x1 +; NO_SVE-NEXT: umov w14, v1.b[8] +; NO_SVE-NEXT: umov w15, v1.b[9] +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: and w11, w11, #0x1 +; NO_SVE-NEXT: orr w9, w12, w9, lsl #6 +; NO_SVE-NEXT: umov w12, v1.b[10] +; NO_SVE-NEXT: orr w9, w9, w10, lsl #7 +; NO_SVE-NEXT: orr w8, w8, w11, lsl #13 +; NO_SVE-NEXT: and w10, w14, #0x1 +; NO_SVE-NEXT: umov w14, v1.b[11] +; NO_SVE-NEXT: and w11, w13, #0x1 +; NO_SVE-NEXT: and w13, w15, #0x1 +; NO_SVE-NEXT: orr w9, w9, w10, lsl #8 +; NO_SVE-NEXT: umov w10, v1.b[12] +; NO_SVE-NEXT: orr w8, w8, w11, lsl #14 +; NO_SVE-NEXT: and w11, w12, #0x1 +; NO_SVE-NEXT: umov w12, v1.b[13] +; NO_SVE-NEXT: orr w9, w9, w13, lsl #9 +; NO_SVE-NEXT: and w13, w14, #0x1 +; NO_SVE-NEXT: umov w14, v1.b[14] +; NO_SVE-NEXT: orr w9, w9, w11, lsl #10 +; NO_SVE-NEXT: umov w11, v0.b[15] +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: orr w9, w9, w13, lsl #11 +; NO_SVE-NEXT: and w12, w12, #0x1 +; NO_SVE-NEXT: umov w13, v1.b[15] +; NO_SVE-NEXT: orr w9, w9, w10, lsl #12 +; NO_SVE-NEXT: and w10, w14, #0x1 +; NO_SVE-NEXT: orr w11, w8, w11, lsl #15 +; NO_SVE-NEXT: orr w8, w9, w12, lsl #13 +; NO_SVE-NEXT: orr w8, w8, w10, lsl #14 +; NO_SVE-NEXT: orr w8, w8, w13, lsl #15 +; NO_SVE-NEXT: bfi w8, w11, #16, #16 +; NO_SVE-NEXT: tbz w8, #0, .LBB19_2 +; NO_SVE-NEXT: // %bb.1: // %cond.load +; NO_SVE-NEXT: ldr b0, [x0] +; NO_SVE-NEXT: tbnz w8, #1, .LBB19_3 +; NO_SVE-NEXT: b .LBB19_4 +; NO_SVE-NEXT: .LBB19_2: +; NO_SVE-NEXT: // implicit-def: $q0 +; NO_SVE-NEXT: tbz w8, #1, .LBB19_4 +; NO_SVE-NEXT: .LBB19_3: // %cond.load1 +; NO_SVE-NEXT: add x9, x0, #1 +; NO_SVE-NEXT: ld1 { v0.b }[1], [x9] +; NO_SVE-NEXT: .LBB19_4: // %else2 +; NO_SVE-NEXT: tbnz w8, #2, .LBB19_20 +; NO_SVE-NEXT: // %bb.5: // %else5 +; NO_SVE-NEXT: tbnz w8, #3, .LBB19_21 +; NO_SVE-NEXT: .LBB19_6: // %else8 +; NO_SVE-NEXT: tbnz w8, #4, .LBB19_22 +; NO_SVE-NEXT: .LBB19_7: // %else11 +; NO_SVE-NEXT: tbnz w8, #5, .LBB19_23 +; NO_SVE-NEXT: .LBB19_8: // %else14 +; NO_SVE-NEXT: tbnz w8, #6, .LBB19_24 +; NO_SVE-NEXT: .LBB19_9: // %else17 +; NO_SVE-NEXT: tbnz w8, #7, .LBB19_25 +; NO_SVE-NEXT: .LBB19_10: // %else20 +; NO_SVE-NEXT: tbnz w8, #8, .LBB19_26 +; NO_SVE-NEXT: .LBB19_11: // %else23 +; NO_SVE-NEXT: tbnz w8, #9, .LBB19_27 +; NO_SVE-NEXT: .LBB19_12: // %else26 +; NO_SVE-NEXT: tbnz w8, #10, .LBB19_28 +; NO_SVE-NEXT: .LBB19_13: // %else29 +; NO_SVE-NEXT: tbnz w8, #11, .LBB19_29 +; NO_SVE-NEXT: .LBB19_14: // %else32 +; NO_SVE-NEXT: tbnz w8, #12, .LBB19_30 +; NO_SVE-NEXT: .LBB19_15: // %else35 +; NO_SVE-NEXT: tbnz w8, #13, .LBB19_31 +; NO_SVE-NEXT: .LBB19_16: // %else38 +; NO_SVE-NEXT: tbnz w8, #14, .LBB19_32 +; NO_SVE-NEXT: .LBB19_17: // %else41 +; NO_SVE-NEXT: tbnz w8, #15, .LBB19_33 +; NO_SVE-NEXT: .LBB19_18: // %else44 +; NO_SVE-NEXT: tbz w8, #16, .LBB19_34 +; NO_SVE-NEXT: .LBB19_19: // %cond.load46 +; NO_SVE-NEXT: add x9, x0, #16 +; NO_SVE-NEXT: ld1 { v2.b }[0], [x9] +; NO_SVE-NEXT: tbnz w8, #17, .LBB19_35 +; NO_SVE-NEXT: b .LBB19_36 +; NO_SVE-NEXT: .LBB19_20: // %cond.load4 +; NO_SVE-NEXT: add x9, x0, #2 +; NO_SVE-NEXT: ld1 { v0.b }[2], [x9] +; NO_SVE-NEXT: tbz w8, #3, .LBB19_6 +; NO_SVE-NEXT: .LBB19_21: // %cond.load7 +; NO_SVE-NEXT: add x9, x0, #3 +; NO_SVE-NEXT: ld1 { v0.b }[3], [x9] +; NO_SVE-NEXT: tbz w8, #4, .LBB19_7 +; NO_SVE-NEXT: .LBB19_22: // %cond.load10 +; NO_SVE-NEXT: add x9, x0, #4 +; NO_SVE-NEXT: ld1 { v0.b }[4], [x9] +; NO_SVE-NEXT: tbz w8, #5, .LBB19_8 +; NO_SVE-NEXT: .LBB19_23: // %cond.load13 +; NO_SVE-NEXT: add x9, x0, #5 +; NO_SVE-NEXT: ld1 { v0.b }[5], [x9] +; NO_SVE-NEXT: tbz w8, #6, .LBB19_9 +; NO_SVE-NEXT: .LBB19_24: // %cond.load16 +; NO_SVE-NEXT: add x9, x0, #6 +; NO_SVE-NEXT: ld1 { v0.b }[6], [x9] +; NO_SVE-NEXT: tbz w8, #7, .LBB19_10 +; NO_SVE-NEXT: .LBB19_25: // %cond.load19 +; NO_SVE-NEXT: add x9, x0, #7 +; NO_SVE-NEXT: ld1 { v0.b }[7], [x9] +; NO_SVE-NEXT: tbz w8, #8, .LBB19_11 +; NO_SVE-NEXT: .LBB19_26: // %cond.load22 +; NO_SVE-NEXT: add x9, x0, #8 +; NO_SVE-NEXT: ld1 { v0.b }[8], [x9] +; NO_SVE-NEXT: tbz w8, #9, .LBB19_12 +; NO_SVE-NEXT: .LBB19_27: // %cond.load25 +; NO_SVE-NEXT: add x9, x0, #9 +; NO_SVE-NEXT: ld1 { v0.b }[9], [x9] +; NO_SVE-NEXT: tbz w8, #10, .LBB19_13 +; NO_SVE-NEXT: .LBB19_28: // %cond.load28 +; NO_SVE-NEXT: add x9, x0, #10 +; NO_SVE-NEXT: ld1 { v0.b }[10], [x9] +; NO_SVE-NEXT: tbz w8, #11, .LBB19_14 +; NO_SVE-NEXT: .LBB19_29: // %cond.load31 +; NO_SVE-NEXT: add x9, x0, #11 +; NO_SVE-NEXT: ld1 { v0.b }[11], [x9] +; NO_SVE-NEXT: tbz w8, #12, .LBB19_15 +; NO_SVE-NEXT: .LBB19_30: // %cond.load34 +; NO_SVE-NEXT: add x9, x0, #12 +; NO_SVE-NEXT: ld1 { v0.b }[12], [x9] +; NO_SVE-NEXT: tbz w8, #13, .LBB19_16 +; NO_SVE-NEXT: .LBB19_31: // %cond.load37 +; NO_SVE-NEXT: add x9, x0, #13 +; NO_SVE-NEXT: ld1 { v0.b }[13], [x9] +; NO_SVE-NEXT: tbz w8, #14, .LBB19_17 +; NO_SVE-NEXT: .LBB19_32: // %cond.load40 +; NO_SVE-NEXT: add x9, x0, #14 +; NO_SVE-NEXT: ld1 { v0.b }[14], [x9] +; NO_SVE-NEXT: tbz w8, #15, .LBB19_18 +; NO_SVE-NEXT: .LBB19_33: // %cond.load43 +; NO_SVE-NEXT: add x9, x0, #15 +; NO_SVE-NEXT: ld1 { v0.b }[15], [x9] +; NO_SVE-NEXT: tbnz w8, #16, .LBB19_19 +; NO_SVE-NEXT: .LBB19_34: +; NO_SVE-NEXT: // implicit-def: $q2 +; NO_SVE-NEXT: tbz w8, #17, .LBB19_36 +; NO_SVE-NEXT: .LBB19_35: // %cond.load49 +; NO_SVE-NEXT: add x9, x0, #17 +; NO_SVE-NEXT: ld1 { v2.b }[1], [x9] +; NO_SVE-NEXT: .LBB19_36: // %else50 +; NO_SVE-NEXT: tbnz w8, #18, .LBB19_52 +; NO_SVE-NEXT: // %bb.37: // %else53 +; NO_SVE-NEXT: tbnz w8, #19, .LBB19_53 +; NO_SVE-NEXT: .LBB19_38: // %else56 +; NO_SVE-NEXT: tbnz w8, #20, .LBB19_54 +; NO_SVE-NEXT: .LBB19_39: // %else59 +; NO_SVE-NEXT: tbnz w8, #21, .LBB19_55 +; NO_SVE-NEXT: .LBB19_40: // %else62 +; NO_SVE-NEXT: tbnz w8, #22, .LBB19_56 +; NO_SVE-NEXT: .LBB19_41: // %else65 +; NO_SVE-NEXT: tbnz w8, #23, .LBB19_57 +; NO_SVE-NEXT: .LBB19_42: // %else68 +; NO_SVE-NEXT: tbnz w8, #24, .LBB19_58 +; NO_SVE-NEXT: .LBB19_43: // %else71 +; NO_SVE-NEXT: tbnz w8, #25, .LBB19_59 +; NO_SVE-NEXT: .LBB19_44: // %else74 +; NO_SVE-NEXT: tbnz w8, #26, .LBB19_60 +; NO_SVE-NEXT: .LBB19_45: // %else77 +; NO_SVE-NEXT: tbnz w8, #27, .LBB19_61 +; NO_SVE-NEXT: .LBB19_46: // %else80 +; NO_SVE-NEXT: tbnz w8, #28, .LBB19_62 +; NO_SVE-NEXT: .LBB19_47: // %else83 +; NO_SVE-NEXT: tbnz w8, #29, .LBB19_63 +; NO_SVE-NEXT: .LBB19_48: // %else86 +; NO_SVE-NEXT: tbnz w8, #30, .LBB19_64 +; NO_SVE-NEXT: .LBB19_49: // %else89 +; NO_SVE-NEXT: tbz w8, #31, .LBB19_51 +; NO_SVE-NEXT: .LBB19_50: // %cond.load91 +; NO_SVE-NEXT: add x8, x0, #31 +; NO_SVE-NEXT: ld1 { v2.b }[15], [x8] +; NO_SVE-NEXT: .LBB19_51: // %else92 +; NO_SVE-NEXT: ushll2 v1.8h, v0.16b, #0 +; NO_SVE-NEXT: ushll v0.8h, v0.8b, #0 +; NO_SVE-NEXT: ushll2 v3.8h, v2.16b, #0 +; NO_SVE-NEXT: ushll v2.8h, v2.8b, #0 +; NO_SVE-NEXT: add sp, sp, #16 +; NO_SVE-NEXT: ret +; NO_SVE-NEXT: .LBB19_52: // %cond.load52 +; NO_SVE-NEXT: add x9, x0, #18 +; NO_SVE-NEXT: ld1 { v2.b }[2], [x9] +; NO_SVE-NEXT: tbz w8, #19, .LBB19_38 +; NO_SVE-NEXT: .LBB19_53: // %cond.load55 +; NO_SVE-NEXT: add x9, x0, #19 +; NO_SVE-NEXT: ld1 { v2.b }[3], [x9] +; NO_SVE-NEXT: tbz w8, #20, .LBB19_39 +; NO_SVE-NEXT: .LBB19_54: // %cond.load58 +; NO_SVE-NEXT: add x9, x0, #20 +; NO_SVE-NEXT: ld1 { v2.b }[4], [x9] +; NO_SVE-NEXT: tbz w8, #21, .LBB19_40 +; NO_SVE-NEXT: .LBB19_55: // %cond.load61 +; NO_SVE-NEXT: add x9, x0, #21 +; NO_SVE-NEXT: ld1 { v2.b }[5], [x9] +; NO_SVE-NEXT: tbz w8, #22, .LBB19_41 +; NO_SVE-NEXT: .LBB19_56: // %cond.load64 +; NO_SVE-NEXT: add x9, x0, #22 +; NO_SVE-NEXT: ld1 { v2.b }[6], [x9] +; NO_SVE-NEXT: tbz w8, #23, .LBB19_42 +; NO_SVE-NEXT: .LBB19_57: // %cond.load67 +; NO_SVE-NEXT: add x9, x0, #23 +; NO_SVE-NEXT: ld1 { v2.b }[7], [x9] +; NO_SVE-NEXT: tbz w8, #24, .LBB19_43 +; NO_SVE-NEXT: .LBB19_58: // %cond.load70 +; NO_SVE-NEXT: add x9, x0, #24 +; NO_SVE-NEXT: ld1 { v2.b }[8], [x9] +; NO_SVE-NEXT: tbz w8, #25, .LBB19_44 +; NO_SVE-NEXT: .LBB19_59: // %cond.load73 +; NO_SVE-NEXT: add x9, x0, #25 +; NO_SVE-NEXT: ld1 { v2.b }[9], [x9] +; NO_SVE-NEXT: tbz w8, #26, .LBB19_45 +; NO_SVE-NEXT: .LBB19_60: // %cond.load76 +; NO_SVE-NEXT: add x9, x0, #26 +; NO_SVE-NEXT: ld1 { v2.b }[10], [x9] +; NO_SVE-NEXT: tbz w8, #27, .LBB19_46 +; NO_SVE-NEXT: .LBB19_61: // %cond.load79 +; NO_SVE-NEXT: add x9, x0, #27 +; NO_SVE-NEXT: ld1 { v2.b }[11], [x9] +; NO_SVE-NEXT: tbz w8, #28, .LBB19_47 +; NO_SVE-NEXT: .LBB19_62: // %cond.load82 +; NO_SVE-NEXT: add x9, x0, #28 +; NO_SVE-NEXT: ld1 { v2.b }[12], [x9] +; NO_SVE-NEXT: tbz w8, #29, .LBB19_48 +; NO_SVE-NEXT: .LBB19_63: // %cond.load85 +; NO_SVE-NEXT: add x9, x0, #29 +; NO_SVE-NEXT: ld1 { v2.b }[13], [x9] +; NO_SVE-NEXT: tbz w8, #30, .LBB19_49 +; NO_SVE-NEXT: .LBB19_64: // %cond.load88 +; NO_SVE-NEXT: add x9, x0, #30 +; NO_SVE-NEXT: ld1 { v2.b }[14], [x9] +; NO_SVE-NEXT: tbnz w8, #31, .LBB19_50 +; NO_SVE-NEXT: b .LBB19_51 +; ; VBITS_GE_512-LABEL: masked_load_zext_v32i8i16: ; VBITS_GE_512: // %bb.0: ; VBITS_GE_512-NEXT: ptrue p0.h, vl32 @@ -375,6 +4251,164 @@ } define <16 x i32> @masked_load_zext_v16i8i32(<16 x i8>* %ap, <16 x i8>* %bp) #0 { +; NO_SVE-LABEL: masked_load_zext_v16i8i32: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: sub sp, sp, #16 +; NO_SVE-NEXT: .cfi_def_cfa_offset 16 +; NO_SVE-NEXT: ldr q0, [x1] +; NO_SVE-NEXT: cmeq v0.16b, v0.16b, #0 +; NO_SVE-NEXT: umov w8, v0.b[1] +; NO_SVE-NEXT: umov w10, v0.b[2] +; NO_SVE-NEXT: umov w9, v0.b[0] +; NO_SVE-NEXT: umov w11, v0.b[3] +; NO_SVE-NEXT: umov w12, v0.b[4] +; NO_SVE-NEXT: umov w13, v0.b[5] +; NO_SVE-NEXT: and w8, w8, #0x1 +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: and w9, w9, #0x1 +; NO_SVE-NEXT: and w11, w11, #0x1 +; NO_SVE-NEXT: and w12, w12, #0x1 +; NO_SVE-NEXT: and w13, w13, #0x1 +; NO_SVE-NEXT: bfi w9, w8, #1, #1 +; NO_SVE-NEXT: umov w8, v0.b[6] +; NO_SVE-NEXT: bfi w9, w10, #2, #1 +; NO_SVE-NEXT: umov w10, v0.b[7] +; NO_SVE-NEXT: bfi w9, w11, #3, #1 +; NO_SVE-NEXT: umov w11, v0.b[8] +; NO_SVE-NEXT: bfi w9, w12, #4, #1 +; NO_SVE-NEXT: umov w12, v0.b[9] +; NO_SVE-NEXT: and w8, w8, #0x1 +; NO_SVE-NEXT: bfi w9, w13, #5, #1 +; NO_SVE-NEXT: umov w13, v0.b[10] +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: orr w8, w9, w8, lsl #6 +; NO_SVE-NEXT: umov w9, v0.b[11] +; NO_SVE-NEXT: and w11, w11, #0x1 +; NO_SVE-NEXT: orr w8, w8, w10, lsl #7 +; NO_SVE-NEXT: umov w10, v0.b[12] +; NO_SVE-NEXT: and w12, w12, #0x1 +; NO_SVE-NEXT: orr w8, w8, w11, lsl #8 +; NO_SVE-NEXT: umov w11, v0.b[13] +; NO_SVE-NEXT: and w13, w13, #0x1 +; NO_SVE-NEXT: orr w8, w8, w12, lsl #9 +; NO_SVE-NEXT: umov w12, v0.b[14] +; NO_SVE-NEXT: and w9, w9, #0x1 +; NO_SVE-NEXT: orr w8, w8, w13, lsl #10 +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: orr w8, w8, w9, lsl #11 +; NO_SVE-NEXT: and w9, w11, #0x1 +; NO_SVE-NEXT: umov w11, v0.b[15] +; NO_SVE-NEXT: orr w8, w8, w10, lsl #12 +; NO_SVE-NEXT: and w10, w12, #0x1 +; NO_SVE-NEXT: orr w8, w8, w9, lsl #13 +; NO_SVE-NEXT: // implicit-def: $q0 +; NO_SVE-NEXT: orr w8, w8, w10, lsl #14 +; NO_SVE-NEXT: orr w9, w8, w11, lsl #15 +; NO_SVE-NEXT: and w8, w9, #0xffff +; NO_SVE-NEXT: tbnz w9, #0, .LBB20_18 +; NO_SVE-NEXT: // %bb.1: // %else +; NO_SVE-NEXT: tbnz w8, #1, .LBB20_19 +; NO_SVE-NEXT: .LBB20_2: // %else2 +; NO_SVE-NEXT: tbnz w8, #2, .LBB20_20 +; NO_SVE-NEXT: .LBB20_3: // %else5 +; NO_SVE-NEXT: tbnz w8, #3, .LBB20_21 +; NO_SVE-NEXT: .LBB20_4: // %else8 +; NO_SVE-NEXT: tbnz w8, #4, .LBB20_22 +; NO_SVE-NEXT: .LBB20_5: // %else11 +; NO_SVE-NEXT: tbnz w8, #5, .LBB20_23 +; NO_SVE-NEXT: .LBB20_6: // %else14 +; NO_SVE-NEXT: tbnz w8, #6, .LBB20_24 +; NO_SVE-NEXT: .LBB20_7: // %else17 +; NO_SVE-NEXT: tbnz w8, #7, .LBB20_25 +; NO_SVE-NEXT: .LBB20_8: // %else20 +; NO_SVE-NEXT: tbnz w8, #8, .LBB20_26 +; NO_SVE-NEXT: .LBB20_9: // %else23 +; NO_SVE-NEXT: tbnz w8, #9, .LBB20_27 +; NO_SVE-NEXT: .LBB20_10: // %else26 +; NO_SVE-NEXT: tbnz w8, #10, .LBB20_28 +; NO_SVE-NEXT: .LBB20_11: // %else29 +; NO_SVE-NEXT: tbnz w8, #11, .LBB20_29 +; NO_SVE-NEXT: .LBB20_12: // %else32 +; NO_SVE-NEXT: tbnz w8, #12, .LBB20_30 +; NO_SVE-NEXT: .LBB20_13: // %else35 +; NO_SVE-NEXT: tbnz w8, #13, .LBB20_31 +; NO_SVE-NEXT: .LBB20_14: // %else38 +; NO_SVE-NEXT: tbnz w8, #14, .LBB20_32 +; NO_SVE-NEXT: .LBB20_15: // %else41 +; NO_SVE-NEXT: tbz w8, #15, .LBB20_17 +; NO_SVE-NEXT: .LBB20_16: // %cond.load43 +; NO_SVE-NEXT: add x8, x0, #15 +; NO_SVE-NEXT: ld1 { v0.b }[15], [x8] +; NO_SVE-NEXT: .LBB20_17: // %else44 +; NO_SVE-NEXT: ushll2 v2.8h, v0.16b, #0 +; NO_SVE-NEXT: ushll v0.8h, v0.8b, #0 +; NO_SVE-NEXT: ushll2 v3.4s, v2.8h, #0 +; NO_SVE-NEXT: ushll2 v1.4s, v0.8h, #0 +; NO_SVE-NEXT: ushll v0.4s, v0.4h, #0 +; NO_SVE-NEXT: ushll v2.4s, v2.4h, #0 +; NO_SVE-NEXT: add sp, sp, #16 +; NO_SVE-NEXT: ret +; NO_SVE-NEXT: .LBB20_18: // %cond.load +; NO_SVE-NEXT: ldr b0, [x0] +; NO_SVE-NEXT: tbz w8, #1, .LBB20_2 +; NO_SVE-NEXT: .LBB20_19: // %cond.load1 +; NO_SVE-NEXT: add x9, x0, #1 +; NO_SVE-NEXT: ld1 { v0.b }[1], [x9] +; NO_SVE-NEXT: tbz w8, #2, .LBB20_3 +; NO_SVE-NEXT: .LBB20_20: // %cond.load4 +; NO_SVE-NEXT: add x9, x0, #2 +; NO_SVE-NEXT: ld1 { v0.b }[2], [x9] +; NO_SVE-NEXT: tbz w8, #3, .LBB20_4 +; NO_SVE-NEXT: .LBB20_21: // %cond.load7 +; NO_SVE-NEXT: add x9, x0, #3 +; NO_SVE-NEXT: ld1 { v0.b }[3], [x9] +; NO_SVE-NEXT: tbz w8, #4, .LBB20_5 +; NO_SVE-NEXT: .LBB20_22: // %cond.load10 +; NO_SVE-NEXT: add x9, x0, #4 +; NO_SVE-NEXT: ld1 { v0.b }[4], [x9] +; NO_SVE-NEXT: tbz w8, #5, .LBB20_6 +; NO_SVE-NEXT: .LBB20_23: // %cond.load13 +; NO_SVE-NEXT: add x9, x0, #5 +; NO_SVE-NEXT: ld1 { v0.b }[5], [x9] +; NO_SVE-NEXT: tbz w8, #6, .LBB20_7 +; NO_SVE-NEXT: .LBB20_24: // %cond.load16 +; NO_SVE-NEXT: add x9, x0, #6 +; NO_SVE-NEXT: ld1 { v0.b }[6], [x9] +; NO_SVE-NEXT: tbz w8, #7, .LBB20_8 +; NO_SVE-NEXT: .LBB20_25: // %cond.load19 +; NO_SVE-NEXT: add x9, x0, #7 +; NO_SVE-NEXT: ld1 { v0.b }[7], [x9] +; NO_SVE-NEXT: tbz w8, #8, .LBB20_9 +; NO_SVE-NEXT: .LBB20_26: // %cond.load22 +; NO_SVE-NEXT: add x9, x0, #8 +; NO_SVE-NEXT: ld1 { v0.b }[8], [x9] +; NO_SVE-NEXT: tbz w8, #9, .LBB20_10 +; NO_SVE-NEXT: .LBB20_27: // %cond.load25 +; NO_SVE-NEXT: add x9, x0, #9 +; NO_SVE-NEXT: ld1 { v0.b }[9], [x9] +; NO_SVE-NEXT: tbz w8, #10, .LBB20_11 +; NO_SVE-NEXT: .LBB20_28: // %cond.load28 +; NO_SVE-NEXT: add x9, x0, #10 +; NO_SVE-NEXT: ld1 { v0.b }[10], [x9] +; NO_SVE-NEXT: tbz w8, #11, .LBB20_12 +; NO_SVE-NEXT: .LBB20_29: // %cond.load31 +; NO_SVE-NEXT: add x9, x0, #11 +; NO_SVE-NEXT: ld1 { v0.b }[11], [x9] +; NO_SVE-NEXT: tbz w8, #12, .LBB20_13 +; NO_SVE-NEXT: .LBB20_30: // %cond.load34 +; NO_SVE-NEXT: add x9, x0, #12 +; NO_SVE-NEXT: ld1 { v0.b }[12], [x9] +; NO_SVE-NEXT: tbz w8, #13, .LBB20_14 +; NO_SVE-NEXT: .LBB20_31: // %cond.load37 +; NO_SVE-NEXT: add x9, x0, #13 +; NO_SVE-NEXT: ld1 { v0.b }[13], [x9] +; NO_SVE-NEXT: tbz w8, #14, .LBB20_15 +; NO_SVE-NEXT: .LBB20_32: // %cond.load40 +; NO_SVE-NEXT: add x9, x0, #14 +; NO_SVE-NEXT: ld1 { v0.b }[14], [x9] +; NO_SVE-NEXT: tbnz w8, #15, .LBB20_16 +; NO_SVE-NEXT: b .LBB20_17 +; ; VBITS_GE_512-LABEL: masked_load_zext_v16i8i32: ; VBITS_GE_512: // %bb.0: ; VBITS_GE_512-NEXT: ptrue p0.s, vl16 @@ -391,6 +4425,93 @@ } define <8 x i64> @masked_load_zext_v8i8i64(<8 x i8>* %ap, <8 x i8>* %bp) #0 { +; NO_SVE-LABEL: masked_load_zext_v8i8i64: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: sub sp, sp, #16 +; NO_SVE-NEXT: .cfi_def_cfa_offset 16 +; NO_SVE-NEXT: ldr d0, [x1] +; NO_SVE-NEXT: cmeq v0.8b, v0.8b, #0 +; NO_SVE-NEXT: umov w8, v0.b[1] +; NO_SVE-NEXT: umov w10, v0.b[2] +; NO_SVE-NEXT: umov w9, v0.b[0] +; NO_SVE-NEXT: umov w11, v0.b[3] +; NO_SVE-NEXT: umov w12, v0.b[4] +; NO_SVE-NEXT: umov w13, v0.b[5] +; NO_SVE-NEXT: umov w14, v0.b[6] +; NO_SVE-NEXT: and w8, w8, #0x1 +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: and w9, w9, #0x1 +; NO_SVE-NEXT: and w11, w11, #0x1 +; NO_SVE-NEXT: bfi w9, w8, #1, #1 +; NO_SVE-NEXT: and w8, w12, #0x1 +; NO_SVE-NEXT: bfi w9, w10, #2, #1 +; NO_SVE-NEXT: and w10, w13, #0x1 +; NO_SVE-NEXT: bfi w9, w11, #3, #1 +; NO_SVE-NEXT: umov w11, v0.b[7] +; NO_SVE-NEXT: bfi w9, w8, #4, #1 +; NO_SVE-NEXT: and w8, w14, #0x1 +; NO_SVE-NEXT: bfi w9, w10, #5, #1 +; NO_SVE-NEXT: // implicit-def: $d0 +; NO_SVE-NEXT: orr w8, w9, w8, lsl #6 +; NO_SVE-NEXT: orr w9, w8, w11, lsl #7 +; NO_SVE-NEXT: and w8, w9, #0xff +; NO_SVE-NEXT: tbnz w9, #0, .LBB21_10 +; NO_SVE-NEXT: // %bb.1: // %else +; NO_SVE-NEXT: tbnz w8, #1, .LBB21_11 +; NO_SVE-NEXT: .LBB21_2: // %else2 +; NO_SVE-NEXT: tbnz w8, #2, .LBB21_12 +; NO_SVE-NEXT: .LBB21_3: // %else5 +; NO_SVE-NEXT: tbnz w8, #3, .LBB21_13 +; NO_SVE-NEXT: .LBB21_4: // %else8 +; NO_SVE-NEXT: tbnz w8, #4, .LBB21_14 +; NO_SVE-NEXT: .LBB21_5: // %else11 +; NO_SVE-NEXT: tbnz w8, #5, .LBB21_15 +; NO_SVE-NEXT: .LBB21_6: // %else14 +; NO_SVE-NEXT: tbnz w8, #6, .LBB21_16 +; NO_SVE-NEXT: .LBB21_7: // %else17 +; NO_SVE-NEXT: tbz w8, #7, .LBB21_9 +; NO_SVE-NEXT: .LBB21_8: // %cond.load19 +; NO_SVE-NEXT: add x8, x0, #7 +; NO_SVE-NEXT: ld1 { v0.b }[7], [x8] +; NO_SVE-NEXT: .LBB21_9: // %else20 +; NO_SVE-NEXT: ushll v0.8h, v0.8b, #0 +; NO_SVE-NEXT: ushll2 v2.4s, v0.8h, #0 +; NO_SVE-NEXT: ushll v0.4s, v0.4h, #0 +; NO_SVE-NEXT: ushll2 v3.2d, v2.4s, #0 +; NO_SVE-NEXT: ushll2 v1.2d, v0.4s, #0 +; NO_SVE-NEXT: ushll v0.2d, v0.2s, #0 +; NO_SVE-NEXT: ushll v2.2d, v2.2s, #0 +; NO_SVE-NEXT: add sp, sp, #16 +; NO_SVE-NEXT: ret +; NO_SVE-NEXT: .LBB21_10: // %cond.load +; NO_SVE-NEXT: ldr b0, [x0] +; NO_SVE-NEXT: tbz w8, #1, .LBB21_2 +; NO_SVE-NEXT: .LBB21_11: // %cond.load1 +; NO_SVE-NEXT: add x9, x0, #1 +; NO_SVE-NEXT: ld1 { v0.b }[1], [x9] +; NO_SVE-NEXT: tbz w8, #2, .LBB21_3 +; NO_SVE-NEXT: .LBB21_12: // %cond.load4 +; NO_SVE-NEXT: add x9, x0, #2 +; NO_SVE-NEXT: ld1 { v0.b }[2], [x9] +; NO_SVE-NEXT: tbz w8, #3, .LBB21_4 +; NO_SVE-NEXT: .LBB21_13: // %cond.load7 +; NO_SVE-NEXT: add x9, x0, #3 +; NO_SVE-NEXT: ld1 { v0.b }[3], [x9] +; NO_SVE-NEXT: tbz w8, #4, .LBB21_5 +; NO_SVE-NEXT: .LBB21_14: // %cond.load10 +; NO_SVE-NEXT: add x9, x0, #4 +; NO_SVE-NEXT: ld1 { v0.b }[4], [x9] +; NO_SVE-NEXT: tbz w8, #5, .LBB21_6 +; NO_SVE-NEXT: .LBB21_15: // %cond.load13 +; NO_SVE-NEXT: add x9, x0, #5 +; NO_SVE-NEXT: ld1 { v0.b }[5], [x9] +; NO_SVE-NEXT: tbz w8, #6, .LBB21_7 +; NO_SVE-NEXT: .LBB21_16: // %cond.load16 +; NO_SVE-NEXT: add x9, x0, #6 +; NO_SVE-NEXT: ld1 { v0.b }[6], [x9] +; NO_SVE-NEXT: tbnz w8, #7, .LBB21_8 +; NO_SVE-NEXT: b .LBB21_9 +; ; VBITS_GE_512-LABEL: masked_load_zext_v8i8i64: ; VBITS_GE_512: // %bb.0: ; VBITS_GE_512-NEXT: ptrue p0.d, vl8 @@ -407,6 +4528,166 @@ } define <16 x i32> @masked_load_zext_v16i16i32(<16 x i16>* %ap, <16 x i16>* %bp) #0 { +; NO_SVE-LABEL: masked_load_zext_v16i16i32: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: sub sp, sp, #16 +; NO_SVE-NEXT: .cfi_def_cfa_offset 16 +; NO_SVE-NEXT: ldp q0, q1, [x1] +; NO_SVE-NEXT: cmeq v0.8h, v0.8h, #0 +; NO_SVE-NEXT: xtn v0.8b, v0.8h +; NO_SVE-NEXT: cmeq v1.8h, v1.8h, #0 +; NO_SVE-NEXT: umov w8, v0.b[1] +; NO_SVE-NEXT: umov w10, v0.b[2] +; NO_SVE-NEXT: umov w9, v0.b[0] +; NO_SVE-NEXT: umov w11, v0.b[3] +; NO_SVE-NEXT: umov w12, v0.b[4] +; NO_SVE-NEXT: umov w13, v0.b[5] +; NO_SVE-NEXT: xtn v1.8b, v1.8h +; NO_SVE-NEXT: and w8, w8, #0x1 +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: and w9, w9, #0x1 +; NO_SVE-NEXT: and w11, w11, #0x1 +; NO_SVE-NEXT: and w12, w12, #0x1 +; NO_SVE-NEXT: bfi w9, w8, #1, #1 +; NO_SVE-NEXT: umov w8, v0.b[6] +; NO_SVE-NEXT: bfi w9, w10, #2, #1 +; NO_SVE-NEXT: umov w10, v0.b[7] +; NO_SVE-NEXT: and w13, w13, #0x1 +; NO_SVE-NEXT: bfi w9, w11, #3, #1 +; NO_SVE-NEXT: umov w11, v1.b[0] +; NO_SVE-NEXT: bfi w9, w12, #4, #1 +; NO_SVE-NEXT: umov w12, v1.b[1] +; NO_SVE-NEXT: and w8, w8, #0x1 +; NO_SVE-NEXT: bfi w9, w13, #5, #1 +; NO_SVE-NEXT: umov w13, v1.b[2] +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: orr w8, w9, w8, lsl #6 +; NO_SVE-NEXT: umov w9, v1.b[3] +; NO_SVE-NEXT: and w11, w11, #0x1 +; NO_SVE-NEXT: orr w8, w8, w10, lsl #7 +; NO_SVE-NEXT: umov w10, v1.b[4] +; NO_SVE-NEXT: and w12, w12, #0x1 +; NO_SVE-NEXT: orr w8, w8, w11, lsl #8 +; NO_SVE-NEXT: umov w11, v1.b[5] +; NO_SVE-NEXT: and w13, w13, #0x1 +; NO_SVE-NEXT: orr w8, w8, w12, lsl #9 +; NO_SVE-NEXT: umov w12, v1.b[6] +; NO_SVE-NEXT: and w9, w9, #0x1 +; NO_SVE-NEXT: orr w8, w8, w13, lsl #10 +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: orr w8, w8, w9, lsl #11 +; NO_SVE-NEXT: and w9, w11, #0x1 +; NO_SVE-NEXT: umov w11, v1.b[7] +; NO_SVE-NEXT: orr w8, w8, w10, lsl #12 +; NO_SVE-NEXT: and w10, w12, #0x1 +; NO_SVE-NEXT: orr w8, w8, w9, lsl #13 +; NO_SVE-NEXT: orr w8, w8, w10, lsl #14 +; NO_SVE-NEXT: orr w9, w8, w11, lsl #15 +; NO_SVE-NEXT: and w8, w9, #0xffff +; NO_SVE-NEXT: tbz w9, #0, .LBB22_2 +; NO_SVE-NEXT: // %bb.1: // %cond.load +; NO_SVE-NEXT: ldr h0, [x0] +; NO_SVE-NEXT: tbnz w8, #1, .LBB22_3 +; NO_SVE-NEXT: b .LBB22_4 +; NO_SVE-NEXT: .LBB22_2: +; NO_SVE-NEXT: // implicit-def: $q0 +; NO_SVE-NEXT: tbz w8, #1, .LBB22_4 +; NO_SVE-NEXT: .LBB22_3: // %cond.load1 +; NO_SVE-NEXT: add x9, x0, #2 +; NO_SVE-NEXT: ld1 { v0.h }[1], [x9] +; NO_SVE-NEXT: .LBB22_4: // %else2 +; NO_SVE-NEXT: tbnz w8, #2, .LBB22_12 +; NO_SVE-NEXT: // %bb.5: // %else5 +; NO_SVE-NEXT: tbnz w8, #3, .LBB22_13 +; NO_SVE-NEXT: .LBB22_6: // %else8 +; NO_SVE-NEXT: tbnz w8, #4, .LBB22_14 +; NO_SVE-NEXT: .LBB22_7: // %else11 +; NO_SVE-NEXT: tbnz w8, #5, .LBB22_15 +; NO_SVE-NEXT: .LBB22_8: // %else14 +; NO_SVE-NEXT: tbnz w8, #6, .LBB22_16 +; NO_SVE-NEXT: .LBB22_9: // %else17 +; NO_SVE-NEXT: tbnz w8, #7, .LBB22_17 +; NO_SVE-NEXT: .LBB22_10: // %else20 +; NO_SVE-NEXT: tbz w8, #8, .LBB22_18 +; NO_SVE-NEXT: .LBB22_11: // %cond.load22 +; NO_SVE-NEXT: add x9, x0, #16 +; NO_SVE-NEXT: ld1 { v2.h }[0], [x9] +; NO_SVE-NEXT: tbnz w8, #9, .LBB22_19 +; NO_SVE-NEXT: b .LBB22_20 +; NO_SVE-NEXT: .LBB22_12: // %cond.load4 +; NO_SVE-NEXT: add x9, x0, #4 +; NO_SVE-NEXT: ld1 { v0.h }[2], [x9] +; NO_SVE-NEXT: tbz w8, #3, .LBB22_6 +; NO_SVE-NEXT: .LBB22_13: // %cond.load7 +; NO_SVE-NEXT: add x9, x0, #6 +; NO_SVE-NEXT: ld1 { v0.h }[3], [x9] +; NO_SVE-NEXT: tbz w8, #4, .LBB22_7 +; NO_SVE-NEXT: .LBB22_14: // %cond.load10 +; NO_SVE-NEXT: add x9, x0, #8 +; NO_SVE-NEXT: ld1 { v0.h }[4], [x9] +; NO_SVE-NEXT: tbz w8, #5, .LBB22_8 +; NO_SVE-NEXT: .LBB22_15: // %cond.load13 +; NO_SVE-NEXT: add x9, x0, #10 +; NO_SVE-NEXT: ld1 { v0.h }[5], [x9] +; NO_SVE-NEXT: tbz w8, #6, .LBB22_9 +; NO_SVE-NEXT: .LBB22_16: // %cond.load16 +; NO_SVE-NEXT: add x9, x0, #12 +; NO_SVE-NEXT: ld1 { v0.h }[6], [x9] +; NO_SVE-NEXT: tbz w8, #7, .LBB22_10 +; NO_SVE-NEXT: .LBB22_17: // %cond.load19 +; NO_SVE-NEXT: add x9, x0, #14 +; NO_SVE-NEXT: ld1 { v0.h }[7], [x9] +; NO_SVE-NEXT: tbnz w8, #8, .LBB22_11 +; NO_SVE-NEXT: .LBB22_18: +; NO_SVE-NEXT: // implicit-def: $q2 +; NO_SVE-NEXT: tbz w8, #9, .LBB22_20 +; NO_SVE-NEXT: .LBB22_19: // %cond.load25 +; NO_SVE-NEXT: add x9, x0, #18 +; NO_SVE-NEXT: ld1 { v2.h }[1], [x9] +; NO_SVE-NEXT: .LBB22_20: // %else26 +; NO_SVE-NEXT: tbnz w8, #10, .LBB22_28 +; NO_SVE-NEXT: // %bb.21: // %else29 +; NO_SVE-NEXT: tbnz w8, #11, .LBB22_29 +; NO_SVE-NEXT: .LBB22_22: // %else32 +; NO_SVE-NEXT: tbnz w8, #12, .LBB22_30 +; NO_SVE-NEXT: .LBB22_23: // %else35 +; NO_SVE-NEXT: tbnz w8, #13, .LBB22_31 +; NO_SVE-NEXT: .LBB22_24: // %else38 +; NO_SVE-NEXT: tbnz w8, #14, .LBB22_32 +; NO_SVE-NEXT: .LBB22_25: // %else41 +; NO_SVE-NEXT: tbz w8, #15, .LBB22_27 +; NO_SVE-NEXT: .LBB22_26: // %cond.load43 +; NO_SVE-NEXT: add x8, x0, #30 +; NO_SVE-NEXT: ld1 { v2.h }[7], [x8] +; NO_SVE-NEXT: .LBB22_27: // %else44 +; NO_SVE-NEXT: ushll2 v1.4s, v0.8h, #0 +; NO_SVE-NEXT: ushll v0.4s, v0.4h, #0 +; NO_SVE-NEXT: ushll2 v3.4s, v2.8h, #0 +; NO_SVE-NEXT: ushll v2.4s, v2.4h, #0 +; NO_SVE-NEXT: add sp, sp, #16 +; NO_SVE-NEXT: ret +; NO_SVE-NEXT: .LBB22_28: // %cond.load28 +; NO_SVE-NEXT: add x9, x0, #20 +; NO_SVE-NEXT: ld1 { v2.h }[2], [x9] +; NO_SVE-NEXT: tbz w8, #11, .LBB22_22 +; NO_SVE-NEXT: .LBB22_29: // %cond.load31 +; NO_SVE-NEXT: add x9, x0, #22 +; NO_SVE-NEXT: ld1 { v2.h }[3], [x9] +; NO_SVE-NEXT: tbz w8, #12, .LBB22_23 +; NO_SVE-NEXT: .LBB22_30: // %cond.load34 +; NO_SVE-NEXT: add x9, x0, #24 +; NO_SVE-NEXT: ld1 { v2.h }[4], [x9] +; NO_SVE-NEXT: tbz w8, #13, .LBB22_24 +; NO_SVE-NEXT: .LBB22_31: // %cond.load37 +; NO_SVE-NEXT: add x9, x0, #26 +; NO_SVE-NEXT: ld1 { v2.h }[5], [x9] +; NO_SVE-NEXT: tbz w8, #14, .LBB22_25 +; NO_SVE-NEXT: .LBB22_32: // %cond.load40 +; NO_SVE-NEXT: add x9, x0, #28 +; NO_SVE-NEXT: ld1 { v2.h }[6], [x9] +; NO_SVE-NEXT: tbnz w8, #15, .LBB22_26 +; NO_SVE-NEXT: b .LBB22_27 +; ; VBITS_GE_512-LABEL: masked_load_zext_v16i16i32: ; VBITS_GE_512: // %bb.0: ; VBITS_GE_512-NEXT: ptrue p0.s, vl16 @@ -423,6 +4704,93 @@ } define <8 x i64> @masked_load_zext_v8i16i64(<8 x i16>* %ap, <8 x i16>* %bp) #0 { +; NO_SVE-LABEL: masked_load_zext_v8i16i64: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: sub sp, sp, #16 +; NO_SVE-NEXT: .cfi_def_cfa_offset 16 +; NO_SVE-NEXT: ldr q0, [x1] +; NO_SVE-NEXT: cmeq v0.8h, v0.8h, #0 +; NO_SVE-NEXT: xtn v0.8b, v0.8h +; NO_SVE-NEXT: umov w8, v0.b[1] +; NO_SVE-NEXT: umov w10, v0.b[2] +; NO_SVE-NEXT: umov w9, v0.b[0] +; NO_SVE-NEXT: umov w11, v0.b[3] +; NO_SVE-NEXT: umov w12, v0.b[4] +; NO_SVE-NEXT: umov w13, v0.b[5] +; NO_SVE-NEXT: umov w14, v0.b[6] +; NO_SVE-NEXT: and w8, w8, #0x1 +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: and w9, w9, #0x1 +; NO_SVE-NEXT: and w11, w11, #0x1 +; NO_SVE-NEXT: bfi w9, w8, #1, #1 +; NO_SVE-NEXT: and w8, w12, #0x1 +; NO_SVE-NEXT: bfi w9, w10, #2, #1 +; NO_SVE-NEXT: and w10, w13, #0x1 +; NO_SVE-NEXT: bfi w9, w11, #3, #1 +; NO_SVE-NEXT: umov w11, v0.b[7] +; NO_SVE-NEXT: bfi w9, w8, #4, #1 +; NO_SVE-NEXT: and w8, w14, #0x1 +; NO_SVE-NEXT: bfi w9, w10, #5, #1 +; NO_SVE-NEXT: // implicit-def: $q0 +; NO_SVE-NEXT: orr w8, w9, w8, lsl #6 +; NO_SVE-NEXT: orr w9, w8, w11, lsl #7 +; NO_SVE-NEXT: and w8, w9, #0xff +; NO_SVE-NEXT: tbnz w9, #0, .LBB23_10 +; NO_SVE-NEXT: // %bb.1: // %else +; NO_SVE-NEXT: tbnz w8, #1, .LBB23_11 +; NO_SVE-NEXT: .LBB23_2: // %else2 +; NO_SVE-NEXT: tbnz w8, #2, .LBB23_12 +; NO_SVE-NEXT: .LBB23_3: // %else5 +; NO_SVE-NEXT: tbnz w8, #3, .LBB23_13 +; NO_SVE-NEXT: .LBB23_4: // %else8 +; NO_SVE-NEXT: tbnz w8, #4, .LBB23_14 +; NO_SVE-NEXT: .LBB23_5: // %else11 +; NO_SVE-NEXT: tbnz w8, #5, .LBB23_15 +; NO_SVE-NEXT: .LBB23_6: // %else14 +; NO_SVE-NEXT: tbnz w8, #6, .LBB23_16 +; NO_SVE-NEXT: .LBB23_7: // %else17 +; NO_SVE-NEXT: tbz w8, #7, .LBB23_9 +; NO_SVE-NEXT: .LBB23_8: // %cond.load19 +; NO_SVE-NEXT: add x8, x0, #14 +; NO_SVE-NEXT: ld1 { v0.h }[7], [x8] +; NO_SVE-NEXT: .LBB23_9: // %else20 +; NO_SVE-NEXT: ushll2 v2.4s, v0.8h, #0 +; NO_SVE-NEXT: ushll v0.4s, v0.4h, #0 +; NO_SVE-NEXT: ushll2 v3.2d, v2.4s, #0 +; NO_SVE-NEXT: ushll2 v1.2d, v0.4s, #0 +; NO_SVE-NEXT: ushll v0.2d, v0.2s, #0 +; NO_SVE-NEXT: ushll v2.2d, v2.2s, #0 +; NO_SVE-NEXT: add sp, sp, #16 +; NO_SVE-NEXT: ret +; NO_SVE-NEXT: .LBB23_10: // %cond.load +; NO_SVE-NEXT: ldr h0, [x0] +; NO_SVE-NEXT: tbz w8, #1, .LBB23_2 +; NO_SVE-NEXT: .LBB23_11: // %cond.load1 +; NO_SVE-NEXT: add x9, x0, #2 +; NO_SVE-NEXT: ld1 { v0.h }[1], [x9] +; NO_SVE-NEXT: tbz w8, #2, .LBB23_3 +; NO_SVE-NEXT: .LBB23_12: // %cond.load4 +; NO_SVE-NEXT: add x9, x0, #4 +; NO_SVE-NEXT: ld1 { v0.h }[2], [x9] +; NO_SVE-NEXT: tbz w8, #3, .LBB23_4 +; NO_SVE-NEXT: .LBB23_13: // %cond.load7 +; NO_SVE-NEXT: add x9, x0, #6 +; NO_SVE-NEXT: ld1 { v0.h }[3], [x9] +; NO_SVE-NEXT: tbz w8, #4, .LBB23_5 +; NO_SVE-NEXT: .LBB23_14: // %cond.load10 +; NO_SVE-NEXT: add x9, x0, #8 +; NO_SVE-NEXT: ld1 { v0.h }[4], [x9] +; NO_SVE-NEXT: tbz w8, #5, .LBB23_6 +; NO_SVE-NEXT: .LBB23_15: // %cond.load13 +; NO_SVE-NEXT: add x9, x0, #10 +; NO_SVE-NEXT: ld1 { v0.h }[5], [x9] +; NO_SVE-NEXT: tbz w8, #6, .LBB23_7 +; NO_SVE-NEXT: .LBB23_16: // %cond.load16 +; NO_SVE-NEXT: add x9, x0, #12 +; NO_SVE-NEXT: ld1 { v0.h }[6], [x9] +; NO_SVE-NEXT: tbnz w8, #7, .LBB23_8 +; NO_SVE-NEXT: b .LBB23_9 +; ; VBITS_GE_512-LABEL: masked_load_zext_v8i16i64: ; VBITS_GE_512: // %bb.0: ; VBITS_GE_512-NEXT: ptrue p0.d, vl8 @@ -439,6 +4807,94 @@ } define <8 x i64> @masked_load_zext_v8i32i64(<8 x i32>* %ap, <8 x i32>* %bp) #0 { +; NO_SVE-LABEL: masked_load_zext_v8i32i64: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: sub sp, sp, #16 +; NO_SVE-NEXT: .cfi_def_cfa_offset 16 +; NO_SVE-NEXT: ldp q1, q0, [x1] +; NO_SVE-NEXT: cmeq v1.4s, v1.4s, #0 +; NO_SVE-NEXT: cmeq v0.4s, v0.4s, #0 +; NO_SVE-NEXT: uzp1 v0.8h, v1.8h, v0.8h +; NO_SVE-NEXT: xtn v0.8b, v0.8h +; NO_SVE-NEXT: umov w8, v0.b[1] +; NO_SVE-NEXT: umov w10, v0.b[2] +; NO_SVE-NEXT: umov w9, v0.b[0] +; NO_SVE-NEXT: umov w11, v0.b[3] +; NO_SVE-NEXT: umov w12, v0.b[4] +; NO_SVE-NEXT: umov w13, v0.b[5] +; NO_SVE-NEXT: umov w14, v0.b[6] +; NO_SVE-NEXT: and w8, w8, #0x1 +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: and w9, w9, #0x1 +; NO_SVE-NEXT: and w11, w11, #0x1 +; NO_SVE-NEXT: bfi w9, w8, #1, #1 +; NO_SVE-NEXT: and w8, w12, #0x1 +; NO_SVE-NEXT: bfi w9, w10, #2, #1 +; NO_SVE-NEXT: and w10, w13, #0x1 +; NO_SVE-NEXT: bfi w9, w11, #3, #1 +; NO_SVE-NEXT: umov w11, v0.b[7] +; NO_SVE-NEXT: bfi w9, w8, #4, #1 +; NO_SVE-NEXT: and w8, w14, #0x1 +; NO_SVE-NEXT: bfi w9, w10, #5, #1 +; NO_SVE-NEXT: orr w8, w9, w8, lsl #6 +; NO_SVE-NEXT: orr w9, w8, w11, lsl #7 +; NO_SVE-NEXT: and w8, w9, #0xff +; NO_SVE-NEXT: tbz w9, #0, .LBB24_2 +; NO_SVE-NEXT: // %bb.1: // %cond.load +; NO_SVE-NEXT: ldr s0, [x0] +; NO_SVE-NEXT: tbnz w8, #1, .LBB24_3 +; NO_SVE-NEXT: b .LBB24_4 +; NO_SVE-NEXT: .LBB24_2: +; NO_SVE-NEXT: // implicit-def: $q0 +; NO_SVE-NEXT: tbz w8, #1, .LBB24_4 +; NO_SVE-NEXT: .LBB24_3: // %cond.load1 +; NO_SVE-NEXT: add x9, x0, #4 +; NO_SVE-NEXT: ld1 { v0.s }[1], [x9] +; NO_SVE-NEXT: .LBB24_4: // %else2 +; NO_SVE-NEXT: tbnz w8, #2, .LBB24_8 +; NO_SVE-NEXT: // %bb.5: // %else5 +; NO_SVE-NEXT: tbnz w8, #3, .LBB24_9 +; NO_SVE-NEXT: .LBB24_6: // %else8 +; NO_SVE-NEXT: tbz w8, #4, .LBB24_10 +; NO_SVE-NEXT: .LBB24_7: // %cond.load10 +; NO_SVE-NEXT: add x9, x0, #16 +; NO_SVE-NEXT: ld1 { v2.s }[0], [x9] +; NO_SVE-NEXT: tbnz w8, #5, .LBB24_11 +; NO_SVE-NEXT: b .LBB24_12 +; NO_SVE-NEXT: .LBB24_8: // %cond.load4 +; NO_SVE-NEXT: add x9, x0, #8 +; NO_SVE-NEXT: ld1 { v0.s }[2], [x9] +; NO_SVE-NEXT: tbz w8, #3, .LBB24_6 +; NO_SVE-NEXT: .LBB24_9: // %cond.load7 +; NO_SVE-NEXT: add x9, x0, #12 +; NO_SVE-NEXT: ld1 { v0.s }[3], [x9] +; NO_SVE-NEXT: tbnz w8, #4, .LBB24_7 +; NO_SVE-NEXT: .LBB24_10: +; NO_SVE-NEXT: // implicit-def: $q2 +; NO_SVE-NEXT: tbz w8, #5, .LBB24_12 +; NO_SVE-NEXT: .LBB24_11: // %cond.load13 +; NO_SVE-NEXT: add x9, x0, #20 +; NO_SVE-NEXT: ld1 { v2.s }[1], [x9] +; NO_SVE-NEXT: .LBB24_12: // %else14 +; NO_SVE-NEXT: tbnz w8, #6, .LBB24_16 +; NO_SVE-NEXT: // %bb.13: // %else17 +; NO_SVE-NEXT: tbz w8, #7, .LBB24_15 +; NO_SVE-NEXT: .LBB24_14: // %cond.load19 +; NO_SVE-NEXT: add x8, x0, #28 +; NO_SVE-NEXT: ld1 { v2.s }[3], [x8] +; NO_SVE-NEXT: .LBB24_15: // %else20 +; NO_SVE-NEXT: ushll2 v1.2d, v0.4s, #0 +; NO_SVE-NEXT: ushll v0.2d, v0.2s, #0 +; NO_SVE-NEXT: ushll2 v3.2d, v2.4s, #0 +; NO_SVE-NEXT: ushll v2.2d, v2.2s, #0 +; NO_SVE-NEXT: add sp, sp, #16 +; NO_SVE-NEXT: ret +; NO_SVE-NEXT: .LBB24_16: // %cond.load16 +; NO_SVE-NEXT: add x9, x0, #24 +; NO_SVE-NEXT: ld1 { v2.s }[2], [x9] +; NO_SVE-NEXT: tbnz w8, #7, .LBB24_14 +; NO_SVE-NEXT: b .LBB24_15 +; ; VBITS_GE_512-LABEL: masked_load_zext_v8i32i64: ; VBITS_GE_512: // %bb.0: ; VBITS_GE_512-NEXT: ptrue p0.d, vl8 @@ -455,6 +4911,313 @@ } define <32 x i16> @masked_load_sext_v32i8i16_m16(<32 x i8>* %ap, <32 x i16>* %bp) #0 { +; NO_SVE-LABEL: masked_load_sext_v32i8i16_m16: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: sub sp, sp, #16 +; NO_SVE-NEXT: .cfi_def_cfa_offset 16 +; NO_SVE-NEXT: ldp q0, q1, [x1, #32] +; NO_SVE-NEXT: cmeq v0.8h, v0.8h, #0 +; NO_SVE-NEXT: xtn v0.8b, v0.8h +; NO_SVE-NEXT: cmeq v1.8h, v1.8h, #0 +; NO_SVE-NEXT: umov w8, v0.b[1] +; NO_SVE-NEXT: umov w10, v0.b[2] +; NO_SVE-NEXT: umov w9, v0.b[0] +; NO_SVE-NEXT: umov w11, v0.b[3] +; NO_SVE-NEXT: umov w12, v0.b[4] +; NO_SVE-NEXT: umov w13, v0.b[5] +; NO_SVE-NEXT: xtn v1.8b, v1.8h +; NO_SVE-NEXT: and w8, w8, #0x1 +; NO_SVE-NEXT: umov w14, v0.b[6] +; NO_SVE-NEXT: ldp q2, q3, [x1] +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: and w9, w9, #0x1 +; NO_SVE-NEXT: and w11, w11, #0x1 +; NO_SVE-NEXT: umov w15, v0.b[7] +; NO_SVE-NEXT: bfi w9, w8, #1, #1 +; NO_SVE-NEXT: and w12, w12, #0x1 +; NO_SVE-NEXT: umov w16, v1.b[0] +; NO_SVE-NEXT: bfi w9, w10, #2, #1 +; NO_SVE-NEXT: and w13, w13, #0x1 +; NO_SVE-NEXT: umov w8, v1.b[1] +; NO_SVE-NEXT: cmeq v2.8h, v2.8h, #0 +; NO_SVE-NEXT: bfi w9, w11, #3, #1 +; NO_SVE-NEXT: umov w10, v1.b[2] +; NO_SVE-NEXT: bfi w9, w12, #4, #1 +; NO_SVE-NEXT: and w14, w14, #0x1 +; NO_SVE-NEXT: xtn v2.8b, v2.8h +; NO_SVE-NEXT: umov w11, v1.b[3] +; NO_SVE-NEXT: bfi w9, w13, #5, #1 +; NO_SVE-NEXT: and w15, w15, #0x1 +; NO_SVE-NEXT: orr w9, w9, w14, lsl #6 +; NO_SVE-NEXT: and w16, w16, #0x1 +; NO_SVE-NEXT: umov w14, v2.b[1] +; NO_SVE-NEXT: orr w9, w9, w15, lsl #7 +; NO_SVE-NEXT: umov w15, v2.b[0] +; NO_SVE-NEXT: and w8, w8, #0x1 +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: orr w9, w9, w16, lsl #8 +; NO_SVE-NEXT: umov w16, v2.b[2] +; NO_SVE-NEXT: umov w12, v1.b[4] +; NO_SVE-NEXT: orr w8, w9, w8, lsl #9 +; NO_SVE-NEXT: umov w9, v2.b[3] +; NO_SVE-NEXT: and w11, w11, #0x1 +; NO_SVE-NEXT: orr w8, w8, w10, lsl #10 +; NO_SVE-NEXT: umov w10, v2.b[4] +; NO_SVE-NEXT: umov w13, v1.b[5] +; NO_SVE-NEXT: and w14, w14, #0x1 +; NO_SVE-NEXT: orr w8, w8, w11, lsl #11 +; NO_SVE-NEXT: and w11, w15, #0x1 +; NO_SVE-NEXT: umov w15, v2.b[5] +; NO_SVE-NEXT: and w16, w16, #0x1 +; NO_SVE-NEXT: and w12, w12, #0x1 +; NO_SVE-NEXT: and w9, w9, #0x1 +; NO_SVE-NEXT: bfi w11, w14, #1, #1 +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: umov w14, v2.b[6] +; NO_SVE-NEXT: orr w8, w8, w12, lsl #12 +; NO_SVE-NEXT: cmeq v0.8h, v3.8h, #0 +; NO_SVE-NEXT: and w12, w13, #0x1 +; NO_SVE-NEXT: bfi w11, w16, #2, #1 +; NO_SVE-NEXT: and w13, w15, #0x1 +; NO_SVE-NEXT: bfi w11, w9, #3, #1 +; NO_SVE-NEXT: umov w9, v2.b[7] +; NO_SVE-NEXT: xtn v0.8b, v0.8h +; NO_SVE-NEXT: bfi w11, w10, #4, #1 +; NO_SVE-NEXT: umov w10, v1.b[6] +; NO_SVE-NEXT: bfi w11, w13, #5, #1 +; NO_SVE-NEXT: and w13, w14, #0x1 +; NO_SVE-NEXT: umov w14, v0.b[0] +; NO_SVE-NEXT: and w9, w9, #0x1 +; NO_SVE-NEXT: umov w15, v0.b[1] +; NO_SVE-NEXT: orr w8, w8, w12, lsl #13 +; NO_SVE-NEXT: orr w11, w11, w13, lsl #6 +; NO_SVE-NEXT: umov w12, v0.b[2] +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: orr w9, w11, w9, lsl #7 +; NO_SVE-NEXT: and w11, w14, #0x1 +; NO_SVE-NEXT: umov w14, v0.b[3] +; NO_SVE-NEXT: and w13, w15, #0x1 +; NO_SVE-NEXT: orr w8, w8, w10, lsl #14 +; NO_SVE-NEXT: umov w10, v0.b[4] +; NO_SVE-NEXT: orr w9, w9, w11, lsl #8 +; NO_SVE-NEXT: and w11, w12, #0x1 +; NO_SVE-NEXT: umov w12, v0.b[5] +; NO_SVE-NEXT: orr w9, w9, w13, lsl #9 +; NO_SVE-NEXT: and w13, w14, #0x1 +; NO_SVE-NEXT: umov w14, v0.b[6] +; NO_SVE-NEXT: orr w9, w9, w11, lsl #10 +; NO_SVE-NEXT: umov w11, v1.b[7] +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: orr w9, w9, w13, lsl #11 +; NO_SVE-NEXT: and w12, w12, #0x1 +; NO_SVE-NEXT: umov w13, v0.b[7] +; NO_SVE-NEXT: orr w9, w9, w10, lsl #12 +; NO_SVE-NEXT: and w10, w14, #0x1 +; NO_SVE-NEXT: orr w11, w8, w11, lsl #15 +; NO_SVE-NEXT: orr w8, w9, w12, lsl #13 +; NO_SVE-NEXT: orr w8, w8, w10, lsl #14 +; NO_SVE-NEXT: orr w8, w8, w13, lsl #15 +; NO_SVE-NEXT: bfi w8, w11, #16, #16 +; NO_SVE-NEXT: tbz w8, #0, .LBB25_2 +; NO_SVE-NEXT: // %bb.1: // %cond.load +; NO_SVE-NEXT: ldr b0, [x0] +; NO_SVE-NEXT: tbnz w8, #1, .LBB25_3 +; NO_SVE-NEXT: b .LBB25_4 +; NO_SVE-NEXT: .LBB25_2: +; NO_SVE-NEXT: // implicit-def: $q0 +; NO_SVE-NEXT: tbz w8, #1, .LBB25_4 +; NO_SVE-NEXT: .LBB25_3: // %cond.load1 +; NO_SVE-NEXT: add x9, x0, #1 +; NO_SVE-NEXT: ld1 { v0.b }[1], [x9] +; NO_SVE-NEXT: .LBB25_4: // %else2 +; NO_SVE-NEXT: tbnz w8, #2, .LBB25_20 +; NO_SVE-NEXT: // %bb.5: // %else5 +; NO_SVE-NEXT: tbnz w8, #3, .LBB25_21 +; NO_SVE-NEXT: .LBB25_6: // %else8 +; NO_SVE-NEXT: tbnz w8, #4, .LBB25_22 +; NO_SVE-NEXT: .LBB25_7: // %else11 +; NO_SVE-NEXT: tbnz w8, #5, .LBB25_23 +; NO_SVE-NEXT: .LBB25_8: // %else14 +; NO_SVE-NEXT: tbnz w8, #6, .LBB25_24 +; NO_SVE-NEXT: .LBB25_9: // %else17 +; NO_SVE-NEXT: tbnz w8, #7, .LBB25_25 +; NO_SVE-NEXT: .LBB25_10: // %else20 +; NO_SVE-NEXT: tbnz w8, #8, .LBB25_26 +; NO_SVE-NEXT: .LBB25_11: // %else23 +; NO_SVE-NEXT: tbnz w8, #9, .LBB25_27 +; NO_SVE-NEXT: .LBB25_12: // %else26 +; NO_SVE-NEXT: tbnz w8, #10, .LBB25_28 +; NO_SVE-NEXT: .LBB25_13: // %else29 +; NO_SVE-NEXT: tbnz w8, #11, .LBB25_29 +; NO_SVE-NEXT: .LBB25_14: // %else32 +; NO_SVE-NEXT: tbnz w8, #12, .LBB25_30 +; NO_SVE-NEXT: .LBB25_15: // %else35 +; NO_SVE-NEXT: tbnz w8, #13, .LBB25_31 +; NO_SVE-NEXT: .LBB25_16: // %else38 +; NO_SVE-NEXT: tbnz w8, #14, .LBB25_32 +; NO_SVE-NEXT: .LBB25_17: // %else41 +; NO_SVE-NEXT: tbnz w8, #15, .LBB25_33 +; NO_SVE-NEXT: .LBB25_18: // %else44 +; NO_SVE-NEXT: tbz w8, #16, .LBB25_34 +; NO_SVE-NEXT: .LBB25_19: // %cond.load46 +; NO_SVE-NEXT: add x9, x0, #16 +; NO_SVE-NEXT: ld1 { v2.b }[0], [x9] +; NO_SVE-NEXT: tbnz w8, #17, .LBB25_35 +; NO_SVE-NEXT: b .LBB25_36 +; NO_SVE-NEXT: .LBB25_20: // %cond.load4 +; NO_SVE-NEXT: add x9, x0, #2 +; NO_SVE-NEXT: ld1 { v0.b }[2], [x9] +; NO_SVE-NEXT: tbz w8, #3, .LBB25_6 +; NO_SVE-NEXT: .LBB25_21: // %cond.load7 +; NO_SVE-NEXT: add x9, x0, #3 +; NO_SVE-NEXT: ld1 { v0.b }[3], [x9] +; NO_SVE-NEXT: tbz w8, #4, .LBB25_7 +; NO_SVE-NEXT: .LBB25_22: // %cond.load10 +; NO_SVE-NEXT: add x9, x0, #4 +; NO_SVE-NEXT: ld1 { v0.b }[4], [x9] +; NO_SVE-NEXT: tbz w8, #5, .LBB25_8 +; NO_SVE-NEXT: .LBB25_23: // %cond.load13 +; NO_SVE-NEXT: add x9, x0, #5 +; NO_SVE-NEXT: ld1 { v0.b }[5], [x9] +; NO_SVE-NEXT: tbz w8, #6, .LBB25_9 +; NO_SVE-NEXT: .LBB25_24: // %cond.load16 +; NO_SVE-NEXT: add x9, x0, #6 +; NO_SVE-NEXT: ld1 { v0.b }[6], [x9] +; NO_SVE-NEXT: tbz w8, #7, .LBB25_10 +; NO_SVE-NEXT: .LBB25_25: // %cond.load19 +; NO_SVE-NEXT: add x9, x0, #7 +; NO_SVE-NEXT: ld1 { v0.b }[7], [x9] +; NO_SVE-NEXT: tbz w8, #8, .LBB25_11 +; NO_SVE-NEXT: .LBB25_26: // %cond.load22 +; NO_SVE-NEXT: add x9, x0, #8 +; NO_SVE-NEXT: ld1 { v0.b }[8], [x9] +; NO_SVE-NEXT: tbz w8, #9, .LBB25_12 +; NO_SVE-NEXT: .LBB25_27: // %cond.load25 +; NO_SVE-NEXT: add x9, x0, #9 +; NO_SVE-NEXT: ld1 { v0.b }[9], [x9] +; NO_SVE-NEXT: tbz w8, #10, .LBB25_13 +; NO_SVE-NEXT: .LBB25_28: // %cond.load28 +; NO_SVE-NEXT: add x9, x0, #10 +; NO_SVE-NEXT: ld1 { v0.b }[10], [x9] +; NO_SVE-NEXT: tbz w8, #11, .LBB25_14 +; NO_SVE-NEXT: .LBB25_29: // %cond.load31 +; NO_SVE-NEXT: add x9, x0, #11 +; NO_SVE-NEXT: ld1 { v0.b }[11], [x9] +; NO_SVE-NEXT: tbz w8, #12, .LBB25_15 +; NO_SVE-NEXT: .LBB25_30: // %cond.load34 +; NO_SVE-NEXT: add x9, x0, #12 +; NO_SVE-NEXT: ld1 { v0.b }[12], [x9] +; NO_SVE-NEXT: tbz w8, #13, .LBB25_16 +; NO_SVE-NEXT: .LBB25_31: // %cond.load37 +; NO_SVE-NEXT: add x9, x0, #13 +; NO_SVE-NEXT: ld1 { v0.b }[13], [x9] +; NO_SVE-NEXT: tbz w8, #14, .LBB25_17 +; NO_SVE-NEXT: .LBB25_32: // %cond.load40 +; NO_SVE-NEXT: add x9, x0, #14 +; NO_SVE-NEXT: ld1 { v0.b }[14], [x9] +; NO_SVE-NEXT: tbz w8, #15, .LBB25_18 +; NO_SVE-NEXT: .LBB25_33: // %cond.load43 +; NO_SVE-NEXT: add x9, x0, #15 +; NO_SVE-NEXT: ld1 { v0.b }[15], [x9] +; NO_SVE-NEXT: tbnz w8, #16, .LBB25_19 +; NO_SVE-NEXT: .LBB25_34: +; NO_SVE-NEXT: // implicit-def: $q2 +; NO_SVE-NEXT: tbz w8, #17, .LBB25_36 +; NO_SVE-NEXT: .LBB25_35: // %cond.load49 +; NO_SVE-NEXT: add x9, x0, #17 +; NO_SVE-NEXT: ld1 { v2.b }[1], [x9] +; NO_SVE-NEXT: .LBB25_36: // %else50 +; NO_SVE-NEXT: tbnz w8, #18, .LBB25_52 +; NO_SVE-NEXT: // %bb.37: // %else53 +; NO_SVE-NEXT: tbnz w8, #19, .LBB25_53 +; NO_SVE-NEXT: .LBB25_38: // %else56 +; NO_SVE-NEXT: tbnz w8, #20, .LBB25_54 +; NO_SVE-NEXT: .LBB25_39: // %else59 +; NO_SVE-NEXT: tbnz w8, #21, .LBB25_55 +; NO_SVE-NEXT: .LBB25_40: // %else62 +; NO_SVE-NEXT: tbnz w8, #22, .LBB25_56 +; NO_SVE-NEXT: .LBB25_41: // %else65 +; NO_SVE-NEXT: tbnz w8, #23, .LBB25_57 +; NO_SVE-NEXT: .LBB25_42: // %else68 +; NO_SVE-NEXT: tbnz w8, #24, .LBB25_58 +; NO_SVE-NEXT: .LBB25_43: // %else71 +; NO_SVE-NEXT: tbnz w8, #25, .LBB25_59 +; NO_SVE-NEXT: .LBB25_44: // %else74 +; NO_SVE-NEXT: tbnz w8, #26, .LBB25_60 +; NO_SVE-NEXT: .LBB25_45: // %else77 +; NO_SVE-NEXT: tbnz w8, #27, .LBB25_61 +; NO_SVE-NEXT: .LBB25_46: // %else80 +; NO_SVE-NEXT: tbnz w8, #28, .LBB25_62 +; NO_SVE-NEXT: .LBB25_47: // %else83 +; NO_SVE-NEXT: tbnz w8, #29, .LBB25_63 +; NO_SVE-NEXT: .LBB25_48: // %else86 +; NO_SVE-NEXT: tbnz w8, #30, .LBB25_64 +; NO_SVE-NEXT: .LBB25_49: // %else89 +; NO_SVE-NEXT: tbz w8, #31, .LBB25_51 +; NO_SVE-NEXT: .LBB25_50: // %cond.load91 +; NO_SVE-NEXT: add x8, x0, #31 +; NO_SVE-NEXT: ld1 { v2.b }[15], [x8] +; NO_SVE-NEXT: .LBB25_51: // %else92 +; NO_SVE-NEXT: sshll2 v1.8h, v0.16b, #0 +; NO_SVE-NEXT: sshll v0.8h, v0.8b, #0 +; NO_SVE-NEXT: sshll2 v3.8h, v2.16b, #0 +; NO_SVE-NEXT: sshll v2.8h, v2.8b, #0 +; NO_SVE-NEXT: add sp, sp, #16 +; NO_SVE-NEXT: ret +; NO_SVE-NEXT: .LBB25_52: // %cond.load52 +; NO_SVE-NEXT: add x9, x0, #18 +; NO_SVE-NEXT: ld1 { v2.b }[2], [x9] +; NO_SVE-NEXT: tbz w8, #19, .LBB25_38 +; NO_SVE-NEXT: .LBB25_53: // %cond.load55 +; NO_SVE-NEXT: add x9, x0, #19 +; NO_SVE-NEXT: ld1 { v2.b }[3], [x9] +; NO_SVE-NEXT: tbz w8, #20, .LBB25_39 +; NO_SVE-NEXT: .LBB25_54: // %cond.load58 +; NO_SVE-NEXT: add x9, x0, #20 +; NO_SVE-NEXT: ld1 { v2.b }[4], [x9] +; NO_SVE-NEXT: tbz w8, #21, .LBB25_40 +; NO_SVE-NEXT: .LBB25_55: // %cond.load61 +; NO_SVE-NEXT: add x9, x0, #21 +; NO_SVE-NEXT: ld1 { v2.b }[5], [x9] +; NO_SVE-NEXT: tbz w8, #22, .LBB25_41 +; NO_SVE-NEXT: .LBB25_56: // %cond.load64 +; NO_SVE-NEXT: add x9, x0, #22 +; NO_SVE-NEXT: ld1 { v2.b }[6], [x9] +; NO_SVE-NEXT: tbz w8, #23, .LBB25_42 +; NO_SVE-NEXT: .LBB25_57: // %cond.load67 +; NO_SVE-NEXT: add x9, x0, #23 +; NO_SVE-NEXT: ld1 { v2.b }[7], [x9] +; NO_SVE-NEXT: tbz w8, #24, .LBB25_43 +; NO_SVE-NEXT: .LBB25_58: // %cond.load70 +; NO_SVE-NEXT: add x9, x0, #24 +; NO_SVE-NEXT: ld1 { v2.b }[8], [x9] +; NO_SVE-NEXT: tbz w8, #25, .LBB25_44 +; NO_SVE-NEXT: .LBB25_59: // %cond.load73 +; NO_SVE-NEXT: add x9, x0, #25 +; NO_SVE-NEXT: ld1 { v2.b }[9], [x9] +; NO_SVE-NEXT: tbz w8, #26, .LBB25_45 +; NO_SVE-NEXT: .LBB25_60: // %cond.load76 +; NO_SVE-NEXT: add x9, x0, #26 +; NO_SVE-NEXT: ld1 { v2.b }[10], [x9] +; NO_SVE-NEXT: tbz w8, #27, .LBB25_46 +; NO_SVE-NEXT: .LBB25_61: // %cond.load79 +; NO_SVE-NEXT: add x9, x0, #27 +; NO_SVE-NEXT: ld1 { v2.b }[11], [x9] +; NO_SVE-NEXT: tbz w8, #28, .LBB25_47 +; NO_SVE-NEXT: .LBB25_62: // %cond.load82 +; NO_SVE-NEXT: add x9, x0, #28 +; NO_SVE-NEXT: ld1 { v2.b }[12], [x9] +; NO_SVE-NEXT: tbz w8, #29, .LBB25_48 +; NO_SVE-NEXT: .LBB25_63: // %cond.load85 +; NO_SVE-NEXT: add x9, x0, #29 +; NO_SVE-NEXT: ld1 { v2.b }[13], [x9] +; NO_SVE-NEXT: tbz w8, #30, .LBB25_49 +; NO_SVE-NEXT: .LBB25_64: // %cond.load88 +; NO_SVE-NEXT: add x9, x0, #30 +; NO_SVE-NEXT: ld1 { v2.b }[14], [x9] +; NO_SVE-NEXT: tbnz w8, #31, .LBB25_50 +; NO_SVE-NEXT: b .LBB25_51 +; ; VBITS_GE_512-LABEL: masked_load_sext_v32i8i16_m16: ; VBITS_GE_512: // %bb.0: ; VBITS_GE_512-NEXT: ptrue p0.h, vl32 @@ -471,6 +5234,172 @@ } define <16 x i32> @masked_load_sext_v16i8i32_m32(<16 x i8>* %ap, <16 x i32>* %bp) #0 { +; NO_SVE-LABEL: masked_load_sext_v16i8i32_m32: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: sub sp, sp, #16 +; NO_SVE-NEXT: .cfi_def_cfa_offset 16 +; NO_SVE-NEXT: ldp q1, q0, [x1] +; NO_SVE-NEXT: cmeq v1.4s, v1.4s, #0 +; NO_SVE-NEXT: cmeq v0.4s, v0.4s, #0 +; NO_SVE-NEXT: uzp1 v0.8h, v1.8h, v0.8h +; NO_SVE-NEXT: ldp q2, q1, [x1, #32] +; NO_SVE-NEXT: xtn v0.8b, v0.8h +; NO_SVE-NEXT: cmeq v2.4s, v2.4s, #0 +; NO_SVE-NEXT: umov w8, v0.b[1] +; NO_SVE-NEXT: cmeq v1.4s, v1.4s, #0 +; NO_SVE-NEXT: umov w10, v0.b[2] +; NO_SVE-NEXT: umov w9, v0.b[0] +; NO_SVE-NEXT: umov w11, v0.b[3] +; NO_SVE-NEXT: umov w12, v0.b[4] +; NO_SVE-NEXT: uzp1 v1.8h, v2.8h, v1.8h +; NO_SVE-NEXT: and w8, w8, #0x1 +; NO_SVE-NEXT: umov w13, v0.b[5] +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: and w9, w9, #0x1 +; NO_SVE-NEXT: and w11, w11, #0x1 +; NO_SVE-NEXT: bfi w9, w8, #1, #1 +; NO_SVE-NEXT: xtn v1.8b, v1.8h +; NO_SVE-NEXT: umov w8, v0.b[6] +; NO_SVE-NEXT: and w12, w12, #0x1 +; NO_SVE-NEXT: bfi w9, w10, #2, #1 +; NO_SVE-NEXT: umov w10, v0.b[7] +; NO_SVE-NEXT: and w13, w13, #0x1 +; NO_SVE-NEXT: bfi w9, w11, #3, #1 +; NO_SVE-NEXT: umov w11, v1.b[0] +; NO_SVE-NEXT: bfi w9, w12, #4, #1 +; NO_SVE-NEXT: umov w12, v1.b[1] +; NO_SVE-NEXT: and w8, w8, #0x1 +; NO_SVE-NEXT: bfi w9, w13, #5, #1 +; NO_SVE-NEXT: umov w13, v1.b[2] +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: orr w8, w9, w8, lsl #6 +; NO_SVE-NEXT: umov w9, v1.b[3] +; NO_SVE-NEXT: and w11, w11, #0x1 +; NO_SVE-NEXT: orr w8, w8, w10, lsl #7 +; NO_SVE-NEXT: umov w10, v1.b[4] +; NO_SVE-NEXT: and w12, w12, #0x1 +; NO_SVE-NEXT: orr w8, w8, w11, lsl #8 +; NO_SVE-NEXT: umov w11, v1.b[5] +; NO_SVE-NEXT: and w13, w13, #0x1 +; NO_SVE-NEXT: orr w8, w8, w12, lsl #9 +; NO_SVE-NEXT: umov w12, v1.b[6] +; NO_SVE-NEXT: and w9, w9, #0x1 +; NO_SVE-NEXT: orr w8, w8, w13, lsl #10 +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: orr w8, w8, w9, lsl #11 +; NO_SVE-NEXT: and w9, w11, #0x1 +; NO_SVE-NEXT: umov w11, v1.b[7] +; NO_SVE-NEXT: orr w8, w8, w10, lsl #12 +; NO_SVE-NEXT: and w10, w12, #0x1 +; NO_SVE-NEXT: orr w8, w8, w9, lsl #13 +; NO_SVE-NEXT: // implicit-def: $q0 +; NO_SVE-NEXT: orr w8, w8, w10, lsl #14 +; NO_SVE-NEXT: orr w9, w8, w11, lsl #15 +; NO_SVE-NEXT: and w8, w9, #0xffff +; NO_SVE-NEXT: tbnz w9, #0, .LBB26_18 +; NO_SVE-NEXT: // %bb.1: // %else +; NO_SVE-NEXT: tbnz w8, #1, .LBB26_19 +; NO_SVE-NEXT: .LBB26_2: // %else2 +; NO_SVE-NEXT: tbnz w8, #2, .LBB26_20 +; NO_SVE-NEXT: .LBB26_3: // %else5 +; NO_SVE-NEXT: tbnz w8, #3, .LBB26_21 +; NO_SVE-NEXT: .LBB26_4: // %else8 +; NO_SVE-NEXT: tbnz w8, #4, .LBB26_22 +; NO_SVE-NEXT: .LBB26_5: // %else11 +; NO_SVE-NEXT: tbnz w8, #5, .LBB26_23 +; NO_SVE-NEXT: .LBB26_6: // %else14 +; NO_SVE-NEXT: tbnz w8, #6, .LBB26_24 +; NO_SVE-NEXT: .LBB26_7: // %else17 +; NO_SVE-NEXT: tbnz w8, #7, .LBB26_25 +; NO_SVE-NEXT: .LBB26_8: // %else20 +; NO_SVE-NEXT: tbnz w8, #8, .LBB26_26 +; NO_SVE-NEXT: .LBB26_9: // %else23 +; NO_SVE-NEXT: tbnz w8, #9, .LBB26_27 +; NO_SVE-NEXT: .LBB26_10: // %else26 +; NO_SVE-NEXT: tbnz w8, #10, .LBB26_28 +; NO_SVE-NEXT: .LBB26_11: // %else29 +; NO_SVE-NEXT: tbnz w8, #11, .LBB26_29 +; NO_SVE-NEXT: .LBB26_12: // %else32 +; NO_SVE-NEXT: tbnz w8, #12, .LBB26_30 +; NO_SVE-NEXT: .LBB26_13: // %else35 +; NO_SVE-NEXT: tbnz w8, #13, .LBB26_31 +; NO_SVE-NEXT: .LBB26_14: // %else38 +; NO_SVE-NEXT: tbnz w8, #14, .LBB26_32 +; NO_SVE-NEXT: .LBB26_15: // %else41 +; NO_SVE-NEXT: tbz w8, #15, .LBB26_17 +; NO_SVE-NEXT: .LBB26_16: // %cond.load43 +; NO_SVE-NEXT: add x8, x0, #15 +; NO_SVE-NEXT: ld1 { v0.b }[15], [x8] +; NO_SVE-NEXT: .LBB26_17: // %else44 +; NO_SVE-NEXT: sshll2 v2.8h, v0.16b, #0 +; NO_SVE-NEXT: sshll v0.8h, v0.8b, #0 +; NO_SVE-NEXT: sshll2 v3.4s, v2.8h, #0 +; NO_SVE-NEXT: sshll2 v1.4s, v0.8h, #0 +; NO_SVE-NEXT: sshll v0.4s, v0.4h, #0 +; NO_SVE-NEXT: sshll v2.4s, v2.4h, #0 +; NO_SVE-NEXT: add sp, sp, #16 +; NO_SVE-NEXT: ret +; NO_SVE-NEXT: .LBB26_18: // %cond.load +; NO_SVE-NEXT: ldr b0, [x0] +; NO_SVE-NEXT: tbz w8, #1, .LBB26_2 +; NO_SVE-NEXT: .LBB26_19: // %cond.load1 +; NO_SVE-NEXT: add x9, x0, #1 +; NO_SVE-NEXT: ld1 { v0.b }[1], [x9] +; NO_SVE-NEXT: tbz w8, #2, .LBB26_3 +; NO_SVE-NEXT: .LBB26_20: // %cond.load4 +; NO_SVE-NEXT: add x9, x0, #2 +; NO_SVE-NEXT: ld1 { v0.b }[2], [x9] +; NO_SVE-NEXT: tbz w8, #3, .LBB26_4 +; NO_SVE-NEXT: .LBB26_21: // %cond.load7 +; NO_SVE-NEXT: add x9, x0, #3 +; NO_SVE-NEXT: ld1 { v0.b }[3], [x9] +; NO_SVE-NEXT: tbz w8, #4, .LBB26_5 +; NO_SVE-NEXT: .LBB26_22: // %cond.load10 +; NO_SVE-NEXT: add x9, x0, #4 +; NO_SVE-NEXT: ld1 { v0.b }[4], [x9] +; NO_SVE-NEXT: tbz w8, #5, .LBB26_6 +; NO_SVE-NEXT: .LBB26_23: // %cond.load13 +; NO_SVE-NEXT: add x9, x0, #5 +; NO_SVE-NEXT: ld1 { v0.b }[5], [x9] +; NO_SVE-NEXT: tbz w8, #6, .LBB26_7 +; NO_SVE-NEXT: .LBB26_24: // %cond.load16 +; NO_SVE-NEXT: add x9, x0, #6 +; NO_SVE-NEXT: ld1 { v0.b }[6], [x9] +; NO_SVE-NEXT: tbz w8, #7, .LBB26_8 +; NO_SVE-NEXT: .LBB26_25: // %cond.load19 +; NO_SVE-NEXT: add x9, x0, #7 +; NO_SVE-NEXT: ld1 { v0.b }[7], [x9] +; NO_SVE-NEXT: tbz w8, #8, .LBB26_9 +; NO_SVE-NEXT: .LBB26_26: // %cond.load22 +; NO_SVE-NEXT: add x9, x0, #8 +; NO_SVE-NEXT: ld1 { v0.b }[8], [x9] +; NO_SVE-NEXT: tbz w8, #9, .LBB26_10 +; NO_SVE-NEXT: .LBB26_27: // %cond.load25 +; NO_SVE-NEXT: add x9, x0, #9 +; NO_SVE-NEXT: ld1 { v0.b }[9], [x9] +; NO_SVE-NEXT: tbz w8, #10, .LBB26_11 +; NO_SVE-NEXT: .LBB26_28: // %cond.load28 +; NO_SVE-NEXT: add x9, x0, #10 +; NO_SVE-NEXT: ld1 { v0.b }[10], [x9] +; NO_SVE-NEXT: tbz w8, #11, .LBB26_12 +; NO_SVE-NEXT: .LBB26_29: // %cond.load31 +; NO_SVE-NEXT: add x9, x0, #11 +; NO_SVE-NEXT: ld1 { v0.b }[11], [x9] +; NO_SVE-NEXT: tbz w8, #12, .LBB26_13 +; NO_SVE-NEXT: .LBB26_30: // %cond.load34 +; NO_SVE-NEXT: add x9, x0, #12 +; NO_SVE-NEXT: ld1 { v0.b }[12], [x9] +; NO_SVE-NEXT: tbz w8, #13, .LBB26_14 +; NO_SVE-NEXT: .LBB26_31: // %cond.load37 +; NO_SVE-NEXT: add x9, x0, #13 +; NO_SVE-NEXT: ld1 { v0.b }[13], [x9] +; NO_SVE-NEXT: tbz w8, #14, .LBB26_15 +; NO_SVE-NEXT: .LBB26_32: // %cond.load40 +; NO_SVE-NEXT: add x9, x0, #14 +; NO_SVE-NEXT: ld1 { v0.b }[14], [x9] +; NO_SVE-NEXT: tbnz w8, #15, .LBB26_16 +; NO_SVE-NEXT: b .LBB26_17 +; ; VBITS_GE_512-LABEL: masked_load_sext_v16i8i32_m32: ; VBITS_GE_512: // %bb.0: ; VBITS_GE_512-NEXT: ptrue p0.s, vl16 @@ -487,6 +5416,101 @@ } define <8 x i64> @masked_load_sext_v8i8i64_m64(<8 x i8>* %ap, <8 x i64>* %bp) #0 { +; NO_SVE-LABEL: masked_load_sext_v8i8i64_m64: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: sub sp, sp, #16 +; NO_SVE-NEXT: .cfi_def_cfa_offset 16 +; NO_SVE-NEXT: ldp q0, q1, [x1, #32] +; NO_SVE-NEXT: cmeq v0.2d, v0.2d, #0 +; NO_SVE-NEXT: ldp q2, q3, [x1] +; NO_SVE-NEXT: cmeq v1.2d, v1.2d, #0 +; NO_SVE-NEXT: uzp1 v0.4s, v0.4s, v1.4s +; NO_SVE-NEXT: cmeq v2.2d, v2.2d, #0 +; NO_SVE-NEXT: cmeq v3.2d, v3.2d, #0 +; NO_SVE-NEXT: uzp1 v1.4s, v2.4s, v3.4s +; NO_SVE-NEXT: uzp1 v0.8h, v1.8h, v0.8h +; NO_SVE-NEXT: xtn v0.8b, v0.8h +; NO_SVE-NEXT: umov w8, v0.b[1] +; NO_SVE-NEXT: umov w10, v0.b[2] +; NO_SVE-NEXT: umov w9, v0.b[0] +; NO_SVE-NEXT: umov w11, v0.b[3] +; NO_SVE-NEXT: umov w12, v0.b[4] +; NO_SVE-NEXT: umov w13, v0.b[5] +; NO_SVE-NEXT: umov w14, v0.b[6] +; NO_SVE-NEXT: and w8, w8, #0x1 +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: and w9, w9, #0x1 +; NO_SVE-NEXT: and w11, w11, #0x1 +; NO_SVE-NEXT: bfi w9, w8, #1, #1 +; NO_SVE-NEXT: and w8, w12, #0x1 +; NO_SVE-NEXT: bfi w9, w10, #2, #1 +; NO_SVE-NEXT: and w10, w13, #0x1 +; NO_SVE-NEXT: bfi w9, w11, #3, #1 +; NO_SVE-NEXT: umov w11, v0.b[7] +; NO_SVE-NEXT: bfi w9, w8, #4, #1 +; NO_SVE-NEXT: and w8, w14, #0x1 +; NO_SVE-NEXT: bfi w9, w10, #5, #1 +; NO_SVE-NEXT: // implicit-def: $d0 +; NO_SVE-NEXT: orr w8, w9, w8, lsl #6 +; NO_SVE-NEXT: orr w9, w8, w11, lsl #7 +; NO_SVE-NEXT: and w8, w9, #0xff +; NO_SVE-NEXT: tbnz w9, #0, .LBB27_10 +; NO_SVE-NEXT: // %bb.1: // %else +; NO_SVE-NEXT: tbnz w8, #1, .LBB27_11 +; NO_SVE-NEXT: .LBB27_2: // %else2 +; NO_SVE-NEXT: tbnz w8, #2, .LBB27_12 +; NO_SVE-NEXT: .LBB27_3: // %else5 +; NO_SVE-NEXT: tbnz w8, #3, .LBB27_13 +; NO_SVE-NEXT: .LBB27_4: // %else8 +; NO_SVE-NEXT: tbnz w8, #4, .LBB27_14 +; NO_SVE-NEXT: .LBB27_5: // %else11 +; NO_SVE-NEXT: tbnz w8, #5, .LBB27_15 +; NO_SVE-NEXT: .LBB27_6: // %else14 +; NO_SVE-NEXT: tbnz w8, #6, .LBB27_16 +; NO_SVE-NEXT: .LBB27_7: // %else17 +; NO_SVE-NEXT: tbz w8, #7, .LBB27_9 +; NO_SVE-NEXT: .LBB27_8: // %cond.load19 +; NO_SVE-NEXT: add x8, x0, #7 +; NO_SVE-NEXT: ld1 { v0.b }[7], [x8] +; NO_SVE-NEXT: .LBB27_9: // %else20 +; NO_SVE-NEXT: sshll v0.8h, v0.8b, #0 +; NO_SVE-NEXT: sshll2 v2.4s, v0.8h, #0 +; NO_SVE-NEXT: sshll v0.4s, v0.4h, #0 +; NO_SVE-NEXT: sshll2 v3.2d, v2.4s, #0 +; NO_SVE-NEXT: sshll2 v1.2d, v0.4s, #0 +; NO_SVE-NEXT: sshll v0.2d, v0.2s, #0 +; NO_SVE-NEXT: sshll v2.2d, v2.2s, #0 +; NO_SVE-NEXT: add sp, sp, #16 +; NO_SVE-NEXT: ret +; NO_SVE-NEXT: .LBB27_10: // %cond.load +; NO_SVE-NEXT: ldr b0, [x0] +; NO_SVE-NEXT: tbz w8, #1, .LBB27_2 +; NO_SVE-NEXT: .LBB27_11: // %cond.load1 +; NO_SVE-NEXT: add x9, x0, #1 +; NO_SVE-NEXT: ld1 { v0.b }[1], [x9] +; NO_SVE-NEXT: tbz w8, #2, .LBB27_3 +; NO_SVE-NEXT: .LBB27_12: // %cond.load4 +; NO_SVE-NEXT: add x9, x0, #2 +; NO_SVE-NEXT: ld1 { v0.b }[2], [x9] +; NO_SVE-NEXT: tbz w8, #3, .LBB27_4 +; NO_SVE-NEXT: .LBB27_13: // %cond.load7 +; NO_SVE-NEXT: add x9, x0, #3 +; NO_SVE-NEXT: ld1 { v0.b }[3], [x9] +; NO_SVE-NEXT: tbz w8, #4, .LBB27_5 +; NO_SVE-NEXT: .LBB27_14: // %cond.load10 +; NO_SVE-NEXT: add x9, x0, #4 +; NO_SVE-NEXT: ld1 { v0.b }[4], [x9] +; NO_SVE-NEXT: tbz w8, #5, .LBB27_6 +; NO_SVE-NEXT: .LBB27_15: // %cond.load13 +; NO_SVE-NEXT: add x9, x0, #5 +; NO_SVE-NEXT: ld1 { v0.b }[5], [x9] +; NO_SVE-NEXT: tbz w8, #6, .LBB27_7 +; NO_SVE-NEXT: .LBB27_16: // %cond.load16 +; NO_SVE-NEXT: add x9, x0, #6 +; NO_SVE-NEXT: ld1 { v0.b }[6], [x9] +; NO_SVE-NEXT: tbnz w8, #7, .LBB27_8 +; NO_SVE-NEXT: b .LBB27_9 +; ; VBITS_GE_512-LABEL: masked_load_sext_v8i8i64_m64: ; VBITS_GE_512: // %bb.0: ; VBITS_GE_512-NEXT: ptrue p0.d, vl8 @@ -503,6 +5527,171 @@ } define <16 x i32> @masked_load_sext_v16i16i32_m32(<16 x i16>* %ap, <16 x i32>* %bp) #0 { +; NO_SVE-LABEL: masked_load_sext_v16i16i32_m32: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: sub sp, sp, #16 +; NO_SVE-NEXT: .cfi_def_cfa_offset 16 +; NO_SVE-NEXT: ldp q1, q0, [x1] +; NO_SVE-NEXT: cmeq v1.4s, v1.4s, #0 +; NO_SVE-NEXT: cmeq v0.4s, v0.4s, #0 +; NO_SVE-NEXT: uzp1 v0.8h, v1.8h, v0.8h +; NO_SVE-NEXT: ldp q2, q1, [x1, #32] +; NO_SVE-NEXT: xtn v0.8b, v0.8h +; NO_SVE-NEXT: cmeq v2.4s, v2.4s, #0 +; NO_SVE-NEXT: umov w8, v0.b[1] +; NO_SVE-NEXT: cmeq v1.4s, v1.4s, #0 +; NO_SVE-NEXT: umov w10, v0.b[2] +; NO_SVE-NEXT: umov w9, v0.b[0] +; NO_SVE-NEXT: umov w11, v0.b[3] +; NO_SVE-NEXT: umov w12, v0.b[4] +; NO_SVE-NEXT: uzp1 v1.8h, v2.8h, v1.8h +; NO_SVE-NEXT: and w8, w8, #0x1 +; NO_SVE-NEXT: umov w13, v0.b[5] +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: and w9, w9, #0x1 +; NO_SVE-NEXT: and w11, w11, #0x1 +; NO_SVE-NEXT: bfi w9, w8, #1, #1 +; NO_SVE-NEXT: xtn v1.8b, v1.8h +; NO_SVE-NEXT: umov w8, v0.b[6] +; NO_SVE-NEXT: and w12, w12, #0x1 +; NO_SVE-NEXT: bfi w9, w10, #2, #1 +; NO_SVE-NEXT: umov w10, v0.b[7] +; NO_SVE-NEXT: and w13, w13, #0x1 +; NO_SVE-NEXT: bfi w9, w11, #3, #1 +; NO_SVE-NEXT: umov w11, v1.b[0] +; NO_SVE-NEXT: bfi w9, w12, #4, #1 +; NO_SVE-NEXT: umov w12, v1.b[1] +; NO_SVE-NEXT: and w8, w8, #0x1 +; NO_SVE-NEXT: bfi w9, w13, #5, #1 +; NO_SVE-NEXT: umov w13, v1.b[2] +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: orr w8, w9, w8, lsl #6 +; NO_SVE-NEXT: umov w9, v1.b[3] +; NO_SVE-NEXT: and w11, w11, #0x1 +; NO_SVE-NEXT: orr w8, w8, w10, lsl #7 +; NO_SVE-NEXT: umov w10, v1.b[4] +; NO_SVE-NEXT: and w12, w12, #0x1 +; NO_SVE-NEXT: orr w8, w8, w11, lsl #8 +; NO_SVE-NEXT: umov w11, v1.b[5] +; NO_SVE-NEXT: and w13, w13, #0x1 +; NO_SVE-NEXT: orr w8, w8, w12, lsl #9 +; NO_SVE-NEXT: umov w12, v1.b[6] +; NO_SVE-NEXT: and w9, w9, #0x1 +; NO_SVE-NEXT: orr w8, w8, w13, lsl #10 +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: orr w8, w8, w9, lsl #11 +; NO_SVE-NEXT: and w9, w11, #0x1 +; NO_SVE-NEXT: umov w11, v1.b[7] +; NO_SVE-NEXT: orr w8, w8, w10, lsl #12 +; NO_SVE-NEXT: and w10, w12, #0x1 +; NO_SVE-NEXT: orr w8, w8, w9, lsl #13 +; NO_SVE-NEXT: orr w8, w8, w10, lsl #14 +; NO_SVE-NEXT: orr w9, w8, w11, lsl #15 +; NO_SVE-NEXT: and w8, w9, #0xffff +; NO_SVE-NEXT: tbz w9, #0, .LBB28_2 +; NO_SVE-NEXT: // %bb.1: // %cond.load +; NO_SVE-NEXT: ldr h0, [x0] +; NO_SVE-NEXT: tbnz w8, #1, .LBB28_3 +; NO_SVE-NEXT: b .LBB28_4 +; NO_SVE-NEXT: .LBB28_2: +; NO_SVE-NEXT: // implicit-def: $q0 +; NO_SVE-NEXT: tbz w8, #1, .LBB28_4 +; NO_SVE-NEXT: .LBB28_3: // %cond.load1 +; NO_SVE-NEXT: add x9, x0, #2 +; NO_SVE-NEXT: ld1 { v0.h }[1], [x9] +; NO_SVE-NEXT: .LBB28_4: // %else2 +; NO_SVE-NEXT: tbnz w8, #2, .LBB28_12 +; NO_SVE-NEXT: // %bb.5: // %else5 +; NO_SVE-NEXT: tbnz w8, #3, .LBB28_13 +; NO_SVE-NEXT: .LBB28_6: // %else8 +; NO_SVE-NEXT: tbnz w8, #4, .LBB28_14 +; NO_SVE-NEXT: .LBB28_7: // %else11 +; NO_SVE-NEXT: tbnz w8, #5, .LBB28_15 +; NO_SVE-NEXT: .LBB28_8: // %else14 +; NO_SVE-NEXT: tbnz w8, #6, .LBB28_16 +; NO_SVE-NEXT: .LBB28_9: // %else17 +; NO_SVE-NEXT: tbnz w8, #7, .LBB28_17 +; NO_SVE-NEXT: .LBB28_10: // %else20 +; NO_SVE-NEXT: tbz w8, #8, .LBB28_18 +; NO_SVE-NEXT: .LBB28_11: // %cond.load22 +; NO_SVE-NEXT: add x9, x0, #16 +; NO_SVE-NEXT: ld1 { v2.h }[0], [x9] +; NO_SVE-NEXT: tbnz w8, #9, .LBB28_19 +; NO_SVE-NEXT: b .LBB28_20 +; NO_SVE-NEXT: .LBB28_12: // %cond.load4 +; NO_SVE-NEXT: add x9, x0, #4 +; NO_SVE-NEXT: ld1 { v0.h }[2], [x9] +; NO_SVE-NEXT: tbz w8, #3, .LBB28_6 +; NO_SVE-NEXT: .LBB28_13: // %cond.load7 +; NO_SVE-NEXT: add x9, x0, #6 +; NO_SVE-NEXT: ld1 { v0.h }[3], [x9] +; NO_SVE-NEXT: tbz w8, #4, .LBB28_7 +; NO_SVE-NEXT: .LBB28_14: // %cond.load10 +; NO_SVE-NEXT: add x9, x0, #8 +; NO_SVE-NEXT: ld1 { v0.h }[4], [x9] +; NO_SVE-NEXT: tbz w8, #5, .LBB28_8 +; NO_SVE-NEXT: .LBB28_15: // %cond.load13 +; NO_SVE-NEXT: add x9, x0, #10 +; NO_SVE-NEXT: ld1 { v0.h }[5], [x9] +; NO_SVE-NEXT: tbz w8, #6, .LBB28_9 +; NO_SVE-NEXT: .LBB28_16: // %cond.load16 +; NO_SVE-NEXT: add x9, x0, #12 +; NO_SVE-NEXT: ld1 { v0.h }[6], [x9] +; NO_SVE-NEXT: tbz w8, #7, .LBB28_10 +; NO_SVE-NEXT: .LBB28_17: // %cond.load19 +; NO_SVE-NEXT: add x9, x0, #14 +; NO_SVE-NEXT: ld1 { v0.h }[7], [x9] +; NO_SVE-NEXT: tbnz w8, #8, .LBB28_11 +; NO_SVE-NEXT: .LBB28_18: +; NO_SVE-NEXT: // implicit-def: $q2 +; NO_SVE-NEXT: tbz w8, #9, .LBB28_20 +; NO_SVE-NEXT: .LBB28_19: // %cond.load25 +; NO_SVE-NEXT: add x9, x0, #18 +; NO_SVE-NEXT: ld1 { v2.h }[1], [x9] +; NO_SVE-NEXT: .LBB28_20: // %else26 +; NO_SVE-NEXT: tbnz w8, #10, .LBB28_28 +; NO_SVE-NEXT: // %bb.21: // %else29 +; NO_SVE-NEXT: tbnz w8, #11, .LBB28_29 +; NO_SVE-NEXT: .LBB28_22: // %else32 +; NO_SVE-NEXT: tbnz w8, #12, .LBB28_30 +; NO_SVE-NEXT: .LBB28_23: // %else35 +; NO_SVE-NEXT: tbnz w8, #13, .LBB28_31 +; NO_SVE-NEXT: .LBB28_24: // %else38 +; NO_SVE-NEXT: tbnz w8, #14, .LBB28_32 +; NO_SVE-NEXT: .LBB28_25: // %else41 +; NO_SVE-NEXT: tbz w8, #15, .LBB28_27 +; NO_SVE-NEXT: .LBB28_26: // %cond.load43 +; NO_SVE-NEXT: add x8, x0, #30 +; NO_SVE-NEXT: ld1 { v2.h }[7], [x8] +; NO_SVE-NEXT: .LBB28_27: // %else44 +; NO_SVE-NEXT: sshll2 v1.4s, v0.8h, #0 +; NO_SVE-NEXT: sshll v0.4s, v0.4h, #0 +; NO_SVE-NEXT: sshll2 v3.4s, v2.8h, #0 +; NO_SVE-NEXT: sshll v2.4s, v2.4h, #0 +; NO_SVE-NEXT: add sp, sp, #16 +; NO_SVE-NEXT: ret +; NO_SVE-NEXT: .LBB28_28: // %cond.load28 +; NO_SVE-NEXT: add x9, x0, #20 +; NO_SVE-NEXT: ld1 { v2.h }[2], [x9] +; NO_SVE-NEXT: tbz w8, #11, .LBB28_22 +; NO_SVE-NEXT: .LBB28_29: // %cond.load31 +; NO_SVE-NEXT: add x9, x0, #22 +; NO_SVE-NEXT: ld1 { v2.h }[3], [x9] +; NO_SVE-NEXT: tbz w8, #12, .LBB28_23 +; NO_SVE-NEXT: .LBB28_30: // %cond.load34 +; NO_SVE-NEXT: add x9, x0, #24 +; NO_SVE-NEXT: ld1 { v2.h }[4], [x9] +; NO_SVE-NEXT: tbz w8, #13, .LBB28_24 +; NO_SVE-NEXT: .LBB28_31: // %cond.load37 +; NO_SVE-NEXT: add x9, x0, #26 +; NO_SVE-NEXT: ld1 { v2.h }[5], [x9] +; NO_SVE-NEXT: tbz w8, #14, .LBB28_25 +; NO_SVE-NEXT: .LBB28_32: // %cond.load40 +; NO_SVE-NEXT: add x9, x0, #28 +; NO_SVE-NEXT: ld1 { v2.h }[6], [x9] +; NO_SVE-NEXT: tbnz w8, #15, .LBB28_26 +; NO_SVE-NEXT: b .LBB28_27 +; ; VBITS_GE_512-LABEL: masked_load_sext_v16i16i32_m32: ; VBITS_GE_512: // %bb.0: ; VBITS_GE_512-NEXT: ptrue p0.s, vl16 @@ -519,6 +5708,100 @@ } define <8 x i64> @masked_load_sext_v8i16i64_m64(<8 x i16>* %ap, <8 x i64>* %bp) #0 { +; NO_SVE-LABEL: masked_load_sext_v8i16i64_m64: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: sub sp, sp, #16 +; NO_SVE-NEXT: .cfi_def_cfa_offset 16 +; NO_SVE-NEXT: ldp q0, q1, [x1, #32] +; NO_SVE-NEXT: cmeq v0.2d, v0.2d, #0 +; NO_SVE-NEXT: ldp q2, q3, [x1] +; NO_SVE-NEXT: cmeq v1.2d, v1.2d, #0 +; NO_SVE-NEXT: uzp1 v0.4s, v0.4s, v1.4s +; NO_SVE-NEXT: cmeq v2.2d, v2.2d, #0 +; NO_SVE-NEXT: cmeq v3.2d, v3.2d, #0 +; NO_SVE-NEXT: uzp1 v1.4s, v2.4s, v3.4s +; NO_SVE-NEXT: uzp1 v0.8h, v1.8h, v0.8h +; NO_SVE-NEXT: xtn v0.8b, v0.8h +; NO_SVE-NEXT: umov w8, v0.b[1] +; NO_SVE-NEXT: umov w10, v0.b[2] +; NO_SVE-NEXT: umov w9, v0.b[0] +; NO_SVE-NEXT: umov w11, v0.b[3] +; NO_SVE-NEXT: umov w12, v0.b[4] +; NO_SVE-NEXT: umov w13, v0.b[5] +; NO_SVE-NEXT: umov w14, v0.b[6] +; NO_SVE-NEXT: and w8, w8, #0x1 +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: and w9, w9, #0x1 +; NO_SVE-NEXT: and w11, w11, #0x1 +; NO_SVE-NEXT: bfi w9, w8, #1, #1 +; NO_SVE-NEXT: and w8, w12, #0x1 +; NO_SVE-NEXT: bfi w9, w10, #2, #1 +; NO_SVE-NEXT: and w10, w13, #0x1 +; NO_SVE-NEXT: bfi w9, w11, #3, #1 +; NO_SVE-NEXT: umov w11, v0.b[7] +; NO_SVE-NEXT: bfi w9, w8, #4, #1 +; NO_SVE-NEXT: and w8, w14, #0x1 +; NO_SVE-NEXT: bfi w9, w10, #5, #1 +; NO_SVE-NEXT: // implicit-def: $q0 +; NO_SVE-NEXT: orr w8, w9, w8, lsl #6 +; NO_SVE-NEXT: orr w9, w8, w11, lsl #7 +; NO_SVE-NEXT: and w8, w9, #0xff +; NO_SVE-NEXT: tbnz w9, #0, .LBB29_10 +; NO_SVE-NEXT: // %bb.1: // %else +; NO_SVE-NEXT: tbnz w8, #1, .LBB29_11 +; NO_SVE-NEXT: .LBB29_2: // %else2 +; NO_SVE-NEXT: tbnz w8, #2, .LBB29_12 +; NO_SVE-NEXT: .LBB29_3: // %else5 +; NO_SVE-NEXT: tbnz w8, #3, .LBB29_13 +; NO_SVE-NEXT: .LBB29_4: // %else8 +; NO_SVE-NEXT: tbnz w8, #4, .LBB29_14 +; NO_SVE-NEXT: .LBB29_5: // %else11 +; NO_SVE-NEXT: tbnz w8, #5, .LBB29_15 +; NO_SVE-NEXT: .LBB29_6: // %else14 +; NO_SVE-NEXT: tbnz w8, #6, .LBB29_16 +; NO_SVE-NEXT: .LBB29_7: // %else17 +; NO_SVE-NEXT: tbz w8, #7, .LBB29_9 +; NO_SVE-NEXT: .LBB29_8: // %cond.load19 +; NO_SVE-NEXT: add x8, x0, #14 +; NO_SVE-NEXT: ld1 { v0.h }[7], [x8] +; NO_SVE-NEXT: .LBB29_9: // %else20 +; NO_SVE-NEXT: sshll2 v2.4s, v0.8h, #0 +; NO_SVE-NEXT: sshll v0.4s, v0.4h, #0 +; NO_SVE-NEXT: sshll2 v3.2d, v2.4s, #0 +; NO_SVE-NEXT: sshll2 v1.2d, v0.4s, #0 +; NO_SVE-NEXT: sshll v0.2d, v0.2s, #0 +; NO_SVE-NEXT: sshll v2.2d, v2.2s, #0 +; NO_SVE-NEXT: add sp, sp, #16 +; NO_SVE-NEXT: ret +; NO_SVE-NEXT: .LBB29_10: // %cond.load +; NO_SVE-NEXT: ldr h0, [x0] +; NO_SVE-NEXT: tbz w8, #1, .LBB29_2 +; NO_SVE-NEXT: .LBB29_11: // %cond.load1 +; NO_SVE-NEXT: add x9, x0, #2 +; NO_SVE-NEXT: ld1 { v0.h }[1], [x9] +; NO_SVE-NEXT: tbz w8, #2, .LBB29_3 +; NO_SVE-NEXT: .LBB29_12: // %cond.load4 +; NO_SVE-NEXT: add x9, x0, #4 +; NO_SVE-NEXT: ld1 { v0.h }[2], [x9] +; NO_SVE-NEXT: tbz w8, #3, .LBB29_4 +; NO_SVE-NEXT: .LBB29_13: // %cond.load7 +; NO_SVE-NEXT: add x9, x0, #6 +; NO_SVE-NEXT: ld1 { v0.h }[3], [x9] +; NO_SVE-NEXT: tbz w8, #4, .LBB29_5 +; NO_SVE-NEXT: .LBB29_14: // %cond.load10 +; NO_SVE-NEXT: add x9, x0, #8 +; NO_SVE-NEXT: ld1 { v0.h }[4], [x9] +; NO_SVE-NEXT: tbz w8, #5, .LBB29_6 +; NO_SVE-NEXT: .LBB29_15: // %cond.load13 +; NO_SVE-NEXT: add x9, x0, #10 +; NO_SVE-NEXT: ld1 { v0.h }[5], [x9] +; NO_SVE-NEXT: tbz w8, #6, .LBB29_7 +; NO_SVE-NEXT: .LBB29_16: // %cond.load16 +; NO_SVE-NEXT: add x9, x0, #12 +; NO_SVE-NEXT: ld1 { v0.h }[6], [x9] +; NO_SVE-NEXT: tbnz w8, #7, .LBB29_8 +; NO_SVE-NEXT: b .LBB29_9 +; ; VBITS_GE_512-LABEL: masked_load_sext_v8i16i64_m64: ; VBITS_GE_512: // %bb.0: ; VBITS_GE_512-NEXT: ptrue p0.d, vl8 @@ -535,6 +5818,99 @@ } define <8 x i64> @masked_load_sext_v8i32i64_m64(<8 x i32>* %ap, <8 x i64>* %bp) #0 { +; NO_SVE-LABEL: masked_load_sext_v8i32i64_m64: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: sub sp, sp, #16 +; NO_SVE-NEXT: .cfi_def_cfa_offset 16 +; NO_SVE-NEXT: ldp q0, q1, [x1, #32] +; NO_SVE-NEXT: cmeq v0.2d, v0.2d, #0 +; NO_SVE-NEXT: ldp q2, q3, [x1] +; NO_SVE-NEXT: cmeq v1.2d, v1.2d, #0 +; NO_SVE-NEXT: uzp1 v0.4s, v0.4s, v1.4s +; NO_SVE-NEXT: cmeq v2.2d, v2.2d, #0 +; NO_SVE-NEXT: cmeq v3.2d, v3.2d, #0 +; NO_SVE-NEXT: uzp1 v1.4s, v2.4s, v3.4s +; NO_SVE-NEXT: uzp1 v0.8h, v1.8h, v0.8h +; NO_SVE-NEXT: xtn v0.8b, v0.8h +; NO_SVE-NEXT: umov w8, v0.b[1] +; NO_SVE-NEXT: umov w10, v0.b[2] +; NO_SVE-NEXT: umov w9, v0.b[0] +; NO_SVE-NEXT: umov w11, v0.b[3] +; NO_SVE-NEXT: umov w12, v0.b[4] +; NO_SVE-NEXT: umov w13, v0.b[5] +; NO_SVE-NEXT: umov w14, v0.b[6] +; NO_SVE-NEXT: and w8, w8, #0x1 +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: and w9, w9, #0x1 +; NO_SVE-NEXT: and w11, w11, #0x1 +; NO_SVE-NEXT: bfi w9, w8, #1, #1 +; NO_SVE-NEXT: and w8, w12, #0x1 +; NO_SVE-NEXT: bfi w9, w10, #2, #1 +; NO_SVE-NEXT: and w10, w13, #0x1 +; NO_SVE-NEXT: bfi w9, w11, #3, #1 +; NO_SVE-NEXT: umov w11, v0.b[7] +; NO_SVE-NEXT: bfi w9, w8, #4, #1 +; NO_SVE-NEXT: and w8, w14, #0x1 +; NO_SVE-NEXT: bfi w9, w10, #5, #1 +; NO_SVE-NEXT: orr w8, w9, w8, lsl #6 +; NO_SVE-NEXT: orr w9, w8, w11, lsl #7 +; NO_SVE-NEXT: and w8, w9, #0xff +; NO_SVE-NEXT: tbz w9, #0, .LBB30_2 +; NO_SVE-NEXT: // %bb.1: // %cond.load +; NO_SVE-NEXT: ldr s0, [x0] +; NO_SVE-NEXT: tbnz w8, #1, .LBB30_3 +; NO_SVE-NEXT: b .LBB30_4 +; NO_SVE-NEXT: .LBB30_2: +; NO_SVE-NEXT: // implicit-def: $q0 +; NO_SVE-NEXT: tbz w8, #1, .LBB30_4 +; NO_SVE-NEXT: .LBB30_3: // %cond.load1 +; NO_SVE-NEXT: add x9, x0, #4 +; NO_SVE-NEXT: ld1 { v0.s }[1], [x9] +; NO_SVE-NEXT: .LBB30_4: // %else2 +; NO_SVE-NEXT: tbnz w8, #2, .LBB30_8 +; NO_SVE-NEXT: // %bb.5: // %else5 +; NO_SVE-NEXT: tbnz w8, #3, .LBB30_9 +; NO_SVE-NEXT: .LBB30_6: // %else8 +; NO_SVE-NEXT: tbz w8, #4, .LBB30_10 +; NO_SVE-NEXT: .LBB30_7: // %cond.load10 +; NO_SVE-NEXT: add x9, x0, #16 +; NO_SVE-NEXT: ld1 { v2.s }[0], [x9] +; NO_SVE-NEXT: tbnz w8, #5, .LBB30_11 +; NO_SVE-NEXT: b .LBB30_12 +; NO_SVE-NEXT: .LBB30_8: // %cond.load4 +; NO_SVE-NEXT: add x9, x0, #8 +; NO_SVE-NEXT: ld1 { v0.s }[2], [x9] +; NO_SVE-NEXT: tbz w8, #3, .LBB30_6 +; NO_SVE-NEXT: .LBB30_9: // %cond.load7 +; NO_SVE-NEXT: add x9, x0, #12 +; NO_SVE-NEXT: ld1 { v0.s }[3], [x9] +; NO_SVE-NEXT: tbnz w8, #4, .LBB30_7 +; NO_SVE-NEXT: .LBB30_10: +; NO_SVE-NEXT: // implicit-def: $q2 +; NO_SVE-NEXT: tbz w8, #5, .LBB30_12 +; NO_SVE-NEXT: .LBB30_11: // %cond.load13 +; NO_SVE-NEXT: add x9, x0, #20 +; NO_SVE-NEXT: ld1 { v2.s }[1], [x9] +; NO_SVE-NEXT: .LBB30_12: // %else14 +; NO_SVE-NEXT: tbnz w8, #6, .LBB30_16 +; NO_SVE-NEXT: // %bb.13: // %else17 +; NO_SVE-NEXT: tbz w8, #7, .LBB30_15 +; NO_SVE-NEXT: .LBB30_14: // %cond.load19 +; NO_SVE-NEXT: add x8, x0, #28 +; NO_SVE-NEXT: ld1 { v2.s }[3], [x8] +; NO_SVE-NEXT: .LBB30_15: // %else20 +; NO_SVE-NEXT: sshll2 v1.2d, v0.4s, #0 +; NO_SVE-NEXT: sshll v0.2d, v0.2s, #0 +; NO_SVE-NEXT: sshll2 v3.2d, v2.4s, #0 +; NO_SVE-NEXT: sshll v2.2d, v2.2s, #0 +; NO_SVE-NEXT: add sp, sp, #16 +; NO_SVE-NEXT: ret +; NO_SVE-NEXT: .LBB30_16: // %cond.load16 +; NO_SVE-NEXT: add x9, x0, #24 +; NO_SVE-NEXT: ld1 { v2.s }[2], [x9] +; NO_SVE-NEXT: tbnz w8, #7, .LBB30_14 +; NO_SVE-NEXT: b .LBB30_15 +; ; VBITS_GE_512-LABEL: masked_load_sext_v8i32i64_m64: ; VBITS_GE_512: // %bb.0: ; VBITS_GE_512-NEXT: ptrue p0.d, vl8 @@ -551,6 +5927,313 @@ } define <32 x i16> @masked_load_zext_v32i8i16_m16(<32 x i8>* %ap, <32 x i16>* %bp) #0 { +; NO_SVE-LABEL: masked_load_zext_v32i8i16_m16: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: sub sp, sp, #16 +; NO_SVE-NEXT: .cfi_def_cfa_offset 16 +; NO_SVE-NEXT: ldp q0, q1, [x1, #32] +; NO_SVE-NEXT: cmeq v0.8h, v0.8h, #0 +; NO_SVE-NEXT: xtn v0.8b, v0.8h +; NO_SVE-NEXT: cmeq v1.8h, v1.8h, #0 +; NO_SVE-NEXT: umov w8, v0.b[1] +; NO_SVE-NEXT: umov w10, v0.b[2] +; NO_SVE-NEXT: umov w9, v0.b[0] +; NO_SVE-NEXT: umov w11, v0.b[3] +; NO_SVE-NEXT: umov w12, v0.b[4] +; NO_SVE-NEXT: umov w13, v0.b[5] +; NO_SVE-NEXT: xtn v1.8b, v1.8h +; NO_SVE-NEXT: and w8, w8, #0x1 +; NO_SVE-NEXT: umov w14, v0.b[6] +; NO_SVE-NEXT: ldp q2, q3, [x1] +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: and w9, w9, #0x1 +; NO_SVE-NEXT: and w11, w11, #0x1 +; NO_SVE-NEXT: umov w15, v0.b[7] +; NO_SVE-NEXT: bfi w9, w8, #1, #1 +; NO_SVE-NEXT: and w12, w12, #0x1 +; NO_SVE-NEXT: umov w16, v1.b[0] +; NO_SVE-NEXT: bfi w9, w10, #2, #1 +; NO_SVE-NEXT: and w13, w13, #0x1 +; NO_SVE-NEXT: umov w8, v1.b[1] +; NO_SVE-NEXT: cmeq v2.8h, v2.8h, #0 +; NO_SVE-NEXT: bfi w9, w11, #3, #1 +; NO_SVE-NEXT: umov w10, v1.b[2] +; NO_SVE-NEXT: bfi w9, w12, #4, #1 +; NO_SVE-NEXT: and w14, w14, #0x1 +; NO_SVE-NEXT: xtn v2.8b, v2.8h +; NO_SVE-NEXT: umov w11, v1.b[3] +; NO_SVE-NEXT: bfi w9, w13, #5, #1 +; NO_SVE-NEXT: and w15, w15, #0x1 +; NO_SVE-NEXT: orr w9, w9, w14, lsl #6 +; NO_SVE-NEXT: and w16, w16, #0x1 +; NO_SVE-NEXT: umov w14, v2.b[1] +; NO_SVE-NEXT: orr w9, w9, w15, lsl #7 +; NO_SVE-NEXT: umov w15, v2.b[0] +; NO_SVE-NEXT: and w8, w8, #0x1 +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: orr w9, w9, w16, lsl #8 +; NO_SVE-NEXT: umov w16, v2.b[2] +; NO_SVE-NEXT: umov w12, v1.b[4] +; NO_SVE-NEXT: orr w8, w9, w8, lsl #9 +; NO_SVE-NEXT: umov w9, v2.b[3] +; NO_SVE-NEXT: and w11, w11, #0x1 +; NO_SVE-NEXT: orr w8, w8, w10, lsl #10 +; NO_SVE-NEXT: umov w10, v2.b[4] +; NO_SVE-NEXT: umov w13, v1.b[5] +; NO_SVE-NEXT: and w14, w14, #0x1 +; NO_SVE-NEXT: orr w8, w8, w11, lsl #11 +; NO_SVE-NEXT: and w11, w15, #0x1 +; NO_SVE-NEXT: umov w15, v2.b[5] +; NO_SVE-NEXT: and w16, w16, #0x1 +; NO_SVE-NEXT: and w12, w12, #0x1 +; NO_SVE-NEXT: and w9, w9, #0x1 +; NO_SVE-NEXT: bfi w11, w14, #1, #1 +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: umov w14, v2.b[6] +; NO_SVE-NEXT: orr w8, w8, w12, lsl #12 +; NO_SVE-NEXT: cmeq v0.8h, v3.8h, #0 +; NO_SVE-NEXT: and w12, w13, #0x1 +; NO_SVE-NEXT: bfi w11, w16, #2, #1 +; NO_SVE-NEXT: and w13, w15, #0x1 +; NO_SVE-NEXT: bfi w11, w9, #3, #1 +; NO_SVE-NEXT: umov w9, v2.b[7] +; NO_SVE-NEXT: xtn v0.8b, v0.8h +; NO_SVE-NEXT: bfi w11, w10, #4, #1 +; NO_SVE-NEXT: umov w10, v1.b[6] +; NO_SVE-NEXT: bfi w11, w13, #5, #1 +; NO_SVE-NEXT: and w13, w14, #0x1 +; NO_SVE-NEXT: umov w14, v0.b[0] +; NO_SVE-NEXT: and w9, w9, #0x1 +; NO_SVE-NEXT: umov w15, v0.b[1] +; NO_SVE-NEXT: orr w8, w8, w12, lsl #13 +; NO_SVE-NEXT: orr w11, w11, w13, lsl #6 +; NO_SVE-NEXT: umov w12, v0.b[2] +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: orr w9, w11, w9, lsl #7 +; NO_SVE-NEXT: and w11, w14, #0x1 +; NO_SVE-NEXT: umov w14, v0.b[3] +; NO_SVE-NEXT: and w13, w15, #0x1 +; NO_SVE-NEXT: orr w8, w8, w10, lsl #14 +; NO_SVE-NEXT: umov w10, v0.b[4] +; NO_SVE-NEXT: orr w9, w9, w11, lsl #8 +; NO_SVE-NEXT: and w11, w12, #0x1 +; NO_SVE-NEXT: umov w12, v0.b[5] +; NO_SVE-NEXT: orr w9, w9, w13, lsl #9 +; NO_SVE-NEXT: and w13, w14, #0x1 +; NO_SVE-NEXT: umov w14, v0.b[6] +; NO_SVE-NEXT: orr w9, w9, w11, lsl #10 +; NO_SVE-NEXT: umov w11, v1.b[7] +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: orr w9, w9, w13, lsl #11 +; NO_SVE-NEXT: and w12, w12, #0x1 +; NO_SVE-NEXT: umov w13, v0.b[7] +; NO_SVE-NEXT: orr w9, w9, w10, lsl #12 +; NO_SVE-NEXT: and w10, w14, #0x1 +; NO_SVE-NEXT: orr w11, w8, w11, lsl #15 +; NO_SVE-NEXT: orr w8, w9, w12, lsl #13 +; NO_SVE-NEXT: orr w8, w8, w10, lsl #14 +; NO_SVE-NEXT: orr w8, w8, w13, lsl #15 +; NO_SVE-NEXT: bfi w8, w11, #16, #16 +; NO_SVE-NEXT: tbz w8, #0, .LBB31_2 +; NO_SVE-NEXT: // %bb.1: // %cond.load +; NO_SVE-NEXT: ldr b0, [x0] +; NO_SVE-NEXT: tbnz w8, #1, .LBB31_3 +; NO_SVE-NEXT: b .LBB31_4 +; NO_SVE-NEXT: .LBB31_2: +; NO_SVE-NEXT: // implicit-def: $q0 +; NO_SVE-NEXT: tbz w8, #1, .LBB31_4 +; NO_SVE-NEXT: .LBB31_3: // %cond.load1 +; NO_SVE-NEXT: add x9, x0, #1 +; NO_SVE-NEXT: ld1 { v0.b }[1], [x9] +; NO_SVE-NEXT: .LBB31_4: // %else2 +; NO_SVE-NEXT: tbnz w8, #2, .LBB31_20 +; NO_SVE-NEXT: // %bb.5: // %else5 +; NO_SVE-NEXT: tbnz w8, #3, .LBB31_21 +; NO_SVE-NEXT: .LBB31_6: // %else8 +; NO_SVE-NEXT: tbnz w8, #4, .LBB31_22 +; NO_SVE-NEXT: .LBB31_7: // %else11 +; NO_SVE-NEXT: tbnz w8, #5, .LBB31_23 +; NO_SVE-NEXT: .LBB31_8: // %else14 +; NO_SVE-NEXT: tbnz w8, #6, .LBB31_24 +; NO_SVE-NEXT: .LBB31_9: // %else17 +; NO_SVE-NEXT: tbnz w8, #7, .LBB31_25 +; NO_SVE-NEXT: .LBB31_10: // %else20 +; NO_SVE-NEXT: tbnz w8, #8, .LBB31_26 +; NO_SVE-NEXT: .LBB31_11: // %else23 +; NO_SVE-NEXT: tbnz w8, #9, .LBB31_27 +; NO_SVE-NEXT: .LBB31_12: // %else26 +; NO_SVE-NEXT: tbnz w8, #10, .LBB31_28 +; NO_SVE-NEXT: .LBB31_13: // %else29 +; NO_SVE-NEXT: tbnz w8, #11, .LBB31_29 +; NO_SVE-NEXT: .LBB31_14: // %else32 +; NO_SVE-NEXT: tbnz w8, #12, .LBB31_30 +; NO_SVE-NEXT: .LBB31_15: // %else35 +; NO_SVE-NEXT: tbnz w8, #13, .LBB31_31 +; NO_SVE-NEXT: .LBB31_16: // %else38 +; NO_SVE-NEXT: tbnz w8, #14, .LBB31_32 +; NO_SVE-NEXT: .LBB31_17: // %else41 +; NO_SVE-NEXT: tbnz w8, #15, .LBB31_33 +; NO_SVE-NEXT: .LBB31_18: // %else44 +; NO_SVE-NEXT: tbz w8, #16, .LBB31_34 +; NO_SVE-NEXT: .LBB31_19: // %cond.load46 +; NO_SVE-NEXT: add x9, x0, #16 +; NO_SVE-NEXT: ld1 { v2.b }[0], [x9] +; NO_SVE-NEXT: tbnz w8, #17, .LBB31_35 +; NO_SVE-NEXT: b .LBB31_36 +; NO_SVE-NEXT: .LBB31_20: // %cond.load4 +; NO_SVE-NEXT: add x9, x0, #2 +; NO_SVE-NEXT: ld1 { v0.b }[2], [x9] +; NO_SVE-NEXT: tbz w8, #3, .LBB31_6 +; NO_SVE-NEXT: .LBB31_21: // %cond.load7 +; NO_SVE-NEXT: add x9, x0, #3 +; NO_SVE-NEXT: ld1 { v0.b }[3], [x9] +; NO_SVE-NEXT: tbz w8, #4, .LBB31_7 +; NO_SVE-NEXT: .LBB31_22: // %cond.load10 +; NO_SVE-NEXT: add x9, x0, #4 +; NO_SVE-NEXT: ld1 { v0.b }[4], [x9] +; NO_SVE-NEXT: tbz w8, #5, .LBB31_8 +; NO_SVE-NEXT: .LBB31_23: // %cond.load13 +; NO_SVE-NEXT: add x9, x0, #5 +; NO_SVE-NEXT: ld1 { v0.b }[5], [x9] +; NO_SVE-NEXT: tbz w8, #6, .LBB31_9 +; NO_SVE-NEXT: .LBB31_24: // %cond.load16 +; NO_SVE-NEXT: add x9, x0, #6 +; NO_SVE-NEXT: ld1 { v0.b }[6], [x9] +; NO_SVE-NEXT: tbz w8, #7, .LBB31_10 +; NO_SVE-NEXT: .LBB31_25: // %cond.load19 +; NO_SVE-NEXT: add x9, x0, #7 +; NO_SVE-NEXT: ld1 { v0.b }[7], [x9] +; NO_SVE-NEXT: tbz w8, #8, .LBB31_11 +; NO_SVE-NEXT: .LBB31_26: // %cond.load22 +; NO_SVE-NEXT: add x9, x0, #8 +; NO_SVE-NEXT: ld1 { v0.b }[8], [x9] +; NO_SVE-NEXT: tbz w8, #9, .LBB31_12 +; NO_SVE-NEXT: .LBB31_27: // %cond.load25 +; NO_SVE-NEXT: add x9, x0, #9 +; NO_SVE-NEXT: ld1 { v0.b }[9], [x9] +; NO_SVE-NEXT: tbz w8, #10, .LBB31_13 +; NO_SVE-NEXT: .LBB31_28: // %cond.load28 +; NO_SVE-NEXT: add x9, x0, #10 +; NO_SVE-NEXT: ld1 { v0.b }[10], [x9] +; NO_SVE-NEXT: tbz w8, #11, .LBB31_14 +; NO_SVE-NEXT: .LBB31_29: // %cond.load31 +; NO_SVE-NEXT: add x9, x0, #11 +; NO_SVE-NEXT: ld1 { v0.b }[11], [x9] +; NO_SVE-NEXT: tbz w8, #12, .LBB31_15 +; NO_SVE-NEXT: .LBB31_30: // %cond.load34 +; NO_SVE-NEXT: add x9, x0, #12 +; NO_SVE-NEXT: ld1 { v0.b }[12], [x9] +; NO_SVE-NEXT: tbz w8, #13, .LBB31_16 +; NO_SVE-NEXT: .LBB31_31: // %cond.load37 +; NO_SVE-NEXT: add x9, x0, #13 +; NO_SVE-NEXT: ld1 { v0.b }[13], [x9] +; NO_SVE-NEXT: tbz w8, #14, .LBB31_17 +; NO_SVE-NEXT: .LBB31_32: // %cond.load40 +; NO_SVE-NEXT: add x9, x0, #14 +; NO_SVE-NEXT: ld1 { v0.b }[14], [x9] +; NO_SVE-NEXT: tbz w8, #15, .LBB31_18 +; NO_SVE-NEXT: .LBB31_33: // %cond.load43 +; NO_SVE-NEXT: add x9, x0, #15 +; NO_SVE-NEXT: ld1 { v0.b }[15], [x9] +; NO_SVE-NEXT: tbnz w8, #16, .LBB31_19 +; NO_SVE-NEXT: .LBB31_34: +; NO_SVE-NEXT: // implicit-def: $q2 +; NO_SVE-NEXT: tbz w8, #17, .LBB31_36 +; NO_SVE-NEXT: .LBB31_35: // %cond.load49 +; NO_SVE-NEXT: add x9, x0, #17 +; NO_SVE-NEXT: ld1 { v2.b }[1], [x9] +; NO_SVE-NEXT: .LBB31_36: // %else50 +; NO_SVE-NEXT: tbnz w8, #18, .LBB31_52 +; NO_SVE-NEXT: // %bb.37: // %else53 +; NO_SVE-NEXT: tbnz w8, #19, .LBB31_53 +; NO_SVE-NEXT: .LBB31_38: // %else56 +; NO_SVE-NEXT: tbnz w8, #20, .LBB31_54 +; NO_SVE-NEXT: .LBB31_39: // %else59 +; NO_SVE-NEXT: tbnz w8, #21, .LBB31_55 +; NO_SVE-NEXT: .LBB31_40: // %else62 +; NO_SVE-NEXT: tbnz w8, #22, .LBB31_56 +; NO_SVE-NEXT: .LBB31_41: // %else65 +; NO_SVE-NEXT: tbnz w8, #23, .LBB31_57 +; NO_SVE-NEXT: .LBB31_42: // %else68 +; NO_SVE-NEXT: tbnz w8, #24, .LBB31_58 +; NO_SVE-NEXT: .LBB31_43: // %else71 +; NO_SVE-NEXT: tbnz w8, #25, .LBB31_59 +; NO_SVE-NEXT: .LBB31_44: // %else74 +; NO_SVE-NEXT: tbnz w8, #26, .LBB31_60 +; NO_SVE-NEXT: .LBB31_45: // %else77 +; NO_SVE-NEXT: tbnz w8, #27, .LBB31_61 +; NO_SVE-NEXT: .LBB31_46: // %else80 +; NO_SVE-NEXT: tbnz w8, #28, .LBB31_62 +; NO_SVE-NEXT: .LBB31_47: // %else83 +; NO_SVE-NEXT: tbnz w8, #29, .LBB31_63 +; NO_SVE-NEXT: .LBB31_48: // %else86 +; NO_SVE-NEXT: tbnz w8, #30, .LBB31_64 +; NO_SVE-NEXT: .LBB31_49: // %else89 +; NO_SVE-NEXT: tbz w8, #31, .LBB31_51 +; NO_SVE-NEXT: .LBB31_50: // %cond.load91 +; NO_SVE-NEXT: add x8, x0, #31 +; NO_SVE-NEXT: ld1 { v2.b }[15], [x8] +; NO_SVE-NEXT: .LBB31_51: // %else92 +; NO_SVE-NEXT: ushll2 v1.8h, v0.16b, #0 +; NO_SVE-NEXT: ushll v0.8h, v0.8b, #0 +; NO_SVE-NEXT: ushll2 v3.8h, v2.16b, #0 +; NO_SVE-NEXT: ushll v2.8h, v2.8b, #0 +; NO_SVE-NEXT: add sp, sp, #16 +; NO_SVE-NEXT: ret +; NO_SVE-NEXT: .LBB31_52: // %cond.load52 +; NO_SVE-NEXT: add x9, x0, #18 +; NO_SVE-NEXT: ld1 { v2.b }[2], [x9] +; NO_SVE-NEXT: tbz w8, #19, .LBB31_38 +; NO_SVE-NEXT: .LBB31_53: // %cond.load55 +; NO_SVE-NEXT: add x9, x0, #19 +; NO_SVE-NEXT: ld1 { v2.b }[3], [x9] +; NO_SVE-NEXT: tbz w8, #20, .LBB31_39 +; NO_SVE-NEXT: .LBB31_54: // %cond.load58 +; NO_SVE-NEXT: add x9, x0, #20 +; NO_SVE-NEXT: ld1 { v2.b }[4], [x9] +; NO_SVE-NEXT: tbz w8, #21, .LBB31_40 +; NO_SVE-NEXT: .LBB31_55: // %cond.load61 +; NO_SVE-NEXT: add x9, x0, #21 +; NO_SVE-NEXT: ld1 { v2.b }[5], [x9] +; NO_SVE-NEXT: tbz w8, #22, .LBB31_41 +; NO_SVE-NEXT: .LBB31_56: // %cond.load64 +; NO_SVE-NEXT: add x9, x0, #22 +; NO_SVE-NEXT: ld1 { v2.b }[6], [x9] +; NO_SVE-NEXT: tbz w8, #23, .LBB31_42 +; NO_SVE-NEXT: .LBB31_57: // %cond.load67 +; NO_SVE-NEXT: add x9, x0, #23 +; NO_SVE-NEXT: ld1 { v2.b }[7], [x9] +; NO_SVE-NEXT: tbz w8, #24, .LBB31_43 +; NO_SVE-NEXT: .LBB31_58: // %cond.load70 +; NO_SVE-NEXT: add x9, x0, #24 +; NO_SVE-NEXT: ld1 { v2.b }[8], [x9] +; NO_SVE-NEXT: tbz w8, #25, .LBB31_44 +; NO_SVE-NEXT: .LBB31_59: // %cond.load73 +; NO_SVE-NEXT: add x9, x0, #25 +; NO_SVE-NEXT: ld1 { v2.b }[9], [x9] +; NO_SVE-NEXT: tbz w8, #26, .LBB31_45 +; NO_SVE-NEXT: .LBB31_60: // %cond.load76 +; NO_SVE-NEXT: add x9, x0, #26 +; NO_SVE-NEXT: ld1 { v2.b }[10], [x9] +; NO_SVE-NEXT: tbz w8, #27, .LBB31_46 +; NO_SVE-NEXT: .LBB31_61: // %cond.load79 +; NO_SVE-NEXT: add x9, x0, #27 +; NO_SVE-NEXT: ld1 { v2.b }[11], [x9] +; NO_SVE-NEXT: tbz w8, #28, .LBB31_47 +; NO_SVE-NEXT: .LBB31_62: // %cond.load82 +; NO_SVE-NEXT: add x9, x0, #28 +; NO_SVE-NEXT: ld1 { v2.b }[12], [x9] +; NO_SVE-NEXT: tbz w8, #29, .LBB31_48 +; NO_SVE-NEXT: .LBB31_63: // %cond.load85 +; NO_SVE-NEXT: add x9, x0, #29 +; NO_SVE-NEXT: ld1 { v2.b }[13], [x9] +; NO_SVE-NEXT: tbz w8, #30, .LBB31_49 +; NO_SVE-NEXT: .LBB31_64: // %cond.load88 +; NO_SVE-NEXT: add x9, x0, #30 +; NO_SVE-NEXT: ld1 { v2.b }[14], [x9] +; NO_SVE-NEXT: tbnz w8, #31, .LBB31_50 +; NO_SVE-NEXT: b .LBB31_51 +; ; VBITS_GE_512-LABEL: masked_load_zext_v32i8i16_m16: ; VBITS_GE_512: // %bb.0: ; VBITS_GE_512-NEXT: ptrue p0.h, vl32 @@ -567,6 +6250,172 @@ } define <16 x i32> @masked_load_zext_v16i8i32_m32(<16 x i8>* %ap, <16 x i32>* %bp) #0 { +; NO_SVE-LABEL: masked_load_zext_v16i8i32_m32: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: sub sp, sp, #16 +; NO_SVE-NEXT: .cfi_def_cfa_offset 16 +; NO_SVE-NEXT: ldp q1, q0, [x1] +; NO_SVE-NEXT: cmeq v1.4s, v1.4s, #0 +; NO_SVE-NEXT: cmeq v0.4s, v0.4s, #0 +; NO_SVE-NEXT: uzp1 v0.8h, v1.8h, v0.8h +; NO_SVE-NEXT: ldp q2, q1, [x1, #32] +; NO_SVE-NEXT: xtn v0.8b, v0.8h +; NO_SVE-NEXT: cmeq v2.4s, v2.4s, #0 +; NO_SVE-NEXT: umov w8, v0.b[1] +; NO_SVE-NEXT: cmeq v1.4s, v1.4s, #0 +; NO_SVE-NEXT: umov w10, v0.b[2] +; NO_SVE-NEXT: umov w9, v0.b[0] +; NO_SVE-NEXT: umov w11, v0.b[3] +; NO_SVE-NEXT: umov w12, v0.b[4] +; NO_SVE-NEXT: uzp1 v1.8h, v2.8h, v1.8h +; NO_SVE-NEXT: and w8, w8, #0x1 +; NO_SVE-NEXT: umov w13, v0.b[5] +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: and w9, w9, #0x1 +; NO_SVE-NEXT: and w11, w11, #0x1 +; NO_SVE-NEXT: bfi w9, w8, #1, #1 +; NO_SVE-NEXT: xtn v1.8b, v1.8h +; NO_SVE-NEXT: umov w8, v0.b[6] +; NO_SVE-NEXT: and w12, w12, #0x1 +; NO_SVE-NEXT: bfi w9, w10, #2, #1 +; NO_SVE-NEXT: umov w10, v0.b[7] +; NO_SVE-NEXT: and w13, w13, #0x1 +; NO_SVE-NEXT: bfi w9, w11, #3, #1 +; NO_SVE-NEXT: umov w11, v1.b[0] +; NO_SVE-NEXT: bfi w9, w12, #4, #1 +; NO_SVE-NEXT: umov w12, v1.b[1] +; NO_SVE-NEXT: and w8, w8, #0x1 +; NO_SVE-NEXT: bfi w9, w13, #5, #1 +; NO_SVE-NEXT: umov w13, v1.b[2] +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: orr w8, w9, w8, lsl #6 +; NO_SVE-NEXT: umov w9, v1.b[3] +; NO_SVE-NEXT: and w11, w11, #0x1 +; NO_SVE-NEXT: orr w8, w8, w10, lsl #7 +; NO_SVE-NEXT: umov w10, v1.b[4] +; NO_SVE-NEXT: and w12, w12, #0x1 +; NO_SVE-NEXT: orr w8, w8, w11, lsl #8 +; NO_SVE-NEXT: umov w11, v1.b[5] +; NO_SVE-NEXT: and w13, w13, #0x1 +; NO_SVE-NEXT: orr w8, w8, w12, lsl #9 +; NO_SVE-NEXT: umov w12, v1.b[6] +; NO_SVE-NEXT: and w9, w9, #0x1 +; NO_SVE-NEXT: orr w8, w8, w13, lsl #10 +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: orr w8, w8, w9, lsl #11 +; NO_SVE-NEXT: and w9, w11, #0x1 +; NO_SVE-NEXT: umov w11, v1.b[7] +; NO_SVE-NEXT: orr w8, w8, w10, lsl #12 +; NO_SVE-NEXT: and w10, w12, #0x1 +; NO_SVE-NEXT: orr w8, w8, w9, lsl #13 +; NO_SVE-NEXT: // implicit-def: $q0 +; NO_SVE-NEXT: orr w8, w8, w10, lsl #14 +; NO_SVE-NEXT: orr w9, w8, w11, lsl #15 +; NO_SVE-NEXT: and w8, w9, #0xffff +; NO_SVE-NEXT: tbnz w9, #0, .LBB32_18 +; NO_SVE-NEXT: // %bb.1: // %else +; NO_SVE-NEXT: tbnz w8, #1, .LBB32_19 +; NO_SVE-NEXT: .LBB32_2: // %else2 +; NO_SVE-NEXT: tbnz w8, #2, .LBB32_20 +; NO_SVE-NEXT: .LBB32_3: // %else5 +; NO_SVE-NEXT: tbnz w8, #3, .LBB32_21 +; NO_SVE-NEXT: .LBB32_4: // %else8 +; NO_SVE-NEXT: tbnz w8, #4, .LBB32_22 +; NO_SVE-NEXT: .LBB32_5: // %else11 +; NO_SVE-NEXT: tbnz w8, #5, .LBB32_23 +; NO_SVE-NEXT: .LBB32_6: // %else14 +; NO_SVE-NEXT: tbnz w8, #6, .LBB32_24 +; NO_SVE-NEXT: .LBB32_7: // %else17 +; NO_SVE-NEXT: tbnz w8, #7, .LBB32_25 +; NO_SVE-NEXT: .LBB32_8: // %else20 +; NO_SVE-NEXT: tbnz w8, #8, .LBB32_26 +; NO_SVE-NEXT: .LBB32_9: // %else23 +; NO_SVE-NEXT: tbnz w8, #9, .LBB32_27 +; NO_SVE-NEXT: .LBB32_10: // %else26 +; NO_SVE-NEXT: tbnz w8, #10, .LBB32_28 +; NO_SVE-NEXT: .LBB32_11: // %else29 +; NO_SVE-NEXT: tbnz w8, #11, .LBB32_29 +; NO_SVE-NEXT: .LBB32_12: // %else32 +; NO_SVE-NEXT: tbnz w8, #12, .LBB32_30 +; NO_SVE-NEXT: .LBB32_13: // %else35 +; NO_SVE-NEXT: tbnz w8, #13, .LBB32_31 +; NO_SVE-NEXT: .LBB32_14: // %else38 +; NO_SVE-NEXT: tbnz w8, #14, .LBB32_32 +; NO_SVE-NEXT: .LBB32_15: // %else41 +; NO_SVE-NEXT: tbz w8, #15, .LBB32_17 +; NO_SVE-NEXT: .LBB32_16: // %cond.load43 +; NO_SVE-NEXT: add x8, x0, #15 +; NO_SVE-NEXT: ld1 { v0.b }[15], [x8] +; NO_SVE-NEXT: .LBB32_17: // %else44 +; NO_SVE-NEXT: ushll2 v2.8h, v0.16b, #0 +; NO_SVE-NEXT: ushll v0.8h, v0.8b, #0 +; NO_SVE-NEXT: ushll2 v3.4s, v2.8h, #0 +; NO_SVE-NEXT: ushll2 v1.4s, v0.8h, #0 +; NO_SVE-NEXT: ushll v0.4s, v0.4h, #0 +; NO_SVE-NEXT: ushll v2.4s, v2.4h, #0 +; NO_SVE-NEXT: add sp, sp, #16 +; NO_SVE-NEXT: ret +; NO_SVE-NEXT: .LBB32_18: // %cond.load +; NO_SVE-NEXT: ldr b0, [x0] +; NO_SVE-NEXT: tbz w8, #1, .LBB32_2 +; NO_SVE-NEXT: .LBB32_19: // %cond.load1 +; NO_SVE-NEXT: add x9, x0, #1 +; NO_SVE-NEXT: ld1 { v0.b }[1], [x9] +; NO_SVE-NEXT: tbz w8, #2, .LBB32_3 +; NO_SVE-NEXT: .LBB32_20: // %cond.load4 +; NO_SVE-NEXT: add x9, x0, #2 +; NO_SVE-NEXT: ld1 { v0.b }[2], [x9] +; NO_SVE-NEXT: tbz w8, #3, .LBB32_4 +; NO_SVE-NEXT: .LBB32_21: // %cond.load7 +; NO_SVE-NEXT: add x9, x0, #3 +; NO_SVE-NEXT: ld1 { v0.b }[3], [x9] +; NO_SVE-NEXT: tbz w8, #4, .LBB32_5 +; NO_SVE-NEXT: .LBB32_22: // %cond.load10 +; NO_SVE-NEXT: add x9, x0, #4 +; NO_SVE-NEXT: ld1 { v0.b }[4], [x9] +; NO_SVE-NEXT: tbz w8, #5, .LBB32_6 +; NO_SVE-NEXT: .LBB32_23: // %cond.load13 +; NO_SVE-NEXT: add x9, x0, #5 +; NO_SVE-NEXT: ld1 { v0.b }[5], [x9] +; NO_SVE-NEXT: tbz w8, #6, .LBB32_7 +; NO_SVE-NEXT: .LBB32_24: // %cond.load16 +; NO_SVE-NEXT: add x9, x0, #6 +; NO_SVE-NEXT: ld1 { v0.b }[6], [x9] +; NO_SVE-NEXT: tbz w8, #7, .LBB32_8 +; NO_SVE-NEXT: .LBB32_25: // %cond.load19 +; NO_SVE-NEXT: add x9, x0, #7 +; NO_SVE-NEXT: ld1 { v0.b }[7], [x9] +; NO_SVE-NEXT: tbz w8, #8, .LBB32_9 +; NO_SVE-NEXT: .LBB32_26: // %cond.load22 +; NO_SVE-NEXT: add x9, x0, #8 +; NO_SVE-NEXT: ld1 { v0.b }[8], [x9] +; NO_SVE-NEXT: tbz w8, #9, .LBB32_10 +; NO_SVE-NEXT: .LBB32_27: // %cond.load25 +; NO_SVE-NEXT: add x9, x0, #9 +; NO_SVE-NEXT: ld1 { v0.b }[9], [x9] +; NO_SVE-NEXT: tbz w8, #10, .LBB32_11 +; NO_SVE-NEXT: .LBB32_28: // %cond.load28 +; NO_SVE-NEXT: add x9, x0, #10 +; NO_SVE-NEXT: ld1 { v0.b }[10], [x9] +; NO_SVE-NEXT: tbz w8, #11, .LBB32_12 +; NO_SVE-NEXT: .LBB32_29: // %cond.load31 +; NO_SVE-NEXT: add x9, x0, #11 +; NO_SVE-NEXT: ld1 { v0.b }[11], [x9] +; NO_SVE-NEXT: tbz w8, #12, .LBB32_13 +; NO_SVE-NEXT: .LBB32_30: // %cond.load34 +; NO_SVE-NEXT: add x9, x0, #12 +; NO_SVE-NEXT: ld1 { v0.b }[12], [x9] +; NO_SVE-NEXT: tbz w8, #13, .LBB32_14 +; NO_SVE-NEXT: .LBB32_31: // %cond.load37 +; NO_SVE-NEXT: add x9, x0, #13 +; NO_SVE-NEXT: ld1 { v0.b }[13], [x9] +; NO_SVE-NEXT: tbz w8, #14, .LBB32_15 +; NO_SVE-NEXT: .LBB32_32: // %cond.load40 +; NO_SVE-NEXT: add x9, x0, #14 +; NO_SVE-NEXT: ld1 { v0.b }[14], [x9] +; NO_SVE-NEXT: tbnz w8, #15, .LBB32_16 +; NO_SVE-NEXT: b .LBB32_17 +; ; VBITS_GE_512-LABEL: masked_load_zext_v16i8i32_m32: ; VBITS_GE_512: // %bb.0: ; VBITS_GE_512-NEXT: ptrue p0.s, vl16 @@ -583,6 +6432,101 @@ } define <8 x i64> @masked_load_zext_v8i8i64_m64(<8 x i8>* %ap, <8 x i64>* %bp) #0 { +; NO_SVE-LABEL: masked_load_zext_v8i8i64_m64: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: sub sp, sp, #16 +; NO_SVE-NEXT: .cfi_def_cfa_offset 16 +; NO_SVE-NEXT: ldp q0, q1, [x1, #32] +; NO_SVE-NEXT: cmeq v0.2d, v0.2d, #0 +; NO_SVE-NEXT: ldp q2, q3, [x1] +; NO_SVE-NEXT: cmeq v1.2d, v1.2d, #0 +; NO_SVE-NEXT: uzp1 v0.4s, v0.4s, v1.4s +; NO_SVE-NEXT: cmeq v2.2d, v2.2d, #0 +; NO_SVE-NEXT: cmeq v3.2d, v3.2d, #0 +; NO_SVE-NEXT: uzp1 v1.4s, v2.4s, v3.4s +; NO_SVE-NEXT: uzp1 v0.8h, v1.8h, v0.8h +; NO_SVE-NEXT: xtn v0.8b, v0.8h +; NO_SVE-NEXT: umov w8, v0.b[1] +; NO_SVE-NEXT: umov w10, v0.b[2] +; NO_SVE-NEXT: umov w9, v0.b[0] +; NO_SVE-NEXT: umov w11, v0.b[3] +; NO_SVE-NEXT: umov w12, v0.b[4] +; NO_SVE-NEXT: umov w13, v0.b[5] +; NO_SVE-NEXT: umov w14, v0.b[6] +; NO_SVE-NEXT: and w8, w8, #0x1 +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: and w9, w9, #0x1 +; NO_SVE-NEXT: and w11, w11, #0x1 +; NO_SVE-NEXT: bfi w9, w8, #1, #1 +; NO_SVE-NEXT: and w8, w12, #0x1 +; NO_SVE-NEXT: bfi w9, w10, #2, #1 +; NO_SVE-NEXT: and w10, w13, #0x1 +; NO_SVE-NEXT: bfi w9, w11, #3, #1 +; NO_SVE-NEXT: umov w11, v0.b[7] +; NO_SVE-NEXT: bfi w9, w8, #4, #1 +; NO_SVE-NEXT: and w8, w14, #0x1 +; NO_SVE-NEXT: bfi w9, w10, #5, #1 +; NO_SVE-NEXT: // implicit-def: $d0 +; NO_SVE-NEXT: orr w8, w9, w8, lsl #6 +; NO_SVE-NEXT: orr w9, w8, w11, lsl #7 +; NO_SVE-NEXT: and w8, w9, #0xff +; NO_SVE-NEXT: tbnz w9, #0, .LBB33_10 +; NO_SVE-NEXT: // %bb.1: // %else +; NO_SVE-NEXT: tbnz w8, #1, .LBB33_11 +; NO_SVE-NEXT: .LBB33_2: // %else2 +; NO_SVE-NEXT: tbnz w8, #2, .LBB33_12 +; NO_SVE-NEXT: .LBB33_3: // %else5 +; NO_SVE-NEXT: tbnz w8, #3, .LBB33_13 +; NO_SVE-NEXT: .LBB33_4: // %else8 +; NO_SVE-NEXT: tbnz w8, #4, .LBB33_14 +; NO_SVE-NEXT: .LBB33_5: // %else11 +; NO_SVE-NEXT: tbnz w8, #5, .LBB33_15 +; NO_SVE-NEXT: .LBB33_6: // %else14 +; NO_SVE-NEXT: tbnz w8, #6, .LBB33_16 +; NO_SVE-NEXT: .LBB33_7: // %else17 +; NO_SVE-NEXT: tbz w8, #7, .LBB33_9 +; NO_SVE-NEXT: .LBB33_8: // %cond.load19 +; NO_SVE-NEXT: add x8, x0, #7 +; NO_SVE-NEXT: ld1 { v0.b }[7], [x8] +; NO_SVE-NEXT: .LBB33_9: // %else20 +; NO_SVE-NEXT: ushll v0.8h, v0.8b, #0 +; NO_SVE-NEXT: ushll2 v2.4s, v0.8h, #0 +; NO_SVE-NEXT: ushll v0.4s, v0.4h, #0 +; NO_SVE-NEXT: ushll2 v3.2d, v2.4s, #0 +; NO_SVE-NEXT: ushll2 v1.2d, v0.4s, #0 +; NO_SVE-NEXT: ushll v0.2d, v0.2s, #0 +; NO_SVE-NEXT: ushll v2.2d, v2.2s, #0 +; NO_SVE-NEXT: add sp, sp, #16 +; NO_SVE-NEXT: ret +; NO_SVE-NEXT: .LBB33_10: // %cond.load +; NO_SVE-NEXT: ldr b0, [x0] +; NO_SVE-NEXT: tbz w8, #1, .LBB33_2 +; NO_SVE-NEXT: .LBB33_11: // %cond.load1 +; NO_SVE-NEXT: add x9, x0, #1 +; NO_SVE-NEXT: ld1 { v0.b }[1], [x9] +; NO_SVE-NEXT: tbz w8, #2, .LBB33_3 +; NO_SVE-NEXT: .LBB33_12: // %cond.load4 +; NO_SVE-NEXT: add x9, x0, #2 +; NO_SVE-NEXT: ld1 { v0.b }[2], [x9] +; NO_SVE-NEXT: tbz w8, #3, .LBB33_4 +; NO_SVE-NEXT: .LBB33_13: // %cond.load7 +; NO_SVE-NEXT: add x9, x0, #3 +; NO_SVE-NEXT: ld1 { v0.b }[3], [x9] +; NO_SVE-NEXT: tbz w8, #4, .LBB33_5 +; NO_SVE-NEXT: .LBB33_14: // %cond.load10 +; NO_SVE-NEXT: add x9, x0, #4 +; NO_SVE-NEXT: ld1 { v0.b }[4], [x9] +; NO_SVE-NEXT: tbz w8, #5, .LBB33_6 +; NO_SVE-NEXT: .LBB33_15: // %cond.load13 +; NO_SVE-NEXT: add x9, x0, #5 +; NO_SVE-NEXT: ld1 { v0.b }[5], [x9] +; NO_SVE-NEXT: tbz w8, #6, .LBB33_7 +; NO_SVE-NEXT: .LBB33_16: // %cond.load16 +; NO_SVE-NEXT: add x9, x0, #6 +; NO_SVE-NEXT: ld1 { v0.b }[6], [x9] +; NO_SVE-NEXT: tbnz w8, #7, .LBB33_8 +; NO_SVE-NEXT: b .LBB33_9 +; ; VBITS_GE_512-LABEL: masked_load_zext_v8i8i64_m64: ; VBITS_GE_512: // %bb.0: ; VBITS_GE_512-NEXT: ptrue p0.d, vl8 @@ -599,6 +6543,171 @@ } define <16 x i32> @masked_load_zext_v16i16i32_m32(<16 x i16>* %ap, <16 x i32>* %bp) #0 { +; NO_SVE-LABEL: masked_load_zext_v16i16i32_m32: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: sub sp, sp, #16 +; NO_SVE-NEXT: .cfi_def_cfa_offset 16 +; NO_SVE-NEXT: ldp q1, q0, [x1] +; NO_SVE-NEXT: cmeq v1.4s, v1.4s, #0 +; NO_SVE-NEXT: cmeq v0.4s, v0.4s, #0 +; NO_SVE-NEXT: uzp1 v0.8h, v1.8h, v0.8h +; NO_SVE-NEXT: ldp q2, q1, [x1, #32] +; NO_SVE-NEXT: xtn v0.8b, v0.8h +; NO_SVE-NEXT: cmeq v2.4s, v2.4s, #0 +; NO_SVE-NEXT: umov w8, v0.b[1] +; NO_SVE-NEXT: cmeq v1.4s, v1.4s, #0 +; NO_SVE-NEXT: umov w10, v0.b[2] +; NO_SVE-NEXT: umov w9, v0.b[0] +; NO_SVE-NEXT: umov w11, v0.b[3] +; NO_SVE-NEXT: umov w12, v0.b[4] +; NO_SVE-NEXT: uzp1 v1.8h, v2.8h, v1.8h +; NO_SVE-NEXT: and w8, w8, #0x1 +; NO_SVE-NEXT: umov w13, v0.b[5] +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: and w9, w9, #0x1 +; NO_SVE-NEXT: and w11, w11, #0x1 +; NO_SVE-NEXT: bfi w9, w8, #1, #1 +; NO_SVE-NEXT: xtn v1.8b, v1.8h +; NO_SVE-NEXT: umov w8, v0.b[6] +; NO_SVE-NEXT: and w12, w12, #0x1 +; NO_SVE-NEXT: bfi w9, w10, #2, #1 +; NO_SVE-NEXT: umov w10, v0.b[7] +; NO_SVE-NEXT: and w13, w13, #0x1 +; NO_SVE-NEXT: bfi w9, w11, #3, #1 +; NO_SVE-NEXT: umov w11, v1.b[0] +; NO_SVE-NEXT: bfi w9, w12, #4, #1 +; NO_SVE-NEXT: umov w12, v1.b[1] +; NO_SVE-NEXT: and w8, w8, #0x1 +; NO_SVE-NEXT: bfi w9, w13, #5, #1 +; NO_SVE-NEXT: umov w13, v1.b[2] +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: orr w8, w9, w8, lsl #6 +; NO_SVE-NEXT: umov w9, v1.b[3] +; NO_SVE-NEXT: and w11, w11, #0x1 +; NO_SVE-NEXT: orr w8, w8, w10, lsl #7 +; NO_SVE-NEXT: umov w10, v1.b[4] +; NO_SVE-NEXT: and w12, w12, #0x1 +; NO_SVE-NEXT: orr w8, w8, w11, lsl #8 +; NO_SVE-NEXT: umov w11, v1.b[5] +; NO_SVE-NEXT: and w13, w13, #0x1 +; NO_SVE-NEXT: orr w8, w8, w12, lsl #9 +; NO_SVE-NEXT: umov w12, v1.b[6] +; NO_SVE-NEXT: and w9, w9, #0x1 +; NO_SVE-NEXT: orr w8, w8, w13, lsl #10 +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: orr w8, w8, w9, lsl #11 +; NO_SVE-NEXT: and w9, w11, #0x1 +; NO_SVE-NEXT: umov w11, v1.b[7] +; NO_SVE-NEXT: orr w8, w8, w10, lsl #12 +; NO_SVE-NEXT: and w10, w12, #0x1 +; NO_SVE-NEXT: orr w8, w8, w9, lsl #13 +; NO_SVE-NEXT: orr w8, w8, w10, lsl #14 +; NO_SVE-NEXT: orr w9, w8, w11, lsl #15 +; NO_SVE-NEXT: and w8, w9, #0xffff +; NO_SVE-NEXT: tbz w9, #0, .LBB34_2 +; NO_SVE-NEXT: // %bb.1: // %cond.load +; NO_SVE-NEXT: ldr h0, [x0] +; NO_SVE-NEXT: tbnz w8, #1, .LBB34_3 +; NO_SVE-NEXT: b .LBB34_4 +; NO_SVE-NEXT: .LBB34_2: +; NO_SVE-NEXT: // implicit-def: $q0 +; NO_SVE-NEXT: tbz w8, #1, .LBB34_4 +; NO_SVE-NEXT: .LBB34_3: // %cond.load1 +; NO_SVE-NEXT: add x9, x0, #2 +; NO_SVE-NEXT: ld1 { v0.h }[1], [x9] +; NO_SVE-NEXT: .LBB34_4: // %else2 +; NO_SVE-NEXT: tbnz w8, #2, .LBB34_12 +; NO_SVE-NEXT: // %bb.5: // %else5 +; NO_SVE-NEXT: tbnz w8, #3, .LBB34_13 +; NO_SVE-NEXT: .LBB34_6: // %else8 +; NO_SVE-NEXT: tbnz w8, #4, .LBB34_14 +; NO_SVE-NEXT: .LBB34_7: // %else11 +; NO_SVE-NEXT: tbnz w8, #5, .LBB34_15 +; NO_SVE-NEXT: .LBB34_8: // %else14 +; NO_SVE-NEXT: tbnz w8, #6, .LBB34_16 +; NO_SVE-NEXT: .LBB34_9: // %else17 +; NO_SVE-NEXT: tbnz w8, #7, .LBB34_17 +; NO_SVE-NEXT: .LBB34_10: // %else20 +; NO_SVE-NEXT: tbz w8, #8, .LBB34_18 +; NO_SVE-NEXT: .LBB34_11: // %cond.load22 +; NO_SVE-NEXT: add x9, x0, #16 +; NO_SVE-NEXT: ld1 { v2.h }[0], [x9] +; NO_SVE-NEXT: tbnz w8, #9, .LBB34_19 +; NO_SVE-NEXT: b .LBB34_20 +; NO_SVE-NEXT: .LBB34_12: // %cond.load4 +; NO_SVE-NEXT: add x9, x0, #4 +; NO_SVE-NEXT: ld1 { v0.h }[2], [x9] +; NO_SVE-NEXT: tbz w8, #3, .LBB34_6 +; NO_SVE-NEXT: .LBB34_13: // %cond.load7 +; NO_SVE-NEXT: add x9, x0, #6 +; NO_SVE-NEXT: ld1 { v0.h }[3], [x9] +; NO_SVE-NEXT: tbz w8, #4, .LBB34_7 +; NO_SVE-NEXT: .LBB34_14: // %cond.load10 +; NO_SVE-NEXT: add x9, x0, #8 +; NO_SVE-NEXT: ld1 { v0.h }[4], [x9] +; NO_SVE-NEXT: tbz w8, #5, .LBB34_8 +; NO_SVE-NEXT: .LBB34_15: // %cond.load13 +; NO_SVE-NEXT: add x9, x0, #10 +; NO_SVE-NEXT: ld1 { v0.h }[5], [x9] +; NO_SVE-NEXT: tbz w8, #6, .LBB34_9 +; NO_SVE-NEXT: .LBB34_16: // %cond.load16 +; NO_SVE-NEXT: add x9, x0, #12 +; NO_SVE-NEXT: ld1 { v0.h }[6], [x9] +; NO_SVE-NEXT: tbz w8, #7, .LBB34_10 +; NO_SVE-NEXT: .LBB34_17: // %cond.load19 +; NO_SVE-NEXT: add x9, x0, #14 +; NO_SVE-NEXT: ld1 { v0.h }[7], [x9] +; NO_SVE-NEXT: tbnz w8, #8, .LBB34_11 +; NO_SVE-NEXT: .LBB34_18: +; NO_SVE-NEXT: // implicit-def: $q2 +; NO_SVE-NEXT: tbz w8, #9, .LBB34_20 +; NO_SVE-NEXT: .LBB34_19: // %cond.load25 +; NO_SVE-NEXT: add x9, x0, #18 +; NO_SVE-NEXT: ld1 { v2.h }[1], [x9] +; NO_SVE-NEXT: .LBB34_20: // %else26 +; NO_SVE-NEXT: tbnz w8, #10, .LBB34_28 +; NO_SVE-NEXT: // %bb.21: // %else29 +; NO_SVE-NEXT: tbnz w8, #11, .LBB34_29 +; NO_SVE-NEXT: .LBB34_22: // %else32 +; NO_SVE-NEXT: tbnz w8, #12, .LBB34_30 +; NO_SVE-NEXT: .LBB34_23: // %else35 +; NO_SVE-NEXT: tbnz w8, #13, .LBB34_31 +; NO_SVE-NEXT: .LBB34_24: // %else38 +; NO_SVE-NEXT: tbnz w8, #14, .LBB34_32 +; NO_SVE-NEXT: .LBB34_25: // %else41 +; NO_SVE-NEXT: tbz w8, #15, .LBB34_27 +; NO_SVE-NEXT: .LBB34_26: // %cond.load43 +; NO_SVE-NEXT: add x8, x0, #30 +; NO_SVE-NEXT: ld1 { v2.h }[7], [x8] +; NO_SVE-NEXT: .LBB34_27: // %else44 +; NO_SVE-NEXT: ushll2 v1.4s, v0.8h, #0 +; NO_SVE-NEXT: ushll v0.4s, v0.4h, #0 +; NO_SVE-NEXT: ushll2 v3.4s, v2.8h, #0 +; NO_SVE-NEXT: ushll v2.4s, v2.4h, #0 +; NO_SVE-NEXT: add sp, sp, #16 +; NO_SVE-NEXT: ret +; NO_SVE-NEXT: .LBB34_28: // %cond.load28 +; NO_SVE-NEXT: add x9, x0, #20 +; NO_SVE-NEXT: ld1 { v2.h }[2], [x9] +; NO_SVE-NEXT: tbz w8, #11, .LBB34_22 +; NO_SVE-NEXT: .LBB34_29: // %cond.load31 +; NO_SVE-NEXT: add x9, x0, #22 +; NO_SVE-NEXT: ld1 { v2.h }[3], [x9] +; NO_SVE-NEXT: tbz w8, #12, .LBB34_23 +; NO_SVE-NEXT: .LBB34_30: // %cond.load34 +; NO_SVE-NEXT: add x9, x0, #24 +; NO_SVE-NEXT: ld1 { v2.h }[4], [x9] +; NO_SVE-NEXT: tbz w8, #13, .LBB34_24 +; NO_SVE-NEXT: .LBB34_31: // %cond.load37 +; NO_SVE-NEXT: add x9, x0, #26 +; NO_SVE-NEXT: ld1 { v2.h }[5], [x9] +; NO_SVE-NEXT: tbz w8, #14, .LBB34_25 +; NO_SVE-NEXT: .LBB34_32: // %cond.load40 +; NO_SVE-NEXT: add x9, x0, #28 +; NO_SVE-NEXT: ld1 { v2.h }[6], [x9] +; NO_SVE-NEXT: tbnz w8, #15, .LBB34_26 +; NO_SVE-NEXT: b .LBB34_27 +; ; VBITS_GE_512-LABEL: masked_load_zext_v16i16i32_m32: ; VBITS_GE_512: // %bb.0: ; VBITS_GE_512-NEXT: ptrue p0.s, vl16 @@ -615,6 +6724,100 @@ } define <8 x i64> @masked_load_zext_v8i16i64_m64(<8 x i16>* %ap, <8 x i64>* %bp) #0 { +; NO_SVE-LABEL: masked_load_zext_v8i16i64_m64: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: sub sp, sp, #16 +; NO_SVE-NEXT: .cfi_def_cfa_offset 16 +; NO_SVE-NEXT: ldp q0, q1, [x1, #32] +; NO_SVE-NEXT: cmeq v0.2d, v0.2d, #0 +; NO_SVE-NEXT: ldp q2, q3, [x1] +; NO_SVE-NEXT: cmeq v1.2d, v1.2d, #0 +; NO_SVE-NEXT: uzp1 v0.4s, v0.4s, v1.4s +; NO_SVE-NEXT: cmeq v2.2d, v2.2d, #0 +; NO_SVE-NEXT: cmeq v3.2d, v3.2d, #0 +; NO_SVE-NEXT: uzp1 v1.4s, v2.4s, v3.4s +; NO_SVE-NEXT: uzp1 v0.8h, v1.8h, v0.8h +; NO_SVE-NEXT: xtn v0.8b, v0.8h +; NO_SVE-NEXT: umov w8, v0.b[1] +; NO_SVE-NEXT: umov w10, v0.b[2] +; NO_SVE-NEXT: umov w9, v0.b[0] +; NO_SVE-NEXT: umov w11, v0.b[3] +; NO_SVE-NEXT: umov w12, v0.b[4] +; NO_SVE-NEXT: umov w13, v0.b[5] +; NO_SVE-NEXT: umov w14, v0.b[6] +; NO_SVE-NEXT: and w8, w8, #0x1 +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: and w9, w9, #0x1 +; NO_SVE-NEXT: and w11, w11, #0x1 +; NO_SVE-NEXT: bfi w9, w8, #1, #1 +; NO_SVE-NEXT: and w8, w12, #0x1 +; NO_SVE-NEXT: bfi w9, w10, #2, #1 +; NO_SVE-NEXT: and w10, w13, #0x1 +; NO_SVE-NEXT: bfi w9, w11, #3, #1 +; NO_SVE-NEXT: umov w11, v0.b[7] +; NO_SVE-NEXT: bfi w9, w8, #4, #1 +; NO_SVE-NEXT: and w8, w14, #0x1 +; NO_SVE-NEXT: bfi w9, w10, #5, #1 +; NO_SVE-NEXT: // implicit-def: $q0 +; NO_SVE-NEXT: orr w8, w9, w8, lsl #6 +; NO_SVE-NEXT: orr w9, w8, w11, lsl #7 +; NO_SVE-NEXT: and w8, w9, #0xff +; NO_SVE-NEXT: tbnz w9, #0, .LBB35_10 +; NO_SVE-NEXT: // %bb.1: // %else +; NO_SVE-NEXT: tbnz w8, #1, .LBB35_11 +; NO_SVE-NEXT: .LBB35_2: // %else2 +; NO_SVE-NEXT: tbnz w8, #2, .LBB35_12 +; NO_SVE-NEXT: .LBB35_3: // %else5 +; NO_SVE-NEXT: tbnz w8, #3, .LBB35_13 +; NO_SVE-NEXT: .LBB35_4: // %else8 +; NO_SVE-NEXT: tbnz w8, #4, .LBB35_14 +; NO_SVE-NEXT: .LBB35_5: // %else11 +; NO_SVE-NEXT: tbnz w8, #5, .LBB35_15 +; NO_SVE-NEXT: .LBB35_6: // %else14 +; NO_SVE-NEXT: tbnz w8, #6, .LBB35_16 +; NO_SVE-NEXT: .LBB35_7: // %else17 +; NO_SVE-NEXT: tbz w8, #7, .LBB35_9 +; NO_SVE-NEXT: .LBB35_8: // %cond.load19 +; NO_SVE-NEXT: add x8, x0, #14 +; NO_SVE-NEXT: ld1 { v0.h }[7], [x8] +; NO_SVE-NEXT: .LBB35_9: // %else20 +; NO_SVE-NEXT: ushll2 v2.4s, v0.8h, #0 +; NO_SVE-NEXT: ushll v0.4s, v0.4h, #0 +; NO_SVE-NEXT: ushll2 v3.2d, v2.4s, #0 +; NO_SVE-NEXT: ushll2 v1.2d, v0.4s, #0 +; NO_SVE-NEXT: ushll v0.2d, v0.2s, #0 +; NO_SVE-NEXT: ushll v2.2d, v2.2s, #0 +; NO_SVE-NEXT: add sp, sp, #16 +; NO_SVE-NEXT: ret +; NO_SVE-NEXT: .LBB35_10: // %cond.load +; NO_SVE-NEXT: ldr h0, [x0] +; NO_SVE-NEXT: tbz w8, #1, .LBB35_2 +; NO_SVE-NEXT: .LBB35_11: // %cond.load1 +; NO_SVE-NEXT: add x9, x0, #2 +; NO_SVE-NEXT: ld1 { v0.h }[1], [x9] +; NO_SVE-NEXT: tbz w8, #2, .LBB35_3 +; NO_SVE-NEXT: .LBB35_12: // %cond.load4 +; NO_SVE-NEXT: add x9, x0, #4 +; NO_SVE-NEXT: ld1 { v0.h }[2], [x9] +; NO_SVE-NEXT: tbz w8, #3, .LBB35_4 +; NO_SVE-NEXT: .LBB35_13: // %cond.load7 +; NO_SVE-NEXT: add x9, x0, #6 +; NO_SVE-NEXT: ld1 { v0.h }[3], [x9] +; NO_SVE-NEXT: tbz w8, #4, .LBB35_5 +; NO_SVE-NEXT: .LBB35_14: // %cond.load10 +; NO_SVE-NEXT: add x9, x0, #8 +; NO_SVE-NEXT: ld1 { v0.h }[4], [x9] +; NO_SVE-NEXT: tbz w8, #5, .LBB35_6 +; NO_SVE-NEXT: .LBB35_15: // %cond.load13 +; NO_SVE-NEXT: add x9, x0, #10 +; NO_SVE-NEXT: ld1 { v0.h }[5], [x9] +; NO_SVE-NEXT: tbz w8, #6, .LBB35_7 +; NO_SVE-NEXT: .LBB35_16: // %cond.load16 +; NO_SVE-NEXT: add x9, x0, #12 +; NO_SVE-NEXT: ld1 { v0.h }[6], [x9] +; NO_SVE-NEXT: tbnz w8, #7, .LBB35_8 +; NO_SVE-NEXT: b .LBB35_9 +; ; VBITS_GE_512-LABEL: masked_load_zext_v8i16i64_m64: ; VBITS_GE_512: // %bb.0: ; VBITS_GE_512-NEXT: ptrue p0.d, vl8 @@ -631,6 +6834,99 @@ } define <8 x i64> @masked_load_zext_v8i32i64_m64(<8 x i32>* %ap, <8 x i64>* %bp) #0 { +; NO_SVE-LABEL: masked_load_zext_v8i32i64_m64: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: sub sp, sp, #16 +; NO_SVE-NEXT: .cfi_def_cfa_offset 16 +; NO_SVE-NEXT: ldp q0, q1, [x1, #32] +; NO_SVE-NEXT: cmeq v0.2d, v0.2d, #0 +; NO_SVE-NEXT: ldp q2, q3, [x1] +; NO_SVE-NEXT: cmeq v1.2d, v1.2d, #0 +; NO_SVE-NEXT: uzp1 v0.4s, v0.4s, v1.4s +; NO_SVE-NEXT: cmeq v2.2d, v2.2d, #0 +; NO_SVE-NEXT: cmeq v3.2d, v3.2d, #0 +; NO_SVE-NEXT: uzp1 v1.4s, v2.4s, v3.4s +; NO_SVE-NEXT: uzp1 v0.8h, v1.8h, v0.8h +; NO_SVE-NEXT: xtn v0.8b, v0.8h +; NO_SVE-NEXT: umov w8, v0.b[1] +; NO_SVE-NEXT: umov w10, v0.b[2] +; NO_SVE-NEXT: umov w9, v0.b[0] +; NO_SVE-NEXT: umov w11, v0.b[3] +; NO_SVE-NEXT: umov w12, v0.b[4] +; NO_SVE-NEXT: umov w13, v0.b[5] +; NO_SVE-NEXT: umov w14, v0.b[6] +; NO_SVE-NEXT: and w8, w8, #0x1 +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: and w9, w9, #0x1 +; NO_SVE-NEXT: and w11, w11, #0x1 +; NO_SVE-NEXT: bfi w9, w8, #1, #1 +; NO_SVE-NEXT: and w8, w12, #0x1 +; NO_SVE-NEXT: bfi w9, w10, #2, #1 +; NO_SVE-NEXT: and w10, w13, #0x1 +; NO_SVE-NEXT: bfi w9, w11, #3, #1 +; NO_SVE-NEXT: umov w11, v0.b[7] +; NO_SVE-NEXT: bfi w9, w8, #4, #1 +; NO_SVE-NEXT: and w8, w14, #0x1 +; NO_SVE-NEXT: bfi w9, w10, #5, #1 +; NO_SVE-NEXT: orr w8, w9, w8, lsl #6 +; NO_SVE-NEXT: orr w9, w8, w11, lsl #7 +; NO_SVE-NEXT: and w8, w9, #0xff +; NO_SVE-NEXT: tbz w9, #0, .LBB36_2 +; NO_SVE-NEXT: // %bb.1: // %cond.load +; NO_SVE-NEXT: ldr s0, [x0] +; NO_SVE-NEXT: tbnz w8, #1, .LBB36_3 +; NO_SVE-NEXT: b .LBB36_4 +; NO_SVE-NEXT: .LBB36_2: +; NO_SVE-NEXT: // implicit-def: $q0 +; NO_SVE-NEXT: tbz w8, #1, .LBB36_4 +; NO_SVE-NEXT: .LBB36_3: // %cond.load1 +; NO_SVE-NEXT: add x9, x0, #4 +; NO_SVE-NEXT: ld1 { v0.s }[1], [x9] +; NO_SVE-NEXT: .LBB36_4: // %else2 +; NO_SVE-NEXT: tbnz w8, #2, .LBB36_8 +; NO_SVE-NEXT: // %bb.5: // %else5 +; NO_SVE-NEXT: tbnz w8, #3, .LBB36_9 +; NO_SVE-NEXT: .LBB36_6: // %else8 +; NO_SVE-NEXT: tbz w8, #4, .LBB36_10 +; NO_SVE-NEXT: .LBB36_7: // %cond.load10 +; NO_SVE-NEXT: add x9, x0, #16 +; NO_SVE-NEXT: ld1 { v2.s }[0], [x9] +; NO_SVE-NEXT: tbnz w8, #5, .LBB36_11 +; NO_SVE-NEXT: b .LBB36_12 +; NO_SVE-NEXT: .LBB36_8: // %cond.load4 +; NO_SVE-NEXT: add x9, x0, #8 +; NO_SVE-NEXT: ld1 { v0.s }[2], [x9] +; NO_SVE-NEXT: tbz w8, #3, .LBB36_6 +; NO_SVE-NEXT: .LBB36_9: // %cond.load7 +; NO_SVE-NEXT: add x9, x0, #12 +; NO_SVE-NEXT: ld1 { v0.s }[3], [x9] +; NO_SVE-NEXT: tbnz w8, #4, .LBB36_7 +; NO_SVE-NEXT: .LBB36_10: +; NO_SVE-NEXT: // implicit-def: $q2 +; NO_SVE-NEXT: tbz w8, #5, .LBB36_12 +; NO_SVE-NEXT: .LBB36_11: // %cond.load13 +; NO_SVE-NEXT: add x9, x0, #20 +; NO_SVE-NEXT: ld1 { v2.s }[1], [x9] +; NO_SVE-NEXT: .LBB36_12: // %else14 +; NO_SVE-NEXT: tbnz w8, #6, .LBB36_16 +; NO_SVE-NEXT: // %bb.13: // %else17 +; NO_SVE-NEXT: tbz w8, #7, .LBB36_15 +; NO_SVE-NEXT: .LBB36_14: // %cond.load19 +; NO_SVE-NEXT: add x8, x0, #28 +; NO_SVE-NEXT: ld1 { v2.s }[3], [x8] +; NO_SVE-NEXT: .LBB36_15: // %else20 +; NO_SVE-NEXT: ushll2 v1.2d, v0.4s, #0 +; NO_SVE-NEXT: ushll v0.2d, v0.2s, #0 +; NO_SVE-NEXT: ushll2 v3.2d, v2.4s, #0 +; NO_SVE-NEXT: ushll v2.2d, v2.2s, #0 +; NO_SVE-NEXT: add sp, sp, #16 +; NO_SVE-NEXT: ret +; NO_SVE-NEXT: .LBB36_16: // %cond.load16 +; NO_SVE-NEXT: add x9, x0, #24 +; NO_SVE-NEXT: ld1 { v2.s }[2], [x9] +; NO_SVE-NEXT: tbnz w8, #7, .LBB36_14 +; NO_SVE-NEXT: b .LBB36_15 +; ; VBITS_GE_512-LABEL: masked_load_zext_v8i32i64_m64: ; VBITS_GE_512: // %bb.0: ; VBITS_GE_512-NEXT: ptrue p0.d, vl8 @@ -647,6 +6943,1190 @@ } define <128 x i16> @masked_load_sext_v128i8i16(<128 x i8>* %ap, <128 x i8>* %bp) #0 { +; NO_SVE-LABEL: masked_load_sext_v128i8i16: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: sub sp, sp, #64 +; NO_SVE-NEXT: .cfi_def_cfa_offset 64 +; NO_SVE-NEXT: stp x22, x21, [sp, #32] // 16-byte Folded Spill +; NO_SVE-NEXT: stp x20, x19, [sp, #48] // 16-byte Folded Spill +; NO_SVE-NEXT: .cfi_offset w19, -8 +; NO_SVE-NEXT: .cfi_offset w20, -16 +; NO_SVE-NEXT: .cfi_offset w21, -24 +; NO_SVE-NEXT: .cfi_offset w22, -32 +; NO_SVE-NEXT: ldp q2, q0, [x1, #32] +; NO_SVE-NEXT: cmeq v2.16b, v2.16b, #0 +; NO_SVE-NEXT: cmeq v1.16b, v0.16b, #0 +; NO_SVE-NEXT: ldp q0, q3, [x1] +; NO_SVE-NEXT: umov w9, v1.b[1] +; NO_SVE-NEXT: umov w11, v1.b[2] +; NO_SVE-NEXT: umov w10, v1.b[0] +; NO_SVE-NEXT: umov w12, v1.b[3] +; NO_SVE-NEXT: umov w13, v1.b[4] +; NO_SVE-NEXT: umov w14, v1.b[5] +; NO_SVE-NEXT: umov w15, v1.b[6] +; NO_SVE-NEXT: umov w16, v1.b[7] +; NO_SVE-NEXT: and w9, w9, #0x1 +; NO_SVE-NEXT: and w11, w11, #0x1 +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: and w12, w12, #0x1 +; NO_SVE-NEXT: and w13, w13, #0x1 +; NO_SVE-NEXT: umov w17, v1.b[8] +; NO_SVE-NEXT: bfi w10, w9, #1, #1 +; NO_SVE-NEXT: and w14, w14, #0x1 +; NO_SVE-NEXT: bfi w10, w11, #2, #1 +; NO_SVE-NEXT: umov w18, v1.b[9] +; NO_SVE-NEXT: bfi w10, w12, #3, #1 +; NO_SVE-NEXT: umov w9, v1.b[10] +; NO_SVE-NEXT: bfi w10, w13, #4, #1 +; NO_SVE-NEXT: and w15, w15, #0x1 +; NO_SVE-NEXT: umov w11, v1.b[11] +; NO_SVE-NEXT: bfi w10, w14, #5, #1 +; NO_SVE-NEXT: and w16, w16, #0x1 +; NO_SVE-NEXT: umov w12, v1.b[12] +; NO_SVE-NEXT: orr w10, w10, w15, lsl #6 +; NO_SVE-NEXT: and w17, w17, #0x1 +; NO_SVE-NEXT: umov w13, v1.b[13] +; NO_SVE-NEXT: orr w10, w10, w16, lsl #7 +; NO_SVE-NEXT: and w18, w18, #0x1 +; NO_SVE-NEXT: orr w10, w10, w17, lsl #8 +; NO_SVE-NEXT: and w9, w9, #0x1 +; NO_SVE-NEXT: umov w15, v2.b[1] +; NO_SVE-NEXT: orr w10, w10, w18, lsl #9 +; NO_SVE-NEXT: and w11, w11, #0x1 +; NO_SVE-NEXT: and w12, w12, #0x1 +; NO_SVE-NEXT: orr w9, w10, w9, lsl #10 +; NO_SVE-NEXT: umov w14, v1.b[14] +; NO_SVE-NEXT: umov w16, v2.b[0] +; NO_SVE-NEXT: umov w17, v2.b[2] +; NO_SVE-NEXT: and w10, w13, #0x1 +; NO_SVE-NEXT: orr w9, w9, w11, lsl #11 +; NO_SVE-NEXT: umov w18, v2.b[3] +; NO_SVE-NEXT: orr w9, w9, w12, lsl #12 +; NO_SVE-NEXT: umov w12, v2.b[4] +; NO_SVE-NEXT: orr w9, w9, w10, lsl #13 +; NO_SVE-NEXT: and w10, w15, #0x1 +; NO_SVE-NEXT: and w11, w14, #0x1 +; NO_SVE-NEXT: and w13, w16, #0x1 +; NO_SVE-NEXT: and w14, w17, #0x1 +; NO_SVE-NEXT: umov w15, v2.b[5] +; NO_SVE-NEXT: bfi w13, w10, #1, #1 +; NO_SVE-NEXT: and w10, w18, #0x1 +; NO_SVE-NEXT: and w12, w12, #0x1 +; NO_SVE-NEXT: umov w16, v2.b[6] +; NO_SVE-NEXT: bfi w13, w14, #2, #1 +; NO_SVE-NEXT: umov w14, v2.b[7] +; NO_SVE-NEXT: bfi w13, w10, #3, #1 +; NO_SVE-NEXT: and w10, w15, #0x1 +; NO_SVE-NEXT: bfi w13, w12, #4, #1 +; NO_SVE-NEXT: umov w12, v2.b[8] +; NO_SVE-NEXT: and w15, w16, #0x1 +; NO_SVE-NEXT: umov w16, v2.b[9] +; NO_SVE-NEXT: bfi w13, w10, #5, #1 +; NO_SVE-NEXT: and w10, w14, #0x1 +; NO_SVE-NEXT: umov w14, v2.b[10] +; NO_SVE-NEXT: orr w13, w13, w15, lsl #6 +; NO_SVE-NEXT: and w12, w12, #0x1 +; NO_SVE-NEXT: orr w10, w13, w10, lsl #7 +; NO_SVE-NEXT: and w13, w16, #0x1 +; NO_SVE-NEXT: umov w15, v2.b[11] +; NO_SVE-NEXT: orr w10, w10, w12, lsl #8 +; NO_SVE-NEXT: umov w12, v2.b[12] +; NO_SVE-NEXT: orr w9, w9, w11, lsl #14 +; NO_SVE-NEXT: umov w11, v1.b[15] +; NO_SVE-NEXT: orr w10, w10, w13, lsl #9 +; NO_SVE-NEXT: and w13, w14, #0x1 +; NO_SVE-NEXT: cmeq v1.16b, v3.16b, #0 +; NO_SVE-NEXT: umov w17, v2.b[14] +; NO_SVE-NEXT: orr w10, w10, w13, lsl #10 +; NO_SVE-NEXT: and w13, w15, #0x1 +; NO_SVE-NEXT: umov w14, v1.b[1] +; NO_SVE-NEXT: and w12, w12, #0x1 +; NO_SVE-NEXT: umov w15, v2.b[13] +; NO_SVE-NEXT: umov w16, v1.b[0] +; NO_SVE-NEXT: orr w10, w10, w13, lsl #11 +; NO_SVE-NEXT: umov w13, v1.b[2] +; NO_SVE-NEXT: orr w10, w10, w12, lsl #12 +; NO_SVE-NEXT: umov w12, v1.b[3] +; NO_SVE-NEXT: orr w9, w9, w11, lsl #15 +; NO_SVE-NEXT: and w11, w14, #0x1 +; NO_SVE-NEXT: and w14, w15, #0x1 +; NO_SVE-NEXT: and w15, w16, #0x1 +; NO_SVE-NEXT: umov w16, v1.b[4] +; NO_SVE-NEXT: and w13, w13, #0x1 +; NO_SVE-NEXT: bfi w15, w11, #1, #1 +; NO_SVE-NEXT: and w11, w12, #0x1 +; NO_SVE-NEXT: umov w12, v1.b[5] +; NO_SVE-NEXT: orr w10, w10, w14, lsl #13 +; NO_SVE-NEXT: bfi w15, w13, #2, #1 +; NO_SVE-NEXT: umov w14, v1.b[6] +; NO_SVE-NEXT: and w13, w16, #0x1 +; NO_SVE-NEXT: bfi w15, w11, #3, #1 +; NO_SVE-NEXT: and w16, w17, #0x1 +; NO_SVE-NEXT: umov w17, v1.b[9] +; NO_SVE-NEXT: and w11, w12, #0x1 +; NO_SVE-NEXT: umov w12, v1.b[7] +; NO_SVE-NEXT: bfi w15, w13, #4, #1 +; NO_SVE-NEXT: umov w13, v1.b[8] +; NO_SVE-NEXT: and w14, w14, #0x1 +; NO_SVE-NEXT: orr w10, w10, w16, lsl #14 +; NO_SVE-NEXT: bfi w15, w11, #5, #1 +; NO_SVE-NEXT: umov w18, v1.b[13] +; NO_SVE-NEXT: and w12, w12, #0x1 +; NO_SVE-NEXT: orr w11, w15, w14, lsl #6 +; NO_SVE-NEXT: and w13, w13, #0x1 +; NO_SVE-NEXT: umov w14, v1.b[10] +; NO_SVE-NEXT: orr w11, w11, w12, lsl #7 +; NO_SVE-NEXT: and w12, w17, #0x1 +; NO_SVE-NEXT: orr w11, w11, w13, lsl #8 +; NO_SVE-NEXT: umov w15, v2.b[15] +; NO_SVE-NEXT: orr w11, w11, w12, lsl #9 +; NO_SVE-NEXT: umov w12, v1.b[11] +; NO_SVE-NEXT: and w13, w14, #0x1 +; NO_SVE-NEXT: umov w14, v1.b[12] +; NO_SVE-NEXT: cmeq v0.16b, v0.16b, #0 +; NO_SVE-NEXT: orr w15, w10, w15, lsl #15 +; NO_SVE-NEXT: orr w10, w11, w13, lsl #10 +; NO_SVE-NEXT: and w11, w12, #0x1 +; NO_SVE-NEXT: umov w12, v0.b[1] +; NO_SVE-NEXT: and w13, w14, #0x1 +; NO_SVE-NEXT: umov w14, v0.b[2] +; NO_SVE-NEXT: orr w10, w10, w11, lsl #11 +; NO_SVE-NEXT: umov w11, v0.b[0] +; NO_SVE-NEXT: orr w10, w10, w13, lsl #12 +; NO_SVE-NEXT: umov w13, v0.b[3] +; NO_SVE-NEXT: and w12, w12, #0x1 +; NO_SVE-NEXT: umov w16, v0.b[4] +; NO_SVE-NEXT: umov w17, v0.b[5] +; NO_SVE-NEXT: and w14, w14, #0x1 +; NO_SVE-NEXT: and w11, w11, #0x1 +; NO_SVE-NEXT: bfi w15, w9, #16, #16 +; NO_SVE-NEXT: bfi w11, w12, #1, #1 +; NO_SVE-NEXT: and w12, w13, #0x1 +; NO_SVE-NEXT: and w13, w16, #0x1 +; NO_SVE-NEXT: umov w16, v0.b[6] +; NO_SVE-NEXT: bfi w11, w14, #2, #1 +; NO_SVE-NEXT: and w14, w17, #0x1 +; NO_SVE-NEXT: bfi w11, w12, #3, #1 +; NO_SVE-NEXT: umov w12, v0.b[7] +; NO_SVE-NEXT: bfi w11, w13, #4, #1 +; NO_SVE-NEXT: umov w13, v1.b[14] +; NO_SVE-NEXT: bfi w11, w14, #5, #1 +; NO_SVE-NEXT: and w14, w16, #0x1 +; NO_SVE-NEXT: umov w16, v0.b[8] +; NO_SVE-NEXT: and w17, w18, #0x1 +; NO_SVE-NEXT: and w12, w12, #0x1 +; NO_SVE-NEXT: umov w18, v0.b[9] +; NO_SVE-NEXT: orr w11, w11, w14, lsl #6 +; NO_SVE-NEXT: umov w14, v0.b[10] +; NO_SVE-NEXT: orr w10, w10, w17, lsl #13 +; NO_SVE-NEXT: orr w11, w11, w12, lsl #7 +; NO_SVE-NEXT: and w12, w16, #0x1 +; NO_SVE-NEXT: umov w17, v0.b[11] +; NO_SVE-NEXT: and w13, w13, #0x1 +; NO_SVE-NEXT: and w16, w18, #0x1 +; NO_SVE-NEXT: orr w11, w11, w12, lsl #8 +; NO_SVE-NEXT: umov w12, v0.b[12] +; NO_SVE-NEXT: orr w10, w10, w13, lsl #14 +; NO_SVE-NEXT: and w13, w14, #0x1 +; NO_SVE-NEXT: umov w14, v0.b[13] +; NO_SVE-NEXT: orr w11, w11, w16, lsl #9 +; NO_SVE-NEXT: and w16, w17, #0x1 +; NO_SVE-NEXT: umov w17, v0.b[14] +; NO_SVE-NEXT: orr w11, w11, w13, lsl #10 +; NO_SVE-NEXT: umov w13, v1.b[15] +; NO_SVE-NEXT: and w12, w12, #0x1 +; NO_SVE-NEXT: orr w11, w11, w16, lsl #11 +; NO_SVE-NEXT: and w14, w14, #0x1 +; NO_SVE-NEXT: umov w16, v0.b[15] +; NO_SVE-NEXT: orr w11, w11, w12, lsl #12 +; NO_SVE-NEXT: and w12, w17, #0x1 +; NO_SVE-NEXT: orr w13, w10, w13, lsl #15 +; NO_SVE-NEXT: orr w10, w11, w14, lsl #13 +; NO_SVE-NEXT: orr w9, w10, w12, lsl #14 +; NO_SVE-NEXT: orr w10, w9, w16, lsl #15 +; NO_SVE-NEXT: bfi w10, w13, #16, #16 +; NO_SVE-NEXT: bfi x10, x15, #32, #32 +; NO_SVE-NEXT: tbz w10, #0, .LBB37_2 +; NO_SVE-NEXT: // %bb.1: // %cond.load +; NO_SVE-NEXT: ldr b0, [x0] +; NO_SVE-NEXT: tbnz w10, #1, .LBB37_3 +; NO_SVE-NEXT: b .LBB37_4 +; NO_SVE-NEXT: .LBB37_2: +; NO_SVE-NEXT: // implicit-def: $q0 +; NO_SVE-NEXT: tbz w10, #1, .LBB37_4 +; NO_SVE-NEXT: .LBB37_3: // %cond.load1 +; NO_SVE-NEXT: add x9, x0, #1 +; NO_SVE-NEXT: ld1 { v0.b }[1], [x9] +; NO_SVE-NEXT: .LBB37_4: // %else2 +; NO_SVE-NEXT: tbnz w10, #2, .LBB37_20 +; NO_SVE-NEXT: // %bb.5: // %else5 +; NO_SVE-NEXT: tbnz w10, #3, .LBB37_21 +; NO_SVE-NEXT: .LBB37_6: // %else8 +; NO_SVE-NEXT: tbnz w10, #4, .LBB37_22 +; NO_SVE-NEXT: .LBB37_7: // %else11 +; NO_SVE-NEXT: tbnz w10, #5, .LBB37_23 +; NO_SVE-NEXT: .LBB37_8: // %else14 +; NO_SVE-NEXT: tbnz w10, #6, .LBB37_24 +; NO_SVE-NEXT: .LBB37_9: // %else17 +; NO_SVE-NEXT: tbnz w10, #7, .LBB37_25 +; NO_SVE-NEXT: .LBB37_10: // %else20 +; NO_SVE-NEXT: tbnz w10, #8, .LBB37_26 +; NO_SVE-NEXT: .LBB37_11: // %else23 +; NO_SVE-NEXT: tbnz w10, #9, .LBB37_27 +; NO_SVE-NEXT: .LBB37_12: // %else26 +; NO_SVE-NEXT: tbnz w10, #10, .LBB37_28 +; NO_SVE-NEXT: .LBB37_13: // %else29 +; NO_SVE-NEXT: tbnz w10, #11, .LBB37_29 +; NO_SVE-NEXT: .LBB37_14: // %else32 +; NO_SVE-NEXT: tbnz w10, #12, .LBB37_30 +; NO_SVE-NEXT: .LBB37_15: // %else35 +; NO_SVE-NEXT: tbnz w10, #13, .LBB37_31 +; NO_SVE-NEXT: .LBB37_16: // %else38 +; NO_SVE-NEXT: tbnz w10, #14, .LBB37_32 +; NO_SVE-NEXT: .LBB37_17: // %else41 +; NO_SVE-NEXT: tbnz w10, #15, .LBB37_33 +; NO_SVE-NEXT: .LBB37_18: // %else44 +; NO_SVE-NEXT: tbz w10, #16, .LBB37_34 +; NO_SVE-NEXT: .LBB37_19: // %cond.load46 +; NO_SVE-NEXT: add x9, x0, #16 +; NO_SVE-NEXT: ld1 { v1.b }[0], [x9] +; NO_SVE-NEXT: tbnz w10, #17, .LBB37_35 +; NO_SVE-NEXT: b .LBB37_36 +; NO_SVE-NEXT: .LBB37_20: // %cond.load4 +; NO_SVE-NEXT: add x9, x0, #2 +; NO_SVE-NEXT: ld1 { v0.b }[2], [x9] +; NO_SVE-NEXT: tbz w10, #3, .LBB37_6 +; NO_SVE-NEXT: .LBB37_21: // %cond.load7 +; NO_SVE-NEXT: add x9, x0, #3 +; NO_SVE-NEXT: ld1 { v0.b }[3], [x9] +; NO_SVE-NEXT: tbz w10, #4, .LBB37_7 +; NO_SVE-NEXT: .LBB37_22: // %cond.load10 +; NO_SVE-NEXT: add x9, x0, #4 +; NO_SVE-NEXT: ld1 { v0.b }[4], [x9] +; NO_SVE-NEXT: tbz w10, #5, .LBB37_8 +; NO_SVE-NEXT: .LBB37_23: // %cond.load13 +; NO_SVE-NEXT: add x9, x0, #5 +; NO_SVE-NEXT: ld1 { v0.b }[5], [x9] +; NO_SVE-NEXT: tbz w10, #6, .LBB37_9 +; NO_SVE-NEXT: .LBB37_24: // %cond.load16 +; NO_SVE-NEXT: add x9, x0, #6 +; NO_SVE-NEXT: ld1 { v0.b }[6], [x9] +; NO_SVE-NEXT: tbz w10, #7, .LBB37_10 +; NO_SVE-NEXT: .LBB37_25: // %cond.load19 +; NO_SVE-NEXT: add x9, x0, #7 +; NO_SVE-NEXT: ld1 { v0.b }[7], [x9] +; NO_SVE-NEXT: tbz w10, #8, .LBB37_11 +; NO_SVE-NEXT: .LBB37_26: // %cond.load22 +; NO_SVE-NEXT: add x9, x0, #8 +; NO_SVE-NEXT: ld1 { v0.b }[8], [x9] +; NO_SVE-NEXT: tbz w10, #9, .LBB37_12 +; NO_SVE-NEXT: .LBB37_27: // %cond.load25 +; NO_SVE-NEXT: add x9, x0, #9 +; NO_SVE-NEXT: ld1 { v0.b }[9], [x9] +; NO_SVE-NEXT: tbz w10, #10, .LBB37_13 +; NO_SVE-NEXT: .LBB37_28: // %cond.load28 +; NO_SVE-NEXT: add x9, x0, #10 +; NO_SVE-NEXT: ld1 { v0.b }[10], [x9] +; NO_SVE-NEXT: tbz w10, #11, .LBB37_14 +; NO_SVE-NEXT: .LBB37_29: // %cond.load31 +; NO_SVE-NEXT: add x9, x0, #11 +; NO_SVE-NEXT: ld1 { v0.b }[11], [x9] +; NO_SVE-NEXT: tbz w10, #12, .LBB37_15 +; NO_SVE-NEXT: .LBB37_30: // %cond.load34 +; NO_SVE-NEXT: add x9, x0, #12 +; NO_SVE-NEXT: ld1 { v0.b }[12], [x9] +; NO_SVE-NEXT: tbz w10, #13, .LBB37_16 +; NO_SVE-NEXT: .LBB37_31: // %cond.load37 +; NO_SVE-NEXT: add x9, x0, #13 +; NO_SVE-NEXT: ld1 { v0.b }[13], [x9] +; NO_SVE-NEXT: tbz w10, #14, .LBB37_17 +; NO_SVE-NEXT: .LBB37_32: // %cond.load40 +; NO_SVE-NEXT: add x9, x0, #14 +; NO_SVE-NEXT: ld1 { v0.b }[14], [x9] +; NO_SVE-NEXT: tbz w10, #15, .LBB37_18 +; NO_SVE-NEXT: .LBB37_33: // %cond.load43 +; NO_SVE-NEXT: add x9, x0, #15 +; NO_SVE-NEXT: ld1 { v0.b }[15], [x9] +; NO_SVE-NEXT: tbnz w10, #16, .LBB37_19 +; NO_SVE-NEXT: .LBB37_34: +; NO_SVE-NEXT: // implicit-def: $q1 +; NO_SVE-NEXT: tbz w10, #17, .LBB37_36 +; NO_SVE-NEXT: .LBB37_35: // %cond.load49 +; NO_SVE-NEXT: add x9, x0, #17 +; NO_SVE-NEXT: ld1 { v1.b }[1], [x9] +; NO_SVE-NEXT: .LBB37_36: // %else50 +; NO_SVE-NEXT: tbnz w10, #18, .LBB37_52 +; NO_SVE-NEXT: // %bb.37: // %else53 +; NO_SVE-NEXT: tbnz w10, #19, .LBB37_53 +; NO_SVE-NEXT: .LBB37_38: // %else56 +; NO_SVE-NEXT: tbnz w10, #20, .LBB37_54 +; NO_SVE-NEXT: .LBB37_39: // %else59 +; NO_SVE-NEXT: tbnz w10, #21, .LBB37_55 +; NO_SVE-NEXT: .LBB37_40: // %else62 +; NO_SVE-NEXT: tbnz w10, #22, .LBB37_56 +; NO_SVE-NEXT: .LBB37_41: // %else65 +; NO_SVE-NEXT: tbnz w10, #23, .LBB37_57 +; NO_SVE-NEXT: .LBB37_42: // %else68 +; NO_SVE-NEXT: tbnz w10, #24, .LBB37_58 +; NO_SVE-NEXT: .LBB37_43: // %else71 +; NO_SVE-NEXT: tbnz w10, #25, .LBB37_59 +; NO_SVE-NEXT: .LBB37_44: // %else74 +; NO_SVE-NEXT: tbnz w10, #26, .LBB37_60 +; NO_SVE-NEXT: .LBB37_45: // %else77 +; NO_SVE-NEXT: tbnz w10, #27, .LBB37_61 +; NO_SVE-NEXT: .LBB37_46: // %else80 +; NO_SVE-NEXT: tbnz w10, #28, .LBB37_62 +; NO_SVE-NEXT: .LBB37_47: // %else83 +; NO_SVE-NEXT: tbnz w10, #29, .LBB37_63 +; NO_SVE-NEXT: .LBB37_48: // %else86 +; NO_SVE-NEXT: tbnz w10, #30, .LBB37_64 +; NO_SVE-NEXT: .LBB37_49: // %else89 +; NO_SVE-NEXT: tbnz w10, #31, .LBB37_65 +; NO_SVE-NEXT: .LBB37_50: // %else92 +; NO_SVE-NEXT: tbz x10, #32, .LBB37_66 +; NO_SVE-NEXT: .LBB37_51: // %cond.load94 +; NO_SVE-NEXT: add x9, x0, #32 +; NO_SVE-NEXT: ld1 { v2.b }[0], [x9] +; NO_SVE-NEXT: tbnz x10, #33, .LBB37_67 +; NO_SVE-NEXT: b .LBB37_68 +; NO_SVE-NEXT: .LBB37_52: // %cond.load52 +; NO_SVE-NEXT: add x9, x0, #18 +; NO_SVE-NEXT: ld1 { v1.b }[2], [x9] +; NO_SVE-NEXT: tbz w10, #19, .LBB37_38 +; NO_SVE-NEXT: .LBB37_53: // %cond.load55 +; NO_SVE-NEXT: add x9, x0, #19 +; NO_SVE-NEXT: ld1 { v1.b }[3], [x9] +; NO_SVE-NEXT: tbz w10, #20, .LBB37_39 +; NO_SVE-NEXT: .LBB37_54: // %cond.load58 +; NO_SVE-NEXT: add x9, x0, #20 +; NO_SVE-NEXT: ld1 { v1.b }[4], [x9] +; NO_SVE-NEXT: tbz w10, #21, .LBB37_40 +; NO_SVE-NEXT: .LBB37_55: // %cond.load61 +; NO_SVE-NEXT: add x9, x0, #21 +; NO_SVE-NEXT: ld1 { v1.b }[5], [x9] +; NO_SVE-NEXT: tbz w10, #22, .LBB37_41 +; NO_SVE-NEXT: .LBB37_56: // %cond.load64 +; NO_SVE-NEXT: add x9, x0, #22 +; NO_SVE-NEXT: ld1 { v1.b }[6], [x9] +; NO_SVE-NEXT: tbz w10, #23, .LBB37_42 +; NO_SVE-NEXT: .LBB37_57: // %cond.load67 +; NO_SVE-NEXT: add x9, x0, #23 +; NO_SVE-NEXT: ld1 { v1.b }[7], [x9] +; NO_SVE-NEXT: tbz w10, #24, .LBB37_43 +; NO_SVE-NEXT: .LBB37_58: // %cond.load70 +; NO_SVE-NEXT: add x9, x0, #24 +; NO_SVE-NEXT: ld1 { v1.b }[8], [x9] +; NO_SVE-NEXT: tbz w10, #25, .LBB37_44 +; NO_SVE-NEXT: .LBB37_59: // %cond.load73 +; NO_SVE-NEXT: add x9, x0, #25 +; NO_SVE-NEXT: ld1 { v1.b }[9], [x9] +; NO_SVE-NEXT: tbz w10, #26, .LBB37_45 +; NO_SVE-NEXT: .LBB37_60: // %cond.load76 +; NO_SVE-NEXT: add x9, x0, #26 +; NO_SVE-NEXT: ld1 { v1.b }[10], [x9] +; NO_SVE-NEXT: tbz w10, #27, .LBB37_46 +; NO_SVE-NEXT: .LBB37_61: // %cond.load79 +; NO_SVE-NEXT: add x9, x0, #27 +; NO_SVE-NEXT: ld1 { v1.b }[11], [x9] +; NO_SVE-NEXT: tbz w10, #28, .LBB37_47 +; NO_SVE-NEXT: .LBB37_62: // %cond.load82 +; NO_SVE-NEXT: add x9, x0, #28 +; NO_SVE-NEXT: ld1 { v1.b }[12], [x9] +; NO_SVE-NEXT: tbz w10, #29, .LBB37_48 +; NO_SVE-NEXT: .LBB37_63: // %cond.load85 +; NO_SVE-NEXT: add x9, x0, #29 +; NO_SVE-NEXT: ld1 { v1.b }[13], [x9] +; NO_SVE-NEXT: tbz w10, #30, .LBB37_49 +; NO_SVE-NEXT: .LBB37_64: // %cond.load88 +; NO_SVE-NEXT: add x9, x0, #30 +; NO_SVE-NEXT: ld1 { v1.b }[14], [x9] +; NO_SVE-NEXT: tbz w10, #31, .LBB37_50 +; NO_SVE-NEXT: .LBB37_65: // %cond.load91 +; NO_SVE-NEXT: add x9, x0, #31 +; NO_SVE-NEXT: ld1 { v1.b }[15], [x9] +; NO_SVE-NEXT: tbnz x10, #32, .LBB37_51 +; NO_SVE-NEXT: .LBB37_66: +; NO_SVE-NEXT: // implicit-def: $q2 +; NO_SVE-NEXT: tbz x10, #33, .LBB37_68 +; NO_SVE-NEXT: .LBB37_67: // %cond.load97 +; NO_SVE-NEXT: add x9, x0, #33 +; NO_SVE-NEXT: ld1 { v2.b }[1], [x9] +; NO_SVE-NEXT: .LBB37_68: // %else98 +; NO_SVE-NEXT: tbnz x10, #34, .LBB37_91 +; NO_SVE-NEXT: // %bb.69: // %else101 +; NO_SVE-NEXT: tbnz x10, #35, .LBB37_92 +; NO_SVE-NEXT: .LBB37_70: // %else104 +; NO_SVE-NEXT: tbnz x10, #36, .LBB37_93 +; NO_SVE-NEXT: .LBB37_71: // %else107 +; NO_SVE-NEXT: tbnz x10, #37, .LBB37_94 +; NO_SVE-NEXT: .LBB37_72: // %else110 +; NO_SVE-NEXT: tbnz x10, #38, .LBB37_95 +; NO_SVE-NEXT: .LBB37_73: // %else113 +; NO_SVE-NEXT: tbnz x10, #39, .LBB37_96 +; NO_SVE-NEXT: .LBB37_74: // %else116 +; NO_SVE-NEXT: tbnz x10, #40, .LBB37_97 +; NO_SVE-NEXT: .LBB37_75: // %else119 +; NO_SVE-NEXT: tbz x10, #41, .LBB37_77 +; NO_SVE-NEXT: .LBB37_76: // %cond.load121 +; NO_SVE-NEXT: add x9, x0, #41 +; NO_SVE-NEXT: ld1 { v2.b }[9], [x9] +; NO_SVE-NEXT: .LBB37_77: // %else122 +; NO_SVE-NEXT: ldp q3, q4, [x1, #64] +; NO_SVE-NEXT: ldp q5, q6, [x1, #96] +; NO_SVE-NEXT: tbz x10, #42, .LBB37_79 +; NO_SVE-NEXT: // %bb.78: // %cond.load124 +; NO_SVE-NEXT: add x9, x0, #42 +; NO_SVE-NEXT: ld1 { v2.b }[10], [x9] +; NO_SVE-NEXT: .LBB37_79: // %else125 +; NO_SVE-NEXT: cmeq v7.16b, v6.16b, #0 +; NO_SVE-NEXT: cmeq v6.16b, v5.16b, #0 +; NO_SVE-NEXT: cmeq v5.16b, v4.16b, #0 +; NO_SVE-NEXT: cmeq v4.16b, v3.16b, #0 +; NO_SVE-NEXT: tbz x10, #43, .LBB37_81 +; NO_SVE-NEXT: // %bb.80: // %cond.load127 +; NO_SVE-NEXT: add x9, x0, #43 +; NO_SVE-NEXT: ld1 { v2.b }[11], [x9] +; NO_SVE-NEXT: .LBB37_81: // %else128 +; NO_SVE-NEXT: umov w11, v7.b[1] +; NO_SVE-NEXT: umov w16, v7.b[0] +; NO_SVE-NEXT: umov w12, v6.b[1] +; NO_SVE-NEXT: umov w15, v6.b[0] +; NO_SVE-NEXT: umov w13, v5.b[1] +; NO_SVE-NEXT: umov w14, v5.b[0] +; NO_SVE-NEXT: umov w9, v4.b[1] +; NO_SVE-NEXT: umov w18, v4.b[0] +; NO_SVE-NEXT: tbz x10, #44, .LBB37_83 +; NO_SVE-NEXT: // %bb.82: // %cond.load130 +; NO_SVE-NEXT: add x17, x0, #44 +; NO_SVE-NEXT: ld1 { v2.b }[12], [x17] +; NO_SVE-NEXT: .LBB37_83: // %else131 +; NO_SVE-NEXT: umov w4, v7.b[2] +; NO_SVE-NEXT: and w1, w12, #0x1 +; NO_SVE-NEXT: and w12, w15, #0x1 +; NO_SVE-NEXT: umov w2, v6.b[2] +; NO_SVE-NEXT: and w15, w13, #0x1 +; NO_SVE-NEXT: and w13, w14, #0x1 +; NO_SVE-NEXT: umov w3, v5.b[2] +; NO_SVE-NEXT: umov w14, v4.b[2] +; NO_SVE-NEXT: and w17, w11, #0x1 +; NO_SVE-NEXT: and w11, w16, #0x1 +; NO_SVE-NEXT: and w16, w9, #0x1 +; NO_SVE-NEXT: and w9, w18, #0x1 +; NO_SVE-NEXT: tbz x10, #45, .LBB37_85 +; NO_SVE-NEXT: // %bb.84: // %cond.load133 +; NO_SVE-NEXT: add x18, x0, #45 +; NO_SVE-NEXT: ld1 { v2.b }[13], [x18] +; NO_SVE-NEXT: .LBB37_85: // %else134 +; NO_SVE-NEXT: bfi w11, w17, #1, #1 +; NO_SVE-NEXT: and w17, w4, #0x1 +; NO_SVE-NEXT: umov w18, v7.b[3] +; NO_SVE-NEXT: bfi w12, w1, #1, #1 +; NO_SVE-NEXT: and w1, w2, #0x1 +; NO_SVE-NEXT: umov w2, v6.b[3] +; NO_SVE-NEXT: bfi w13, w15, #1, #1 +; NO_SVE-NEXT: umov w4, v5.b[3] +; NO_SVE-NEXT: umov w15, v4.b[3] +; NO_SVE-NEXT: and w3, w3, #0x1 +; NO_SVE-NEXT: bfi w9, w16, #1, #1 +; NO_SVE-NEXT: and w14, w14, #0x1 +; NO_SVE-NEXT: tbz x10, #46, .LBB37_87 +; NO_SVE-NEXT: // %bb.86: // %cond.load136 +; NO_SVE-NEXT: add x16, x0, #46 +; NO_SVE-NEXT: ld1 { v2.b }[14], [x16] +; NO_SVE-NEXT: .LBB37_87: // %else137 +; NO_SVE-NEXT: bfi w11, w17, #2, #1 +; NO_SVE-NEXT: and w16, w18, #0x1 +; NO_SVE-NEXT: umov w18, v7.b[4] +; NO_SVE-NEXT: bfi w12, w1, #2, #1 +; NO_SVE-NEXT: and w1, w2, #0x1 +; NO_SVE-NEXT: umov w2, v6.b[4] +; NO_SVE-NEXT: umov w5, v5.b[4] +; NO_SVE-NEXT: umov w17, v4.b[4] +; NO_SVE-NEXT: bfi w13, w3, #2, #1 +; NO_SVE-NEXT: and w3, w4, #0x1 +; NO_SVE-NEXT: bfi w9, w14, #2, #1 +; NO_SVE-NEXT: and w4, w15, #0x1 +; NO_SVE-NEXT: tbz x10, #47, .LBB37_89 +; NO_SVE-NEXT: // %bb.88: // %cond.load139 +; NO_SVE-NEXT: add x14, x0, #47 +; NO_SVE-NEXT: ld1 { v2.b }[15], [x14] +; NO_SVE-NEXT: .LBB37_89: // %else140 +; NO_SVE-NEXT: bfi w11, w16, #3, #1 +; NO_SVE-NEXT: umov w16, v7.b[5] +; NO_SVE-NEXT: bfi w12, w1, #3, #1 +; NO_SVE-NEXT: umov w1, v6.b[5] +; NO_SVE-NEXT: bfi w13, w3, #3, #1 +; NO_SVE-NEXT: umov w3, v5.b[5] +; NO_SVE-NEXT: umov w15, v4.b[5] +; NO_SVE-NEXT: and w14, w18, #0x1 +; NO_SVE-NEXT: and w18, w2, #0x1 +; NO_SVE-NEXT: and w2, w5, #0x1 +; NO_SVE-NEXT: bfi w9, w4, #3, #1 +; NO_SVE-NEXT: and w4, w17, #0x1 +; NO_SVE-NEXT: tbz x10, #48, .LBB37_98 +; NO_SVE-NEXT: // %bb.90: // %cond.load142 +; NO_SVE-NEXT: add x17, x0, #48 +; NO_SVE-NEXT: ld1 { v3.b }[0], [x17] +; NO_SVE-NEXT: b .LBB37_99 +; NO_SVE-NEXT: .LBB37_91: // %cond.load100 +; NO_SVE-NEXT: add x9, x0, #34 +; NO_SVE-NEXT: ld1 { v2.b }[2], [x9] +; NO_SVE-NEXT: tbz x10, #35, .LBB37_70 +; NO_SVE-NEXT: .LBB37_92: // %cond.load103 +; NO_SVE-NEXT: add x9, x0, #35 +; NO_SVE-NEXT: ld1 { v2.b }[3], [x9] +; NO_SVE-NEXT: tbz x10, #36, .LBB37_71 +; NO_SVE-NEXT: .LBB37_93: // %cond.load106 +; NO_SVE-NEXT: add x9, x0, #36 +; NO_SVE-NEXT: ld1 { v2.b }[4], [x9] +; NO_SVE-NEXT: tbz x10, #37, .LBB37_72 +; NO_SVE-NEXT: .LBB37_94: // %cond.load109 +; NO_SVE-NEXT: add x9, x0, #37 +; NO_SVE-NEXT: ld1 { v2.b }[5], [x9] +; NO_SVE-NEXT: tbz x10, #38, .LBB37_73 +; NO_SVE-NEXT: .LBB37_95: // %cond.load112 +; NO_SVE-NEXT: add x9, x0, #38 +; NO_SVE-NEXT: ld1 { v2.b }[6], [x9] +; NO_SVE-NEXT: tbz x10, #39, .LBB37_74 +; NO_SVE-NEXT: .LBB37_96: // %cond.load115 +; NO_SVE-NEXT: add x9, x0, #39 +; NO_SVE-NEXT: ld1 { v2.b }[7], [x9] +; NO_SVE-NEXT: tbz x10, #40, .LBB37_75 +; NO_SVE-NEXT: .LBB37_97: // %cond.load118 +; NO_SVE-NEXT: add x9, x0, #40 +; NO_SVE-NEXT: ld1 { v2.b }[8], [x9] +; NO_SVE-NEXT: tbnz x10, #41, .LBB37_76 +; NO_SVE-NEXT: b .LBB37_77 +; NO_SVE-NEXT: .LBB37_98: +; NO_SVE-NEXT: // implicit-def: $q3 +; NO_SVE-NEXT: .LBB37_99: // %else143 +; NO_SVE-NEXT: bfi w11, w14, #4, #1 +; NO_SVE-NEXT: umov w17, v7.b[6] +; NO_SVE-NEXT: bfi w12, w18, #4, #1 +; NO_SVE-NEXT: and w18, w1, #0x1 +; NO_SVE-NEXT: umov w1, v6.b[6] +; NO_SVE-NEXT: bfi w13, w2, #4, #1 +; NO_SVE-NEXT: and w2, w3, #0x1 +; NO_SVE-NEXT: umov w3, v5.b[6] +; NO_SVE-NEXT: umov w14, v4.b[6] +; NO_SVE-NEXT: and w16, w16, #0x1 +; NO_SVE-NEXT: bfi w9, w4, #4, #1 +; NO_SVE-NEXT: and w15, w15, #0x1 +; NO_SVE-NEXT: tbz x10, #49, .LBB37_101 +; NO_SVE-NEXT: // %bb.100: // %cond.load145 +; NO_SVE-NEXT: add x4, x0, #49 +; NO_SVE-NEXT: ld1 { v3.b }[1], [x4] +; NO_SVE-NEXT: .LBB37_101: // %else146 +; NO_SVE-NEXT: bfi w11, w16, #5, #1 +; NO_SVE-NEXT: and w16, w17, #0x1 +; NO_SVE-NEXT: umov w17, v7.b[7] +; NO_SVE-NEXT: bfi w12, w18, #5, #1 +; NO_SVE-NEXT: and w18, w1, #0x1 +; NO_SVE-NEXT: umov w1, v6.b[7] +; NO_SVE-NEXT: bfi w13, w2, #5, #1 +; NO_SVE-NEXT: umov w4, v5.b[7] +; NO_SVE-NEXT: umov w2, v4.b[7] +; NO_SVE-NEXT: and w3, w3, #0x1 +; NO_SVE-NEXT: bfi w9, w15, #5, #1 +; NO_SVE-NEXT: and w5, w14, #0x1 +; NO_SVE-NEXT: tbz x10, #50, .LBB37_103 +; NO_SVE-NEXT: // %bb.102: // %cond.load148 +; NO_SVE-NEXT: add x14, x0, #50 +; NO_SVE-NEXT: ld1 { v3.b }[2], [x14] +; NO_SVE-NEXT: .LBB37_103: // %else149 +; NO_SVE-NEXT: orr w14, w11, w16, lsl #6 +; NO_SVE-NEXT: umov w7, v7.b[8] +; NO_SVE-NEXT: umov w20, v6.b[8] +; NO_SVE-NEXT: umov w22, v5.b[8] +; NO_SVE-NEXT: umov w11, v4.b[8] +; NO_SVE-NEXT: and w6, w17, #0x1 +; NO_SVE-NEXT: orr w15, w12, w18, lsl #6 +; NO_SVE-NEXT: and w19, w1, #0x1 +; NO_SVE-NEXT: orr w16, w13, w3, lsl #6 +; NO_SVE-NEXT: and w21, w4, #0x1 +; NO_SVE-NEXT: orr w17, w9, w5, lsl #6 +; NO_SVE-NEXT: and w18, w2, #0x1 +; NO_SVE-NEXT: tbz x10, #51, .LBB37_105 +; NO_SVE-NEXT: // %bb.104: // %cond.load151 +; NO_SVE-NEXT: add x9, x0, #51 +; NO_SVE-NEXT: ld1 { v3.b }[3], [x9] +; NO_SVE-NEXT: .LBB37_105: // %else152 +; NO_SVE-NEXT: orr w12, w14, w6, lsl #7 +; NO_SVE-NEXT: umov w2, v7.b[9] +; NO_SVE-NEXT: umov w4, v6.b[9] +; NO_SVE-NEXT: umov w6, v5.b[9] +; NO_SVE-NEXT: umov w9, v4.b[9] +; NO_SVE-NEXT: and w1, w7, #0x1 +; NO_SVE-NEXT: orr w13, w15, w19, lsl #7 +; NO_SVE-NEXT: and w3, w20, #0x1 +; NO_SVE-NEXT: orr w14, w16, w21, lsl #7 +; NO_SVE-NEXT: and w5, w22, #0x1 +; NO_SVE-NEXT: orr w15, w17, w18, lsl #7 +; NO_SVE-NEXT: and w16, w11, #0x1 +; NO_SVE-NEXT: tbz x10, #52, .LBB37_107 +; NO_SVE-NEXT: // %bb.106: // %cond.load154 +; NO_SVE-NEXT: add x11, x0, #52 +; NO_SVE-NEXT: ld1 { v3.b }[4], [x11] +; NO_SVE-NEXT: .LBB37_107: // %else155 +; NO_SVE-NEXT: orr w12, w12, w1, lsl #8 +; NO_SVE-NEXT: and w17, w2, #0x1 +; NO_SVE-NEXT: umov w18, v7.b[10] +; NO_SVE-NEXT: and w1, w4, #0x1 +; NO_SVE-NEXT: umov w2, v6.b[10] +; NO_SVE-NEXT: umov w4, v5.b[10] +; NO_SVE-NEXT: umov w11, v4.b[10] +; NO_SVE-NEXT: orr w13, w13, w3, lsl #8 +; NO_SVE-NEXT: orr w14, w14, w5, lsl #8 +; NO_SVE-NEXT: and w3, w6, #0x1 +; NO_SVE-NEXT: orr w15, w15, w16, lsl #8 +; NO_SVE-NEXT: and w16, w9, #0x1 +; NO_SVE-NEXT: tbz x10, #53, .LBB37_109 +; NO_SVE-NEXT: // %bb.108: // %cond.load157 +; NO_SVE-NEXT: add x9, x0, #53 +; NO_SVE-NEXT: ld1 { v3.b }[5], [x9] +; NO_SVE-NEXT: .LBB37_109: // %else158 +; NO_SVE-NEXT: orr w12, w12, w17, lsl #9 +; NO_SVE-NEXT: and w17, w18, #0x1 +; NO_SVE-NEXT: umov w18, v7.b[11] +; NO_SVE-NEXT: orr w13, w13, w1, lsl #9 +; NO_SVE-NEXT: and w1, w2, #0x1 +; NO_SVE-NEXT: umov w2, v6.b[11] +; NO_SVE-NEXT: orr w14, w14, w3, lsl #9 +; NO_SVE-NEXT: and w3, w4, #0x1 +; NO_SVE-NEXT: umov w4, v5.b[11] +; NO_SVE-NEXT: umov w9, v4.b[11] +; NO_SVE-NEXT: orr w15, w15, w16, lsl #9 +; NO_SVE-NEXT: and w16, w11, #0x1 +; NO_SVE-NEXT: tbz x10, #54, .LBB37_111 +; NO_SVE-NEXT: // %bb.110: // %cond.load160 +; NO_SVE-NEXT: add x11, x0, #54 +; NO_SVE-NEXT: ld1 { v3.b }[6], [x11] +; NO_SVE-NEXT: .LBB37_111: // %else161 +; NO_SVE-NEXT: orr w12, w12, w17, lsl #10 +; NO_SVE-NEXT: and w17, w18, #0x1 +; NO_SVE-NEXT: umov w18, v7.b[12] +; NO_SVE-NEXT: orr w13, w13, w1, lsl #10 +; NO_SVE-NEXT: and w1, w2, #0x1 +; NO_SVE-NEXT: umov w2, v6.b[12] +; NO_SVE-NEXT: orr w14, w14, w3, lsl #10 +; NO_SVE-NEXT: and w3, w4, #0x1 +; NO_SVE-NEXT: umov w4, v5.b[12] +; NO_SVE-NEXT: umov w11, v4.b[12] +; NO_SVE-NEXT: orr w15, w15, w16, lsl #10 +; NO_SVE-NEXT: and w16, w9, #0x1 +; NO_SVE-NEXT: tbz x10, #55, .LBB37_113 +; NO_SVE-NEXT: // %bb.112: // %cond.load163 +; NO_SVE-NEXT: add x9, x0, #55 +; NO_SVE-NEXT: ld1 { v3.b }[7], [x9] +; NO_SVE-NEXT: .LBB37_113: // %else164 +; NO_SVE-NEXT: orr w12, w12, w17, lsl #11 +; NO_SVE-NEXT: and w17, w18, #0x1 +; NO_SVE-NEXT: umov w18, v7.b[13] +; NO_SVE-NEXT: orr w13, w13, w1, lsl #11 +; NO_SVE-NEXT: and w1, w2, #0x1 +; NO_SVE-NEXT: umov w2, v6.b[13] +; NO_SVE-NEXT: orr w14, w14, w3, lsl #11 +; NO_SVE-NEXT: and w3, w4, #0x1 +; NO_SVE-NEXT: umov w4, v5.b[13] +; NO_SVE-NEXT: umov w9, v4.b[13] +; NO_SVE-NEXT: orr w15, w15, w16, lsl #11 +; NO_SVE-NEXT: and w16, w11, #0x1 +; NO_SVE-NEXT: tbz x10, #56, .LBB37_115 +; NO_SVE-NEXT: // %bb.114: // %cond.load166 +; NO_SVE-NEXT: add x11, x0, #56 +; NO_SVE-NEXT: ld1 { v3.b }[8], [x11] +; NO_SVE-NEXT: .LBB37_115: // %else167 +; NO_SVE-NEXT: orr w11, w12, w17, lsl #12 +; NO_SVE-NEXT: and w12, w18, #0x1 +; NO_SVE-NEXT: umov w17, v7.b[14] +; NO_SVE-NEXT: orr w13, w13, w1, lsl #12 +; NO_SVE-NEXT: and w18, w2, #0x1 +; NO_SVE-NEXT: umov w1, v6.b[14] +; NO_SVE-NEXT: orr w14, w14, w3, lsl #12 +; NO_SVE-NEXT: and w2, w4, #0x1 +; NO_SVE-NEXT: umov w3, v5.b[14] +; NO_SVE-NEXT: umov w4, v4.b[14] +; NO_SVE-NEXT: orr w5, w15, w16, lsl #12 +; NO_SVE-NEXT: and w6, w9, #0x1 +; NO_SVE-NEXT: tbz x10, #57, .LBB37_117 +; NO_SVE-NEXT: // %bb.116: // %cond.load169 +; NO_SVE-NEXT: add x9, x0, #57 +; NO_SVE-NEXT: ld1 { v3.b }[9], [x9] +; NO_SVE-NEXT: .LBB37_117: // %else170 +; NO_SVE-NEXT: orr w15, w11, w12, lsl #13 +; NO_SVE-NEXT: and w17, w17, #0x1 +; NO_SVE-NEXT: orr w16, w13, w18, lsl #13 +; NO_SVE-NEXT: and w18, w1, #0x1 +; NO_SVE-NEXT: orr w9, w14, w2, lsl #13 +; NO_SVE-NEXT: and w11, w3, #0x1 +; NO_SVE-NEXT: orr w12, w5, w6, lsl #13 +; NO_SVE-NEXT: and w13, w4, #0x1 +; NO_SVE-NEXT: tbz x10, #58, .LBB37_119 +; NO_SVE-NEXT: // %bb.118: // %cond.load172 +; NO_SVE-NEXT: add x14, x0, #58 +; NO_SVE-NEXT: ld1 { v3.b }[10], [x14] +; NO_SVE-NEXT: .LBB37_119: // %else173 +; NO_SVE-NEXT: orr w14, w15, w17, lsl #14 +; NO_SVE-NEXT: umov w15, v7.b[15] +; NO_SVE-NEXT: orr w16, w16, w18, lsl #14 +; NO_SVE-NEXT: umov w17, v6.b[15] +; NO_SVE-NEXT: umov w18, v5.b[15] +; NO_SVE-NEXT: umov w1, v4.b[15] +; NO_SVE-NEXT: orr w2, w9, w11, lsl #14 +; NO_SVE-NEXT: orr w9, w12, w13, lsl #14 +; NO_SVE-NEXT: tbz x10, #59, .LBB37_121 +; NO_SVE-NEXT: // %bb.120: // %cond.load175 +; NO_SVE-NEXT: add x11, x0, #59 +; NO_SVE-NEXT: ld1 { v3.b }[11], [x11] +; NO_SVE-NEXT: .LBB37_121: // %else176 +; NO_SVE-NEXT: orr w12, w14, w15, lsl #15 +; NO_SVE-NEXT: orr w11, w16, w17, lsl #15 +; NO_SVE-NEXT: orr w13, w2, w18, lsl #15 +; NO_SVE-NEXT: orr w9, w9, w1, lsl #15 +; NO_SVE-NEXT: tbz x10, #60, .LBB37_123 +; NO_SVE-NEXT: // %bb.122: // %cond.load178 +; NO_SVE-NEXT: add x14, x0, #60 +; NO_SVE-NEXT: ld1 { v3.b }[12], [x14] +; NO_SVE-NEXT: .LBB37_123: // %else179 +; NO_SVE-NEXT: bfi w11, w12, #16, #16 +; NO_SVE-NEXT: bfi w9, w13, #16, #16 +; NO_SVE-NEXT: tbnz x10, #61, .LBB37_128 +; NO_SVE-NEXT: // %bb.124: // %else182 +; NO_SVE-NEXT: tbnz x10, #62, .LBB37_129 +; NO_SVE-NEXT: .LBB37_125: // %else185 +; NO_SVE-NEXT: bfi x9, x11, #32, #32 +; NO_SVE-NEXT: tbnz x10, #63, .LBB37_130 +; NO_SVE-NEXT: .LBB37_126: // %else188 +; NO_SVE-NEXT: tbz w9, #0, .LBB37_131 +; NO_SVE-NEXT: .LBB37_127: // %cond.load190 +; NO_SVE-NEXT: add x10, x0, #64 +; NO_SVE-NEXT: ld1 { v4.b }[0], [x10] +; NO_SVE-NEXT: tbnz w9, #1, .LBB37_132 +; NO_SVE-NEXT: b .LBB37_133 +; NO_SVE-NEXT: .LBB37_128: // %cond.load181 +; NO_SVE-NEXT: add x12, x0, #61 +; NO_SVE-NEXT: ld1 { v3.b }[13], [x12] +; NO_SVE-NEXT: tbz x10, #62, .LBB37_125 +; NO_SVE-NEXT: .LBB37_129: // %cond.load184 +; NO_SVE-NEXT: add x12, x0, #62 +; NO_SVE-NEXT: ld1 { v3.b }[14], [x12] +; NO_SVE-NEXT: bfi x9, x11, #32, #32 +; NO_SVE-NEXT: tbz x10, #63, .LBB37_126 +; NO_SVE-NEXT: .LBB37_130: // %cond.load187 +; NO_SVE-NEXT: add x10, x0, #63 +; NO_SVE-NEXT: ld1 { v3.b }[15], [x10] +; NO_SVE-NEXT: tbnz w9, #0, .LBB37_127 +; NO_SVE-NEXT: .LBB37_131: +; NO_SVE-NEXT: // implicit-def: $q4 +; NO_SVE-NEXT: tbz w9, #1, .LBB37_133 +; NO_SVE-NEXT: .LBB37_132: // %cond.load193 +; NO_SVE-NEXT: add x10, x0, #65 +; NO_SVE-NEXT: ld1 { v4.b }[1], [x10] +; NO_SVE-NEXT: .LBB37_133: // %else194 +; NO_SVE-NEXT: tbnz w9, #2, .LBB37_149 +; NO_SVE-NEXT: // %bb.134: // %else197 +; NO_SVE-NEXT: tbnz w9, #3, .LBB37_150 +; NO_SVE-NEXT: .LBB37_135: // %else200 +; NO_SVE-NEXT: tbnz w9, #4, .LBB37_151 +; NO_SVE-NEXT: .LBB37_136: // %else203 +; NO_SVE-NEXT: tbnz w9, #5, .LBB37_152 +; NO_SVE-NEXT: .LBB37_137: // %else206 +; NO_SVE-NEXT: tbnz w9, #6, .LBB37_153 +; NO_SVE-NEXT: .LBB37_138: // %else209 +; NO_SVE-NEXT: tbnz w9, #7, .LBB37_154 +; NO_SVE-NEXT: .LBB37_139: // %else212 +; NO_SVE-NEXT: tbnz w9, #8, .LBB37_155 +; NO_SVE-NEXT: .LBB37_140: // %else215 +; NO_SVE-NEXT: tbnz w9, #9, .LBB37_156 +; NO_SVE-NEXT: .LBB37_141: // %else218 +; NO_SVE-NEXT: tbnz w9, #10, .LBB37_157 +; NO_SVE-NEXT: .LBB37_142: // %else221 +; NO_SVE-NEXT: tbnz w9, #11, .LBB37_158 +; NO_SVE-NEXT: .LBB37_143: // %else224 +; NO_SVE-NEXT: tbnz w9, #12, .LBB37_159 +; NO_SVE-NEXT: .LBB37_144: // %else227 +; NO_SVE-NEXT: tbnz w9, #13, .LBB37_160 +; NO_SVE-NEXT: .LBB37_145: // %else230 +; NO_SVE-NEXT: tbnz w9, #14, .LBB37_161 +; NO_SVE-NEXT: .LBB37_146: // %else233 +; NO_SVE-NEXT: tbnz w9, #15, .LBB37_162 +; NO_SVE-NEXT: .LBB37_147: // %else236 +; NO_SVE-NEXT: tbz w9, #16, .LBB37_163 +; NO_SVE-NEXT: .LBB37_148: // %cond.load238 +; NO_SVE-NEXT: add x10, x0, #80 +; NO_SVE-NEXT: ld1 { v5.b }[0], [x10] +; NO_SVE-NEXT: tbnz w9, #17, .LBB37_164 +; NO_SVE-NEXT: b .LBB37_165 +; NO_SVE-NEXT: .LBB37_149: // %cond.load196 +; NO_SVE-NEXT: add x10, x0, #66 +; NO_SVE-NEXT: ld1 { v4.b }[2], [x10] +; NO_SVE-NEXT: tbz w9, #3, .LBB37_135 +; NO_SVE-NEXT: .LBB37_150: // %cond.load199 +; NO_SVE-NEXT: add x10, x0, #67 +; NO_SVE-NEXT: ld1 { v4.b }[3], [x10] +; NO_SVE-NEXT: tbz w9, #4, .LBB37_136 +; NO_SVE-NEXT: .LBB37_151: // %cond.load202 +; NO_SVE-NEXT: add x10, x0, #68 +; NO_SVE-NEXT: ld1 { v4.b }[4], [x10] +; NO_SVE-NEXT: tbz w9, #5, .LBB37_137 +; NO_SVE-NEXT: .LBB37_152: // %cond.load205 +; NO_SVE-NEXT: add x10, x0, #69 +; NO_SVE-NEXT: ld1 { v4.b }[5], [x10] +; NO_SVE-NEXT: tbz w9, #6, .LBB37_138 +; NO_SVE-NEXT: .LBB37_153: // %cond.load208 +; NO_SVE-NEXT: add x10, x0, #70 +; NO_SVE-NEXT: ld1 { v4.b }[6], [x10] +; NO_SVE-NEXT: tbz w9, #7, .LBB37_139 +; NO_SVE-NEXT: .LBB37_154: // %cond.load211 +; NO_SVE-NEXT: add x10, x0, #71 +; NO_SVE-NEXT: ld1 { v4.b }[7], [x10] +; NO_SVE-NEXT: tbz w9, #8, .LBB37_140 +; NO_SVE-NEXT: .LBB37_155: // %cond.load214 +; NO_SVE-NEXT: add x10, x0, #72 +; NO_SVE-NEXT: ld1 { v4.b }[8], [x10] +; NO_SVE-NEXT: tbz w9, #9, .LBB37_141 +; NO_SVE-NEXT: .LBB37_156: // %cond.load217 +; NO_SVE-NEXT: add x10, x0, #73 +; NO_SVE-NEXT: ld1 { v4.b }[9], [x10] +; NO_SVE-NEXT: tbz w9, #10, .LBB37_142 +; NO_SVE-NEXT: .LBB37_157: // %cond.load220 +; NO_SVE-NEXT: add x10, x0, #74 +; NO_SVE-NEXT: ld1 { v4.b }[10], [x10] +; NO_SVE-NEXT: tbz w9, #11, .LBB37_143 +; NO_SVE-NEXT: .LBB37_158: // %cond.load223 +; NO_SVE-NEXT: add x10, x0, #75 +; NO_SVE-NEXT: ld1 { v4.b }[11], [x10] +; NO_SVE-NEXT: tbz w9, #12, .LBB37_144 +; NO_SVE-NEXT: .LBB37_159: // %cond.load226 +; NO_SVE-NEXT: add x10, x0, #76 +; NO_SVE-NEXT: ld1 { v4.b }[12], [x10] +; NO_SVE-NEXT: tbz w9, #13, .LBB37_145 +; NO_SVE-NEXT: .LBB37_160: // %cond.load229 +; NO_SVE-NEXT: add x10, x0, #77 +; NO_SVE-NEXT: ld1 { v4.b }[13], [x10] +; NO_SVE-NEXT: tbz w9, #14, .LBB37_146 +; NO_SVE-NEXT: .LBB37_161: // %cond.load232 +; NO_SVE-NEXT: add x10, x0, #78 +; NO_SVE-NEXT: ld1 { v4.b }[14], [x10] +; NO_SVE-NEXT: tbz w9, #15, .LBB37_147 +; NO_SVE-NEXT: .LBB37_162: // %cond.load235 +; NO_SVE-NEXT: add x10, x0, #79 +; NO_SVE-NEXT: ld1 { v4.b }[15], [x10] +; NO_SVE-NEXT: tbnz w9, #16, .LBB37_148 +; NO_SVE-NEXT: .LBB37_163: +; NO_SVE-NEXT: // implicit-def: $q5 +; NO_SVE-NEXT: tbz w9, #17, .LBB37_165 +; NO_SVE-NEXT: .LBB37_164: // %cond.load241 +; NO_SVE-NEXT: add x10, x0, #81 +; NO_SVE-NEXT: ld1 { v5.b }[1], [x10] +; NO_SVE-NEXT: .LBB37_165: // %else242 +; NO_SVE-NEXT: tbnz w9, #18, .LBB37_181 +; NO_SVE-NEXT: // %bb.166: // %else245 +; NO_SVE-NEXT: tbnz w9, #19, .LBB37_182 +; NO_SVE-NEXT: .LBB37_167: // %else248 +; NO_SVE-NEXT: tbnz w9, #20, .LBB37_183 +; NO_SVE-NEXT: .LBB37_168: // %else251 +; NO_SVE-NEXT: tbnz w9, #21, .LBB37_184 +; NO_SVE-NEXT: .LBB37_169: // %else254 +; NO_SVE-NEXT: tbnz w9, #22, .LBB37_185 +; NO_SVE-NEXT: .LBB37_170: // %else257 +; NO_SVE-NEXT: tbnz w9, #23, .LBB37_186 +; NO_SVE-NEXT: .LBB37_171: // %else260 +; NO_SVE-NEXT: tbnz w9, #24, .LBB37_187 +; NO_SVE-NEXT: .LBB37_172: // %else263 +; NO_SVE-NEXT: tbnz w9, #25, .LBB37_188 +; NO_SVE-NEXT: .LBB37_173: // %else266 +; NO_SVE-NEXT: tbnz w9, #26, .LBB37_189 +; NO_SVE-NEXT: .LBB37_174: // %else269 +; NO_SVE-NEXT: tbnz w9, #27, .LBB37_190 +; NO_SVE-NEXT: .LBB37_175: // %else272 +; NO_SVE-NEXT: tbnz w9, #28, .LBB37_191 +; NO_SVE-NEXT: .LBB37_176: // %else275 +; NO_SVE-NEXT: tbnz w9, #29, .LBB37_192 +; NO_SVE-NEXT: .LBB37_177: // %else278 +; NO_SVE-NEXT: tbnz w9, #30, .LBB37_193 +; NO_SVE-NEXT: .LBB37_178: // %else281 +; NO_SVE-NEXT: tbnz w9, #31, .LBB37_194 +; NO_SVE-NEXT: .LBB37_179: // %else284 +; NO_SVE-NEXT: tbz x9, #32, .LBB37_195 +; NO_SVE-NEXT: .LBB37_180: // %cond.load286 +; NO_SVE-NEXT: add x10, x0, #96 +; NO_SVE-NEXT: ld1 { v6.b }[0], [x10] +; NO_SVE-NEXT: tbnz x9, #33, .LBB37_196 +; NO_SVE-NEXT: b .LBB37_197 +; NO_SVE-NEXT: .LBB37_181: // %cond.load244 +; NO_SVE-NEXT: add x10, x0, #82 +; NO_SVE-NEXT: ld1 { v5.b }[2], [x10] +; NO_SVE-NEXT: tbz w9, #19, .LBB37_167 +; NO_SVE-NEXT: .LBB37_182: // %cond.load247 +; NO_SVE-NEXT: add x10, x0, #83 +; NO_SVE-NEXT: ld1 { v5.b }[3], [x10] +; NO_SVE-NEXT: tbz w9, #20, .LBB37_168 +; NO_SVE-NEXT: .LBB37_183: // %cond.load250 +; NO_SVE-NEXT: add x10, x0, #84 +; NO_SVE-NEXT: ld1 { v5.b }[4], [x10] +; NO_SVE-NEXT: tbz w9, #21, .LBB37_169 +; NO_SVE-NEXT: .LBB37_184: // %cond.load253 +; NO_SVE-NEXT: add x10, x0, #85 +; NO_SVE-NEXT: ld1 { v5.b }[5], [x10] +; NO_SVE-NEXT: tbz w9, #22, .LBB37_170 +; NO_SVE-NEXT: .LBB37_185: // %cond.load256 +; NO_SVE-NEXT: add x10, x0, #86 +; NO_SVE-NEXT: ld1 { v5.b }[6], [x10] +; NO_SVE-NEXT: tbz w9, #23, .LBB37_171 +; NO_SVE-NEXT: .LBB37_186: // %cond.load259 +; NO_SVE-NEXT: add x10, x0, #87 +; NO_SVE-NEXT: ld1 { v5.b }[7], [x10] +; NO_SVE-NEXT: tbz w9, #24, .LBB37_172 +; NO_SVE-NEXT: .LBB37_187: // %cond.load262 +; NO_SVE-NEXT: add x10, x0, #88 +; NO_SVE-NEXT: ld1 { v5.b }[8], [x10] +; NO_SVE-NEXT: tbz w9, #25, .LBB37_173 +; NO_SVE-NEXT: .LBB37_188: // %cond.load265 +; NO_SVE-NEXT: add x10, x0, #89 +; NO_SVE-NEXT: ld1 { v5.b }[9], [x10] +; NO_SVE-NEXT: tbz w9, #26, .LBB37_174 +; NO_SVE-NEXT: .LBB37_189: // %cond.load268 +; NO_SVE-NEXT: add x10, x0, #90 +; NO_SVE-NEXT: ld1 { v5.b }[10], [x10] +; NO_SVE-NEXT: tbz w9, #27, .LBB37_175 +; NO_SVE-NEXT: .LBB37_190: // %cond.load271 +; NO_SVE-NEXT: add x10, x0, #91 +; NO_SVE-NEXT: ld1 { v5.b }[11], [x10] +; NO_SVE-NEXT: tbz w9, #28, .LBB37_176 +; NO_SVE-NEXT: .LBB37_191: // %cond.load274 +; NO_SVE-NEXT: add x10, x0, #92 +; NO_SVE-NEXT: ld1 { v5.b }[12], [x10] +; NO_SVE-NEXT: tbz w9, #29, .LBB37_177 +; NO_SVE-NEXT: .LBB37_192: // %cond.load277 +; NO_SVE-NEXT: add x10, x0, #93 +; NO_SVE-NEXT: ld1 { v5.b }[13], [x10] +; NO_SVE-NEXT: tbz w9, #30, .LBB37_178 +; NO_SVE-NEXT: .LBB37_193: // %cond.load280 +; NO_SVE-NEXT: add x10, x0, #94 +; NO_SVE-NEXT: ld1 { v5.b }[14], [x10] +; NO_SVE-NEXT: tbz w9, #31, .LBB37_179 +; NO_SVE-NEXT: .LBB37_194: // %cond.load283 +; NO_SVE-NEXT: add x10, x0, #95 +; NO_SVE-NEXT: ld1 { v5.b }[15], [x10] +; NO_SVE-NEXT: tbnz x9, #32, .LBB37_180 +; NO_SVE-NEXT: .LBB37_195: +; NO_SVE-NEXT: // implicit-def: $q6 +; NO_SVE-NEXT: tbz x9, #33, .LBB37_197 +; NO_SVE-NEXT: .LBB37_196: // %cond.load289 +; NO_SVE-NEXT: add x10, x0, #97 +; NO_SVE-NEXT: ld1 { v6.b }[1], [x10] +; NO_SVE-NEXT: .LBB37_197: // %else290 +; NO_SVE-NEXT: tbnz x9, #34, .LBB37_213 +; NO_SVE-NEXT: // %bb.198: // %else293 +; NO_SVE-NEXT: tbnz x9, #35, .LBB37_214 +; NO_SVE-NEXT: .LBB37_199: // %else296 +; NO_SVE-NEXT: tbnz x9, #36, .LBB37_215 +; NO_SVE-NEXT: .LBB37_200: // %else299 +; NO_SVE-NEXT: tbnz x9, #37, .LBB37_216 +; NO_SVE-NEXT: .LBB37_201: // %else302 +; NO_SVE-NEXT: tbnz x9, #38, .LBB37_217 +; NO_SVE-NEXT: .LBB37_202: // %else305 +; NO_SVE-NEXT: tbnz x9, #39, .LBB37_218 +; NO_SVE-NEXT: .LBB37_203: // %else308 +; NO_SVE-NEXT: tbnz x9, #40, .LBB37_219 +; NO_SVE-NEXT: .LBB37_204: // %else311 +; NO_SVE-NEXT: tbnz x9, #41, .LBB37_220 +; NO_SVE-NEXT: .LBB37_205: // %else314 +; NO_SVE-NEXT: tbnz x9, #42, .LBB37_221 +; NO_SVE-NEXT: .LBB37_206: // %else317 +; NO_SVE-NEXT: tbnz x9, #43, .LBB37_222 +; NO_SVE-NEXT: .LBB37_207: // %else320 +; NO_SVE-NEXT: tbnz x9, #44, .LBB37_223 +; NO_SVE-NEXT: .LBB37_208: // %else323 +; NO_SVE-NEXT: tbnz x9, #45, .LBB37_224 +; NO_SVE-NEXT: .LBB37_209: // %else326 +; NO_SVE-NEXT: tbnz x9, #46, .LBB37_225 +; NO_SVE-NEXT: .LBB37_210: // %else329 +; NO_SVE-NEXT: tbnz x9, #47, .LBB37_226 +; NO_SVE-NEXT: .LBB37_211: // %else332 +; NO_SVE-NEXT: tbz x9, #48, .LBB37_227 +; NO_SVE-NEXT: .LBB37_212: // %cond.load334 +; NO_SVE-NEXT: add x10, x0, #112 +; NO_SVE-NEXT: ld1 { v7.b }[0], [x10] +; NO_SVE-NEXT: tbnz x9, #49, .LBB37_228 +; NO_SVE-NEXT: b .LBB37_229 +; NO_SVE-NEXT: .LBB37_213: // %cond.load292 +; NO_SVE-NEXT: add x10, x0, #98 +; NO_SVE-NEXT: ld1 { v6.b }[2], [x10] +; NO_SVE-NEXT: tbz x9, #35, .LBB37_199 +; NO_SVE-NEXT: .LBB37_214: // %cond.load295 +; NO_SVE-NEXT: add x10, x0, #99 +; NO_SVE-NEXT: ld1 { v6.b }[3], [x10] +; NO_SVE-NEXT: tbz x9, #36, .LBB37_200 +; NO_SVE-NEXT: .LBB37_215: // %cond.load298 +; NO_SVE-NEXT: add x10, x0, #100 +; NO_SVE-NEXT: ld1 { v6.b }[4], [x10] +; NO_SVE-NEXT: tbz x9, #37, .LBB37_201 +; NO_SVE-NEXT: .LBB37_216: // %cond.load301 +; NO_SVE-NEXT: add x10, x0, #101 +; NO_SVE-NEXT: ld1 { v6.b }[5], [x10] +; NO_SVE-NEXT: tbz x9, #38, .LBB37_202 +; NO_SVE-NEXT: .LBB37_217: // %cond.load304 +; NO_SVE-NEXT: add x10, x0, #102 +; NO_SVE-NEXT: ld1 { v6.b }[6], [x10] +; NO_SVE-NEXT: tbz x9, #39, .LBB37_203 +; NO_SVE-NEXT: .LBB37_218: // %cond.load307 +; NO_SVE-NEXT: add x10, x0, #103 +; NO_SVE-NEXT: ld1 { v6.b }[7], [x10] +; NO_SVE-NEXT: tbz x9, #40, .LBB37_204 +; NO_SVE-NEXT: .LBB37_219: // %cond.load310 +; NO_SVE-NEXT: add x10, x0, #104 +; NO_SVE-NEXT: ld1 { v6.b }[8], [x10] +; NO_SVE-NEXT: tbz x9, #41, .LBB37_205 +; NO_SVE-NEXT: .LBB37_220: // %cond.load313 +; NO_SVE-NEXT: add x10, x0, #105 +; NO_SVE-NEXT: ld1 { v6.b }[9], [x10] +; NO_SVE-NEXT: tbz x9, #42, .LBB37_206 +; NO_SVE-NEXT: .LBB37_221: // %cond.load316 +; NO_SVE-NEXT: add x10, x0, #106 +; NO_SVE-NEXT: ld1 { v6.b }[10], [x10] +; NO_SVE-NEXT: tbz x9, #43, .LBB37_207 +; NO_SVE-NEXT: .LBB37_222: // %cond.load319 +; NO_SVE-NEXT: add x10, x0, #107 +; NO_SVE-NEXT: ld1 { v6.b }[11], [x10] +; NO_SVE-NEXT: tbz x9, #44, .LBB37_208 +; NO_SVE-NEXT: .LBB37_223: // %cond.load322 +; NO_SVE-NEXT: add x10, x0, #108 +; NO_SVE-NEXT: ld1 { v6.b }[12], [x10] +; NO_SVE-NEXT: tbz x9, #45, .LBB37_209 +; NO_SVE-NEXT: .LBB37_224: // %cond.load325 +; NO_SVE-NEXT: add x10, x0, #109 +; NO_SVE-NEXT: ld1 { v6.b }[13], [x10] +; NO_SVE-NEXT: tbz x9, #46, .LBB37_210 +; NO_SVE-NEXT: .LBB37_225: // %cond.load328 +; NO_SVE-NEXT: add x10, x0, #110 +; NO_SVE-NEXT: ld1 { v6.b }[14], [x10] +; NO_SVE-NEXT: tbz x9, #47, .LBB37_211 +; NO_SVE-NEXT: .LBB37_226: // %cond.load331 +; NO_SVE-NEXT: add x10, x0, #111 +; NO_SVE-NEXT: ld1 { v6.b }[15], [x10] +; NO_SVE-NEXT: tbnz x9, #48, .LBB37_212 +; NO_SVE-NEXT: .LBB37_227: +; NO_SVE-NEXT: // implicit-def: $q7 +; NO_SVE-NEXT: tbz x9, #49, .LBB37_229 +; NO_SVE-NEXT: .LBB37_228: // %cond.load337 +; NO_SVE-NEXT: add x10, x0, #113 +; NO_SVE-NEXT: ld1 { v7.b }[1], [x10] +; NO_SVE-NEXT: .LBB37_229: // %else338 +; NO_SVE-NEXT: tbnz x9, #50, .LBB37_245 +; NO_SVE-NEXT: // %bb.230: // %else341 +; NO_SVE-NEXT: tbnz x9, #51, .LBB37_246 +; NO_SVE-NEXT: .LBB37_231: // %else344 +; NO_SVE-NEXT: tbnz x9, #52, .LBB37_247 +; NO_SVE-NEXT: .LBB37_232: // %else347 +; NO_SVE-NEXT: tbnz x9, #53, .LBB37_248 +; NO_SVE-NEXT: .LBB37_233: // %else350 +; NO_SVE-NEXT: tbnz x9, #54, .LBB37_249 +; NO_SVE-NEXT: .LBB37_234: // %else353 +; NO_SVE-NEXT: tbnz x9, #55, .LBB37_250 +; NO_SVE-NEXT: .LBB37_235: // %else356 +; NO_SVE-NEXT: tbnz x9, #56, .LBB37_251 +; NO_SVE-NEXT: .LBB37_236: // %else359 +; NO_SVE-NEXT: tbnz x9, #57, .LBB37_252 +; NO_SVE-NEXT: .LBB37_237: // %else362 +; NO_SVE-NEXT: tbnz x9, #58, .LBB37_253 +; NO_SVE-NEXT: .LBB37_238: // %else365 +; NO_SVE-NEXT: tbnz x9, #59, .LBB37_254 +; NO_SVE-NEXT: .LBB37_239: // %else368 +; NO_SVE-NEXT: tbnz x9, #60, .LBB37_255 +; NO_SVE-NEXT: .LBB37_240: // %else371 +; NO_SVE-NEXT: tbnz x9, #61, .LBB37_256 +; NO_SVE-NEXT: .LBB37_241: // %else374 +; NO_SVE-NEXT: tbnz x9, #62, .LBB37_257 +; NO_SVE-NEXT: .LBB37_242: // %else377 +; NO_SVE-NEXT: tbz x9, #63, .LBB37_244 +; NO_SVE-NEXT: .LBB37_243: // %cond.load379 +; NO_SVE-NEXT: add x9, x0, #127 +; NO_SVE-NEXT: ld1 { v7.b }[15], [x9] +; NO_SVE-NEXT: .LBB37_244: // %else380 +; NO_SVE-NEXT: sshll2 v16.8h, v0.16b, #0 +; NO_SVE-NEXT: sshll v0.8h, v0.8b, #0 +; NO_SVE-NEXT: sshll2 v17.8h, v1.16b, #0 +; NO_SVE-NEXT: stp q0, q16, [x8] +; NO_SVE-NEXT: sshll v1.8h, v1.8b, #0 +; NO_SVE-NEXT: sshll2 v0.8h, v2.16b, #0 +; NO_SVE-NEXT: sshll v2.8h, v2.8b, #0 +; NO_SVE-NEXT: stp q1, q17, [x8, #32] +; NO_SVE-NEXT: sshll2 v1.8h, v3.16b, #0 +; NO_SVE-NEXT: stp q2, q0, [x8, #64] +; NO_SVE-NEXT: sshll v0.8h, v3.8b, #0 +; NO_SVE-NEXT: sshll2 v2.8h, v4.16b, #0 +; NO_SVE-NEXT: stp q0, q1, [x8, #96] +; NO_SVE-NEXT: sshll v1.8h, v4.8b, #0 +; NO_SVE-NEXT: sshll2 v0.8h, v5.16b, #0 +; NO_SVE-NEXT: stp q1, q2, [x8, #128] +; NO_SVE-NEXT: sshll v2.8h, v5.8b, #0 +; NO_SVE-NEXT: sshll2 v1.8h, v6.16b, #0 +; NO_SVE-NEXT: stp q2, q0, [x8, #160] +; NO_SVE-NEXT: sshll v0.8h, v6.8b, #0 +; NO_SVE-NEXT: sshll2 v2.8h, v7.16b, #0 +; NO_SVE-NEXT: stp q0, q1, [x8, #192] +; NO_SVE-NEXT: sshll v1.8h, v7.8b, #0 +; NO_SVE-NEXT: stp q1, q2, [x8, #224] +; NO_SVE-NEXT: ldp x20, x19, [sp, #48] // 16-byte Folded Reload +; NO_SVE-NEXT: ldp x22, x21, [sp, #32] // 16-byte Folded Reload +; NO_SVE-NEXT: add sp, sp, #64 +; NO_SVE-NEXT: ret +; NO_SVE-NEXT: .LBB37_245: // %cond.load340 +; NO_SVE-NEXT: add x10, x0, #114 +; NO_SVE-NEXT: ld1 { v7.b }[2], [x10] +; NO_SVE-NEXT: tbz x9, #51, .LBB37_231 +; NO_SVE-NEXT: .LBB37_246: // %cond.load343 +; NO_SVE-NEXT: add x10, x0, #115 +; NO_SVE-NEXT: ld1 { v7.b }[3], [x10] +; NO_SVE-NEXT: tbz x9, #52, .LBB37_232 +; NO_SVE-NEXT: .LBB37_247: // %cond.load346 +; NO_SVE-NEXT: add x10, x0, #116 +; NO_SVE-NEXT: ld1 { v7.b }[4], [x10] +; NO_SVE-NEXT: tbz x9, #53, .LBB37_233 +; NO_SVE-NEXT: .LBB37_248: // %cond.load349 +; NO_SVE-NEXT: add x10, x0, #117 +; NO_SVE-NEXT: ld1 { v7.b }[5], [x10] +; NO_SVE-NEXT: tbz x9, #54, .LBB37_234 +; NO_SVE-NEXT: .LBB37_249: // %cond.load352 +; NO_SVE-NEXT: add x10, x0, #118 +; NO_SVE-NEXT: ld1 { v7.b }[6], [x10] +; NO_SVE-NEXT: tbz x9, #55, .LBB37_235 +; NO_SVE-NEXT: .LBB37_250: // %cond.load355 +; NO_SVE-NEXT: add x10, x0, #119 +; NO_SVE-NEXT: ld1 { v7.b }[7], [x10] +; NO_SVE-NEXT: tbz x9, #56, .LBB37_236 +; NO_SVE-NEXT: .LBB37_251: // %cond.load358 +; NO_SVE-NEXT: add x10, x0, #120 +; NO_SVE-NEXT: ld1 { v7.b }[8], [x10] +; NO_SVE-NEXT: tbz x9, #57, .LBB37_237 +; NO_SVE-NEXT: .LBB37_252: // %cond.load361 +; NO_SVE-NEXT: add x10, x0, #121 +; NO_SVE-NEXT: ld1 { v7.b }[9], [x10] +; NO_SVE-NEXT: tbz x9, #58, .LBB37_238 +; NO_SVE-NEXT: .LBB37_253: // %cond.load364 +; NO_SVE-NEXT: add x10, x0, #122 +; NO_SVE-NEXT: ld1 { v7.b }[10], [x10] +; NO_SVE-NEXT: tbz x9, #59, .LBB37_239 +; NO_SVE-NEXT: .LBB37_254: // %cond.load367 +; NO_SVE-NEXT: add x10, x0, #123 +; NO_SVE-NEXT: ld1 { v7.b }[11], [x10] +; NO_SVE-NEXT: tbz x9, #60, .LBB37_240 +; NO_SVE-NEXT: .LBB37_255: // %cond.load370 +; NO_SVE-NEXT: add x10, x0, #124 +; NO_SVE-NEXT: ld1 { v7.b }[12], [x10] +; NO_SVE-NEXT: tbz x9, #61, .LBB37_241 +; NO_SVE-NEXT: .LBB37_256: // %cond.load373 +; NO_SVE-NEXT: add x10, x0, #125 +; NO_SVE-NEXT: ld1 { v7.b }[13], [x10] +; NO_SVE-NEXT: tbz x9, #62, .LBB37_242 +; NO_SVE-NEXT: .LBB37_257: // %cond.load376 +; NO_SVE-NEXT: add x10, x0, #126 +; NO_SVE-NEXT: ld1 { v7.b }[14], [x10] +; NO_SVE-NEXT: tbnz x9, #63, .LBB37_243 +; NO_SVE-NEXT: b .LBB37_244 +; ; VBITS_GE_2048-LABEL: masked_load_sext_v128i8i16: ; VBITS_GE_2048: // %bb.0: ; VBITS_GE_2048-NEXT: ptrue p0.h, vl128 @@ -663,6 +8143,625 @@ } define <64 x i32> @masked_load_sext_v64i8i32(<64 x i8>* %ap, <64 x i8>* %bp) #0 { +; NO_SVE-LABEL: masked_load_sext_v64i8i32: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: sub sp, sp, #16 +; NO_SVE-NEXT: .cfi_def_cfa_offset 16 +; NO_SVE-NEXT: ldp q2, q0, [x1, #32] +; NO_SVE-NEXT: cmeq v2.16b, v2.16b, #0 +; NO_SVE-NEXT: cmeq v1.16b, v0.16b, #0 +; NO_SVE-NEXT: ldp q0, q3, [x1] +; NO_SVE-NEXT: umov w9, v1.b[1] +; NO_SVE-NEXT: umov w11, v1.b[2] +; NO_SVE-NEXT: umov w10, v1.b[0] +; NO_SVE-NEXT: umov w12, v1.b[3] +; NO_SVE-NEXT: umov w13, v1.b[4] +; NO_SVE-NEXT: umov w14, v1.b[5] +; NO_SVE-NEXT: umov w15, v1.b[6] +; NO_SVE-NEXT: umov w16, v1.b[7] +; NO_SVE-NEXT: and w9, w9, #0x1 +; NO_SVE-NEXT: and w11, w11, #0x1 +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: and w12, w12, #0x1 +; NO_SVE-NEXT: and w13, w13, #0x1 +; NO_SVE-NEXT: umov w17, v1.b[8] +; NO_SVE-NEXT: bfi w10, w9, #1, #1 +; NO_SVE-NEXT: and w14, w14, #0x1 +; NO_SVE-NEXT: bfi w10, w11, #2, #1 +; NO_SVE-NEXT: umov w18, v1.b[9] +; NO_SVE-NEXT: bfi w10, w12, #3, #1 +; NO_SVE-NEXT: umov w9, v1.b[10] +; NO_SVE-NEXT: bfi w10, w13, #4, #1 +; NO_SVE-NEXT: and w15, w15, #0x1 +; NO_SVE-NEXT: umov w11, v1.b[11] +; NO_SVE-NEXT: bfi w10, w14, #5, #1 +; NO_SVE-NEXT: and w16, w16, #0x1 +; NO_SVE-NEXT: umov w12, v1.b[12] +; NO_SVE-NEXT: orr w10, w10, w15, lsl #6 +; NO_SVE-NEXT: and w17, w17, #0x1 +; NO_SVE-NEXT: umov w13, v1.b[13] +; NO_SVE-NEXT: orr w10, w10, w16, lsl #7 +; NO_SVE-NEXT: and w18, w18, #0x1 +; NO_SVE-NEXT: orr w10, w10, w17, lsl #8 +; NO_SVE-NEXT: and w9, w9, #0x1 +; NO_SVE-NEXT: umov w15, v2.b[1] +; NO_SVE-NEXT: orr w10, w10, w18, lsl #9 +; NO_SVE-NEXT: and w11, w11, #0x1 +; NO_SVE-NEXT: and w12, w12, #0x1 +; NO_SVE-NEXT: orr w9, w10, w9, lsl #10 +; NO_SVE-NEXT: umov w14, v1.b[14] +; NO_SVE-NEXT: umov w16, v2.b[0] +; NO_SVE-NEXT: umov w17, v2.b[2] +; NO_SVE-NEXT: and w10, w13, #0x1 +; NO_SVE-NEXT: orr w9, w9, w11, lsl #11 +; NO_SVE-NEXT: umov w18, v2.b[3] +; NO_SVE-NEXT: orr w9, w9, w12, lsl #12 +; NO_SVE-NEXT: umov w12, v2.b[4] +; NO_SVE-NEXT: orr w9, w9, w10, lsl #13 +; NO_SVE-NEXT: and w10, w15, #0x1 +; NO_SVE-NEXT: and w11, w14, #0x1 +; NO_SVE-NEXT: and w13, w16, #0x1 +; NO_SVE-NEXT: and w14, w17, #0x1 +; NO_SVE-NEXT: umov w15, v2.b[5] +; NO_SVE-NEXT: bfi w13, w10, #1, #1 +; NO_SVE-NEXT: and w10, w18, #0x1 +; NO_SVE-NEXT: and w12, w12, #0x1 +; NO_SVE-NEXT: umov w16, v2.b[6] +; NO_SVE-NEXT: bfi w13, w14, #2, #1 +; NO_SVE-NEXT: umov w14, v2.b[7] +; NO_SVE-NEXT: bfi w13, w10, #3, #1 +; NO_SVE-NEXT: and w10, w15, #0x1 +; NO_SVE-NEXT: bfi w13, w12, #4, #1 +; NO_SVE-NEXT: umov w12, v2.b[8] +; NO_SVE-NEXT: and w15, w16, #0x1 +; NO_SVE-NEXT: umov w16, v2.b[9] +; NO_SVE-NEXT: bfi w13, w10, #5, #1 +; NO_SVE-NEXT: and w10, w14, #0x1 +; NO_SVE-NEXT: umov w14, v2.b[10] +; NO_SVE-NEXT: orr w13, w13, w15, lsl #6 +; NO_SVE-NEXT: and w12, w12, #0x1 +; NO_SVE-NEXT: orr w10, w13, w10, lsl #7 +; NO_SVE-NEXT: and w13, w16, #0x1 +; NO_SVE-NEXT: umov w15, v2.b[11] +; NO_SVE-NEXT: orr w10, w10, w12, lsl #8 +; NO_SVE-NEXT: umov w12, v2.b[12] +; NO_SVE-NEXT: orr w9, w9, w11, lsl #14 +; NO_SVE-NEXT: umov w11, v1.b[15] +; NO_SVE-NEXT: orr w10, w10, w13, lsl #9 +; NO_SVE-NEXT: and w13, w14, #0x1 +; NO_SVE-NEXT: cmeq v1.16b, v3.16b, #0 +; NO_SVE-NEXT: umov w17, v2.b[14] +; NO_SVE-NEXT: orr w10, w10, w13, lsl #10 +; NO_SVE-NEXT: and w13, w15, #0x1 +; NO_SVE-NEXT: umov w14, v1.b[1] +; NO_SVE-NEXT: and w12, w12, #0x1 +; NO_SVE-NEXT: umov w15, v2.b[13] +; NO_SVE-NEXT: umov w16, v1.b[0] +; NO_SVE-NEXT: orr w10, w10, w13, lsl #11 +; NO_SVE-NEXT: umov w13, v1.b[2] +; NO_SVE-NEXT: orr w10, w10, w12, lsl #12 +; NO_SVE-NEXT: umov w12, v1.b[3] +; NO_SVE-NEXT: orr w9, w9, w11, lsl #15 +; NO_SVE-NEXT: and w11, w14, #0x1 +; NO_SVE-NEXT: and w14, w15, #0x1 +; NO_SVE-NEXT: and w15, w16, #0x1 +; NO_SVE-NEXT: umov w16, v1.b[4] +; NO_SVE-NEXT: and w13, w13, #0x1 +; NO_SVE-NEXT: bfi w15, w11, #1, #1 +; NO_SVE-NEXT: and w11, w12, #0x1 +; NO_SVE-NEXT: umov w12, v1.b[5] +; NO_SVE-NEXT: orr w10, w10, w14, lsl #13 +; NO_SVE-NEXT: bfi w15, w13, #2, #1 +; NO_SVE-NEXT: umov w14, v1.b[6] +; NO_SVE-NEXT: and w13, w16, #0x1 +; NO_SVE-NEXT: bfi w15, w11, #3, #1 +; NO_SVE-NEXT: and w16, w17, #0x1 +; NO_SVE-NEXT: umov w17, v1.b[9] +; NO_SVE-NEXT: and w11, w12, #0x1 +; NO_SVE-NEXT: umov w12, v1.b[7] +; NO_SVE-NEXT: bfi w15, w13, #4, #1 +; NO_SVE-NEXT: umov w13, v1.b[8] +; NO_SVE-NEXT: and w14, w14, #0x1 +; NO_SVE-NEXT: orr w10, w10, w16, lsl #14 +; NO_SVE-NEXT: bfi w15, w11, #5, #1 +; NO_SVE-NEXT: umov w18, v1.b[13] +; NO_SVE-NEXT: and w12, w12, #0x1 +; NO_SVE-NEXT: orr w11, w15, w14, lsl #6 +; NO_SVE-NEXT: and w13, w13, #0x1 +; NO_SVE-NEXT: umov w14, v1.b[10] +; NO_SVE-NEXT: orr w11, w11, w12, lsl #7 +; NO_SVE-NEXT: and w12, w17, #0x1 +; NO_SVE-NEXT: orr w11, w11, w13, lsl #8 +; NO_SVE-NEXT: umov w15, v2.b[15] +; NO_SVE-NEXT: orr w11, w11, w12, lsl #9 +; NO_SVE-NEXT: umov w12, v1.b[11] +; NO_SVE-NEXT: and w13, w14, #0x1 +; NO_SVE-NEXT: umov w14, v1.b[12] +; NO_SVE-NEXT: cmeq v0.16b, v0.16b, #0 +; NO_SVE-NEXT: orr w11, w11, w13, lsl #10 +; NO_SVE-NEXT: orr w10, w10, w15, lsl #15 +; NO_SVE-NEXT: and w12, w12, #0x1 +; NO_SVE-NEXT: umov w13, v0.b[1] +; NO_SVE-NEXT: and w14, w14, #0x1 +; NO_SVE-NEXT: umov w15, v0.b[2] +; NO_SVE-NEXT: orr w11, w11, w12, lsl #11 +; NO_SVE-NEXT: umov w12, v0.b[0] +; NO_SVE-NEXT: orr w11, w11, w14, lsl #12 +; NO_SVE-NEXT: umov w14, v0.b[3] +; NO_SVE-NEXT: and w13, w13, #0x1 +; NO_SVE-NEXT: umov w16, v0.b[4] +; NO_SVE-NEXT: umov w17, v0.b[5] +; NO_SVE-NEXT: and w15, w15, #0x1 +; NO_SVE-NEXT: and w12, w12, #0x1 +; NO_SVE-NEXT: bfi w10, w9, #16, #16 +; NO_SVE-NEXT: bfi w12, w13, #1, #1 +; NO_SVE-NEXT: and w13, w14, #0x1 +; NO_SVE-NEXT: and w14, w16, #0x1 +; NO_SVE-NEXT: umov w16, v0.b[6] +; NO_SVE-NEXT: bfi w12, w15, #2, #1 +; NO_SVE-NEXT: and w15, w17, #0x1 +; NO_SVE-NEXT: bfi w12, w13, #3, #1 +; NO_SVE-NEXT: umov w13, v0.b[7] +; NO_SVE-NEXT: bfi w12, w14, #4, #1 +; NO_SVE-NEXT: umov w14, v1.b[14] +; NO_SVE-NEXT: bfi w12, w15, #5, #1 +; NO_SVE-NEXT: and w15, w16, #0x1 +; NO_SVE-NEXT: umov w16, v0.b[8] +; NO_SVE-NEXT: and w17, w18, #0x1 +; NO_SVE-NEXT: and w13, w13, #0x1 +; NO_SVE-NEXT: umov w18, v0.b[9] +; NO_SVE-NEXT: orr w12, w12, w15, lsl #6 +; NO_SVE-NEXT: umov w15, v0.b[10] +; NO_SVE-NEXT: orr w11, w11, w17, lsl #13 +; NO_SVE-NEXT: orr w12, w12, w13, lsl #7 +; NO_SVE-NEXT: and w13, w16, #0x1 +; NO_SVE-NEXT: umov w17, v0.b[11] +; NO_SVE-NEXT: and w14, w14, #0x1 +; NO_SVE-NEXT: and w16, w18, #0x1 +; NO_SVE-NEXT: orr w12, w12, w13, lsl #8 +; NO_SVE-NEXT: umov w13, v0.b[12] +; NO_SVE-NEXT: orr w11, w11, w14, lsl #14 +; NO_SVE-NEXT: and w14, w15, #0x1 +; NO_SVE-NEXT: umov w15, v0.b[13] +; NO_SVE-NEXT: orr w12, w12, w16, lsl #9 +; NO_SVE-NEXT: and w16, w17, #0x1 +; NO_SVE-NEXT: umov w17, v0.b[14] +; NO_SVE-NEXT: orr w12, w12, w14, lsl #10 +; NO_SVE-NEXT: umov w14, v1.b[15] +; NO_SVE-NEXT: and w13, w13, #0x1 +; NO_SVE-NEXT: orr w12, w12, w16, lsl #11 +; NO_SVE-NEXT: and w15, w15, #0x1 +; NO_SVE-NEXT: umov w16, v0.b[15] +; NO_SVE-NEXT: orr w12, w12, w13, lsl #12 +; NO_SVE-NEXT: and w13, w17, #0x1 +; NO_SVE-NEXT: orr w11, w11, w14, lsl #15 +; NO_SVE-NEXT: orr w12, w12, w15, lsl #13 +; NO_SVE-NEXT: orr w9, w12, w13, lsl #14 +; NO_SVE-NEXT: orr w9, w9, w16, lsl #15 +; NO_SVE-NEXT: bfi w9, w11, #16, #16 +; NO_SVE-NEXT: bfi x9, x10, #32, #32 +; NO_SVE-NEXT: tbz w9, #0, .LBB38_2 +; NO_SVE-NEXT: // %bb.1: // %cond.load +; NO_SVE-NEXT: ldr b0, [x0] +; NO_SVE-NEXT: tbnz w9, #1, .LBB38_3 +; NO_SVE-NEXT: b .LBB38_4 +; NO_SVE-NEXT: .LBB38_2: +; NO_SVE-NEXT: // implicit-def: $q0 +; NO_SVE-NEXT: tbz w9, #1, .LBB38_4 +; NO_SVE-NEXT: .LBB38_3: // %cond.load1 +; NO_SVE-NEXT: add x10, x0, #1 +; NO_SVE-NEXT: ld1 { v0.b }[1], [x10] +; NO_SVE-NEXT: .LBB38_4: // %else2 +; NO_SVE-NEXT: tbnz w9, #2, .LBB38_20 +; NO_SVE-NEXT: // %bb.5: // %else5 +; NO_SVE-NEXT: tbnz w9, #3, .LBB38_21 +; NO_SVE-NEXT: .LBB38_6: // %else8 +; NO_SVE-NEXT: tbnz w9, #4, .LBB38_22 +; NO_SVE-NEXT: .LBB38_7: // %else11 +; NO_SVE-NEXT: tbnz w9, #5, .LBB38_23 +; NO_SVE-NEXT: .LBB38_8: // %else14 +; NO_SVE-NEXT: tbnz w9, #6, .LBB38_24 +; NO_SVE-NEXT: .LBB38_9: // %else17 +; NO_SVE-NEXT: tbnz w9, #7, .LBB38_25 +; NO_SVE-NEXT: .LBB38_10: // %else20 +; NO_SVE-NEXT: tbnz w9, #8, .LBB38_26 +; NO_SVE-NEXT: .LBB38_11: // %else23 +; NO_SVE-NEXT: tbnz w9, #9, .LBB38_27 +; NO_SVE-NEXT: .LBB38_12: // %else26 +; NO_SVE-NEXT: tbnz w9, #10, .LBB38_28 +; NO_SVE-NEXT: .LBB38_13: // %else29 +; NO_SVE-NEXT: tbnz w9, #11, .LBB38_29 +; NO_SVE-NEXT: .LBB38_14: // %else32 +; NO_SVE-NEXT: tbnz w9, #12, .LBB38_30 +; NO_SVE-NEXT: .LBB38_15: // %else35 +; NO_SVE-NEXT: tbnz w9, #13, .LBB38_31 +; NO_SVE-NEXT: .LBB38_16: // %else38 +; NO_SVE-NEXT: tbnz w9, #14, .LBB38_32 +; NO_SVE-NEXT: .LBB38_17: // %else41 +; NO_SVE-NEXT: tbnz w9, #15, .LBB38_33 +; NO_SVE-NEXT: .LBB38_18: // %else44 +; NO_SVE-NEXT: tbz w9, #16, .LBB38_34 +; NO_SVE-NEXT: .LBB38_19: // %cond.load46 +; NO_SVE-NEXT: add x10, x0, #16 +; NO_SVE-NEXT: ld1 { v1.b }[0], [x10] +; NO_SVE-NEXT: tbnz w9, #17, .LBB38_35 +; NO_SVE-NEXT: b .LBB38_36 +; NO_SVE-NEXT: .LBB38_20: // %cond.load4 +; NO_SVE-NEXT: add x10, x0, #2 +; NO_SVE-NEXT: ld1 { v0.b }[2], [x10] +; NO_SVE-NEXT: tbz w9, #3, .LBB38_6 +; NO_SVE-NEXT: .LBB38_21: // %cond.load7 +; NO_SVE-NEXT: add x10, x0, #3 +; NO_SVE-NEXT: ld1 { v0.b }[3], [x10] +; NO_SVE-NEXT: tbz w9, #4, .LBB38_7 +; NO_SVE-NEXT: .LBB38_22: // %cond.load10 +; NO_SVE-NEXT: add x10, x0, #4 +; NO_SVE-NEXT: ld1 { v0.b }[4], [x10] +; NO_SVE-NEXT: tbz w9, #5, .LBB38_8 +; NO_SVE-NEXT: .LBB38_23: // %cond.load13 +; NO_SVE-NEXT: add x10, x0, #5 +; NO_SVE-NEXT: ld1 { v0.b }[5], [x10] +; NO_SVE-NEXT: tbz w9, #6, .LBB38_9 +; NO_SVE-NEXT: .LBB38_24: // %cond.load16 +; NO_SVE-NEXT: add x10, x0, #6 +; NO_SVE-NEXT: ld1 { v0.b }[6], [x10] +; NO_SVE-NEXT: tbz w9, #7, .LBB38_10 +; NO_SVE-NEXT: .LBB38_25: // %cond.load19 +; NO_SVE-NEXT: add x10, x0, #7 +; NO_SVE-NEXT: ld1 { v0.b }[7], [x10] +; NO_SVE-NEXT: tbz w9, #8, .LBB38_11 +; NO_SVE-NEXT: .LBB38_26: // %cond.load22 +; NO_SVE-NEXT: add x10, x0, #8 +; NO_SVE-NEXT: ld1 { v0.b }[8], [x10] +; NO_SVE-NEXT: tbz w9, #9, .LBB38_12 +; NO_SVE-NEXT: .LBB38_27: // %cond.load25 +; NO_SVE-NEXT: add x10, x0, #9 +; NO_SVE-NEXT: ld1 { v0.b }[9], [x10] +; NO_SVE-NEXT: tbz w9, #10, .LBB38_13 +; NO_SVE-NEXT: .LBB38_28: // %cond.load28 +; NO_SVE-NEXT: add x10, x0, #10 +; NO_SVE-NEXT: ld1 { v0.b }[10], [x10] +; NO_SVE-NEXT: tbz w9, #11, .LBB38_14 +; NO_SVE-NEXT: .LBB38_29: // %cond.load31 +; NO_SVE-NEXT: add x10, x0, #11 +; NO_SVE-NEXT: ld1 { v0.b }[11], [x10] +; NO_SVE-NEXT: tbz w9, #12, .LBB38_15 +; NO_SVE-NEXT: .LBB38_30: // %cond.load34 +; NO_SVE-NEXT: add x10, x0, #12 +; NO_SVE-NEXT: ld1 { v0.b }[12], [x10] +; NO_SVE-NEXT: tbz w9, #13, .LBB38_16 +; NO_SVE-NEXT: .LBB38_31: // %cond.load37 +; NO_SVE-NEXT: add x10, x0, #13 +; NO_SVE-NEXT: ld1 { v0.b }[13], [x10] +; NO_SVE-NEXT: tbz w9, #14, .LBB38_17 +; NO_SVE-NEXT: .LBB38_32: // %cond.load40 +; NO_SVE-NEXT: add x10, x0, #14 +; NO_SVE-NEXT: ld1 { v0.b }[14], [x10] +; NO_SVE-NEXT: tbz w9, #15, .LBB38_18 +; NO_SVE-NEXT: .LBB38_33: // %cond.load43 +; NO_SVE-NEXT: add x10, x0, #15 +; NO_SVE-NEXT: ld1 { v0.b }[15], [x10] +; NO_SVE-NEXT: tbnz w9, #16, .LBB38_19 +; NO_SVE-NEXT: .LBB38_34: +; NO_SVE-NEXT: // implicit-def: $q1 +; NO_SVE-NEXT: tbz w9, #17, .LBB38_36 +; NO_SVE-NEXT: .LBB38_35: // %cond.load49 +; NO_SVE-NEXT: add x10, x0, #17 +; NO_SVE-NEXT: ld1 { v1.b }[1], [x10] +; NO_SVE-NEXT: .LBB38_36: // %else50 +; NO_SVE-NEXT: tbnz w9, #18, .LBB38_52 +; NO_SVE-NEXT: // %bb.37: // %else53 +; NO_SVE-NEXT: tbnz w9, #19, .LBB38_53 +; NO_SVE-NEXT: .LBB38_38: // %else56 +; NO_SVE-NEXT: tbnz w9, #20, .LBB38_54 +; NO_SVE-NEXT: .LBB38_39: // %else59 +; NO_SVE-NEXT: tbnz w9, #21, .LBB38_55 +; NO_SVE-NEXT: .LBB38_40: // %else62 +; NO_SVE-NEXT: tbnz w9, #22, .LBB38_56 +; NO_SVE-NEXT: .LBB38_41: // %else65 +; NO_SVE-NEXT: tbnz w9, #23, .LBB38_57 +; NO_SVE-NEXT: .LBB38_42: // %else68 +; NO_SVE-NEXT: tbnz w9, #24, .LBB38_58 +; NO_SVE-NEXT: .LBB38_43: // %else71 +; NO_SVE-NEXT: tbnz w9, #25, .LBB38_59 +; NO_SVE-NEXT: .LBB38_44: // %else74 +; NO_SVE-NEXT: tbnz w9, #26, .LBB38_60 +; NO_SVE-NEXT: .LBB38_45: // %else77 +; NO_SVE-NEXT: tbnz w9, #27, .LBB38_61 +; NO_SVE-NEXT: .LBB38_46: // %else80 +; NO_SVE-NEXT: tbnz w9, #28, .LBB38_62 +; NO_SVE-NEXT: .LBB38_47: // %else83 +; NO_SVE-NEXT: tbnz w9, #29, .LBB38_63 +; NO_SVE-NEXT: .LBB38_48: // %else86 +; NO_SVE-NEXT: tbnz w9, #30, .LBB38_64 +; NO_SVE-NEXT: .LBB38_49: // %else89 +; NO_SVE-NEXT: tbnz w9, #31, .LBB38_65 +; NO_SVE-NEXT: .LBB38_50: // %else92 +; NO_SVE-NEXT: tbz x9, #32, .LBB38_66 +; NO_SVE-NEXT: .LBB38_51: // %cond.load94 +; NO_SVE-NEXT: add x10, x0, #32 +; NO_SVE-NEXT: ld1 { v2.b }[0], [x10] +; NO_SVE-NEXT: tbnz x9, #33, .LBB38_67 +; NO_SVE-NEXT: b .LBB38_68 +; NO_SVE-NEXT: .LBB38_52: // %cond.load52 +; NO_SVE-NEXT: add x10, x0, #18 +; NO_SVE-NEXT: ld1 { v1.b }[2], [x10] +; NO_SVE-NEXT: tbz w9, #19, .LBB38_38 +; NO_SVE-NEXT: .LBB38_53: // %cond.load55 +; NO_SVE-NEXT: add x10, x0, #19 +; NO_SVE-NEXT: ld1 { v1.b }[3], [x10] +; NO_SVE-NEXT: tbz w9, #20, .LBB38_39 +; NO_SVE-NEXT: .LBB38_54: // %cond.load58 +; NO_SVE-NEXT: add x10, x0, #20 +; NO_SVE-NEXT: ld1 { v1.b }[4], [x10] +; NO_SVE-NEXT: tbz w9, #21, .LBB38_40 +; NO_SVE-NEXT: .LBB38_55: // %cond.load61 +; NO_SVE-NEXT: add x10, x0, #21 +; NO_SVE-NEXT: ld1 { v1.b }[5], [x10] +; NO_SVE-NEXT: tbz w9, #22, .LBB38_41 +; NO_SVE-NEXT: .LBB38_56: // %cond.load64 +; NO_SVE-NEXT: add x10, x0, #22 +; NO_SVE-NEXT: ld1 { v1.b }[6], [x10] +; NO_SVE-NEXT: tbz w9, #23, .LBB38_42 +; NO_SVE-NEXT: .LBB38_57: // %cond.load67 +; NO_SVE-NEXT: add x10, x0, #23 +; NO_SVE-NEXT: ld1 { v1.b }[7], [x10] +; NO_SVE-NEXT: tbz w9, #24, .LBB38_43 +; NO_SVE-NEXT: .LBB38_58: // %cond.load70 +; NO_SVE-NEXT: add x10, x0, #24 +; NO_SVE-NEXT: ld1 { v1.b }[8], [x10] +; NO_SVE-NEXT: tbz w9, #25, .LBB38_44 +; NO_SVE-NEXT: .LBB38_59: // %cond.load73 +; NO_SVE-NEXT: add x10, x0, #25 +; NO_SVE-NEXT: ld1 { v1.b }[9], [x10] +; NO_SVE-NEXT: tbz w9, #26, .LBB38_45 +; NO_SVE-NEXT: .LBB38_60: // %cond.load76 +; NO_SVE-NEXT: add x10, x0, #26 +; NO_SVE-NEXT: ld1 { v1.b }[10], [x10] +; NO_SVE-NEXT: tbz w9, #27, .LBB38_46 +; NO_SVE-NEXT: .LBB38_61: // %cond.load79 +; NO_SVE-NEXT: add x10, x0, #27 +; NO_SVE-NEXT: ld1 { v1.b }[11], [x10] +; NO_SVE-NEXT: tbz w9, #28, .LBB38_47 +; NO_SVE-NEXT: .LBB38_62: // %cond.load82 +; NO_SVE-NEXT: add x10, x0, #28 +; NO_SVE-NEXT: ld1 { v1.b }[12], [x10] +; NO_SVE-NEXT: tbz w9, #29, .LBB38_48 +; NO_SVE-NEXT: .LBB38_63: // %cond.load85 +; NO_SVE-NEXT: add x10, x0, #29 +; NO_SVE-NEXT: ld1 { v1.b }[13], [x10] +; NO_SVE-NEXT: tbz w9, #30, .LBB38_49 +; NO_SVE-NEXT: .LBB38_64: // %cond.load88 +; NO_SVE-NEXT: add x10, x0, #30 +; NO_SVE-NEXT: ld1 { v1.b }[14], [x10] +; NO_SVE-NEXT: tbz w9, #31, .LBB38_50 +; NO_SVE-NEXT: .LBB38_65: // %cond.load91 +; NO_SVE-NEXT: add x10, x0, #31 +; NO_SVE-NEXT: ld1 { v1.b }[15], [x10] +; NO_SVE-NEXT: tbnz x9, #32, .LBB38_51 +; NO_SVE-NEXT: .LBB38_66: +; NO_SVE-NEXT: // implicit-def: $q2 +; NO_SVE-NEXT: tbz x9, #33, .LBB38_68 +; NO_SVE-NEXT: .LBB38_67: // %cond.load97 +; NO_SVE-NEXT: add x10, x0, #33 +; NO_SVE-NEXT: ld1 { v2.b }[1], [x10] +; NO_SVE-NEXT: .LBB38_68: // %else98 +; NO_SVE-NEXT: tbnz x9, #34, .LBB38_84 +; NO_SVE-NEXT: // %bb.69: // %else101 +; NO_SVE-NEXT: tbnz x9, #35, .LBB38_85 +; NO_SVE-NEXT: .LBB38_70: // %else104 +; NO_SVE-NEXT: tbnz x9, #36, .LBB38_86 +; NO_SVE-NEXT: .LBB38_71: // %else107 +; NO_SVE-NEXT: tbnz x9, #37, .LBB38_87 +; NO_SVE-NEXT: .LBB38_72: // %else110 +; NO_SVE-NEXT: tbnz x9, #38, .LBB38_88 +; NO_SVE-NEXT: .LBB38_73: // %else113 +; NO_SVE-NEXT: tbnz x9, #39, .LBB38_89 +; NO_SVE-NEXT: .LBB38_74: // %else116 +; NO_SVE-NEXT: tbnz x9, #40, .LBB38_90 +; NO_SVE-NEXT: .LBB38_75: // %else119 +; NO_SVE-NEXT: tbnz x9, #41, .LBB38_91 +; NO_SVE-NEXT: .LBB38_76: // %else122 +; NO_SVE-NEXT: tbnz x9, #42, .LBB38_92 +; NO_SVE-NEXT: .LBB38_77: // %else125 +; NO_SVE-NEXT: tbnz x9, #43, .LBB38_93 +; NO_SVE-NEXT: .LBB38_78: // %else128 +; NO_SVE-NEXT: tbnz x9, #44, .LBB38_94 +; NO_SVE-NEXT: .LBB38_79: // %else131 +; NO_SVE-NEXT: tbnz x9, #45, .LBB38_95 +; NO_SVE-NEXT: .LBB38_80: // %else134 +; NO_SVE-NEXT: tbnz x9, #46, .LBB38_96 +; NO_SVE-NEXT: .LBB38_81: // %else137 +; NO_SVE-NEXT: tbnz x9, #47, .LBB38_97 +; NO_SVE-NEXT: .LBB38_82: // %else140 +; NO_SVE-NEXT: tbz x9, #48, .LBB38_98 +; NO_SVE-NEXT: .LBB38_83: // %cond.load142 +; NO_SVE-NEXT: add x10, x0, #48 +; NO_SVE-NEXT: ld1 { v3.b }[0], [x10] +; NO_SVE-NEXT: tbnz x9, #49, .LBB38_99 +; NO_SVE-NEXT: b .LBB38_100 +; NO_SVE-NEXT: .LBB38_84: // %cond.load100 +; NO_SVE-NEXT: add x10, x0, #34 +; NO_SVE-NEXT: ld1 { v2.b }[2], [x10] +; NO_SVE-NEXT: tbz x9, #35, .LBB38_70 +; NO_SVE-NEXT: .LBB38_85: // %cond.load103 +; NO_SVE-NEXT: add x10, x0, #35 +; NO_SVE-NEXT: ld1 { v2.b }[3], [x10] +; NO_SVE-NEXT: tbz x9, #36, .LBB38_71 +; NO_SVE-NEXT: .LBB38_86: // %cond.load106 +; NO_SVE-NEXT: add x10, x0, #36 +; NO_SVE-NEXT: ld1 { v2.b }[4], [x10] +; NO_SVE-NEXT: tbz x9, #37, .LBB38_72 +; NO_SVE-NEXT: .LBB38_87: // %cond.load109 +; NO_SVE-NEXT: add x10, x0, #37 +; NO_SVE-NEXT: ld1 { v2.b }[5], [x10] +; NO_SVE-NEXT: tbz x9, #38, .LBB38_73 +; NO_SVE-NEXT: .LBB38_88: // %cond.load112 +; NO_SVE-NEXT: add x10, x0, #38 +; NO_SVE-NEXT: ld1 { v2.b }[6], [x10] +; NO_SVE-NEXT: tbz x9, #39, .LBB38_74 +; NO_SVE-NEXT: .LBB38_89: // %cond.load115 +; NO_SVE-NEXT: add x10, x0, #39 +; NO_SVE-NEXT: ld1 { v2.b }[7], [x10] +; NO_SVE-NEXT: tbz x9, #40, .LBB38_75 +; NO_SVE-NEXT: .LBB38_90: // %cond.load118 +; NO_SVE-NEXT: add x10, x0, #40 +; NO_SVE-NEXT: ld1 { v2.b }[8], [x10] +; NO_SVE-NEXT: tbz x9, #41, .LBB38_76 +; NO_SVE-NEXT: .LBB38_91: // %cond.load121 +; NO_SVE-NEXT: add x10, x0, #41 +; NO_SVE-NEXT: ld1 { v2.b }[9], [x10] +; NO_SVE-NEXT: tbz x9, #42, .LBB38_77 +; NO_SVE-NEXT: .LBB38_92: // %cond.load124 +; NO_SVE-NEXT: add x10, x0, #42 +; NO_SVE-NEXT: ld1 { v2.b }[10], [x10] +; NO_SVE-NEXT: tbz x9, #43, .LBB38_78 +; NO_SVE-NEXT: .LBB38_93: // %cond.load127 +; NO_SVE-NEXT: add x10, x0, #43 +; NO_SVE-NEXT: ld1 { v2.b }[11], [x10] +; NO_SVE-NEXT: tbz x9, #44, .LBB38_79 +; NO_SVE-NEXT: .LBB38_94: // %cond.load130 +; NO_SVE-NEXT: add x10, x0, #44 +; NO_SVE-NEXT: ld1 { v2.b }[12], [x10] +; NO_SVE-NEXT: tbz x9, #45, .LBB38_80 +; NO_SVE-NEXT: .LBB38_95: // %cond.load133 +; NO_SVE-NEXT: add x10, x0, #45 +; NO_SVE-NEXT: ld1 { v2.b }[13], [x10] +; NO_SVE-NEXT: tbz x9, #46, .LBB38_81 +; NO_SVE-NEXT: .LBB38_96: // %cond.load136 +; NO_SVE-NEXT: add x10, x0, #46 +; NO_SVE-NEXT: ld1 { v2.b }[14], [x10] +; NO_SVE-NEXT: tbz x9, #47, .LBB38_82 +; NO_SVE-NEXT: .LBB38_97: // %cond.load139 +; NO_SVE-NEXT: add x10, x0, #47 +; NO_SVE-NEXT: ld1 { v2.b }[15], [x10] +; NO_SVE-NEXT: tbnz x9, #48, .LBB38_83 +; NO_SVE-NEXT: .LBB38_98: +; NO_SVE-NEXT: // implicit-def: $q3 +; NO_SVE-NEXT: tbz x9, #49, .LBB38_100 +; NO_SVE-NEXT: .LBB38_99: // %cond.load145 +; NO_SVE-NEXT: add x10, x0, #49 +; NO_SVE-NEXT: ld1 { v3.b }[1], [x10] +; NO_SVE-NEXT: .LBB38_100: // %else146 +; NO_SVE-NEXT: tbnz x9, #50, .LBB38_116 +; NO_SVE-NEXT: // %bb.101: // %else149 +; NO_SVE-NEXT: tbnz x9, #51, .LBB38_117 +; NO_SVE-NEXT: .LBB38_102: // %else152 +; NO_SVE-NEXT: tbnz x9, #52, .LBB38_118 +; NO_SVE-NEXT: .LBB38_103: // %else155 +; NO_SVE-NEXT: tbnz x9, #53, .LBB38_119 +; NO_SVE-NEXT: .LBB38_104: // %else158 +; NO_SVE-NEXT: tbnz x9, #54, .LBB38_120 +; NO_SVE-NEXT: .LBB38_105: // %else161 +; NO_SVE-NEXT: tbnz x9, #55, .LBB38_121 +; NO_SVE-NEXT: .LBB38_106: // %else164 +; NO_SVE-NEXT: tbnz x9, #56, .LBB38_122 +; NO_SVE-NEXT: .LBB38_107: // %else167 +; NO_SVE-NEXT: tbnz x9, #57, .LBB38_123 +; NO_SVE-NEXT: .LBB38_108: // %else170 +; NO_SVE-NEXT: tbnz x9, #58, .LBB38_124 +; NO_SVE-NEXT: .LBB38_109: // %else173 +; NO_SVE-NEXT: tbnz x9, #59, .LBB38_125 +; NO_SVE-NEXT: .LBB38_110: // %else176 +; NO_SVE-NEXT: tbnz x9, #60, .LBB38_126 +; NO_SVE-NEXT: .LBB38_111: // %else179 +; NO_SVE-NEXT: tbnz x9, #61, .LBB38_127 +; NO_SVE-NEXT: .LBB38_112: // %else182 +; NO_SVE-NEXT: tbnz x9, #62, .LBB38_128 +; NO_SVE-NEXT: .LBB38_113: // %else185 +; NO_SVE-NEXT: tbz x9, #63, .LBB38_115 +; NO_SVE-NEXT: .LBB38_114: // %cond.load187 +; NO_SVE-NEXT: add x9, x0, #63 +; NO_SVE-NEXT: ld1 { v3.b }[15], [x9] +; NO_SVE-NEXT: .LBB38_115: // %else188 +; NO_SVE-NEXT: sshll v6.8h, v0.8b, #0 +; NO_SVE-NEXT: sshll2 v0.8h, v0.16b, #0 +; NO_SVE-NEXT: sshll2 v5.8h, v2.16b, #0 +; NO_SVE-NEXT: sshll2 v16.4s, v0.8h, #0 +; NO_SVE-NEXT: sshll v0.4s, v0.4h, #0 +; NO_SVE-NEXT: sshll2 v7.8h, v1.16b, #0 +; NO_SVE-NEXT: stp q0, q16, [x8, #32] +; NO_SVE-NEXT: sshll2 v0.4s, v6.8h, #0 +; NO_SVE-NEXT: sshll v6.4s, v6.4h, #0 +; NO_SVE-NEXT: sshll v1.8h, v1.8b, #0 +; NO_SVE-NEXT: stp q6, q0, [x8] +; NO_SVE-NEXT: sshll2 v0.4s, v5.8h, #0 +; NO_SVE-NEXT: sshll v5.4s, v5.4h, #0 +; NO_SVE-NEXT: sshll2 v4.8h, v3.16b, #0 +; NO_SVE-NEXT: sshll2 v6.4s, v1.8h, #0 +; NO_SVE-NEXT: stp q5, q0, [x8, #160] +; NO_SVE-NEXT: sshll v0.4s, v1.4h, #0 +; NO_SVE-NEXT: sshll2 v1.4s, v4.8h, #0 +; NO_SVE-NEXT: stp q0, q6, [x8, #64] +; NO_SVE-NEXT: sshll v0.4s, v4.4h, #0 +; NO_SVE-NEXT: sshll v2.8h, v2.8b, #0 +; NO_SVE-NEXT: stp q0, q1, [x8, #224] +; NO_SVE-NEXT: sshll2 v0.4s, v2.8h, #0 +; NO_SVE-NEXT: sshll v1.4s, v2.4h, #0 +; NO_SVE-NEXT: sshll v2.8h, v3.8b, #0 +; NO_SVE-NEXT: sshll2 v17.4s, v7.8h, #0 +; NO_SVE-NEXT: stp q1, q0, [x8, #128] +; NO_SVE-NEXT: sshll v7.4s, v7.4h, #0 +; NO_SVE-NEXT: sshll2 v0.4s, v2.8h, #0 +; NO_SVE-NEXT: sshll v1.4s, v2.4h, #0 +; NO_SVE-NEXT: stp q7, q17, [x8, #96] +; NO_SVE-NEXT: stp q1, q0, [x8, #192] +; NO_SVE-NEXT: add sp, sp, #16 +; NO_SVE-NEXT: ret +; NO_SVE-NEXT: .LBB38_116: // %cond.load148 +; NO_SVE-NEXT: add x10, x0, #50 +; NO_SVE-NEXT: ld1 { v3.b }[2], [x10] +; NO_SVE-NEXT: tbz x9, #51, .LBB38_102 +; NO_SVE-NEXT: .LBB38_117: // %cond.load151 +; NO_SVE-NEXT: add x10, x0, #51 +; NO_SVE-NEXT: ld1 { v3.b }[3], [x10] +; NO_SVE-NEXT: tbz x9, #52, .LBB38_103 +; NO_SVE-NEXT: .LBB38_118: // %cond.load154 +; NO_SVE-NEXT: add x10, x0, #52 +; NO_SVE-NEXT: ld1 { v3.b }[4], [x10] +; NO_SVE-NEXT: tbz x9, #53, .LBB38_104 +; NO_SVE-NEXT: .LBB38_119: // %cond.load157 +; NO_SVE-NEXT: add x10, x0, #53 +; NO_SVE-NEXT: ld1 { v3.b }[5], [x10] +; NO_SVE-NEXT: tbz x9, #54, .LBB38_105 +; NO_SVE-NEXT: .LBB38_120: // %cond.load160 +; NO_SVE-NEXT: add x10, x0, #54 +; NO_SVE-NEXT: ld1 { v3.b }[6], [x10] +; NO_SVE-NEXT: tbz x9, #55, .LBB38_106 +; NO_SVE-NEXT: .LBB38_121: // %cond.load163 +; NO_SVE-NEXT: add x10, x0, #55 +; NO_SVE-NEXT: ld1 { v3.b }[7], [x10] +; NO_SVE-NEXT: tbz x9, #56, .LBB38_107 +; NO_SVE-NEXT: .LBB38_122: // %cond.load166 +; NO_SVE-NEXT: add x10, x0, #56 +; NO_SVE-NEXT: ld1 { v3.b }[8], [x10] +; NO_SVE-NEXT: tbz x9, #57, .LBB38_108 +; NO_SVE-NEXT: .LBB38_123: // %cond.load169 +; NO_SVE-NEXT: add x10, x0, #57 +; NO_SVE-NEXT: ld1 { v3.b }[9], [x10] +; NO_SVE-NEXT: tbz x9, #58, .LBB38_109 +; NO_SVE-NEXT: .LBB38_124: // %cond.load172 +; NO_SVE-NEXT: add x10, x0, #58 +; NO_SVE-NEXT: ld1 { v3.b }[10], [x10] +; NO_SVE-NEXT: tbz x9, #59, .LBB38_110 +; NO_SVE-NEXT: .LBB38_125: // %cond.load175 +; NO_SVE-NEXT: add x10, x0, #59 +; NO_SVE-NEXT: ld1 { v3.b }[11], [x10] +; NO_SVE-NEXT: tbz x9, #60, .LBB38_111 +; NO_SVE-NEXT: .LBB38_126: // %cond.load178 +; NO_SVE-NEXT: add x10, x0, #60 +; NO_SVE-NEXT: ld1 { v3.b }[12], [x10] +; NO_SVE-NEXT: tbz x9, #61, .LBB38_112 +; NO_SVE-NEXT: .LBB38_127: // %cond.load181 +; NO_SVE-NEXT: add x10, x0, #61 +; NO_SVE-NEXT: ld1 { v3.b }[13], [x10] +; NO_SVE-NEXT: tbz x9, #62, .LBB38_113 +; NO_SVE-NEXT: .LBB38_128: // %cond.load184 +; NO_SVE-NEXT: add x10, x0, #62 +; NO_SVE-NEXT: ld1 { v3.b }[14], [x10] +; NO_SVE-NEXT: tbnz x9, #63, .LBB38_114 +; NO_SVE-NEXT: b .LBB38_115 +; ; VBITS_GE_2048-LABEL: masked_load_sext_v64i8i32: ; VBITS_GE_2048: // %bb.0: ; VBITS_GE_2048-NEXT: ptrue p0.s, vl64 @@ -679,6 +8778,338 @@ } define <32 x i64> @masked_load_sext_v32i8i64(<32 x i8>* %ap, <32 x i8>* %bp) #0 { +; NO_SVE-LABEL: masked_load_sext_v32i8i64: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: sub sp, sp, #16 +; NO_SVE-NEXT: .cfi_def_cfa_offset 16 +; NO_SVE-NEXT: ldp q1, q0, [x1] +; NO_SVE-NEXT: cmeq v1.16b, v1.16b, #0 +; NO_SVE-NEXT: cmeq v0.16b, v0.16b, #0 +; NO_SVE-NEXT: umov w9, v0.b[1] +; NO_SVE-NEXT: umov w11, v0.b[2] +; NO_SVE-NEXT: umov w10, v0.b[0] +; NO_SVE-NEXT: umov w12, v0.b[3] +; NO_SVE-NEXT: umov w13, v0.b[4] +; NO_SVE-NEXT: umov w14, v0.b[5] +; NO_SVE-NEXT: umov w15, v0.b[6] +; NO_SVE-NEXT: umov w16, v0.b[7] +; NO_SVE-NEXT: and w9, w9, #0x1 +; NO_SVE-NEXT: and w11, w11, #0x1 +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: and w12, w12, #0x1 +; NO_SVE-NEXT: and w13, w13, #0x1 +; NO_SVE-NEXT: and w14, w14, #0x1 +; NO_SVE-NEXT: bfi w10, w9, #1, #1 +; NO_SVE-NEXT: umov w9, v0.b[8] +; NO_SVE-NEXT: bfi w10, w11, #2, #1 +; NO_SVE-NEXT: umov w11, v0.b[9] +; NO_SVE-NEXT: bfi w10, w12, #3, #1 +; NO_SVE-NEXT: umov w12, v0.b[10] +; NO_SVE-NEXT: bfi w10, w13, #4, #1 +; NO_SVE-NEXT: and w15, w15, #0x1 +; NO_SVE-NEXT: umov w13, v0.b[11] +; NO_SVE-NEXT: bfi w10, w14, #5, #1 +; NO_SVE-NEXT: and w16, w16, #0x1 +; NO_SVE-NEXT: orr w10, w10, w15, lsl #6 +; NO_SVE-NEXT: and w9, w9, #0x1 +; NO_SVE-NEXT: umov w15, v1.b[1] +; NO_SVE-NEXT: orr w10, w10, w16, lsl #7 +; NO_SVE-NEXT: and w11, w11, #0x1 +; NO_SVE-NEXT: orr w9, w10, w9, lsl #8 +; NO_SVE-NEXT: umov w10, v1.b[2] +; NO_SVE-NEXT: and w12, w12, #0x1 +; NO_SVE-NEXT: umov w16, v1.b[0] +; NO_SVE-NEXT: orr w9, w9, w11, lsl #9 +; NO_SVE-NEXT: and w13, w13, #0x1 +; NO_SVE-NEXT: umov w11, v1.b[3] +; NO_SVE-NEXT: orr w9, w9, w12, lsl #10 +; NO_SVE-NEXT: umov w12, v1.b[4] +; NO_SVE-NEXT: orr w9, w9, w13, lsl #11 +; NO_SVE-NEXT: umov w13, v1.b[5] +; NO_SVE-NEXT: and w15, w15, #0x1 +; NO_SVE-NEXT: umov w14, v0.b[12] +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: and w16, w16, #0x1 +; NO_SVE-NEXT: and w11, w11, #0x1 +; NO_SVE-NEXT: bfi w16, w15, #1, #1 +; NO_SVE-NEXT: umov w15, v1.b[9] +; NO_SVE-NEXT: bfi w16, w10, #2, #1 +; NO_SVE-NEXT: and w10, w12, #0x1 +; NO_SVE-NEXT: and w12, w13, #0x1 +; NO_SVE-NEXT: umov w13, v1.b[6] +; NO_SVE-NEXT: and w14, w14, #0x1 +; NO_SVE-NEXT: bfi w16, w11, #3, #1 +; NO_SVE-NEXT: umov w11, v1.b[7] +; NO_SVE-NEXT: bfi w16, w10, #4, #1 +; NO_SVE-NEXT: orr w9, w9, w14, lsl #12 +; NO_SVE-NEXT: umov w14, v0.b[13] +; NO_SVE-NEXT: bfi w16, w12, #5, #1 +; NO_SVE-NEXT: and w12, w13, #0x1 +; NO_SVE-NEXT: umov w13, v1.b[8] +; NO_SVE-NEXT: umov w10, v0.b[14] +; NO_SVE-NEXT: and w11, w11, #0x1 +; NO_SVE-NEXT: orr w12, w16, w12, lsl #6 +; NO_SVE-NEXT: and w14, w14, #0x1 +; NO_SVE-NEXT: orr w11, w12, w11, lsl #7 +; NO_SVE-NEXT: and w12, w13, #0x1 +; NO_SVE-NEXT: umov w13, v1.b[10] +; NO_SVE-NEXT: orr w9, w9, w14, lsl #13 +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: and w14, w15, #0x1 +; NO_SVE-NEXT: umov w15, v1.b[11] +; NO_SVE-NEXT: orr w9, w9, w10, lsl #14 +; NO_SVE-NEXT: orr w10, w11, w12, lsl #8 +; NO_SVE-NEXT: umov w11, v1.b[12] +; NO_SVE-NEXT: and w12, w13, #0x1 +; NO_SVE-NEXT: umov w13, v1.b[13] +; NO_SVE-NEXT: orr w10, w10, w14, lsl #9 +; NO_SVE-NEXT: and w14, w15, #0x1 +; NO_SVE-NEXT: umov w15, v1.b[14] +; NO_SVE-NEXT: orr w10, w10, w12, lsl #10 +; NO_SVE-NEXT: umov w12, v0.b[15] +; NO_SVE-NEXT: and w11, w11, #0x1 +; NO_SVE-NEXT: orr w10, w10, w14, lsl #11 +; NO_SVE-NEXT: and w13, w13, #0x1 +; NO_SVE-NEXT: umov w14, v1.b[15] +; NO_SVE-NEXT: orr w10, w10, w11, lsl #12 +; NO_SVE-NEXT: and w11, w15, #0x1 +; NO_SVE-NEXT: orr w12, w9, w12, lsl #15 +; NO_SVE-NEXT: orr w9, w10, w13, lsl #13 +; NO_SVE-NEXT: orr w9, w9, w11, lsl #14 +; NO_SVE-NEXT: orr w9, w9, w14, lsl #15 +; NO_SVE-NEXT: bfi w9, w12, #16, #16 +; NO_SVE-NEXT: tbz w9, #0, .LBB39_2 +; NO_SVE-NEXT: // %bb.1: // %cond.load +; NO_SVE-NEXT: ldr b0, [x0] +; NO_SVE-NEXT: tbnz w9, #1, .LBB39_3 +; NO_SVE-NEXT: b .LBB39_4 +; NO_SVE-NEXT: .LBB39_2: +; NO_SVE-NEXT: // implicit-def: $q0 +; NO_SVE-NEXT: tbz w9, #1, .LBB39_4 +; NO_SVE-NEXT: .LBB39_3: // %cond.load1 +; NO_SVE-NEXT: add x10, x0, #1 +; NO_SVE-NEXT: ld1 { v0.b }[1], [x10] +; NO_SVE-NEXT: .LBB39_4: // %else2 +; NO_SVE-NEXT: tbnz w9, #2, .LBB39_20 +; NO_SVE-NEXT: // %bb.5: // %else5 +; NO_SVE-NEXT: tbnz w9, #3, .LBB39_21 +; NO_SVE-NEXT: .LBB39_6: // %else8 +; NO_SVE-NEXT: tbnz w9, #4, .LBB39_22 +; NO_SVE-NEXT: .LBB39_7: // %else11 +; NO_SVE-NEXT: tbnz w9, #5, .LBB39_23 +; NO_SVE-NEXT: .LBB39_8: // %else14 +; NO_SVE-NEXT: tbnz w9, #6, .LBB39_24 +; NO_SVE-NEXT: .LBB39_9: // %else17 +; NO_SVE-NEXT: tbnz w9, #7, .LBB39_25 +; NO_SVE-NEXT: .LBB39_10: // %else20 +; NO_SVE-NEXT: tbnz w9, #8, .LBB39_26 +; NO_SVE-NEXT: .LBB39_11: // %else23 +; NO_SVE-NEXT: tbnz w9, #9, .LBB39_27 +; NO_SVE-NEXT: .LBB39_12: // %else26 +; NO_SVE-NEXT: tbnz w9, #10, .LBB39_28 +; NO_SVE-NEXT: .LBB39_13: // %else29 +; NO_SVE-NEXT: tbnz w9, #11, .LBB39_29 +; NO_SVE-NEXT: .LBB39_14: // %else32 +; NO_SVE-NEXT: tbnz w9, #12, .LBB39_30 +; NO_SVE-NEXT: .LBB39_15: // %else35 +; NO_SVE-NEXT: tbnz w9, #13, .LBB39_31 +; NO_SVE-NEXT: .LBB39_16: // %else38 +; NO_SVE-NEXT: tbnz w9, #14, .LBB39_32 +; NO_SVE-NEXT: .LBB39_17: // %else41 +; NO_SVE-NEXT: tbnz w9, #15, .LBB39_33 +; NO_SVE-NEXT: .LBB39_18: // %else44 +; NO_SVE-NEXT: tbz w9, #16, .LBB39_34 +; NO_SVE-NEXT: .LBB39_19: // %cond.load46 +; NO_SVE-NEXT: add x10, x0, #16 +; NO_SVE-NEXT: ld1 { v1.b }[0], [x10] +; NO_SVE-NEXT: tbnz w9, #17, .LBB39_35 +; NO_SVE-NEXT: b .LBB39_36 +; NO_SVE-NEXT: .LBB39_20: // %cond.load4 +; NO_SVE-NEXT: add x10, x0, #2 +; NO_SVE-NEXT: ld1 { v0.b }[2], [x10] +; NO_SVE-NEXT: tbz w9, #3, .LBB39_6 +; NO_SVE-NEXT: .LBB39_21: // %cond.load7 +; NO_SVE-NEXT: add x10, x0, #3 +; NO_SVE-NEXT: ld1 { v0.b }[3], [x10] +; NO_SVE-NEXT: tbz w9, #4, .LBB39_7 +; NO_SVE-NEXT: .LBB39_22: // %cond.load10 +; NO_SVE-NEXT: add x10, x0, #4 +; NO_SVE-NEXT: ld1 { v0.b }[4], [x10] +; NO_SVE-NEXT: tbz w9, #5, .LBB39_8 +; NO_SVE-NEXT: .LBB39_23: // %cond.load13 +; NO_SVE-NEXT: add x10, x0, #5 +; NO_SVE-NEXT: ld1 { v0.b }[5], [x10] +; NO_SVE-NEXT: tbz w9, #6, .LBB39_9 +; NO_SVE-NEXT: .LBB39_24: // %cond.load16 +; NO_SVE-NEXT: add x10, x0, #6 +; NO_SVE-NEXT: ld1 { v0.b }[6], [x10] +; NO_SVE-NEXT: tbz w9, #7, .LBB39_10 +; NO_SVE-NEXT: .LBB39_25: // %cond.load19 +; NO_SVE-NEXT: add x10, x0, #7 +; NO_SVE-NEXT: ld1 { v0.b }[7], [x10] +; NO_SVE-NEXT: tbz w9, #8, .LBB39_11 +; NO_SVE-NEXT: .LBB39_26: // %cond.load22 +; NO_SVE-NEXT: add x10, x0, #8 +; NO_SVE-NEXT: ld1 { v0.b }[8], [x10] +; NO_SVE-NEXT: tbz w9, #9, .LBB39_12 +; NO_SVE-NEXT: .LBB39_27: // %cond.load25 +; NO_SVE-NEXT: add x10, x0, #9 +; NO_SVE-NEXT: ld1 { v0.b }[9], [x10] +; NO_SVE-NEXT: tbz w9, #10, .LBB39_13 +; NO_SVE-NEXT: .LBB39_28: // %cond.load28 +; NO_SVE-NEXT: add x10, x0, #10 +; NO_SVE-NEXT: ld1 { v0.b }[10], [x10] +; NO_SVE-NEXT: tbz w9, #11, .LBB39_14 +; NO_SVE-NEXT: .LBB39_29: // %cond.load31 +; NO_SVE-NEXT: add x10, x0, #11 +; NO_SVE-NEXT: ld1 { v0.b }[11], [x10] +; NO_SVE-NEXT: tbz w9, #12, .LBB39_15 +; NO_SVE-NEXT: .LBB39_30: // %cond.load34 +; NO_SVE-NEXT: add x10, x0, #12 +; NO_SVE-NEXT: ld1 { v0.b }[12], [x10] +; NO_SVE-NEXT: tbz w9, #13, .LBB39_16 +; NO_SVE-NEXT: .LBB39_31: // %cond.load37 +; NO_SVE-NEXT: add x10, x0, #13 +; NO_SVE-NEXT: ld1 { v0.b }[13], [x10] +; NO_SVE-NEXT: tbz w9, #14, .LBB39_17 +; NO_SVE-NEXT: .LBB39_32: // %cond.load40 +; NO_SVE-NEXT: add x10, x0, #14 +; NO_SVE-NEXT: ld1 { v0.b }[14], [x10] +; NO_SVE-NEXT: tbz w9, #15, .LBB39_18 +; NO_SVE-NEXT: .LBB39_33: // %cond.load43 +; NO_SVE-NEXT: add x10, x0, #15 +; NO_SVE-NEXT: ld1 { v0.b }[15], [x10] +; NO_SVE-NEXT: tbnz w9, #16, .LBB39_19 +; NO_SVE-NEXT: .LBB39_34: +; NO_SVE-NEXT: // implicit-def: $q1 +; NO_SVE-NEXT: tbz w9, #17, .LBB39_36 +; NO_SVE-NEXT: .LBB39_35: // %cond.load49 +; NO_SVE-NEXT: add x10, x0, #17 +; NO_SVE-NEXT: ld1 { v1.b }[1], [x10] +; NO_SVE-NEXT: .LBB39_36: // %else50 +; NO_SVE-NEXT: tbnz w9, #18, .LBB39_52 +; NO_SVE-NEXT: // %bb.37: // %else53 +; NO_SVE-NEXT: tbnz w9, #19, .LBB39_53 +; NO_SVE-NEXT: .LBB39_38: // %else56 +; NO_SVE-NEXT: tbnz w9, #20, .LBB39_54 +; NO_SVE-NEXT: .LBB39_39: // %else59 +; NO_SVE-NEXT: tbnz w9, #21, .LBB39_55 +; NO_SVE-NEXT: .LBB39_40: // %else62 +; NO_SVE-NEXT: tbnz w9, #22, .LBB39_56 +; NO_SVE-NEXT: .LBB39_41: // %else65 +; NO_SVE-NEXT: tbnz w9, #23, .LBB39_57 +; NO_SVE-NEXT: .LBB39_42: // %else68 +; NO_SVE-NEXT: tbnz w9, #24, .LBB39_58 +; NO_SVE-NEXT: .LBB39_43: // %else71 +; NO_SVE-NEXT: tbnz w9, #25, .LBB39_59 +; NO_SVE-NEXT: .LBB39_44: // %else74 +; NO_SVE-NEXT: tbnz w9, #26, .LBB39_60 +; NO_SVE-NEXT: .LBB39_45: // %else77 +; NO_SVE-NEXT: tbnz w9, #27, .LBB39_61 +; NO_SVE-NEXT: .LBB39_46: // %else80 +; NO_SVE-NEXT: tbnz w9, #28, .LBB39_62 +; NO_SVE-NEXT: .LBB39_47: // %else83 +; NO_SVE-NEXT: tbnz w9, #29, .LBB39_63 +; NO_SVE-NEXT: .LBB39_48: // %else86 +; NO_SVE-NEXT: tbnz w9, #30, .LBB39_64 +; NO_SVE-NEXT: .LBB39_49: // %else89 +; NO_SVE-NEXT: tbz w9, #31, .LBB39_51 +; NO_SVE-NEXT: .LBB39_50: // %cond.load91 +; NO_SVE-NEXT: add x9, x0, #31 +; NO_SVE-NEXT: ld1 { v1.b }[15], [x9] +; NO_SVE-NEXT: .LBB39_51: // %else92 +; NO_SVE-NEXT: sshll v3.8h, v0.8b, #0 +; NO_SVE-NEXT: sshll2 v0.8h, v0.16b, #0 +; NO_SVE-NEXT: sshll v2.8h, v1.8b, #0 +; NO_SVE-NEXT: sshll v6.4s, v0.4h, #0 +; NO_SVE-NEXT: sshll2 v0.4s, v0.8h, #0 +; NO_SVE-NEXT: sshll2 v1.8h, v1.16b, #0 +; NO_SVE-NEXT: sshll2 v16.2d, v0.4s, #0 +; NO_SVE-NEXT: sshll v0.2d, v0.2s, #0 +; NO_SVE-NEXT: sshll2 v5.4s, v3.8h, #0 +; NO_SVE-NEXT: stp q0, q16, [x8, #96] +; NO_SVE-NEXT: sshll2 v0.2d, v6.4s, #0 +; NO_SVE-NEXT: sshll v6.2d, v6.2s, #0 +; NO_SVE-NEXT: sshll2 v7.4s, v1.8h, #0 +; NO_SVE-NEXT: sshll v1.4s, v1.4h, #0 +; NO_SVE-NEXT: stp q6, q0, [x8, #64] +; NO_SVE-NEXT: sshll2 v0.2d, v5.4s, #0 +; NO_SVE-NEXT: sshll v5.2d, v5.2s, #0 +; NO_SVE-NEXT: sshll2 v4.4s, v2.8h, #0 +; NO_SVE-NEXT: sshll2 v6.2d, v1.4s, #0 +; NO_SVE-NEXT: stp q5, q0, [x8, #32] +; NO_SVE-NEXT: sshll v0.2d, v1.2s, #0 +; NO_SVE-NEXT: sshll2 v1.2d, v4.4s, #0 +; NO_SVE-NEXT: stp q0, q6, [x8, #192] +; NO_SVE-NEXT: sshll v0.2d, v4.2s, #0 +; NO_SVE-NEXT: sshll v3.4s, v3.4h, #0 +; NO_SVE-NEXT: stp q0, q1, [x8, #160] +; NO_SVE-NEXT: sshll2 v0.2d, v3.4s, #0 +; NO_SVE-NEXT: sshll v1.2d, v3.2s, #0 +; NO_SVE-NEXT: sshll v2.4s, v2.4h, #0 +; NO_SVE-NEXT: sshll2 v17.2d, v7.4s, #0 +; NO_SVE-NEXT: stp q1, q0, [x8] +; NO_SVE-NEXT: sshll v7.2d, v7.2s, #0 +; NO_SVE-NEXT: sshll2 v0.2d, v2.4s, #0 +; NO_SVE-NEXT: sshll v1.2d, v2.2s, #0 +; NO_SVE-NEXT: stp q7, q17, [x8, #224] +; NO_SVE-NEXT: stp q1, q0, [x8, #128] +; NO_SVE-NEXT: add sp, sp, #16 +; NO_SVE-NEXT: ret +; NO_SVE-NEXT: .LBB39_52: // %cond.load52 +; NO_SVE-NEXT: add x10, x0, #18 +; NO_SVE-NEXT: ld1 { v1.b }[2], [x10] +; NO_SVE-NEXT: tbz w9, #19, .LBB39_38 +; NO_SVE-NEXT: .LBB39_53: // %cond.load55 +; NO_SVE-NEXT: add x10, x0, #19 +; NO_SVE-NEXT: ld1 { v1.b }[3], [x10] +; NO_SVE-NEXT: tbz w9, #20, .LBB39_39 +; NO_SVE-NEXT: .LBB39_54: // %cond.load58 +; NO_SVE-NEXT: add x10, x0, #20 +; NO_SVE-NEXT: ld1 { v1.b }[4], [x10] +; NO_SVE-NEXT: tbz w9, #21, .LBB39_40 +; NO_SVE-NEXT: .LBB39_55: // %cond.load61 +; NO_SVE-NEXT: add x10, x0, #21 +; NO_SVE-NEXT: ld1 { v1.b }[5], [x10] +; NO_SVE-NEXT: tbz w9, #22, .LBB39_41 +; NO_SVE-NEXT: .LBB39_56: // %cond.load64 +; NO_SVE-NEXT: add x10, x0, #22 +; NO_SVE-NEXT: ld1 { v1.b }[6], [x10] +; NO_SVE-NEXT: tbz w9, #23, .LBB39_42 +; NO_SVE-NEXT: .LBB39_57: // %cond.load67 +; NO_SVE-NEXT: add x10, x0, #23 +; NO_SVE-NEXT: ld1 { v1.b }[7], [x10] +; NO_SVE-NEXT: tbz w9, #24, .LBB39_43 +; NO_SVE-NEXT: .LBB39_58: // %cond.load70 +; NO_SVE-NEXT: add x10, x0, #24 +; NO_SVE-NEXT: ld1 { v1.b }[8], [x10] +; NO_SVE-NEXT: tbz w9, #25, .LBB39_44 +; NO_SVE-NEXT: .LBB39_59: // %cond.load73 +; NO_SVE-NEXT: add x10, x0, #25 +; NO_SVE-NEXT: ld1 { v1.b }[9], [x10] +; NO_SVE-NEXT: tbz w9, #26, .LBB39_45 +; NO_SVE-NEXT: .LBB39_60: // %cond.load76 +; NO_SVE-NEXT: add x10, x0, #26 +; NO_SVE-NEXT: ld1 { v1.b }[10], [x10] +; NO_SVE-NEXT: tbz w9, #27, .LBB39_46 +; NO_SVE-NEXT: .LBB39_61: // %cond.load79 +; NO_SVE-NEXT: add x10, x0, #27 +; NO_SVE-NEXT: ld1 { v1.b }[11], [x10] +; NO_SVE-NEXT: tbz w9, #28, .LBB39_47 +; NO_SVE-NEXT: .LBB39_62: // %cond.load82 +; NO_SVE-NEXT: add x10, x0, #28 +; NO_SVE-NEXT: ld1 { v1.b }[12], [x10] +; NO_SVE-NEXT: tbz w9, #29, .LBB39_48 +; NO_SVE-NEXT: .LBB39_63: // %cond.load85 +; NO_SVE-NEXT: add x10, x0, #29 +; NO_SVE-NEXT: ld1 { v1.b }[13], [x10] +; NO_SVE-NEXT: tbz w9, #30, .LBB39_49 +; NO_SVE-NEXT: .LBB39_64: // %cond.load88 +; NO_SVE-NEXT: add x10, x0, #30 +; NO_SVE-NEXT: ld1 { v1.b }[14], [x10] +; NO_SVE-NEXT: tbnz w9, #31, .LBB39_50 +; NO_SVE-NEXT: b .LBB39_51 +; ; VBITS_GE_2048-LABEL: masked_load_sext_v32i8i64: ; VBITS_GE_2048: // %bb.0: ; VBITS_GE_2048-NEXT: ptrue p0.d, vl32 @@ -695,6 +9126,635 @@ } define <64 x i32> @masked_load_sext_v64i16i32(<64 x i16>* %ap, <64 x i16>* %bp) #0 { +; NO_SVE-LABEL: masked_load_sext_v64i16i32: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: sub sp, sp, #16 +; NO_SVE-NEXT: .cfi_def_cfa_offset 16 +; NO_SVE-NEXT: ldp q3, q2, [x1, #96] +; NO_SVE-NEXT: cmeq v3.8h, v3.8h, #0 +; NO_SVE-NEXT: xtn v3.8b, v3.8h +; NO_SVE-NEXT: cmeq v2.8h, v2.8h, #0 +; NO_SVE-NEXT: umov w9, v3.b[1] +; NO_SVE-NEXT: umov w11, v3.b[2] +; NO_SVE-NEXT: umov w10, v3.b[0] +; NO_SVE-NEXT: umov w12, v3.b[3] +; NO_SVE-NEXT: umov w13, v3.b[4] +; NO_SVE-NEXT: umov w14, v3.b[5] +; NO_SVE-NEXT: xtn v6.8b, v2.8h +; NO_SVE-NEXT: and w9, w9, #0x1 +; NO_SVE-NEXT: umov w15, v3.b[6] +; NO_SVE-NEXT: and w11, w11, #0x1 +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: and w12, w12, #0x1 +; NO_SVE-NEXT: umov w16, v3.b[7] +; NO_SVE-NEXT: bfi w10, w9, #1, #1 +; NO_SVE-NEXT: and w13, w13, #0x1 +; NO_SVE-NEXT: ldp q5, q4, [x1, #64] +; NO_SVE-NEXT: umov w17, v6.b[0] +; NO_SVE-NEXT: bfi w10, w11, #2, #1 +; NO_SVE-NEXT: and w14, w14, #0x1 +; NO_SVE-NEXT: umov w18, v6.b[1] +; NO_SVE-NEXT: bfi w10, w12, #3, #1 +; NO_SVE-NEXT: umov w9, v6.b[2] +; NO_SVE-NEXT: and w15, w15, #0x1 +; NO_SVE-NEXT: bfi w10, w13, #4, #1 +; NO_SVE-NEXT: umov w11, v6.b[3] +; NO_SVE-NEXT: and w16, w16, #0x1 +; NO_SVE-NEXT: bfi w10, w14, #5, #1 +; NO_SVE-NEXT: umov w12, v6.b[4] +; NO_SVE-NEXT: and w17, w17, #0x1 +; NO_SVE-NEXT: orr w10, w10, w15, lsl #6 +; NO_SVE-NEXT: cmeq v2.8h, v5.8h, #0 +; NO_SVE-NEXT: and w18, w18, #0x1 +; NO_SVE-NEXT: umov w13, v6.b[5] +; NO_SVE-NEXT: orr w10, w10, w16, lsl #7 +; NO_SVE-NEXT: and w9, w9, #0x1 +; NO_SVE-NEXT: xtn v5.8b, v2.8h +; NO_SVE-NEXT: orr w10, w10, w17, lsl #8 +; NO_SVE-NEXT: and w11, w11, #0x1 +; NO_SVE-NEXT: orr w10, w10, w18, lsl #9 +; NO_SVE-NEXT: umov w14, v6.b[6] +; NO_SVE-NEXT: umov w15, v5.b[1] +; NO_SVE-NEXT: and w12, w12, #0x1 +; NO_SVE-NEXT: orr w9, w10, w9, lsl #10 +; NO_SVE-NEXT: and w13, w13, #0x1 +; NO_SVE-NEXT: orr w9, w9, w11, lsl #11 +; NO_SVE-NEXT: umov w16, v6.b[7] +; NO_SVE-NEXT: orr w9, w9, w12, lsl #12 +; NO_SVE-NEXT: umov w12, v5.b[0] +; NO_SVE-NEXT: orr w9, w9, w13, lsl #13 +; NO_SVE-NEXT: umov w13, v5.b[2] +; NO_SVE-NEXT: and w10, w14, #0x1 +; NO_SVE-NEXT: and w11, w15, #0x1 +; NO_SVE-NEXT: umov w14, v5.b[3] +; NO_SVE-NEXT: orr w9, w9, w10, lsl #14 +; NO_SVE-NEXT: and w10, w12, #0x1 +; NO_SVE-NEXT: umov w12, v5.b[4] +; NO_SVE-NEXT: bfi w10, w11, #1, #1 +; NO_SVE-NEXT: and w11, w13, #0x1 +; NO_SVE-NEXT: orr w9, w9, w16, lsl #15 +; NO_SVE-NEXT: and w13, w14, #0x1 +; NO_SVE-NEXT: umov w14, v5.b[5] +; NO_SVE-NEXT: cmeq v2.8h, v4.8h, #0 +; NO_SVE-NEXT: bfi w10, w11, #2, #1 +; NO_SVE-NEXT: and w11, w12, #0x1 +; NO_SVE-NEXT: umov w12, v5.b[6] +; NO_SVE-NEXT: xtn v2.8b, v2.8h +; NO_SVE-NEXT: bfi w10, w13, #3, #1 +; NO_SVE-NEXT: umov w13, v5.b[7] +; NO_SVE-NEXT: and w14, w14, #0x1 +; NO_SVE-NEXT: umov w15, v2.b[0] +; NO_SVE-NEXT: bfi w10, w11, #4, #1 +; NO_SVE-NEXT: umov w16, v2.b[6] +; NO_SVE-NEXT: and w11, w12, #0x1 +; NO_SVE-NEXT: umov w12, v2.b[1] +; NO_SVE-NEXT: ldp q7, q3, [x1, #32] +; NO_SVE-NEXT: bfi w10, w14, #5, #1 +; NO_SVE-NEXT: and w13, w13, #0x1 +; NO_SVE-NEXT: orr w10, w10, w11, lsl #6 +; NO_SVE-NEXT: and w11, w15, #0x1 +; NO_SVE-NEXT: orr w10, w10, w13, lsl #7 +; NO_SVE-NEXT: umov w13, v2.b[2] +; NO_SVE-NEXT: orr w10, w10, w11, lsl #8 +; NO_SVE-NEXT: and w11, w12, #0x1 +; NO_SVE-NEXT: umov w12, v2.b[3] +; NO_SVE-NEXT: umov w14, v2.b[4] +; NO_SVE-NEXT: cmeq v4.8h, v7.8h, #0 +; NO_SVE-NEXT: orr w10, w10, w11, lsl #9 +; NO_SVE-NEXT: and w11, w13, #0x1 +; NO_SVE-NEXT: umov w13, v2.b[5] +; NO_SVE-NEXT: xtn v4.8b, v4.8h +; NO_SVE-NEXT: and w16, w16, #0x1 +; NO_SVE-NEXT: orr w10, w10, w11, lsl #10 +; NO_SVE-NEXT: and w11, w12, #0x1 +; NO_SVE-NEXT: and w12, w14, #0x1 +; NO_SVE-NEXT: umov w14, v4.b[1] +; NO_SVE-NEXT: umov w15, v4.b[0] +; NO_SVE-NEXT: umov w17, v4.b[2] +; NO_SVE-NEXT: orr w10, w10, w11, lsl #11 +; NO_SVE-NEXT: and w11, w13, #0x1 +; NO_SVE-NEXT: orr w10, w10, w12, lsl #12 +; NO_SVE-NEXT: orr w10, w10, w11, lsl #13 +; NO_SVE-NEXT: and w11, w14, #0x1 +; NO_SVE-NEXT: umov w14, v4.b[3] +; NO_SVE-NEXT: and w12, w15, #0x1 +; NO_SVE-NEXT: and w13, w17, #0x1 +; NO_SVE-NEXT: umov w15, v4.b[4] +; NO_SVE-NEXT: bfi w12, w11, #1, #1 +; NO_SVE-NEXT: umov w11, v4.b[5] +; NO_SVE-NEXT: cmeq v3.8h, v3.8h, #0 +; NO_SVE-NEXT: umov w17, v4.b[7] +; NO_SVE-NEXT: bfi w12, w13, #2, #1 +; NO_SVE-NEXT: and w13, w14, #0x1 +; NO_SVE-NEXT: and w14, w15, #0x1 +; NO_SVE-NEXT: umov w15, v4.b[6] +; NO_SVE-NEXT: xtn v3.8b, v3.8h +; NO_SVE-NEXT: and w11, w11, #0x1 +; NO_SVE-NEXT: bfi w12, w13, #3, #1 +; NO_SVE-NEXT: orr w10, w10, w16, lsl #14 +; NO_SVE-NEXT: umov w13, v3.b[0] +; NO_SVE-NEXT: bfi w12, w14, #4, #1 +; NO_SVE-NEXT: ldp q1, q0, [x1] +; NO_SVE-NEXT: bfi w12, w11, #5, #1 +; NO_SVE-NEXT: and w11, w15, #0x1 +; NO_SVE-NEXT: and w14, w17, #0x1 +; NO_SVE-NEXT: umov w15, v3.b[1] +; NO_SVE-NEXT: orr w11, w12, w11, lsl #6 +; NO_SVE-NEXT: and w12, w13, #0x1 +; NO_SVE-NEXT: orr w11, w11, w14, lsl #7 +; NO_SVE-NEXT: umov w14, v3.b[3] +; NO_SVE-NEXT: cmeq v1.8h, v1.8h, #0 +; NO_SVE-NEXT: orr w11, w11, w12, lsl #8 +; NO_SVE-NEXT: umov w12, v3.b[2] +; NO_SVE-NEXT: and w13, w15, #0x1 +; NO_SVE-NEXT: xtn v1.8b, v1.8h +; NO_SVE-NEXT: umov w15, v2.b[7] +; NO_SVE-NEXT: orr w11, w11, w13, lsl #9 +; NO_SVE-NEXT: and w13, w14, #0x1 +; NO_SVE-NEXT: umov w14, v3.b[4] +; NO_SVE-NEXT: umov w16, v1.b[1] +; NO_SVE-NEXT: umov w17, v1.b[2] +; NO_SVE-NEXT: and w12, w12, #0x1 +; NO_SVE-NEXT: orr w10, w10, w15, lsl #15 +; NO_SVE-NEXT: umov w18, v1.b[4] +; NO_SVE-NEXT: orr w11, w11, w12, lsl #10 +; NO_SVE-NEXT: umov w12, v1.b[0] +; NO_SVE-NEXT: orr w11, w11, w13, lsl #11 +; NO_SVE-NEXT: and w13, w14, #0x1 +; NO_SVE-NEXT: umov w14, v3.b[5] +; NO_SVE-NEXT: and w15, w16, #0x1 +; NO_SVE-NEXT: and w16, w17, #0x1 +; NO_SVE-NEXT: umov w17, v1.b[3] +; NO_SVE-NEXT: umov w1, v1.b[5] +; NO_SVE-NEXT: and w12, w12, #0x1 +; NO_SVE-NEXT: orr w11, w11, w13, lsl #12 +; NO_SVE-NEXT: bfi w12, w15, #1, #1 +; NO_SVE-NEXT: and w13, w14, #0x1 +; NO_SVE-NEXT: and w15, w18, #0x1 +; NO_SVE-NEXT: and w14, w17, #0x1 +; NO_SVE-NEXT: umov w17, v1.b[6] +; NO_SVE-NEXT: cmeq v0.8h, v0.8h, #0 +; NO_SVE-NEXT: bfi w12, w16, #2, #1 +; NO_SVE-NEXT: and w16, w1, #0x1 +; NO_SVE-NEXT: orr w11, w11, w13, lsl #13 +; NO_SVE-NEXT: bfi w12, w14, #3, #1 +; NO_SVE-NEXT: umov w14, v1.b[7] +; NO_SVE-NEXT: xtn v0.8b, v0.8h +; NO_SVE-NEXT: bfi w12, w15, #4, #1 +; NO_SVE-NEXT: umov w15, v3.b[6] +; NO_SVE-NEXT: bfi w12, w16, #5, #1 +; NO_SVE-NEXT: and w16, w17, #0x1 +; NO_SVE-NEXT: umov w17, v0.b[0] +; NO_SVE-NEXT: and w14, w14, #0x1 +; NO_SVE-NEXT: umov w18, v0.b[1] +; NO_SVE-NEXT: orr w12, w12, w16, lsl #6 +; NO_SVE-NEXT: bfi w10, w9, #16, #16 +; NO_SVE-NEXT: and w13, w15, #0x1 +; NO_SVE-NEXT: umov w15, v0.b[2] +; NO_SVE-NEXT: orr w12, w12, w14, lsl #7 +; NO_SVE-NEXT: and w14, w17, #0x1 +; NO_SVE-NEXT: umov w17, v0.b[3] +; NO_SVE-NEXT: and w16, w18, #0x1 +; NO_SVE-NEXT: orr w11, w11, w13, lsl #14 +; NO_SVE-NEXT: umov w13, v0.b[4] +; NO_SVE-NEXT: orr w12, w12, w14, lsl #8 +; NO_SVE-NEXT: and w14, w15, #0x1 +; NO_SVE-NEXT: umov w15, v0.b[5] +; NO_SVE-NEXT: orr w12, w12, w16, lsl #9 +; NO_SVE-NEXT: and w16, w17, #0x1 +; NO_SVE-NEXT: umov w17, v0.b[6] +; NO_SVE-NEXT: orr w12, w12, w14, lsl #10 +; NO_SVE-NEXT: umov w14, v3.b[7] +; NO_SVE-NEXT: and w13, w13, #0x1 +; NO_SVE-NEXT: orr w12, w12, w16, lsl #11 +; NO_SVE-NEXT: and w15, w15, #0x1 +; NO_SVE-NEXT: umov w16, v0.b[7] +; NO_SVE-NEXT: orr w12, w12, w13, lsl #12 +; NO_SVE-NEXT: and w13, w17, #0x1 +; NO_SVE-NEXT: orr w11, w11, w14, lsl #15 +; NO_SVE-NEXT: orr w12, w12, w15, lsl #13 +; NO_SVE-NEXT: orr w9, w12, w13, lsl #14 +; NO_SVE-NEXT: orr w9, w9, w16, lsl #15 +; NO_SVE-NEXT: bfi w9, w11, #16, #16 +; NO_SVE-NEXT: bfi x9, x10, #32, #32 +; NO_SVE-NEXT: tbz w9, #0, .LBB40_2 +; NO_SVE-NEXT: // %bb.1: // %cond.load +; NO_SVE-NEXT: ldr h0, [x0] +; NO_SVE-NEXT: tbnz w9, #1, .LBB40_3 +; NO_SVE-NEXT: b .LBB40_4 +; NO_SVE-NEXT: .LBB40_2: +; NO_SVE-NEXT: // implicit-def: $q0 +; NO_SVE-NEXT: tbz w9, #1, .LBB40_4 +; NO_SVE-NEXT: .LBB40_3: // %cond.load1 +; NO_SVE-NEXT: add x10, x0, #2 +; NO_SVE-NEXT: ld1 { v0.h }[1], [x10] +; NO_SVE-NEXT: .LBB40_4: // %else2 +; NO_SVE-NEXT: tbnz w9, #2, .LBB40_12 +; NO_SVE-NEXT: // %bb.5: // %else5 +; NO_SVE-NEXT: tbnz w9, #3, .LBB40_13 +; NO_SVE-NEXT: .LBB40_6: // %else8 +; NO_SVE-NEXT: tbnz w9, #4, .LBB40_14 +; NO_SVE-NEXT: .LBB40_7: // %else11 +; NO_SVE-NEXT: tbnz w9, #5, .LBB40_15 +; NO_SVE-NEXT: .LBB40_8: // %else14 +; NO_SVE-NEXT: tbnz w9, #6, .LBB40_16 +; NO_SVE-NEXT: .LBB40_9: // %else17 +; NO_SVE-NEXT: tbnz w9, #7, .LBB40_17 +; NO_SVE-NEXT: .LBB40_10: // %else20 +; NO_SVE-NEXT: tbz w9, #8, .LBB40_18 +; NO_SVE-NEXT: .LBB40_11: // %cond.load22 +; NO_SVE-NEXT: add x10, x0, #16 +; NO_SVE-NEXT: ld1 { v1.h }[0], [x10] +; NO_SVE-NEXT: tbnz w9, #9, .LBB40_19 +; NO_SVE-NEXT: b .LBB40_20 +; NO_SVE-NEXT: .LBB40_12: // %cond.load4 +; NO_SVE-NEXT: add x10, x0, #4 +; NO_SVE-NEXT: ld1 { v0.h }[2], [x10] +; NO_SVE-NEXT: tbz w9, #3, .LBB40_6 +; NO_SVE-NEXT: .LBB40_13: // %cond.load7 +; NO_SVE-NEXT: add x10, x0, #6 +; NO_SVE-NEXT: ld1 { v0.h }[3], [x10] +; NO_SVE-NEXT: tbz w9, #4, .LBB40_7 +; NO_SVE-NEXT: .LBB40_14: // %cond.load10 +; NO_SVE-NEXT: add x10, x0, #8 +; NO_SVE-NEXT: ld1 { v0.h }[4], [x10] +; NO_SVE-NEXT: tbz w9, #5, .LBB40_8 +; NO_SVE-NEXT: .LBB40_15: // %cond.load13 +; NO_SVE-NEXT: add x10, x0, #10 +; NO_SVE-NEXT: ld1 { v0.h }[5], [x10] +; NO_SVE-NEXT: tbz w9, #6, .LBB40_9 +; NO_SVE-NEXT: .LBB40_16: // %cond.load16 +; NO_SVE-NEXT: add x10, x0, #12 +; NO_SVE-NEXT: ld1 { v0.h }[6], [x10] +; NO_SVE-NEXT: tbz w9, #7, .LBB40_10 +; NO_SVE-NEXT: .LBB40_17: // %cond.load19 +; NO_SVE-NEXT: add x10, x0, #14 +; NO_SVE-NEXT: ld1 { v0.h }[7], [x10] +; NO_SVE-NEXT: tbnz w9, #8, .LBB40_11 +; NO_SVE-NEXT: .LBB40_18: +; NO_SVE-NEXT: // implicit-def: $q1 +; NO_SVE-NEXT: tbz w9, #9, .LBB40_20 +; NO_SVE-NEXT: .LBB40_19: // %cond.load25 +; NO_SVE-NEXT: add x10, x0, #18 +; NO_SVE-NEXT: ld1 { v1.h }[1], [x10] +; NO_SVE-NEXT: .LBB40_20: // %else26 +; NO_SVE-NEXT: tbnz w9, #10, .LBB40_28 +; NO_SVE-NEXT: // %bb.21: // %else29 +; NO_SVE-NEXT: tbnz w9, #11, .LBB40_29 +; NO_SVE-NEXT: .LBB40_22: // %else32 +; NO_SVE-NEXT: tbnz w9, #12, .LBB40_30 +; NO_SVE-NEXT: .LBB40_23: // %else35 +; NO_SVE-NEXT: tbnz w9, #13, .LBB40_31 +; NO_SVE-NEXT: .LBB40_24: // %else38 +; NO_SVE-NEXT: tbnz w9, #14, .LBB40_32 +; NO_SVE-NEXT: .LBB40_25: // %else41 +; NO_SVE-NEXT: tbnz w9, #15, .LBB40_33 +; NO_SVE-NEXT: .LBB40_26: // %else44 +; NO_SVE-NEXT: tbz w9, #16, .LBB40_34 +; NO_SVE-NEXT: .LBB40_27: // %cond.load46 +; NO_SVE-NEXT: add x10, x0, #32 +; NO_SVE-NEXT: ld1 { v2.h }[0], [x10] +; NO_SVE-NEXT: tbnz w9, #17, .LBB40_35 +; NO_SVE-NEXT: b .LBB40_36 +; NO_SVE-NEXT: .LBB40_28: // %cond.load28 +; NO_SVE-NEXT: add x10, x0, #20 +; NO_SVE-NEXT: ld1 { v1.h }[2], [x10] +; NO_SVE-NEXT: tbz w9, #11, .LBB40_22 +; NO_SVE-NEXT: .LBB40_29: // %cond.load31 +; NO_SVE-NEXT: add x10, x0, #22 +; NO_SVE-NEXT: ld1 { v1.h }[3], [x10] +; NO_SVE-NEXT: tbz w9, #12, .LBB40_23 +; NO_SVE-NEXT: .LBB40_30: // %cond.load34 +; NO_SVE-NEXT: add x10, x0, #24 +; NO_SVE-NEXT: ld1 { v1.h }[4], [x10] +; NO_SVE-NEXT: tbz w9, #13, .LBB40_24 +; NO_SVE-NEXT: .LBB40_31: // %cond.load37 +; NO_SVE-NEXT: add x10, x0, #26 +; NO_SVE-NEXT: ld1 { v1.h }[5], [x10] +; NO_SVE-NEXT: tbz w9, #14, .LBB40_25 +; NO_SVE-NEXT: .LBB40_32: // %cond.load40 +; NO_SVE-NEXT: add x10, x0, #28 +; NO_SVE-NEXT: ld1 { v1.h }[6], [x10] +; NO_SVE-NEXT: tbz w9, #15, .LBB40_26 +; NO_SVE-NEXT: .LBB40_33: // %cond.load43 +; NO_SVE-NEXT: add x10, x0, #30 +; NO_SVE-NEXT: ld1 { v1.h }[7], [x10] +; NO_SVE-NEXT: tbnz w9, #16, .LBB40_27 +; NO_SVE-NEXT: .LBB40_34: +; NO_SVE-NEXT: // implicit-def: $q2 +; NO_SVE-NEXT: tbz w9, #17, .LBB40_36 +; NO_SVE-NEXT: .LBB40_35: // %cond.load49 +; NO_SVE-NEXT: add x10, x0, #34 +; NO_SVE-NEXT: ld1 { v2.h }[1], [x10] +; NO_SVE-NEXT: .LBB40_36: // %else50 +; NO_SVE-NEXT: tbnz w9, #18, .LBB40_44 +; NO_SVE-NEXT: // %bb.37: // %else53 +; NO_SVE-NEXT: tbnz w9, #19, .LBB40_45 +; NO_SVE-NEXT: .LBB40_38: // %else56 +; NO_SVE-NEXT: tbnz w9, #20, .LBB40_46 +; NO_SVE-NEXT: .LBB40_39: // %else59 +; NO_SVE-NEXT: tbnz w9, #21, .LBB40_47 +; NO_SVE-NEXT: .LBB40_40: // %else62 +; NO_SVE-NEXT: tbnz w9, #22, .LBB40_48 +; NO_SVE-NEXT: .LBB40_41: // %else65 +; NO_SVE-NEXT: tbnz w9, #23, .LBB40_49 +; NO_SVE-NEXT: .LBB40_42: // %else68 +; NO_SVE-NEXT: tbz w9, #24, .LBB40_50 +; NO_SVE-NEXT: .LBB40_43: // %cond.load70 +; NO_SVE-NEXT: add x10, x0, #48 +; NO_SVE-NEXT: ld1 { v3.h }[0], [x10] +; NO_SVE-NEXT: tbnz w9, #25, .LBB40_51 +; NO_SVE-NEXT: b .LBB40_52 +; NO_SVE-NEXT: .LBB40_44: // %cond.load52 +; NO_SVE-NEXT: add x10, x0, #36 +; NO_SVE-NEXT: ld1 { v2.h }[2], [x10] +; NO_SVE-NEXT: tbz w9, #19, .LBB40_38 +; NO_SVE-NEXT: .LBB40_45: // %cond.load55 +; NO_SVE-NEXT: add x10, x0, #38 +; NO_SVE-NEXT: ld1 { v2.h }[3], [x10] +; NO_SVE-NEXT: tbz w9, #20, .LBB40_39 +; NO_SVE-NEXT: .LBB40_46: // %cond.load58 +; NO_SVE-NEXT: add x10, x0, #40 +; NO_SVE-NEXT: ld1 { v2.h }[4], [x10] +; NO_SVE-NEXT: tbz w9, #21, .LBB40_40 +; NO_SVE-NEXT: .LBB40_47: // %cond.load61 +; NO_SVE-NEXT: add x10, x0, #42 +; NO_SVE-NEXT: ld1 { v2.h }[5], [x10] +; NO_SVE-NEXT: tbz w9, #22, .LBB40_41 +; NO_SVE-NEXT: .LBB40_48: // %cond.load64 +; NO_SVE-NEXT: add x10, x0, #44 +; NO_SVE-NEXT: ld1 { v2.h }[6], [x10] +; NO_SVE-NEXT: tbz w9, #23, .LBB40_42 +; NO_SVE-NEXT: .LBB40_49: // %cond.load67 +; NO_SVE-NEXT: add x10, x0, #46 +; NO_SVE-NEXT: ld1 { v2.h }[7], [x10] +; NO_SVE-NEXT: tbnz w9, #24, .LBB40_43 +; NO_SVE-NEXT: .LBB40_50: +; NO_SVE-NEXT: // implicit-def: $q3 +; NO_SVE-NEXT: tbz w9, #25, .LBB40_52 +; NO_SVE-NEXT: .LBB40_51: // %cond.load73 +; NO_SVE-NEXT: add x10, x0, #50 +; NO_SVE-NEXT: ld1 { v3.h }[1], [x10] +; NO_SVE-NEXT: .LBB40_52: // %else74 +; NO_SVE-NEXT: tbnz w9, #26, .LBB40_60 +; NO_SVE-NEXT: // %bb.53: // %else77 +; NO_SVE-NEXT: tbnz w9, #27, .LBB40_61 +; NO_SVE-NEXT: .LBB40_54: // %else80 +; NO_SVE-NEXT: tbnz w9, #28, .LBB40_62 +; NO_SVE-NEXT: .LBB40_55: // %else83 +; NO_SVE-NEXT: tbnz w9, #29, .LBB40_63 +; NO_SVE-NEXT: .LBB40_56: // %else86 +; NO_SVE-NEXT: tbnz w9, #30, .LBB40_64 +; NO_SVE-NEXT: .LBB40_57: // %else89 +; NO_SVE-NEXT: tbnz w9, #31, .LBB40_65 +; NO_SVE-NEXT: .LBB40_58: // %else92 +; NO_SVE-NEXT: tbz x9, #32, .LBB40_66 +; NO_SVE-NEXT: .LBB40_59: // %cond.load94 +; NO_SVE-NEXT: add x10, x0, #64 +; NO_SVE-NEXT: ld1 { v4.h }[0], [x10] +; NO_SVE-NEXT: tbnz x9, #33, .LBB40_67 +; NO_SVE-NEXT: b .LBB40_68 +; NO_SVE-NEXT: .LBB40_60: // %cond.load76 +; NO_SVE-NEXT: add x10, x0, #52 +; NO_SVE-NEXT: ld1 { v3.h }[2], [x10] +; NO_SVE-NEXT: tbz w9, #27, .LBB40_54 +; NO_SVE-NEXT: .LBB40_61: // %cond.load79 +; NO_SVE-NEXT: add x10, x0, #54 +; NO_SVE-NEXT: ld1 { v3.h }[3], [x10] +; NO_SVE-NEXT: tbz w9, #28, .LBB40_55 +; NO_SVE-NEXT: .LBB40_62: // %cond.load82 +; NO_SVE-NEXT: add x10, x0, #56 +; NO_SVE-NEXT: ld1 { v3.h }[4], [x10] +; NO_SVE-NEXT: tbz w9, #29, .LBB40_56 +; NO_SVE-NEXT: .LBB40_63: // %cond.load85 +; NO_SVE-NEXT: add x10, x0, #58 +; NO_SVE-NEXT: ld1 { v3.h }[5], [x10] +; NO_SVE-NEXT: tbz w9, #30, .LBB40_57 +; NO_SVE-NEXT: .LBB40_64: // %cond.load88 +; NO_SVE-NEXT: add x10, x0, #60 +; NO_SVE-NEXT: ld1 { v3.h }[6], [x10] +; NO_SVE-NEXT: tbz w9, #31, .LBB40_58 +; NO_SVE-NEXT: .LBB40_65: // %cond.load91 +; NO_SVE-NEXT: add x10, x0, #62 +; NO_SVE-NEXT: ld1 { v3.h }[7], [x10] +; NO_SVE-NEXT: tbnz x9, #32, .LBB40_59 +; NO_SVE-NEXT: .LBB40_66: +; NO_SVE-NEXT: // implicit-def: $q4 +; NO_SVE-NEXT: tbz x9, #33, .LBB40_68 +; NO_SVE-NEXT: .LBB40_67: // %cond.load97 +; NO_SVE-NEXT: add x10, x0, #66 +; NO_SVE-NEXT: ld1 { v4.h }[1], [x10] +; NO_SVE-NEXT: .LBB40_68: // %else98 +; NO_SVE-NEXT: tbnz x9, #34, .LBB40_76 +; NO_SVE-NEXT: // %bb.69: // %else101 +; NO_SVE-NEXT: tbnz x9, #35, .LBB40_77 +; NO_SVE-NEXT: .LBB40_70: // %else104 +; NO_SVE-NEXT: tbnz x9, #36, .LBB40_78 +; NO_SVE-NEXT: .LBB40_71: // %else107 +; NO_SVE-NEXT: tbnz x9, #37, .LBB40_79 +; NO_SVE-NEXT: .LBB40_72: // %else110 +; NO_SVE-NEXT: tbnz x9, #38, .LBB40_80 +; NO_SVE-NEXT: .LBB40_73: // %else113 +; NO_SVE-NEXT: tbnz x9, #39, .LBB40_81 +; NO_SVE-NEXT: .LBB40_74: // %else116 +; NO_SVE-NEXT: tbz x9, #40, .LBB40_82 +; NO_SVE-NEXT: .LBB40_75: // %cond.load118 +; NO_SVE-NEXT: add x10, x0, #80 +; NO_SVE-NEXT: ld1 { v5.h }[0], [x10] +; NO_SVE-NEXT: tbnz x9, #41, .LBB40_83 +; NO_SVE-NEXT: b .LBB40_84 +; NO_SVE-NEXT: .LBB40_76: // %cond.load100 +; NO_SVE-NEXT: add x10, x0, #68 +; NO_SVE-NEXT: ld1 { v4.h }[2], [x10] +; NO_SVE-NEXT: tbz x9, #35, .LBB40_70 +; NO_SVE-NEXT: .LBB40_77: // %cond.load103 +; NO_SVE-NEXT: add x10, x0, #70 +; NO_SVE-NEXT: ld1 { v4.h }[3], [x10] +; NO_SVE-NEXT: tbz x9, #36, .LBB40_71 +; NO_SVE-NEXT: .LBB40_78: // %cond.load106 +; NO_SVE-NEXT: add x10, x0, #72 +; NO_SVE-NEXT: ld1 { v4.h }[4], [x10] +; NO_SVE-NEXT: tbz x9, #37, .LBB40_72 +; NO_SVE-NEXT: .LBB40_79: // %cond.load109 +; NO_SVE-NEXT: add x10, x0, #74 +; NO_SVE-NEXT: ld1 { v4.h }[5], [x10] +; NO_SVE-NEXT: tbz x9, #38, .LBB40_73 +; NO_SVE-NEXT: .LBB40_80: // %cond.load112 +; NO_SVE-NEXT: add x10, x0, #76 +; NO_SVE-NEXT: ld1 { v4.h }[6], [x10] +; NO_SVE-NEXT: tbz x9, #39, .LBB40_74 +; NO_SVE-NEXT: .LBB40_81: // %cond.load115 +; NO_SVE-NEXT: add x10, x0, #78 +; NO_SVE-NEXT: ld1 { v4.h }[7], [x10] +; NO_SVE-NEXT: tbnz x9, #40, .LBB40_75 +; NO_SVE-NEXT: .LBB40_82: +; NO_SVE-NEXT: // implicit-def: $q5 +; NO_SVE-NEXT: tbz x9, #41, .LBB40_84 +; NO_SVE-NEXT: .LBB40_83: // %cond.load121 +; NO_SVE-NEXT: add x10, x0, #82 +; NO_SVE-NEXT: ld1 { v5.h }[1], [x10] +; NO_SVE-NEXT: .LBB40_84: // %else122 +; NO_SVE-NEXT: tbnz x9, #42, .LBB40_92 +; NO_SVE-NEXT: // %bb.85: // %else125 +; NO_SVE-NEXT: tbnz x9, #43, .LBB40_93 +; NO_SVE-NEXT: .LBB40_86: // %else128 +; NO_SVE-NEXT: tbnz x9, #44, .LBB40_94 +; NO_SVE-NEXT: .LBB40_87: // %else131 +; NO_SVE-NEXT: tbnz x9, #45, .LBB40_95 +; NO_SVE-NEXT: .LBB40_88: // %else134 +; NO_SVE-NEXT: tbnz x9, #46, .LBB40_96 +; NO_SVE-NEXT: .LBB40_89: // %else137 +; NO_SVE-NEXT: tbnz x9, #47, .LBB40_97 +; NO_SVE-NEXT: .LBB40_90: // %else140 +; NO_SVE-NEXT: tbz x9, #48, .LBB40_98 +; NO_SVE-NEXT: .LBB40_91: // %cond.load142 +; NO_SVE-NEXT: add x10, x0, #96 +; NO_SVE-NEXT: ld1 { v6.h }[0], [x10] +; NO_SVE-NEXT: tbnz x9, #49, .LBB40_99 +; NO_SVE-NEXT: b .LBB40_100 +; NO_SVE-NEXT: .LBB40_92: // %cond.load124 +; NO_SVE-NEXT: add x10, x0, #84 +; NO_SVE-NEXT: ld1 { v5.h }[2], [x10] +; NO_SVE-NEXT: tbz x9, #43, .LBB40_86 +; NO_SVE-NEXT: .LBB40_93: // %cond.load127 +; NO_SVE-NEXT: add x10, x0, #86 +; NO_SVE-NEXT: ld1 { v5.h }[3], [x10] +; NO_SVE-NEXT: tbz x9, #44, .LBB40_87 +; NO_SVE-NEXT: .LBB40_94: // %cond.load130 +; NO_SVE-NEXT: add x10, x0, #88 +; NO_SVE-NEXT: ld1 { v5.h }[4], [x10] +; NO_SVE-NEXT: tbz x9, #45, .LBB40_88 +; NO_SVE-NEXT: .LBB40_95: // %cond.load133 +; NO_SVE-NEXT: add x10, x0, #90 +; NO_SVE-NEXT: ld1 { v5.h }[5], [x10] +; NO_SVE-NEXT: tbz x9, #46, .LBB40_89 +; NO_SVE-NEXT: .LBB40_96: // %cond.load136 +; NO_SVE-NEXT: add x10, x0, #92 +; NO_SVE-NEXT: ld1 { v5.h }[6], [x10] +; NO_SVE-NEXT: tbz x9, #47, .LBB40_90 +; NO_SVE-NEXT: .LBB40_97: // %cond.load139 +; NO_SVE-NEXT: add x10, x0, #94 +; NO_SVE-NEXT: ld1 { v5.h }[7], [x10] +; NO_SVE-NEXT: tbnz x9, #48, .LBB40_91 +; NO_SVE-NEXT: .LBB40_98: +; NO_SVE-NEXT: // implicit-def: $q6 +; NO_SVE-NEXT: tbz x9, #49, .LBB40_100 +; NO_SVE-NEXT: .LBB40_99: // %cond.load145 +; NO_SVE-NEXT: add x10, x0, #98 +; NO_SVE-NEXT: ld1 { v6.h }[1], [x10] +; NO_SVE-NEXT: .LBB40_100: // %else146 +; NO_SVE-NEXT: tbnz x9, #50, .LBB40_108 +; NO_SVE-NEXT: // %bb.101: // %else149 +; NO_SVE-NEXT: tbnz x9, #51, .LBB40_109 +; NO_SVE-NEXT: .LBB40_102: // %else152 +; NO_SVE-NEXT: tbnz x9, #52, .LBB40_110 +; NO_SVE-NEXT: .LBB40_103: // %else155 +; NO_SVE-NEXT: tbnz x9, #53, .LBB40_111 +; NO_SVE-NEXT: .LBB40_104: // %else158 +; NO_SVE-NEXT: tbnz x9, #54, .LBB40_112 +; NO_SVE-NEXT: .LBB40_105: // %else161 +; NO_SVE-NEXT: tbnz x9, #55, .LBB40_113 +; NO_SVE-NEXT: .LBB40_106: // %else164 +; NO_SVE-NEXT: tbz x9, #56, .LBB40_114 +; NO_SVE-NEXT: .LBB40_107: // %cond.load166 +; NO_SVE-NEXT: add x10, x0, #112 +; NO_SVE-NEXT: ld1 { v7.h }[0], [x10] +; NO_SVE-NEXT: tbnz x9, #57, .LBB40_115 +; NO_SVE-NEXT: b .LBB40_116 +; NO_SVE-NEXT: .LBB40_108: // %cond.load148 +; NO_SVE-NEXT: add x10, x0, #100 +; NO_SVE-NEXT: ld1 { v6.h }[2], [x10] +; NO_SVE-NEXT: tbz x9, #51, .LBB40_102 +; NO_SVE-NEXT: .LBB40_109: // %cond.load151 +; NO_SVE-NEXT: add x10, x0, #102 +; NO_SVE-NEXT: ld1 { v6.h }[3], [x10] +; NO_SVE-NEXT: tbz x9, #52, .LBB40_103 +; NO_SVE-NEXT: .LBB40_110: // %cond.load154 +; NO_SVE-NEXT: add x10, x0, #104 +; NO_SVE-NEXT: ld1 { v6.h }[4], [x10] +; NO_SVE-NEXT: tbz x9, #53, .LBB40_104 +; NO_SVE-NEXT: .LBB40_111: // %cond.load157 +; NO_SVE-NEXT: add x10, x0, #106 +; NO_SVE-NEXT: ld1 { v6.h }[5], [x10] +; NO_SVE-NEXT: tbz x9, #54, .LBB40_105 +; NO_SVE-NEXT: .LBB40_112: // %cond.load160 +; NO_SVE-NEXT: add x10, x0, #108 +; NO_SVE-NEXT: ld1 { v6.h }[6], [x10] +; NO_SVE-NEXT: tbz x9, #55, .LBB40_106 +; NO_SVE-NEXT: .LBB40_113: // %cond.load163 +; NO_SVE-NEXT: add x10, x0, #110 +; NO_SVE-NEXT: ld1 { v6.h }[7], [x10] +; NO_SVE-NEXT: tbnz x9, #56, .LBB40_107 +; NO_SVE-NEXT: .LBB40_114: +; NO_SVE-NEXT: // implicit-def: $q7 +; NO_SVE-NEXT: tbz x9, #57, .LBB40_116 +; NO_SVE-NEXT: .LBB40_115: // %cond.load169 +; NO_SVE-NEXT: add x10, x0, #114 +; NO_SVE-NEXT: ld1 { v7.h }[1], [x10] +; NO_SVE-NEXT: .LBB40_116: // %else170 +; NO_SVE-NEXT: tbnz x9, #58, .LBB40_124 +; NO_SVE-NEXT: // %bb.117: // %else173 +; NO_SVE-NEXT: tbnz x9, #59, .LBB40_125 +; NO_SVE-NEXT: .LBB40_118: // %else176 +; NO_SVE-NEXT: tbnz x9, #60, .LBB40_126 +; NO_SVE-NEXT: .LBB40_119: // %else179 +; NO_SVE-NEXT: tbnz x9, #61, .LBB40_127 +; NO_SVE-NEXT: .LBB40_120: // %else182 +; NO_SVE-NEXT: tbnz x9, #62, .LBB40_128 +; NO_SVE-NEXT: .LBB40_121: // %else185 +; NO_SVE-NEXT: tbz x9, #63, .LBB40_123 +; NO_SVE-NEXT: .LBB40_122: // %cond.load187 +; NO_SVE-NEXT: add x9, x0, #126 +; NO_SVE-NEXT: ld1 { v7.h }[7], [x9] +; NO_SVE-NEXT: .LBB40_123: // %else188 +; NO_SVE-NEXT: sshll2 v16.4s, v0.8h, #0 +; NO_SVE-NEXT: sshll v0.4s, v0.4h, #0 +; NO_SVE-NEXT: stp q0, q16, [x8] +; NO_SVE-NEXT: sshll2 v16.4s, v1.8h, #0 +; NO_SVE-NEXT: sshll v0.4s, v1.4h, #0 +; NO_SVE-NEXT: sshll v1.4s, v2.4h, #0 +; NO_SVE-NEXT: stp q0, q16, [x8, #32] +; NO_SVE-NEXT: sshll2 v0.4s, v2.8h, #0 +; NO_SVE-NEXT: stp q1, q0, [x8, #64] +; NO_SVE-NEXT: sshll2 v0.4s, v3.8h, #0 +; NO_SVE-NEXT: sshll v1.4s, v3.4h, #0 +; NO_SVE-NEXT: stp q1, q0, [x8, #96] +; NO_SVE-NEXT: sshll2 v0.4s, v4.8h, #0 +; NO_SVE-NEXT: sshll v1.4s, v4.4h, #0 +; NO_SVE-NEXT: stp q1, q0, [x8, #128] +; NO_SVE-NEXT: sshll2 v0.4s, v5.8h, #0 +; NO_SVE-NEXT: sshll v1.4s, v5.4h, #0 +; NO_SVE-NEXT: stp q1, q0, [x8, #160] +; NO_SVE-NEXT: sshll2 v0.4s, v6.8h, #0 +; NO_SVE-NEXT: sshll v1.4s, v6.4h, #0 +; NO_SVE-NEXT: stp q1, q0, [x8, #192] +; NO_SVE-NEXT: sshll2 v0.4s, v7.8h, #0 +; NO_SVE-NEXT: sshll v1.4s, v7.4h, #0 +; NO_SVE-NEXT: stp q1, q0, [x8, #224] +; NO_SVE-NEXT: add sp, sp, #16 +; NO_SVE-NEXT: ret +; NO_SVE-NEXT: .LBB40_124: // %cond.load172 +; NO_SVE-NEXT: add x10, x0, #116 +; NO_SVE-NEXT: ld1 { v7.h }[2], [x10] +; NO_SVE-NEXT: tbz x9, #59, .LBB40_118 +; NO_SVE-NEXT: .LBB40_125: // %cond.load175 +; NO_SVE-NEXT: add x10, x0, #118 +; NO_SVE-NEXT: ld1 { v7.h }[3], [x10] +; NO_SVE-NEXT: tbz x9, #60, .LBB40_119 +; NO_SVE-NEXT: .LBB40_126: // %cond.load178 +; NO_SVE-NEXT: add x10, x0, #120 +; NO_SVE-NEXT: ld1 { v7.h }[4], [x10] +; NO_SVE-NEXT: tbz x9, #61, .LBB40_120 +; NO_SVE-NEXT: .LBB40_127: // %cond.load181 +; NO_SVE-NEXT: add x10, x0, #122 +; NO_SVE-NEXT: ld1 { v7.h }[5], [x10] +; NO_SVE-NEXT: tbz x9, #62, .LBB40_121 +; NO_SVE-NEXT: .LBB40_128: // %cond.load184 +; NO_SVE-NEXT: add x10, x0, #124 +; NO_SVE-NEXT: ld1 { v7.h }[6], [x10] +; NO_SVE-NEXT: tbnz x9, #63, .LBB40_122 +; NO_SVE-NEXT: b .LBB40_123 +; ; VBITS_GE_2048-LABEL: masked_load_sext_v64i16i32: ; VBITS_GE_2048: // %bb.0: ; VBITS_GE_2048-NEXT: ptrue p0.s, vl64 @@ -711,6 +9771,343 @@ } define <32 x i64> @masked_load_sext_v32i16i64(<32 x i16>* %ap, <32 x i16>* %bp) #0 { +; NO_SVE-LABEL: masked_load_sext_v32i16i64: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: sub sp, sp, #16 +; NO_SVE-NEXT: .cfi_def_cfa_offset 16 +; NO_SVE-NEXT: ldp q0, q1, [x1, #32] +; NO_SVE-NEXT: cmeq v0.8h, v0.8h, #0 +; NO_SVE-NEXT: xtn v0.8b, v0.8h +; NO_SVE-NEXT: ldp q2, q3, [x1] +; NO_SVE-NEXT: umov w9, v0.b[1] +; NO_SVE-NEXT: umov w11, v0.b[2] +; NO_SVE-NEXT: umov w10, v0.b[0] +; NO_SVE-NEXT: umov w12, v0.b[3] +; NO_SVE-NEXT: umov w13, v0.b[4] +; NO_SVE-NEXT: umov w14, v0.b[5] +; NO_SVE-NEXT: cmeq v1.8h, v1.8h, #0 +; NO_SVE-NEXT: umov w15, v0.b[6] +; NO_SVE-NEXT: and w9, w9, #0x1 +; NO_SVE-NEXT: and w11, w11, #0x1 +; NO_SVE-NEXT: xtn v1.8b, v1.8h +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: and w12, w12, #0x1 +; NO_SVE-NEXT: umov w16, v0.b[7] +; NO_SVE-NEXT: bfi w10, w9, #1, #1 +; NO_SVE-NEXT: and w13, w13, #0x1 +; NO_SVE-NEXT: cmeq v2.8h, v2.8h, #0 +; NO_SVE-NEXT: umov w17, v1.b[0] +; NO_SVE-NEXT: bfi w10, w11, #2, #1 +; NO_SVE-NEXT: and w14, w14, #0x1 +; NO_SVE-NEXT: umov w9, v1.b[1] +; NO_SVE-NEXT: bfi w10, w12, #3, #1 +; NO_SVE-NEXT: xtn v2.8b, v2.8h +; NO_SVE-NEXT: umov w11, v1.b[2] +; NO_SVE-NEXT: bfi w10, w13, #4, #1 +; NO_SVE-NEXT: and w15, w15, #0x1 +; NO_SVE-NEXT: umov w12, v1.b[3] +; NO_SVE-NEXT: bfi w10, w14, #5, #1 +; NO_SVE-NEXT: and w16, w16, #0x1 +; NO_SVE-NEXT: orr w10, w10, w15, lsl #6 +; NO_SVE-NEXT: umov w15, v2.b[1] +; NO_SVE-NEXT: and w17, w17, #0x1 +; NO_SVE-NEXT: orr w10, w10, w16, lsl #7 +; NO_SVE-NEXT: umov w16, v2.b[2] +; NO_SVE-NEXT: and w9, w9, #0x1 +; NO_SVE-NEXT: and w11, w11, #0x1 +; NO_SVE-NEXT: orr w10, w10, w17, lsl #8 +; NO_SVE-NEXT: umov w17, v2.b[0] +; NO_SVE-NEXT: umov w13, v1.b[4] +; NO_SVE-NEXT: and w12, w12, #0x1 +; NO_SVE-NEXT: orr w9, w10, w9, lsl #9 +; NO_SVE-NEXT: and w15, w15, #0x1 +; NO_SVE-NEXT: orr w9, w9, w11, lsl #10 +; NO_SVE-NEXT: umov w11, v2.b[3] +; NO_SVE-NEXT: umov w14, v1.b[5] +; NO_SVE-NEXT: and w10, w16, #0x1 +; NO_SVE-NEXT: orr w9, w9, w12, lsl #11 +; NO_SVE-NEXT: umov w12, v2.b[4] +; NO_SVE-NEXT: and w16, w17, #0x1 +; NO_SVE-NEXT: umov w17, v2.b[5] +; NO_SVE-NEXT: and w13, w13, #0x1 +; NO_SVE-NEXT: bfi w16, w15, #1, #1 +; NO_SVE-NEXT: bfi w16, w10, #2, #1 +; NO_SVE-NEXT: and w10, w11, #0x1 +; NO_SVE-NEXT: orr w9, w9, w13, lsl #12 +; NO_SVE-NEXT: and w13, w14, #0x1 +; NO_SVE-NEXT: and w11, w12, #0x1 +; NO_SVE-NEXT: umov w14, v2.b[6] +; NO_SVE-NEXT: cmeq v0.8h, v3.8h, #0 +; NO_SVE-NEXT: and w12, w17, #0x1 +; NO_SVE-NEXT: bfi w16, w10, #3, #1 +; NO_SVE-NEXT: umov w10, v2.b[7] +; NO_SVE-NEXT: xtn v0.8b, v0.8h +; NO_SVE-NEXT: bfi w16, w11, #4, #1 +; NO_SVE-NEXT: umov w11, v1.b[6] +; NO_SVE-NEXT: bfi w16, w12, #5, #1 +; NO_SVE-NEXT: and w12, w14, #0x1 +; NO_SVE-NEXT: umov w14, v0.b[0] +; NO_SVE-NEXT: umov w15, v0.b[1] +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: orr w9, w9, w13, lsl #13 +; NO_SVE-NEXT: orr w12, w16, w12, lsl #6 +; NO_SVE-NEXT: umov w13, v0.b[2] +; NO_SVE-NEXT: and w11, w11, #0x1 +; NO_SVE-NEXT: orr w10, w12, w10, lsl #7 +; NO_SVE-NEXT: and w12, w14, #0x1 +; NO_SVE-NEXT: and w14, w15, #0x1 +; NO_SVE-NEXT: umov w15, v0.b[3] +; NO_SVE-NEXT: orr w9, w9, w11, lsl #14 +; NO_SVE-NEXT: umov w11, v0.b[4] +; NO_SVE-NEXT: orr w10, w10, w12, lsl #8 +; NO_SVE-NEXT: and w12, w13, #0x1 +; NO_SVE-NEXT: umov w13, v0.b[5] +; NO_SVE-NEXT: orr w10, w10, w14, lsl #9 +; NO_SVE-NEXT: and w14, w15, #0x1 +; NO_SVE-NEXT: umov w15, v0.b[6] +; NO_SVE-NEXT: orr w10, w10, w12, lsl #10 +; NO_SVE-NEXT: umov w12, v1.b[7] +; NO_SVE-NEXT: and w11, w11, #0x1 +; NO_SVE-NEXT: orr w10, w10, w14, lsl #11 +; NO_SVE-NEXT: and w13, w13, #0x1 +; NO_SVE-NEXT: umov w14, v0.b[7] +; NO_SVE-NEXT: orr w10, w10, w11, lsl #12 +; NO_SVE-NEXT: and w11, w15, #0x1 +; NO_SVE-NEXT: orr w12, w9, w12, lsl #15 +; NO_SVE-NEXT: orr w9, w10, w13, lsl #13 +; NO_SVE-NEXT: orr w9, w9, w11, lsl #14 +; NO_SVE-NEXT: orr w9, w9, w14, lsl #15 +; NO_SVE-NEXT: bfi w9, w12, #16, #16 +; NO_SVE-NEXT: tbz w9, #0, .LBB41_2 +; NO_SVE-NEXT: // %bb.1: // %cond.load +; NO_SVE-NEXT: ldr h0, [x0] +; NO_SVE-NEXT: tbnz w9, #1, .LBB41_3 +; NO_SVE-NEXT: b .LBB41_4 +; NO_SVE-NEXT: .LBB41_2: +; NO_SVE-NEXT: // implicit-def: $q0 +; NO_SVE-NEXT: tbz w9, #1, .LBB41_4 +; NO_SVE-NEXT: .LBB41_3: // %cond.load1 +; NO_SVE-NEXT: add x10, x0, #2 +; NO_SVE-NEXT: ld1 { v0.h }[1], [x10] +; NO_SVE-NEXT: .LBB41_4: // %else2 +; NO_SVE-NEXT: tbnz w9, #2, .LBB41_12 +; NO_SVE-NEXT: // %bb.5: // %else5 +; NO_SVE-NEXT: tbnz w9, #3, .LBB41_13 +; NO_SVE-NEXT: .LBB41_6: // %else8 +; NO_SVE-NEXT: tbnz w9, #4, .LBB41_14 +; NO_SVE-NEXT: .LBB41_7: // %else11 +; NO_SVE-NEXT: tbnz w9, #5, .LBB41_15 +; NO_SVE-NEXT: .LBB41_8: // %else14 +; NO_SVE-NEXT: tbnz w9, #6, .LBB41_16 +; NO_SVE-NEXT: .LBB41_9: // %else17 +; NO_SVE-NEXT: tbnz w9, #7, .LBB41_17 +; NO_SVE-NEXT: .LBB41_10: // %else20 +; NO_SVE-NEXT: tbz w9, #8, .LBB41_18 +; NO_SVE-NEXT: .LBB41_11: // %cond.load22 +; NO_SVE-NEXT: add x10, x0, #16 +; NO_SVE-NEXT: ld1 { v1.h }[0], [x10] +; NO_SVE-NEXT: tbnz w9, #9, .LBB41_19 +; NO_SVE-NEXT: b .LBB41_20 +; NO_SVE-NEXT: .LBB41_12: // %cond.load4 +; NO_SVE-NEXT: add x10, x0, #4 +; NO_SVE-NEXT: ld1 { v0.h }[2], [x10] +; NO_SVE-NEXT: tbz w9, #3, .LBB41_6 +; NO_SVE-NEXT: .LBB41_13: // %cond.load7 +; NO_SVE-NEXT: add x10, x0, #6 +; NO_SVE-NEXT: ld1 { v0.h }[3], [x10] +; NO_SVE-NEXT: tbz w9, #4, .LBB41_7 +; NO_SVE-NEXT: .LBB41_14: // %cond.load10 +; NO_SVE-NEXT: add x10, x0, #8 +; NO_SVE-NEXT: ld1 { v0.h }[4], [x10] +; NO_SVE-NEXT: tbz w9, #5, .LBB41_8 +; NO_SVE-NEXT: .LBB41_15: // %cond.load13 +; NO_SVE-NEXT: add x10, x0, #10 +; NO_SVE-NEXT: ld1 { v0.h }[5], [x10] +; NO_SVE-NEXT: tbz w9, #6, .LBB41_9 +; NO_SVE-NEXT: .LBB41_16: // %cond.load16 +; NO_SVE-NEXT: add x10, x0, #12 +; NO_SVE-NEXT: ld1 { v0.h }[6], [x10] +; NO_SVE-NEXT: tbz w9, #7, .LBB41_10 +; NO_SVE-NEXT: .LBB41_17: // %cond.load19 +; NO_SVE-NEXT: add x10, x0, #14 +; NO_SVE-NEXT: ld1 { v0.h }[7], [x10] +; NO_SVE-NEXT: tbnz w9, #8, .LBB41_11 +; NO_SVE-NEXT: .LBB41_18: +; NO_SVE-NEXT: // implicit-def: $q1 +; NO_SVE-NEXT: tbz w9, #9, .LBB41_20 +; NO_SVE-NEXT: .LBB41_19: // %cond.load25 +; NO_SVE-NEXT: add x10, x0, #18 +; NO_SVE-NEXT: ld1 { v1.h }[1], [x10] +; NO_SVE-NEXT: .LBB41_20: // %else26 +; NO_SVE-NEXT: tbnz w9, #10, .LBB41_28 +; NO_SVE-NEXT: // %bb.21: // %else29 +; NO_SVE-NEXT: tbnz w9, #11, .LBB41_29 +; NO_SVE-NEXT: .LBB41_22: // %else32 +; NO_SVE-NEXT: tbnz w9, #12, .LBB41_30 +; NO_SVE-NEXT: .LBB41_23: // %else35 +; NO_SVE-NEXT: tbnz w9, #13, .LBB41_31 +; NO_SVE-NEXT: .LBB41_24: // %else38 +; NO_SVE-NEXT: tbnz w9, #14, .LBB41_32 +; NO_SVE-NEXT: .LBB41_25: // %else41 +; NO_SVE-NEXT: tbnz w9, #15, .LBB41_33 +; NO_SVE-NEXT: .LBB41_26: // %else44 +; NO_SVE-NEXT: tbz w9, #16, .LBB41_34 +; NO_SVE-NEXT: .LBB41_27: // %cond.load46 +; NO_SVE-NEXT: add x10, x0, #32 +; NO_SVE-NEXT: ld1 { v2.h }[0], [x10] +; NO_SVE-NEXT: tbnz w9, #17, .LBB41_35 +; NO_SVE-NEXT: b .LBB41_36 +; NO_SVE-NEXT: .LBB41_28: // %cond.load28 +; NO_SVE-NEXT: add x10, x0, #20 +; NO_SVE-NEXT: ld1 { v1.h }[2], [x10] +; NO_SVE-NEXT: tbz w9, #11, .LBB41_22 +; NO_SVE-NEXT: .LBB41_29: // %cond.load31 +; NO_SVE-NEXT: add x10, x0, #22 +; NO_SVE-NEXT: ld1 { v1.h }[3], [x10] +; NO_SVE-NEXT: tbz w9, #12, .LBB41_23 +; NO_SVE-NEXT: .LBB41_30: // %cond.load34 +; NO_SVE-NEXT: add x10, x0, #24 +; NO_SVE-NEXT: ld1 { v1.h }[4], [x10] +; NO_SVE-NEXT: tbz w9, #13, .LBB41_24 +; NO_SVE-NEXT: .LBB41_31: // %cond.load37 +; NO_SVE-NEXT: add x10, x0, #26 +; NO_SVE-NEXT: ld1 { v1.h }[5], [x10] +; NO_SVE-NEXT: tbz w9, #14, .LBB41_25 +; NO_SVE-NEXT: .LBB41_32: // %cond.load40 +; NO_SVE-NEXT: add x10, x0, #28 +; NO_SVE-NEXT: ld1 { v1.h }[6], [x10] +; NO_SVE-NEXT: tbz w9, #15, .LBB41_26 +; NO_SVE-NEXT: .LBB41_33: // %cond.load43 +; NO_SVE-NEXT: add x10, x0, #30 +; NO_SVE-NEXT: ld1 { v1.h }[7], [x10] +; NO_SVE-NEXT: tbnz w9, #16, .LBB41_27 +; NO_SVE-NEXT: .LBB41_34: +; NO_SVE-NEXT: // implicit-def: $q2 +; NO_SVE-NEXT: tbz w9, #17, .LBB41_36 +; NO_SVE-NEXT: .LBB41_35: // %cond.load49 +; NO_SVE-NEXT: add x10, x0, #34 +; NO_SVE-NEXT: ld1 { v2.h }[1], [x10] +; NO_SVE-NEXT: .LBB41_36: // %else50 +; NO_SVE-NEXT: tbnz w9, #18, .LBB41_44 +; NO_SVE-NEXT: // %bb.37: // %else53 +; NO_SVE-NEXT: tbnz w9, #19, .LBB41_45 +; NO_SVE-NEXT: .LBB41_38: // %else56 +; NO_SVE-NEXT: tbnz w9, #20, .LBB41_46 +; NO_SVE-NEXT: .LBB41_39: // %else59 +; NO_SVE-NEXT: tbnz w9, #21, .LBB41_47 +; NO_SVE-NEXT: .LBB41_40: // %else62 +; NO_SVE-NEXT: tbnz w9, #22, .LBB41_48 +; NO_SVE-NEXT: .LBB41_41: // %else65 +; NO_SVE-NEXT: tbnz w9, #23, .LBB41_49 +; NO_SVE-NEXT: .LBB41_42: // %else68 +; NO_SVE-NEXT: tbz w9, #24, .LBB41_50 +; NO_SVE-NEXT: .LBB41_43: // %cond.load70 +; NO_SVE-NEXT: add x10, x0, #48 +; NO_SVE-NEXT: ld1 { v3.h }[0], [x10] +; NO_SVE-NEXT: tbnz w9, #25, .LBB41_51 +; NO_SVE-NEXT: b .LBB41_52 +; NO_SVE-NEXT: .LBB41_44: // %cond.load52 +; NO_SVE-NEXT: add x10, x0, #36 +; NO_SVE-NEXT: ld1 { v2.h }[2], [x10] +; NO_SVE-NEXT: tbz w9, #19, .LBB41_38 +; NO_SVE-NEXT: .LBB41_45: // %cond.load55 +; NO_SVE-NEXT: add x10, x0, #38 +; NO_SVE-NEXT: ld1 { v2.h }[3], [x10] +; NO_SVE-NEXT: tbz w9, #20, .LBB41_39 +; NO_SVE-NEXT: .LBB41_46: // %cond.load58 +; NO_SVE-NEXT: add x10, x0, #40 +; NO_SVE-NEXT: ld1 { v2.h }[4], [x10] +; NO_SVE-NEXT: tbz w9, #21, .LBB41_40 +; NO_SVE-NEXT: .LBB41_47: // %cond.load61 +; NO_SVE-NEXT: add x10, x0, #42 +; NO_SVE-NEXT: ld1 { v2.h }[5], [x10] +; NO_SVE-NEXT: tbz w9, #22, .LBB41_41 +; NO_SVE-NEXT: .LBB41_48: // %cond.load64 +; NO_SVE-NEXT: add x10, x0, #44 +; NO_SVE-NEXT: ld1 { v2.h }[6], [x10] +; NO_SVE-NEXT: tbz w9, #23, .LBB41_42 +; NO_SVE-NEXT: .LBB41_49: // %cond.load67 +; NO_SVE-NEXT: add x10, x0, #46 +; NO_SVE-NEXT: ld1 { v2.h }[7], [x10] +; NO_SVE-NEXT: tbnz w9, #24, .LBB41_43 +; NO_SVE-NEXT: .LBB41_50: +; NO_SVE-NEXT: // implicit-def: $q3 +; NO_SVE-NEXT: tbz w9, #25, .LBB41_52 +; NO_SVE-NEXT: .LBB41_51: // %cond.load73 +; NO_SVE-NEXT: add x10, x0, #50 +; NO_SVE-NEXT: ld1 { v3.h }[1], [x10] +; NO_SVE-NEXT: .LBB41_52: // %else74 +; NO_SVE-NEXT: tbnz w9, #26, .LBB41_60 +; NO_SVE-NEXT: // %bb.53: // %else77 +; NO_SVE-NEXT: tbnz w9, #27, .LBB41_61 +; NO_SVE-NEXT: .LBB41_54: // %else80 +; NO_SVE-NEXT: tbnz w9, #28, .LBB41_62 +; NO_SVE-NEXT: .LBB41_55: // %else83 +; NO_SVE-NEXT: tbnz w9, #29, .LBB41_63 +; NO_SVE-NEXT: .LBB41_56: // %else86 +; NO_SVE-NEXT: tbnz w9, #30, .LBB41_64 +; NO_SVE-NEXT: .LBB41_57: // %else89 +; NO_SVE-NEXT: tbz w9, #31, .LBB41_59 +; NO_SVE-NEXT: .LBB41_58: // %cond.load91 +; NO_SVE-NEXT: add x9, x0, #62 +; NO_SVE-NEXT: ld1 { v3.h }[7], [x9] +; NO_SVE-NEXT: .LBB41_59: // %else92 +; NO_SVE-NEXT: sshll v6.4s, v0.4h, #0 +; NO_SVE-NEXT: sshll2 v0.4s, v0.8h, #0 +; NO_SVE-NEXT: sshll2 v5.4s, v2.8h, #0 +; NO_SVE-NEXT: sshll2 v16.2d, v0.4s, #0 +; NO_SVE-NEXT: sshll v0.2d, v0.2s, #0 +; NO_SVE-NEXT: sshll2 v7.4s, v1.8h, #0 +; NO_SVE-NEXT: stp q0, q16, [x8, #32] +; NO_SVE-NEXT: sshll2 v0.2d, v6.4s, #0 +; NO_SVE-NEXT: sshll v6.2d, v6.2s, #0 +; NO_SVE-NEXT: sshll v1.4s, v1.4h, #0 +; NO_SVE-NEXT: stp q6, q0, [x8] +; NO_SVE-NEXT: sshll2 v0.2d, v5.4s, #0 +; NO_SVE-NEXT: sshll v5.2d, v5.2s, #0 +; NO_SVE-NEXT: sshll2 v4.4s, v3.8h, #0 +; NO_SVE-NEXT: sshll2 v6.2d, v1.4s, #0 +; NO_SVE-NEXT: stp q5, q0, [x8, #160] +; NO_SVE-NEXT: sshll v0.2d, v1.2s, #0 +; NO_SVE-NEXT: sshll2 v1.2d, v4.4s, #0 +; NO_SVE-NEXT: stp q0, q6, [x8, #64] +; NO_SVE-NEXT: sshll v0.2d, v4.2s, #0 +; NO_SVE-NEXT: sshll v2.4s, v2.4h, #0 +; NO_SVE-NEXT: stp q0, q1, [x8, #224] +; NO_SVE-NEXT: sshll2 v0.2d, v2.4s, #0 +; NO_SVE-NEXT: sshll v1.2d, v2.2s, #0 +; NO_SVE-NEXT: sshll v2.4s, v3.4h, #0 +; NO_SVE-NEXT: sshll2 v17.2d, v7.4s, #0 +; NO_SVE-NEXT: stp q1, q0, [x8, #128] +; NO_SVE-NEXT: sshll v7.2d, v7.2s, #0 +; NO_SVE-NEXT: sshll2 v0.2d, v2.4s, #0 +; NO_SVE-NEXT: sshll v1.2d, v2.2s, #0 +; NO_SVE-NEXT: stp q7, q17, [x8, #96] +; NO_SVE-NEXT: stp q1, q0, [x8, #192] +; NO_SVE-NEXT: add sp, sp, #16 +; NO_SVE-NEXT: ret +; NO_SVE-NEXT: .LBB41_60: // %cond.load76 +; NO_SVE-NEXT: add x10, x0, #52 +; NO_SVE-NEXT: ld1 { v3.h }[2], [x10] +; NO_SVE-NEXT: tbz w9, #27, .LBB41_54 +; NO_SVE-NEXT: .LBB41_61: // %cond.load79 +; NO_SVE-NEXT: add x10, x0, #54 +; NO_SVE-NEXT: ld1 { v3.h }[3], [x10] +; NO_SVE-NEXT: tbz w9, #28, .LBB41_55 +; NO_SVE-NEXT: .LBB41_62: // %cond.load82 +; NO_SVE-NEXT: add x10, x0, #56 +; NO_SVE-NEXT: ld1 { v3.h }[4], [x10] +; NO_SVE-NEXT: tbz w9, #29, .LBB41_56 +; NO_SVE-NEXT: .LBB41_63: // %cond.load85 +; NO_SVE-NEXT: add x10, x0, #58 +; NO_SVE-NEXT: ld1 { v3.h }[5], [x10] +; NO_SVE-NEXT: tbz w9, #30, .LBB41_57 +; NO_SVE-NEXT: .LBB41_64: // %cond.load88 +; NO_SVE-NEXT: add x10, x0, #60 +; NO_SVE-NEXT: ld1 { v3.h }[6], [x10] +; NO_SVE-NEXT: tbnz w9, #31, .LBB41_58 +; NO_SVE-NEXT: b .LBB41_59 +; ; VBITS_GE_2048-LABEL: masked_load_sext_v32i16i64: ; VBITS_GE_2048: // %bb.0: ; VBITS_GE_2048-NEXT: ptrue p0.d, vl32 @@ -727,6 +10124,349 @@ } define <32 x i64> @masked_load_sext_v32i32i64(<32 x i32>* %ap, <32 x i32>* %bp) #0 { +; NO_SVE-LABEL: masked_load_sext_v32i32i64: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: sub sp, sp, #16 +; NO_SVE-NEXT: .cfi_def_cfa_offset 16 +; NO_SVE-NEXT: ldp q2, q3, [x1, #64] +; NO_SVE-NEXT: cmeq v2.4s, v2.4s, #0 +; NO_SVE-NEXT: cmeq v3.4s, v3.4s, #0 +; NO_SVE-NEXT: ldp q4, q5, [x1, #96] +; NO_SVE-NEXT: uzp1 v2.8h, v2.8h, v3.8h +; NO_SVE-NEXT: cmeq v4.4s, v4.4s, #0 +; NO_SVE-NEXT: xtn v2.8b, v2.8h +; NO_SVE-NEXT: cmeq v5.4s, v5.4s, #0 +; NO_SVE-NEXT: umov w9, v2.b[1] +; NO_SVE-NEXT: ldp q0, q1, [x1] +; NO_SVE-NEXT: umov w11, v2.b[2] +; NO_SVE-NEXT: umov w10, v2.b[0] +; NO_SVE-NEXT: uzp1 v3.8h, v4.8h, v5.8h +; NO_SVE-NEXT: umov w12, v2.b[3] +; NO_SVE-NEXT: umov w13, v2.b[4] +; NO_SVE-NEXT: and w9, w9, #0x1 +; NO_SVE-NEXT: umov w14, v2.b[5] +; NO_SVE-NEXT: umov w15, v2.b[6] +; NO_SVE-NEXT: and w11, w11, #0x1 +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: xtn v3.8b, v3.8h +; NO_SVE-NEXT: and w12, w12, #0x1 +; NO_SVE-NEXT: cmeq v1.4s, v1.4s, #0 +; NO_SVE-NEXT: umov w16, v2.b[7] +; NO_SVE-NEXT: cmeq v0.4s, v0.4s, #0 +; NO_SVE-NEXT: bfi w10, w9, #1, #1 +; NO_SVE-NEXT: and w13, w13, #0x1 +; NO_SVE-NEXT: umov w9, v3.b[0] +; NO_SVE-NEXT: bfi w10, w11, #2, #1 +; NO_SVE-NEXT: and w14, w14, #0x1 +; NO_SVE-NEXT: uzp1 v0.8h, v0.8h, v1.8h +; NO_SVE-NEXT: umov w11, v3.b[1] +; NO_SVE-NEXT: bfi w10, w12, #3, #1 +; NO_SVE-NEXT: umov w12, v3.b[2] +; NO_SVE-NEXT: bfi w10, w13, #4, #1 +; NO_SVE-NEXT: and w15, w15, #0x1 +; NO_SVE-NEXT: umov w13, v3.b[3] +; NO_SVE-NEXT: bfi w10, w14, #5, #1 +; NO_SVE-NEXT: and w16, w16, #0x1 +; NO_SVE-NEXT: umov w14, v3.b[4] +; NO_SVE-NEXT: and w9, w9, #0x1 +; NO_SVE-NEXT: orr w10, w10, w15, lsl #6 +; NO_SVE-NEXT: and w11, w11, #0x1 +; NO_SVE-NEXT: xtn v0.8b, v0.8h +; NO_SVE-NEXT: orr w10, w10, w16, lsl #7 +; NO_SVE-NEXT: and w12, w12, #0x1 +; NO_SVE-NEXT: ldp q1, q4, [x1, #32] +; NO_SVE-NEXT: orr w9, w10, w9, lsl #8 +; NO_SVE-NEXT: and w13, w13, #0x1 +; NO_SVE-NEXT: umov w16, v0.b[1] +; NO_SVE-NEXT: orr w9, w9, w11, lsl #9 +; NO_SVE-NEXT: and w14, w14, #0x1 +; NO_SVE-NEXT: umov w15, v3.b[5] +; NO_SVE-NEXT: orr w9, w9, w12, lsl #10 +; NO_SVE-NEXT: umov w10, v0.b[0] +; NO_SVE-NEXT: umov w11, v0.b[2] +; NO_SVE-NEXT: orr w9, w9, w13, lsl #11 +; NO_SVE-NEXT: orr w9, w9, w14, lsl #12 +; NO_SVE-NEXT: umov w14, v0.b[3] +; NO_SVE-NEXT: and w13, w16, #0x1 +; NO_SVE-NEXT: and w12, w15, #0x1 +; NO_SVE-NEXT: umov w15, v0.b[4] +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: cmeq v2.4s, v4.4s, #0 +; NO_SVE-NEXT: and w11, w11, #0x1 +; NO_SVE-NEXT: cmeq v1.4s, v1.4s, #0 +; NO_SVE-NEXT: bfi w10, w13, #1, #1 +; NO_SVE-NEXT: umov w16, v0.b[5] +; NO_SVE-NEXT: and w13, w14, #0x1 +; NO_SVE-NEXT: uzp1 v1.8h, v1.8h, v2.8h +; NO_SVE-NEXT: bfi w10, w11, #2, #1 +; NO_SVE-NEXT: and w11, w15, #0x1 +; NO_SVE-NEXT: umov w15, v0.b[6] +; NO_SVE-NEXT: bfi w10, w13, #3, #1 +; NO_SVE-NEXT: umov w13, v0.b[7] +; NO_SVE-NEXT: and w14, w16, #0x1 +; NO_SVE-NEXT: orr w9, w9, w12, lsl #13 +; NO_SVE-NEXT: xtn v0.8b, v1.8h +; NO_SVE-NEXT: bfi w10, w11, #4, #1 +; NO_SVE-NEXT: umov w11, v3.b[6] +; NO_SVE-NEXT: bfi w10, w14, #5, #1 +; NO_SVE-NEXT: and w14, w15, #0x1 +; NO_SVE-NEXT: umov w15, v0.b[0] +; NO_SVE-NEXT: and w13, w13, #0x1 +; NO_SVE-NEXT: umov w16, v0.b[1] +; NO_SVE-NEXT: orr w10, w10, w14, lsl #6 +; NO_SVE-NEXT: orr w10, w10, w13, lsl #7 +; NO_SVE-NEXT: umov w13, v0.b[2] +; NO_SVE-NEXT: and w11, w11, #0x1 +; NO_SVE-NEXT: and w12, w15, #0x1 +; NO_SVE-NEXT: umov w15, v0.b[3] +; NO_SVE-NEXT: and w14, w16, #0x1 +; NO_SVE-NEXT: orr w9, w9, w11, lsl #14 +; NO_SVE-NEXT: umov w11, v0.b[4] +; NO_SVE-NEXT: orr w10, w10, w12, lsl #8 +; NO_SVE-NEXT: and w12, w13, #0x1 +; NO_SVE-NEXT: umov w13, v0.b[5] +; NO_SVE-NEXT: orr w10, w10, w14, lsl #9 +; NO_SVE-NEXT: and w14, w15, #0x1 +; NO_SVE-NEXT: umov w15, v0.b[6] +; NO_SVE-NEXT: orr w10, w10, w12, lsl #10 +; NO_SVE-NEXT: umov w12, v3.b[7] +; NO_SVE-NEXT: and w11, w11, #0x1 +; NO_SVE-NEXT: orr w10, w10, w14, lsl #11 +; NO_SVE-NEXT: and w13, w13, #0x1 +; NO_SVE-NEXT: umov w14, v0.b[7] +; NO_SVE-NEXT: orr w10, w10, w11, lsl #12 +; NO_SVE-NEXT: and w11, w15, #0x1 +; NO_SVE-NEXT: orr w12, w9, w12, lsl #15 +; NO_SVE-NEXT: orr w9, w10, w13, lsl #13 +; NO_SVE-NEXT: orr w9, w9, w11, lsl #14 +; NO_SVE-NEXT: orr w9, w9, w14, lsl #15 +; NO_SVE-NEXT: bfi w9, w12, #16, #16 +; NO_SVE-NEXT: tbz w9, #0, .LBB42_2 +; NO_SVE-NEXT: // %bb.1: // %cond.load +; NO_SVE-NEXT: ldr s0, [x0] +; NO_SVE-NEXT: tbnz w9, #1, .LBB42_3 +; NO_SVE-NEXT: b .LBB42_4 +; NO_SVE-NEXT: .LBB42_2: +; NO_SVE-NEXT: // implicit-def: $q0 +; NO_SVE-NEXT: tbz w9, #1, .LBB42_4 +; NO_SVE-NEXT: .LBB42_3: // %cond.load1 +; NO_SVE-NEXT: add x10, x0, #4 +; NO_SVE-NEXT: ld1 { v0.s }[1], [x10] +; NO_SVE-NEXT: .LBB42_4: // %else2 +; NO_SVE-NEXT: tbnz w9, #2, .LBB42_8 +; NO_SVE-NEXT: // %bb.5: // %else5 +; NO_SVE-NEXT: tbnz w9, #3, .LBB42_9 +; NO_SVE-NEXT: .LBB42_6: // %else8 +; NO_SVE-NEXT: tbz w9, #4, .LBB42_10 +; NO_SVE-NEXT: .LBB42_7: // %cond.load10 +; NO_SVE-NEXT: add x10, x0, #16 +; NO_SVE-NEXT: ld1 { v1.s }[0], [x10] +; NO_SVE-NEXT: tbnz w9, #5, .LBB42_11 +; NO_SVE-NEXT: b .LBB42_12 +; NO_SVE-NEXT: .LBB42_8: // %cond.load4 +; NO_SVE-NEXT: add x10, x0, #8 +; NO_SVE-NEXT: ld1 { v0.s }[2], [x10] +; NO_SVE-NEXT: tbz w9, #3, .LBB42_6 +; NO_SVE-NEXT: .LBB42_9: // %cond.load7 +; NO_SVE-NEXT: add x10, x0, #12 +; NO_SVE-NEXT: ld1 { v0.s }[3], [x10] +; NO_SVE-NEXT: tbnz w9, #4, .LBB42_7 +; NO_SVE-NEXT: .LBB42_10: +; NO_SVE-NEXT: // implicit-def: $q1 +; NO_SVE-NEXT: tbz w9, #5, .LBB42_12 +; NO_SVE-NEXT: .LBB42_11: // %cond.load13 +; NO_SVE-NEXT: add x10, x0, #20 +; NO_SVE-NEXT: ld1 { v1.s }[1], [x10] +; NO_SVE-NEXT: .LBB42_12: // %else14 +; NO_SVE-NEXT: tbnz w9, #6, .LBB42_16 +; NO_SVE-NEXT: // %bb.13: // %else17 +; NO_SVE-NEXT: tbnz w9, #7, .LBB42_17 +; NO_SVE-NEXT: .LBB42_14: // %else20 +; NO_SVE-NEXT: tbz w9, #8, .LBB42_18 +; NO_SVE-NEXT: .LBB42_15: // %cond.load22 +; NO_SVE-NEXT: add x10, x0, #32 +; NO_SVE-NEXT: ld1 { v2.s }[0], [x10] +; NO_SVE-NEXT: tbnz w9, #9, .LBB42_19 +; NO_SVE-NEXT: b .LBB42_20 +; NO_SVE-NEXT: .LBB42_16: // %cond.load16 +; NO_SVE-NEXT: add x10, x0, #24 +; NO_SVE-NEXT: ld1 { v1.s }[2], [x10] +; NO_SVE-NEXT: tbz w9, #7, .LBB42_14 +; NO_SVE-NEXT: .LBB42_17: // %cond.load19 +; NO_SVE-NEXT: add x10, x0, #28 +; NO_SVE-NEXT: ld1 { v1.s }[3], [x10] +; NO_SVE-NEXT: tbnz w9, #8, .LBB42_15 +; NO_SVE-NEXT: .LBB42_18: +; NO_SVE-NEXT: // implicit-def: $q2 +; NO_SVE-NEXT: tbz w9, #9, .LBB42_20 +; NO_SVE-NEXT: .LBB42_19: // %cond.load25 +; NO_SVE-NEXT: add x10, x0, #36 +; NO_SVE-NEXT: ld1 { v2.s }[1], [x10] +; NO_SVE-NEXT: .LBB42_20: // %else26 +; NO_SVE-NEXT: tbnz w9, #10, .LBB42_24 +; NO_SVE-NEXT: // %bb.21: // %else29 +; NO_SVE-NEXT: tbnz w9, #11, .LBB42_25 +; NO_SVE-NEXT: .LBB42_22: // %else32 +; NO_SVE-NEXT: tbz w9, #12, .LBB42_26 +; NO_SVE-NEXT: .LBB42_23: // %cond.load34 +; NO_SVE-NEXT: add x10, x0, #48 +; NO_SVE-NEXT: ld1 { v3.s }[0], [x10] +; NO_SVE-NEXT: tbnz w9, #13, .LBB42_27 +; NO_SVE-NEXT: b .LBB42_28 +; NO_SVE-NEXT: .LBB42_24: // %cond.load28 +; NO_SVE-NEXT: add x10, x0, #40 +; NO_SVE-NEXT: ld1 { v2.s }[2], [x10] +; NO_SVE-NEXT: tbz w9, #11, .LBB42_22 +; NO_SVE-NEXT: .LBB42_25: // %cond.load31 +; NO_SVE-NEXT: add x10, x0, #44 +; NO_SVE-NEXT: ld1 { v2.s }[3], [x10] +; NO_SVE-NEXT: tbnz w9, #12, .LBB42_23 +; NO_SVE-NEXT: .LBB42_26: +; NO_SVE-NEXT: // implicit-def: $q3 +; NO_SVE-NEXT: tbz w9, #13, .LBB42_28 +; NO_SVE-NEXT: .LBB42_27: // %cond.load37 +; NO_SVE-NEXT: add x10, x0, #52 +; NO_SVE-NEXT: ld1 { v3.s }[1], [x10] +; NO_SVE-NEXT: .LBB42_28: // %else38 +; NO_SVE-NEXT: tbnz w9, #14, .LBB42_32 +; NO_SVE-NEXT: // %bb.29: // %else41 +; NO_SVE-NEXT: tbnz w9, #15, .LBB42_33 +; NO_SVE-NEXT: .LBB42_30: // %else44 +; NO_SVE-NEXT: tbz w9, #16, .LBB42_34 +; NO_SVE-NEXT: .LBB42_31: // %cond.load46 +; NO_SVE-NEXT: add x10, x0, #64 +; NO_SVE-NEXT: ld1 { v4.s }[0], [x10] +; NO_SVE-NEXT: tbnz w9, #17, .LBB42_35 +; NO_SVE-NEXT: b .LBB42_36 +; NO_SVE-NEXT: .LBB42_32: // %cond.load40 +; NO_SVE-NEXT: add x10, x0, #56 +; NO_SVE-NEXT: ld1 { v3.s }[2], [x10] +; NO_SVE-NEXT: tbz w9, #15, .LBB42_30 +; NO_SVE-NEXT: .LBB42_33: // %cond.load43 +; NO_SVE-NEXT: add x10, x0, #60 +; NO_SVE-NEXT: ld1 { v3.s }[3], [x10] +; NO_SVE-NEXT: tbnz w9, #16, .LBB42_31 +; NO_SVE-NEXT: .LBB42_34: +; NO_SVE-NEXT: // implicit-def: $q4 +; NO_SVE-NEXT: tbz w9, #17, .LBB42_36 +; NO_SVE-NEXT: .LBB42_35: // %cond.load49 +; NO_SVE-NEXT: add x10, x0, #68 +; NO_SVE-NEXT: ld1 { v4.s }[1], [x10] +; NO_SVE-NEXT: .LBB42_36: // %else50 +; NO_SVE-NEXT: tbnz w9, #18, .LBB42_40 +; NO_SVE-NEXT: // %bb.37: // %else53 +; NO_SVE-NEXT: tbnz w9, #19, .LBB42_41 +; NO_SVE-NEXT: .LBB42_38: // %else56 +; NO_SVE-NEXT: tbz w9, #20, .LBB42_42 +; NO_SVE-NEXT: .LBB42_39: // %cond.load58 +; NO_SVE-NEXT: add x10, x0, #80 +; NO_SVE-NEXT: ld1 { v5.s }[0], [x10] +; NO_SVE-NEXT: tbnz w9, #21, .LBB42_43 +; NO_SVE-NEXT: b .LBB42_44 +; NO_SVE-NEXT: .LBB42_40: // %cond.load52 +; NO_SVE-NEXT: add x10, x0, #72 +; NO_SVE-NEXT: ld1 { v4.s }[2], [x10] +; NO_SVE-NEXT: tbz w9, #19, .LBB42_38 +; NO_SVE-NEXT: .LBB42_41: // %cond.load55 +; NO_SVE-NEXT: add x10, x0, #76 +; NO_SVE-NEXT: ld1 { v4.s }[3], [x10] +; NO_SVE-NEXT: tbnz w9, #20, .LBB42_39 +; NO_SVE-NEXT: .LBB42_42: +; NO_SVE-NEXT: // implicit-def: $q5 +; NO_SVE-NEXT: tbz w9, #21, .LBB42_44 +; NO_SVE-NEXT: .LBB42_43: // %cond.load61 +; NO_SVE-NEXT: add x10, x0, #84 +; NO_SVE-NEXT: ld1 { v5.s }[1], [x10] +; NO_SVE-NEXT: .LBB42_44: // %else62 +; NO_SVE-NEXT: tbnz w9, #22, .LBB42_48 +; NO_SVE-NEXT: // %bb.45: // %else65 +; NO_SVE-NEXT: tbnz w9, #23, .LBB42_49 +; NO_SVE-NEXT: .LBB42_46: // %else68 +; NO_SVE-NEXT: tbz w9, #24, .LBB42_50 +; NO_SVE-NEXT: .LBB42_47: // %cond.load70 +; NO_SVE-NEXT: add x10, x0, #96 +; NO_SVE-NEXT: ld1 { v6.s }[0], [x10] +; NO_SVE-NEXT: tbnz w9, #25, .LBB42_51 +; NO_SVE-NEXT: b .LBB42_52 +; NO_SVE-NEXT: .LBB42_48: // %cond.load64 +; NO_SVE-NEXT: add x10, x0, #88 +; NO_SVE-NEXT: ld1 { v5.s }[2], [x10] +; NO_SVE-NEXT: tbz w9, #23, .LBB42_46 +; NO_SVE-NEXT: .LBB42_49: // %cond.load67 +; NO_SVE-NEXT: add x10, x0, #92 +; NO_SVE-NEXT: ld1 { v5.s }[3], [x10] +; NO_SVE-NEXT: tbnz w9, #24, .LBB42_47 +; NO_SVE-NEXT: .LBB42_50: +; NO_SVE-NEXT: // implicit-def: $q6 +; NO_SVE-NEXT: tbz w9, #25, .LBB42_52 +; NO_SVE-NEXT: .LBB42_51: // %cond.load73 +; NO_SVE-NEXT: add x10, x0, #100 +; NO_SVE-NEXT: ld1 { v6.s }[1], [x10] +; NO_SVE-NEXT: .LBB42_52: // %else74 +; NO_SVE-NEXT: tbnz w9, #26, .LBB42_56 +; NO_SVE-NEXT: // %bb.53: // %else77 +; NO_SVE-NEXT: tbnz w9, #27, .LBB42_57 +; NO_SVE-NEXT: .LBB42_54: // %else80 +; NO_SVE-NEXT: tbz w9, #28, .LBB42_58 +; NO_SVE-NEXT: .LBB42_55: // %cond.load82 +; NO_SVE-NEXT: add x10, x0, #112 +; NO_SVE-NEXT: ld1 { v7.s }[0], [x10] +; NO_SVE-NEXT: tbnz w9, #29, .LBB42_59 +; NO_SVE-NEXT: b .LBB42_60 +; NO_SVE-NEXT: .LBB42_56: // %cond.load76 +; NO_SVE-NEXT: add x10, x0, #104 +; NO_SVE-NEXT: ld1 { v6.s }[2], [x10] +; NO_SVE-NEXT: tbz w9, #27, .LBB42_54 +; NO_SVE-NEXT: .LBB42_57: // %cond.load79 +; NO_SVE-NEXT: add x10, x0, #108 +; NO_SVE-NEXT: ld1 { v6.s }[3], [x10] +; NO_SVE-NEXT: tbnz w9, #28, .LBB42_55 +; NO_SVE-NEXT: .LBB42_58: +; NO_SVE-NEXT: // implicit-def: $q7 +; NO_SVE-NEXT: tbz w9, #29, .LBB42_60 +; NO_SVE-NEXT: .LBB42_59: // %cond.load85 +; NO_SVE-NEXT: add x10, x0, #116 +; NO_SVE-NEXT: ld1 { v7.s }[1], [x10] +; NO_SVE-NEXT: .LBB42_60: // %else86 +; NO_SVE-NEXT: tbnz w9, #30, .LBB42_64 +; NO_SVE-NEXT: // %bb.61: // %else89 +; NO_SVE-NEXT: tbz w9, #31, .LBB42_63 +; NO_SVE-NEXT: .LBB42_62: // %cond.load91 +; NO_SVE-NEXT: add x9, x0, #124 +; NO_SVE-NEXT: ld1 { v7.s }[3], [x9] +; NO_SVE-NEXT: .LBB42_63: // %else92 +; NO_SVE-NEXT: sshll2 v16.2d, v0.4s, #0 +; NO_SVE-NEXT: sshll v0.2d, v0.2s, #0 +; NO_SVE-NEXT: stp q0, q16, [x8] +; NO_SVE-NEXT: sshll2 v16.2d, v1.4s, #0 +; NO_SVE-NEXT: sshll v0.2d, v1.2s, #0 +; NO_SVE-NEXT: sshll v1.2d, v2.2s, #0 +; NO_SVE-NEXT: stp q0, q16, [x8, #32] +; NO_SVE-NEXT: sshll2 v0.2d, v2.4s, #0 +; NO_SVE-NEXT: stp q1, q0, [x8, #64] +; NO_SVE-NEXT: sshll2 v0.2d, v3.4s, #0 +; NO_SVE-NEXT: sshll v1.2d, v3.2s, #0 +; NO_SVE-NEXT: stp q1, q0, [x8, #96] +; NO_SVE-NEXT: sshll2 v0.2d, v4.4s, #0 +; NO_SVE-NEXT: sshll v1.2d, v4.2s, #0 +; NO_SVE-NEXT: stp q1, q0, [x8, #128] +; NO_SVE-NEXT: sshll2 v0.2d, v5.4s, #0 +; NO_SVE-NEXT: sshll v1.2d, v5.2s, #0 +; NO_SVE-NEXT: stp q1, q0, [x8, #160] +; NO_SVE-NEXT: sshll2 v0.2d, v6.4s, #0 +; NO_SVE-NEXT: sshll v1.2d, v6.2s, #0 +; NO_SVE-NEXT: stp q1, q0, [x8, #192] +; NO_SVE-NEXT: sshll2 v0.2d, v7.4s, #0 +; NO_SVE-NEXT: sshll v1.2d, v7.2s, #0 +; NO_SVE-NEXT: stp q1, q0, [x8, #224] +; NO_SVE-NEXT: add sp, sp, #16 +; NO_SVE-NEXT: ret +; NO_SVE-NEXT: .LBB42_64: // %cond.load88 +; NO_SVE-NEXT: add x10, x0, #120 +; NO_SVE-NEXT: ld1 { v7.s }[2], [x10] +; NO_SVE-NEXT: tbnz w9, #31, .LBB42_62 +; NO_SVE-NEXT: b .LBB42_63 +; ; VBITS_GE_2048-LABEL: masked_load_sext_v32i32i64: ; VBITS_GE_2048: // %bb.0: ; VBITS_GE_2048-NEXT: ptrue p0.d, vl32 @@ -743,6 +10483,1190 @@ } define <128 x i16> @masked_load_zext_v128i8i16(<128 x i8>* %ap, <128 x i8>* %bp) #0 { +; NO_SVE-LABEL: masked_load_zext_v128i8i16: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: sub sp, sp, #64 +; NO_SVE-NEXT: .cfi_def_cfa_offset 64 +; NO_SVE-NEXT: stp x22, x21, [sp, #32] // 16-byte Folded Spill +; NO_SVE-NEXT: stp x20, x19, [sp, #48] // 16-byte Folded Spill +; NO_SVE-NEXT: .cfi_offset w19, -8 +; NO_SVE-NEXT: .cfi_offset w20, -16 +; NO_SVE-NEXT: .cfi_offset w21, -24 +; NO_SVE-NEXT: .cfi_offset w22, -32 +; NO_SVE-NEXT: ldp q2, q0, [x1, #32] +; NO_SVE-NEXT: cmeq v2.16b, v2.16b, #0 +; NO_SVE-NEXT: cmeq v1.16b, v0.16b, #0 +; NO_SVE-NEXT: ldp q0, q3, [x1] +; NO_SVE-NEXT: umov w9, v1.b[1] +; NO_SVE-NEXT: umov w11, v1.b[2] +; NO_SVE-NEXT: umov w10, v1.b[0] +; NO_SVE-NEXT: umov w12, v1.b[3] +; NO_SVE-NEXT: umov w13, v1.b[4] +; NO_SVE-NEXT: umov w14, v1.b[5] +; NO_SVE-NEXT: umov w15, v1.b[6] +; NO_SVE-NEXT: umov w16, v1.b[7] +; NO_SVE-NEXT: and w9, w9, #0x1 +; NO_SVE-NEXT: and w11, w11, #0x1 +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: and w12, w12, #0x1 +; NO_SVE-NEXT: and w13, w13, #0x1 +; NO_SVE-NEXT: umov w17, v1.b[8] +; NO_SVE-NEXT: bfi w10, w9, #1, #1 +; NO_SVE-NEXT: and w14, w14, #0x1 +; NO_SVE-NEXT: bfi w10, w11, #2, #1 +; NO_SVE-NEXT: umov w18, v1.b[9] +; NO_SVE-NEXT: bfi w10, w12, #3, #1 +; NO_SVE-NEXT: umov w9, v1.b[10] +; NO_SVE-NEXT: bfi w10, w13, #4, #1 +; NO_SVE-NEXT: and w15, w15, #0x1 +; NO_SVE-NEXT: umov w11, v1.b[11] +; NO_SVE-NEXT: bfi w10, w14, #5, #1 +; NO_SVE-NEXT: and w16, w16, #0x1 +; NO_SVE-NEXT: umov w12, v1.b[12] +; NO_SVE-NEXT: orr w10, w10, w15, lsl #6 +; NO_SVE-NEXT: and w17, w17, #0x1 +; NO_SVE-NEXT: umov w13, v1.b[13] +; NO_SVE-NEXT: orr w10, w10, w16, lsl #7 +; NO_SVE-NEXT: and w18, w18, #0x1 +; NO_SVE-NEXT: orr w10, w10, w17, lsl #8 +; NO_SVE-NEXT: and w9, w9, #0x1 +; NO_SVE-NEXT: umov w15, v2.b[1] +; NO_SVE-NEXT: orr w10, w10, w18, lsl #9 +; NO_SVE-NEXT: and w11, w11, #0x1 +; NO_SVE-NEXT: and w12, w12, #0x1 +; NO_SVE-NEXT: orr w9, w10, w9, lsl #10 +; NO_SVE-NEXT: umov w14, v1.b[14] +; NO_SVE-NEXT: umov w16, v2.b[0] +; NO_SVE-NEXT: umov w17, v2.b[2] +; NO_SVE-NEXT: and w10, w13, #0x1 +; NO_SVE-NEXT: orr w9, w9, w11, lsl #11 +; NO_SVE-NEXT: umov w18, v2.b[3] +; NO_SVE-NEXT: orr w9, w9, w12, lsl #12 +; NO_SVE-NEXT: umov w12, v2.b[4] +; NO_SVE-NEXT: orr w9, w9, w10, lsl #13 +; NO_SVE-NEXT: and w10, w15, #0x1 +; NO_SVE-NEXT: and w11, w14, #0x1 +; NO_SVE-NEXT: and w13, w16, #0x1 +; NO_SVE-NEXT: and w14, w17, #0x1 +; NO_SVE-NEXT: umov w15, v2.b[5] +; NO_SVE-NEXT: bfi w13, w10, #1, #1 +; NO_SVE-NEXT: and w10, w18, #0x1 +; NO_SVE-NEXT: and w12, w12, #0x1 +; NO_SVE-NEXT: umov w16, v2.b[6] +; NO_SVE-NEXT: bfi w13, w14, #2, #1 +; NO_SVE-NEXT: umov w14, v2.b[7] +; NO_SVE-NEXT: bfi w13, w10, #3, #1 +; NO_SVE-NEXT: and w10, w15, #0x1 +; NO_SVE-NEXT: bfi w13, w12, #4, #1 +; NO_SVE-NEXT: umov w12, v2.b[8] +; NO_SVE-NEXT: and w15, w16, #0x1 +; NO_SVE-NEXT: umov w16, v2.b[9] +; NO_SVE-NEXT: bfi w13, w10, #5, #1 +; NO_SVE-NEXT: and w10, w14, #0x1 +; NO_SVE-NEXT: umov w14, v2.b[10] +; NO_SVE-NEXT: orr w13, w13, w15, lsl #6 +; NO_SVE-NEXT: and w12, w12, #0x1 +; NO_SVE-NEXT: orr w10, w13, w10, lsl #7 +; NO_SVE-NEXT: and w13, w16, #0x1 +; NO_SVE-NEXT: umov w15, v2.b[11] +; NO_SVE-NEXT: orr w10, w10, w12, lsl #8 +; NO_SVE-NEXT: umov w12, v2.b[12] +; NO_SVE-NEXT: orr w9, w9, w11, lsl #14 +; NO_SVE-NEXT: umov w11, v1.b[15] +; NO_SVE-NEXT: orr w10, w10, w13, lsl #9 +; NO_SVE-NEXT: and w13, w14, #0x1 +; NO_SVE-NEXT: cmeq v1.16b, v3.16b, #0 +; NO_SVE-NEXT: umov w17, v2.b[14] +; NO_SVE-NEXT: orr w10, w10, w13, lsl #10 +; NO_SVE-NEXT: and w13, w15, #0x1 +; NO_SVE-NEXT: umov w14, v1.b[1] +; NO_SVE-NEXT: and w12, w12, #0x1 +; NO_SVE-NEXT: umov w15, v2.b[13] +; NO_SVE-NEXT: umov w16, v1.b[0] +; NO_SVE-NEXT: orr w10, w10, w13, lsl #11 +; NO_SVE-NEXT: umov w13, v1.b[2] +; NO_SVE-NEXT: orr w10, w10, w12, lsl #12 +; NO_SVE-NEXT: umov w12, v1.b[3] +; NO_SVE-NEXT: orr w9, w9, w11, lsl #15 +; NO_SVE-NEXT: and w11, w14, #0x1 +; NO_SVE-NEXT: and w14, w15, #0x1 +; NO_SVE-NEXT: and w15, w16, #0x1 +; NO_SVE-NEXT: umov w16, v1.b[4] +; NO_SVE-NEXT: and w13, w13, #0x1 +; NO_SVE-NEXT: bfi w15, w11, #1, #1 +; NO_SVE-NEXT: and w11, w12, #0x1 +; NO_SVE-NEXT: umov w12, v1.b[5] +; NO_SVE-NEXT: orr w10, w10, w14, lsl #13 +; NO_SVE-NEXT: bfi w15, w13, #2, #1 +; NO_SVE-NEXT: umov w14, v1.b[6] +; NO_SVE-NEXT: and w13, w16, #0x1 +; NO_SVE-NEXT: bfi w15, w11, #3, #1 +; NO_SVE-NEXT: and w16, w17, #0x1 +; NO_SVE-NEXT: umov w17, v1.b[9] +; NO_SVE-NEXT: and w11, w12, #0x1 +; NO_SVE-NEXT: umov w12, v1.b[7] +; NO_SVE-NEXT: bfi w15, w13, #4, #1 +; NO_SVE-NEXT: umov w13, v1.b[8] +; NO_SVE-NEXT: and w14, w14, #0x1 +; NO_SVE-NEXT: orr w10, w10, w16, lsl #14 +; NO_SVE-NEXT: bfi w15, w11, #5, #1 +; NO_SVE-NEXT: umov w18, v1.b[13] +; NO_SVE-NEXT: and w12, w12, #0x1 +; NO_SVE-NEXT: orr w11, w15, w14, lsl #6 +; NO_SVE-NEXT: and w13, w13, #0x1 +; NO_SVE-NEXT: umov w14, v1.b[10] +; NO_SVE-NEXT: orr w11, w11, w12, lsl #7 +; NO_SVE-NEXT: and w12, w17, #0x1 +; NO_SVE-NEXT: orr w11, w11, w13, lsl #8 +; NO_SVE-NEXT: umov w15, v2.b[15] +; NO_SVE-NEXT: orr w11, w11, w12, lsl #9 +; NO_SVE-NEXT: umov w12, v1.b[11] +; NO_SVE-NEXT: and w13, w14, #0x1 +; NO_SVE-NEXT: umov w14, v1.b[12] +; NO_SVE-NEXT: cmeq v0.16b, v0.16b, #0 +; NO_SVE-NEXT: orr w15, w10, w15, lsl #15 +; NO_SVE-NEXT: orr w10, w11, w13, lsl #10 +; NO_SVE-NEXT: and w11, w12, #0x1 +; NO_SVE-NEXT: umov w12, v0.b[1] +; NO_SVE-NEXT: and w13, w14, #0x1 +; NO_SVE-NEXT: umov w14, v0.b[2] +; NO_SVE-NEXT: orr w10, w10, w11, lsl #11 +; NO_SVE-NEXT: umov w11, v0.b[0] +; NO_SVE-NEXT: orr w10, w10, w13, lsl #12 +; NO_SVE-NEXT: umov w13, v0.b[3] +; NO_SVE-NEXT: and w12, w12, #0x1 +; NO_SVE-NEXT: umov w16, v0.b[4] +; NO_SVE-NEXT: umov w17, v0.b[5] +; NO_SVE-NEXT: and w14, w14, #0x1 +; NO_SVE-NEXT: and w11, w11, #0x1 +; NO_SVE-NEXT: bfi w15, w9, #16, #16 +; NO_SVE-NEXT: bfi w11, w12, #1, #1 +; NO_SVE-NEXT: and w12, w13, #0x1 +; NO_SVE-NEXT: and w13, w16, #0x1 +; NO_SVE-NEXT: umov w16, v0.b[6] +; NO_SVE-NEXT: bfi w11, w14, #2, #1 +; NO_SVE-NEXT: and w14, w17, #0x1 +; NO_SVE-NEXT: bfi w11, w12, #3, #1 +; NO_SVE-NEXT: umov w12, v0.b[7] +; NO_SVE-NEXT: bfi w11, w13, #4, #1 +; NO_SVE-NEXT: umov w13, v1.b[14] +; NO_SVE-NEXT: bfi w11, w14, #5, #1 +; NO_SVE-NEXT: and w14, w16, #0x1 +; NO_SVE-NEXT: umov w16, v0.b[8] +; NO_SVE-NEXT: and w17, w18, #0x1 +; NO_SVE-NEXT: and w12, w12, #0x1 +; NO_SVE-NEXT: umov w18, v0.b[9] +; NO_SVE-NEXT: orr w11, w11, w14, lsl #6 +; NO_SVE-NEXT: umov w14, v0.b[10] +; NO_SVE-NEXT: orr w10, w10, w17, lsl #13 +; NO_SVE-NEXT: orr w11, w11, w12, lsl #7 +; NO_SVE-NEXT: and w12, w16, #0x1 +; NO_SVE-NEXT: umov w17, v0.b[11] +; NO_SVE-NEXT: and w13, w13, #0x1 +; NO_SVE-NEXT: and w16, w18, #0x1 +; NO_SVE-NEXT: orr w11, w11, w12, lsl #8 +; NO_SVE-NEXT: umov w12, v0.b[12] +; NO_SVE-NEXT: orr w10, w10, w13, lsl #14 +; NO_SVE-NEXT: and w13, w14, #0x1 +; NO_SVE-NEXT: umov w14, v0.b[13] +; NO_SVE-NEXT: orr w11, w11, w16, lsl #9 +; NO_SVE-NEXT: and w16, w17, #0x1 +; NO_SVE-NEXT: umov w17, v0.b[14] +; NO_SVE-NEXT: orr w11, w11, w13, lsl #10 +; NO_SVE-NEXT: umov w13, v1.b[15] +; NO_SVE-NEXT: and w12, w12, #0x1 +; NO_SVE-NEXT: orr w11, w11, w16, lsl #11 +; NO_SVE-NEXT: and w14, w14, #0x1 +; NO_SVE-NEXT: umov w16, v0.b[15] +; NO_SVE-NEXT: orr w11, w11, w12, lsl #12 +; NO_SVE-NEXT: and w12, w17, #0x1 +; NO_SVE-NEXT: orr w13, w10, w13, lsl #15 +; NO_SVE-NEXT: orr w10, w11, w14, lsl #13 +; NO_SVE-NEXT: orr w9, w10, w12, lsl #14 +; NO_SVE-NEXT: orr w10, w9, w16, lsl #15 +; NO_SVE-NEXT: bfi w10, w13, #16, #16 +; NO_SVE-NEXT: bfi x10, x15, #32, #32 +; NO_SVE-NEXT: tbz w10, #0, .LBB43_2 +; NO_SVE-NEXT: // %bb.1: // %cond.load +; NO_SVE-NEXT: ldr b0, [x0] +; NO_SVE-NEXT: tbnz w10, #1, .LBB43_3 +; NO_SVE-NEXT: b .LBB43_4 +; NO_SVE-NEXT: .LBB43_2: +; NO_SVE-NEXT: // implicit-def: $q0 +; NO_SVE-NEXT: tbz w10, #1, .LBB43_4 +; NO_SVE-NEXT: .LBB43_3: // %cond.load1 +; NO_SVE-NEXT: add x9, x0, #1 +; NO_SVE-NEXT: ld1 { v0.b }[1], [x9] +; NO_SVE-NEXT: .LBB43_4: // %else2 +; NO_SVE-NEXT: tbnz w10, #2, .LBB43_20 +; NO_SVE-NEXT: // %bb.5: // %else5 +; NO_SVE-NEXT: tbnz w10, #3, .LBB43_21 +; NO_SVE-NEXT: .LBB43_6: // %else8 +; NO_SVE-NEXT: tbnz w10, #4, .LBB43_22 +; NO_SVE-NEXT: .LBB43_7: // %else11 +; NO_SVE-NEXT: tbnz w10, #5, .LBB43_23 +; NO_SVE-NEXT: .LBB43_8: // %else14 +; NO_SVE-NEXT: tbnz w10, #6, .LBB43_24 +; NO_SVE-NEXT: .LBB43_9: // %else17 +; NO_SVE-NEXT: tbnz w10, #7, .LBB43_25 +; NO_SVE-NEXT: .LBB43_10: // %else20 +; NO_SVE-NEXT: tbnz w10, #8, .LBB43_26 +; NO_SVE-NEXT: .LBB43_11: // %else23 +; NO_SVE-NEXT: tbnz w10, #9, .LBB43_27 +; NO_SVE-NEXT: .LBB43_12: // %else26 +; NO_SVE-NEXT: tbnz w10, #10, .LBB43_28 +; NO_SVE-NEXT: .LBB43_13: // %else29 +; NO_SVE-NEXT: tbnz w10, #11, .LBB43_29 +; NO_SVE-NEXT: .LBB43_14: // %else32 +; NO_SVE-NEXT: tbnz w10, #12, .LBB43_30 +; NO_SVE-NEXT: .LBB43_15: // %else35 +; NO_SVE-NEXT: tbnz w10, #13, .LBB43_31 +; NO_SVE-NEXT: .LBB43_16: // %else38 +; NO_SVE-NEXT: tbnz w10, #14, .LBB43_32 +; NO_SVE-NEXT: .LBB43_17: // %else41 +; NO_SVE-NEXT: tbnz w10, #15, .LBB43_33 +; NO_SVE-NEXT: .LBB43_18: // %else44 +; NO_SVE-NEXT: tbz w10, #16, .LBB43_34 +; NO_SVE-NEXT: .LBB43_19: // %cond.load46 +; NO_SVE-NEXT: add x9, x0, #16 +; NO_SVE-NEXT: ld1 { v1.b }[0], [x9] +; NO_SVE-NEXT: tbnz w10, #17, .LBB43_35 +; NO_SVE-NEXT: b .LBB43_36 +; NO_SVE-NEXT: .LBB43_20: // %cond.load4 +; NO_SVE-NEXT: add x9, x0, #2 +; NO_SVE-NEXT: ld1 { v0.b }[2], [x9] +; NO_SVE-NEXT: tbz w10, #3, .LBB43_6 +; NO_SVE-NEXT: .LBB43_21: // %cond.load7 +; NO_SVE-NEXT: add x9, x0, #3 +; NO_SVE-NEXT: ld1 { v0.b }[3], [x9] +; NO_SVE-NEXT: tbz w10, #4, .LBB43_7 +; NO_SVE-NEXT: .LBB43_22: // %cond.load10 +; NO_SVE-NEXT: add x9, x0, #4 +; NO_SVE-NEXT: ld1 { v0.b }[4], [x9] +; NO_SVE-NEXT: tbz w10, #5, .LBB43_8 +; NO_SVE-NEXT: .LBB43_23: // %cond.load13 +; NO_SVE-NEXT: add x9, x0, #5 +; NO_SVE-NEXT: ld1 { v0.b }[5], [x9] +; NO_SVE-NEXT: tbz w10, #6, .LBB43_9 +; NO_SVE-NEXT: .LBB43_24: // %cond.load16 +; NO_SVE-NEXT: add x9, x0, #6 +; NO_SVE-NEXT: ld1 { v0.b }[6], [x9] +; NO_SVE-NEXT: tbz w10, #7, .LBB43_10 +; NO_SVE-NEXT: .LBB43_25: // %cond.load19 +; NO_SVE-NEXT: add x9, x0, #7 +; NO_SVE-NEXT: ld1 { v0.b }[7], [x9] +; NO_SVE-NEXT: tbz w10, #8, .LBB43_11 +; NO_SVE-NEXT: .LBB43_26: // %cond.load22 +; NO_SVE-NEXT: add x9, x0, #8 +; NO_SVE-NEXT: ld1 { v0.b }[8], [x9] +; NO_SVE-NEXT: tbz w10, #9, .LBB43_12 +; NO_SVE-NEXT: .LBB43_27: // %cond.load25 +; NO_SVE-NEXT: add x9, x0, #9 +; NO_SVE-NEXT: ld1 { v0.b }[9], [x9] +; NO_SVE-NEXT: tbz w10, #10, .LBB43_13 +; NO_SVE-NEXT: .LBB43_28: // %cond.load28 +; NO_SVE-NEXT: add x9, x0, #10 +; NO_SVE-NEXT: ld1 { v0.b }[10], [x9] +; NO_SVE-NEXT: tbz w10, #11, .LBB43_14 +; NO_SVE-NEXT: .LBB43_29: // %cond.load31 +; NO_SVE-NEXT: add x9, x0, #11 +; NO_SVE-NEXT: ld1 { v0.b }[11], [x9] +; NO_SVE-NEXT: tbz w10, #12, .LBB43_15 +; NO_SVE-NEXT: .LBB43_30: // %cond.load34 +; NO_SVE-NEXT: add x9, x0, #12 +; NO_SVE-NEXT: ld1 { v0.b }[12], [x9] +; NO_SVE-NEXT: tbz w10, #13, .LBB43_16 +; NO_SVE-NEXT: .LBB43_31: // %cond.load37 +; NO_SVE-NEXT: add x9, x0, #13 +; NO_SVE-NEXT: ld1 { v0.b }[13], [x9] +; NO_SVE-NEXT: tbz w10, #14, .LBB43_17 +; NO_SVE-NEXT: .LBB43_32: // %cond.load40 +; NO_SVE-NEXT: add x9, x0, #14 +; NO_SVE-NEXT: ld1 { v0.b }[14], [x9] +; NO_SVE-NEXT: tbz w10, #15, .LBB43_18 +; NO_SVE-NEXT: .LBB43_33: // %cond.load43 +; NO_SVE-NEXT: add x9, x0, #15 +; NO_SVE-NEXT: ld1 { v0.b }[15], [x9] +; NO_SVE-NEXT: tbnz w10, #16, .LBB43_19 +; NO_SVE-NEXT: .LBB43_34: +; NO_SVE-NEXT: // implicit-def: $q1 +; NO_SVE-NEXT: tbz w10, #17, .LBB43_36 +; NO_SVE-NEXT: .LBB43_35: // %cond.load49 +; NO_SVE-NEXT: add x9, x0, #17 +; NO_SVE-NEXT: ld1 { v1.b }[1], [x9] +; NO_SVE-NEXT: .LBB43_36: // %else50 +; NO_SVE-NEXT: tbnz w10, #18, .LBB43_52 +; NO_SVE-NEXT: // %bb.37: // %else53 +; NO_SVE-NEXT: tbnz w10, #19, .LBB43_53 +; NO_SVE-NEXT: .LBB43_38: // %else56 +; NO_SVE-NEXT: tbnz w10, #20, .LBB43_54 +; NO_SVE-NEXT: .LBB43_39: // %else59 +; NO_SVE-NEXT: tbnz w10, #21, .LBB43_55 +; NO_SVE-NEXT: .LBB43_40: // %else62 +; NO_SVE-NEXT: tbnz w10, #22, .LBB43_56 +; NO_SVE-NEXT: .LBB43_41: // %else65 +; NO_SVE-NEXT: tbnz w10, #23, .LBB43_57 +; NO_SVE-NEXT: .LBB43_42: // %else68 +; NO_SVE-NEXT: tbnz w10, #24, .LBB43_58 +; NO_SVE-NEXT: .LBB43_43: // %else71 +; NO_SVE-NEXT: tbnz w10, #25, .LBB43_59 +; NO_SVE-NEXT: .LBB43_44: // %else74 +; NO_SVE-NEXT: tbnz w10, #26, .LBB43_60 +; NO_SVE-NEXT: .LBB43_45: // %else77 +; NO_SVE-NEXT: tbnz w10, #27, .LBB43_61 +; NO_SVE-NEXT: .LBB43_46: // %else80 +; NO_SVE-NEXT: tbnz w10, #28, .LBB43_62 +; NO_SVE-NEXT: .LBB43_47: // %else83 +; NO_SVE-NEXT: tbnz w10, #29, .LBB43_63 +; NO_SVE-NEXT: .LBB43_48: // %else86 +; NO_SVE-NEXT: tbnz w10, #30, .LBB43_64 +; NO_SVE-NEXT: .LBB43_49: // %else89 +; NO_SVE-NEXT: tbnz w10, #31, .LBB43_65 +; NO_SVE-NEXT: .LBB43_50: // %else92 +; NO_SVE-NEXT: tbz x10, #32, .LBB43_66 +; NO_SVE-NEXT: .LBB43_51: // %cond.load94 +; NO_SVE-NEXT: add x9, x0, #32 +; NO_SVE-NEXT: ld1 { v2.b }[0], [x9] +; NO_SVE-NEXT: tbnz x10, #33, .LBB43_67 +; NO_SVE-NEXT: b .LBB43_68 +; NO_SVE-NEXT: .LBB43_52: // %cond.load52 +; NO_SVE-NEXT: add x9, x0, #18 +; NO_SVE-NEXT: ld1 { v1.b }[2], [x9] +; NO_SVE-NEXT: tbz w10, #19, .LBB43_38 +; NO_SVE-NEXT: .LBB43_53: // %cond.load55 +; NO_SVE-NEXT: add x9, x0, #19 +; NO_SVE-NEXT: ld1 { v1.b }[3], [x9] +; NO_SVE-NEXT: tbz w10, #20, .LBB43_39 +; NO_SVE-NEXT: .LBB43_54: // %cond.load58 +; NO_SVE-NEXT: add x9, x0, #20 +; NO_SVE-NEXT: ld1 { v1.b }[4], [x9] +; NO_SVE-NEXT: tbz w10, #21, .LBB43_40 +; NO_SVE-NEXT: .LBB43_55: // %cond.load61 +; NO_SVE-NEXT: add x9, x0, #21 +; NO_SVE-NEXT: ld1 { v1.b }[5], [x9] +; NO_SVE-NEXT: tbz w10, #22, .LBB43_41 +; NO_SVE-NEXT: .LBB43_56: // %cond.load64 +; NO_SVE-NEXT: add x9, x0, #22 +; NO_SVE-NEXT: ld1 { v1.b }[6], [x9] +; NO_SVE-NEXT: tbz w10, #23, .LBB43_42 +; NO_SVE-NEXT: .LBB43_57: // %cond.load67 +; NO_SVE-NEXT: add x9, x0, #23 +; NO_SVE-NEXT: ld1 { v1.b }[7], [x9] +; NO_SVE-NEXT: tbz w10, #24, .LBB43_43 +; NO_SVE-NEXT: .LBB43_58: // %cond.load70 +; NO_SVE-NEXT: add x9, x0, #24 +; NO_SVE-NEXT: ld1 { v1.b }[8], [x9] +; NO_SVE-NEXT: tbz w10, #25, .LBB43_44 +; NO_SVE-NEXT: .LBB43_59: // %cond.load73 +; NO_SVE-NEXT: add x9, x0, #25 +; NO_SVE-NEXT: ld1 { v1.b }[9], [x9] +; NO_SVE-NEXT: tbz w10, #26, .LBB43_45 +; NO_SVE-NEXT: .LBB43_60: // %cond.load76 +; NO_SVE-NEXT: add x9, x0, #26 +; NO_SVE-NEXT: ld1 { v1.b }[10], [x9] +; NO_SVE-NEXT: tbz w10, #27, .LBB43_46 +; NO_SVE-NEXT: .LBB43_61: // %cond.load79 +; NO_SVE-NEXT: add x9, x0, #27 +; NO_SVE-NEXT: ld1 { v1.b }[11], [x9] +; NO_SVE-NEXT: tbz w10, #28, .LBB43_47 +; NO_SVE-NEXT: .LBB43_62: // %cond.load82 +; NO_SVE-NEXT: add x9, x0, #28 +; NO_SVE-NEXT: ld1 { v1.b }[12], [x9] +; NO_SVE-NEXT: tbz w10, #29, .LBB43_48 +; NO_SVE-NEXT: .LBB43_63: // %cond.load85 +; NO_SVE-NEXT: add x9, x0, #29 +; NO_SVE-NEXT: ld1 { v1.b }[13], [x9] +; NO_SVE-NEXT: tbz w10, #30, .LBB43_49 +; NO_SVE-NEXT: .LBB43_64: // %cond.load88 +; NO_SVE-NEXT: add x9, x0, #30 +; NO_SVE-NEXT: ld1 { v1.b }[14], [x9] +; NO_SVE-NEXT: tbz w10, #31, .LBB43_50 +; NO_SVE-NEXT: .LBB43_65: // %cond.load91 +; NO_SVE-NEXT: add x9, x0, #31 +; NO_SVE-NEXT: ld1 { v1.b }[15], [x9] +; NO_SVE-NEXT: tbnz x10, #32, .LBB43_51 +; NO_SVE-NEXT: .LBB43_66: +; NO_SVE-NEXT: // implicit-def: $q2 +; NO_SVE-NEXT: tbz x10, #33, .LBB43_68 +; NO_SVE-NEXT: .LBB43_67: // %cond.load97 +; NO_SVE-NEXT: add x9, x0, #33 +; NO_SVE-NEXT: ld1 { v2.b }[1], [x9] +; NO_SVE-NEXT: .LBB43_68: // %else98 +; NO_SVE-NEXT: tbnz x10, #34, .LBB43_91 +; NO_SVE-NEXT: // %bb.69: // %else101 +; NO_SVE-NEXT: tbnz x10, #35, .LBB43_92 +; NO_SVE-NEXT: .LBB43_70: // %else104 +; NO_SVE-NEXT: tbnz x10, #36, .LBB43_93 +; NO_SVE-NEXT: .LBB43_71: // %else107 +; NO_SVE-NEXT: tbnz x10, #37, .LBB43_94 +; NO_SVE-NEXT: .LBB43_72: // %else110 +; NO_SVE-NEXT: tbnz x10, #38, .LBB43_95 +; NO_SVE-NEXT: .LBB43_73: // %else113 +; NO_SVE-NEXT: tbnz x10, #39, .LBB43_96 +; NO_SVE-NEXT: .LBB43_74: // %else116 +; NO_SVE-NEXT: tbnz x10, #40, .LBB43_97 +; NO_SVE-NEXT: .LBB43_75: // %else119 +; NO_SVE-NEXT: tbz x10, #41, .LBB43_77 +; NO_SVE-NEXT: .LBB43_76: // %cond.load121 +; NO_SVE-NEXT: add x9, x0, #41 +; NO_SVE-NEXT: ld1 { v2.b }[9], [x9] +; NO_SVE-NEXT: .LBB43_77: // %else122 +; NO_SVE-NEXT: ldp q3, q4, [x1, #64] +; NO_SVE-NEXT: ldp q5, q6, [x1, #96] +; NO_SVE-NEXT: tbz x10, #42, .LBB43_79 +; NO_SVE-NEXT: // %bb.78: // %cond.load124 +; NO_SVE-NEXT: add x9, x0, #42 +; NO_SVE-NEXT: ld1 { v2.b }[10], [x9] +; NO_SVE-NEXT: .LBB43_79: // %else125 +; NO_SVE-NEXT: cmeq v7.16b, v6.16b, #0 +; NO_SVE-NEXT: cmeq v6.16b, v5.16b, #0 +; NO_SVE-NEXT: cmeq v5.16b, v4.16b, #0 +; NO_SVE-NEXT: cmeq v4.16b, v3.16b, #0 +; NO_SVE-NEXT: tbz x10, #43, .LBB43_81 +; NO_SVE-NEXT: // %bb.80: // %cond.load127 +; NO_SVE-NEXT: add x9, x0, #43 +; NO_SVE-NEXT: ld1 { v2.b }[11], [x9] +; NO_SVE-NEXT: .LBB43_81: // %else128 +; NO_SVE-NEXT: umov w11, v7.b[1] +; NO_SVE-NEXT: umov w16, v7.b[0] +; NO_SVE-NEXT: umov w12, v6.b[1] +; NO_SVE-NEXT: umov w15, v6.b[0] +; NO_SVE-NEXT: umov w13, v5.b[1] +; NO_SVE-NEXT: umov w14, v5.b[0] +; NO_SVE-NEXT: umov w9, v4.b[1] +; NO_SVE-NEXT: umov w18, v4.b[0] +; NO_SVE-NEXT: tbz x10, #44, .LBB43_83 +; NO_SVE-NEXT: // %bb.82: // %cond.load130 +; NO_SVE-NEXT: add x17, x0, #44 +; NO_SVE-NEXT: ld1 { v2.b }[12], [x17] +; NO_SVE-NEXT: .LBB43_83: // %else131 +; NO_SVE-NEXT: umov w4, v7.b[2] +; NO_SVE-NEXT: and w1, w12, #0x1 +; NO_SVE-NEXT: and w12, w15, #0x1 +; NO_SVE-NEXT: umov w2, v6.b[2] +; NO_SVE-NEXT: and w15, w13, #0x1 +; NO_SVE-NEXT: and w13, w14, #0x1 +; NO_SVE-NEXT: umov w3, v5.b[2] +; NO_SVE-NEXT: umov w14, v4.b[2] +; NO_SVE-NEXT: and w17, w11, #0x1 +; NO_SVE-NEXT: and w11, w16, #0x1 +; NO_SVE-NEXT: and w16, w9, #0x1 +; NO_SVE-NEXT: and w9, w18, #0x1 +; NO_SVE-NEXT: tbz x10, #45, .LBB43_85 +; NO_SVE-NEXT: // %bb.84: // %cond.load133 +; NO_SVE-NEXT: add x18, x0, #45 +; NO_SVE-NEXT: ld1 { v2.b }[13], [x18] +; NO_SVE-NEXT: .LBB43_85: // %else134 +; NO_SVE-NEXT: bfi w11, w17, #1, #1 +; NO_SVE-NEXT: and w17, w4, #0x1 +; NO_SVE-NEXT: umov w18, v7.b[3] +; NO_SVE-NEXT: bfi w12, w1, #1, #1 +; NO_SVE-NEXT: and w1, w2, #0x1 +; NO_SVE-NEXT: umov w2, v6.b[3] +; NO_SVE-NEXT: bfi w13, w15, #1, #1 +; NO_SVE-NEXT: umov w4, v5.b[3] +; NO_SVE-NEXT: umov w15, v4.b[3] +; NO_SVE-NEXT: and w3, w3, #0x1 +; NO_SVE-NEXT: bfi w9, w16, #1, #1 +; NO_SVE-NEXT: and w14, w14, #0x1 +; NO_SVE-NEXT: tbz x10, #46, .LBB43_87 +; NO_SVE-NEXT: // %bb.86: // %cond.load136 +; NO_SVE-NEXT: add x16, x0, #46 +; NO_SVE-NEXT: ld1 { v2.b }[14], [x16] +; NO_SVE-NEXT: .LBB43_87: // %else137 +; NO_SVE-NEXT: bfi w11, w17, #2, #1 +; NO_SVE-NEXT: and w16, w18, #0x1 +; NO_SVE-NEXT: umov w18, v7.b[4] +; NO_SVE-NEXT: bfi w12, w1, #2, #1 +; NO_SVE-NEXT: and w1, w2, #0x1 +; NO_SVE-NEXT: umov w2, v6.b[4] +; NO_SVE-NEXT: umov w5, v5.b[4] +; NO_SVE-NEXT: umov w17, v4.b[4] +; NO_SVE-NEXT: bfi w13, w3, #2, #1 +; NO_SVE-NEXT: and w3, w4, #0x1 +; NO_SVE-NEXT: bfi w9, w14, #2, #1 +; NO_SVE-NEXT: and w4, w15, #0x1 +; NO_SVE-NEXT: tbz x10, #47, .LBB43_89 +; NO_SVE-NEXT: // %bb.88: // %cond.load139 +; NO_SVE-NEXT: add x14, x0, #47 +; NO_SVE-NEXT: ld1 { v2.b }[15], [x14] +; NO_SVE-NEXT: .LBB43_89: // %else140 +; NO_SVE-NEXT: bfi w11, w16, #3, #1 +; NO_SVE-NEXT: umov w16, v7.b[5] +; NO_SVE-NEXT: bfi w12, w1, #3, #1 +; NO_SVE-NEXT: umov w1, v6.b[5] +; NO_SVE-NEXT: bfi w13, w3, #3, #1 +; NO_SVE-NEXT: umov w3, v5.b[5] +; NO_SVE-NEXT: umov w15, v4.b[5] +; NO_SVE-NEXT: and w14, w18, #0x1 +; NO_SVE-NEXT: and w18, w2, #0x1 +; NO_SVE-NEXT: and w2, w5, #0x1 +; NO_SVE-NEXT: bfi w9, w4, #3, #1 +; NO_SVE-NEXT: and w4, w17, #0x1 +; NO_SVE-NEXT: tbz x10, #48, .LBB43_98 +; NO_SVE-NEXT: // %bb.90: // %cond.load142 +; NO_SVE-NEXT: add x17, x0, #48 +; NO_SVE-NEXT: ld1 { v3.b }[0], [x17] +; NO_SVE-NEXT: b .LBB43_99 +; NO_SVE-NEXT: .LBB43_91: // %cond.load100 +; NO_SVE-NEXT: add x9, x0, #34 +; NO_SVE-NEXT: ld1 { v2.b }[2], [x9] +; NO_SVE-NEXT: tbz x10, #35, .LBB43_70 +; NO_SVE-NEXT: .LBB43_92: // %cond.load103 +; NO_SVE-NEXT: add x9, x0, #35 +; NO_SVE-NEXT: ld1 { v2.b }[3], [x9] +; NO_SVE-NEXT: tbz x10, #36, .LBB43_71 +; NO_SVE-NEXT: .LBB43_93: // %cond.load106 +; NO_SVE-NEXT: add x9, x0, #36 +; NO_SVE-NEXT: ld1 { v2.b }[4], [x9] +; NO_SVE-NEXT: tbz x10, #37, .LBB43_72 +; NO_SVE-NEXT: .LBB43_94: // %cond.load109 +; NO_SVE-NEXT: add x9, x0, #37 +; NO_SVE-NEXT: ld1 { v2.b }[5], [x9] +; NO_SVE-NEXT: tbz x10, #38, .LBB43_73 +; NO_SVE-NEXT: .LBB43_95: // %cond.load112 +; NO_SVE-NEXT: add x9, x0, #38 +; NO_SVE-NEXT: ld1 { v2.b }[6], [x9] +; NO_SVE-NEXT: tbz x10, #39, .LBB43_74 +; NO_SVE-NEXT: .LBB43_96: // %cond.load115 +; NO_SVE-NEXT: add x9, x0, #39 +; NO_SVE-NEXT: ld1 { v2.b }[7], [x9] +; NO_SVE-NEXT: tbz x10, #40, .LBB43_75 +; NO_SVE-NEXT: .LBB43_97: // %cond.load118 +; NO_SVE-NEXT: add x9, x0, #40 +; NO_SVE-NEXT: ld1 { v2.b }[8], [x9] +; NO_SVE-NEXT: tbnz x10, #41, .LBB43_76 +; NO_SVE-NEXT: b .LBB43_77 +; NO_SVE-NEXT: .LBB43_98: +; NO_SVE-NEXT: // implicit-def: $q3 +; NO_SVE-NEXT: .LBB43_99: // %else143 +; NO_SVE-NEXT: bfi w11, w14, #4, #1 +; NO_SVE-NEXT: umov w17, v7.b[6] +; NO_SVE-NEXT: bfi w12, w18, #4, #1 +; NO_SVE-NEXT: and w18, w1, #0x1 +; NO_SVE-NEXT: umov w1, v6.b[6] +; NO_SVE-NEXT: bfi w13, w2, #4, #1 +; NO_SVE-NEXT: and w2, w3, #0x1 +; NO_SVE-NEXT: umov w3, v5.b[6] +; NO_SVE-NEXT: umov w14, v4.b[6] +; NO_SVE-NEXT: and w16, w16, #0x1 +; NO_SVE-NEXT: bfi w9, w4, #4, #1 +; NO_SVE-NEXT: and w15, w15, #0x1 +; NO_SVE-NEXT: tbz x10, #49, .LBB43_101 +; NO_SVE-NEXT: // %bb.100: // %cond.load145 +; NO_SVE-NEXT: add x4, x0, #49 +; NO_SVE-NEXT: ld1 { v3.b }[1], [x4] +; NO_SVE-NEXT: .LBB43_101: // %else146 +; NO_SVE-NEXT: bfi w11, w16, #5, #1 +; NO_SVE-NEXT: and w16, w17, #0x1 +; NO_SVE-NEXT: umov w17, v7.b[7] +; NO_SVE-NEXT: bfi w12, w18, #5, #1 +; NO_SVE-NEXT: and w18, w1, #0x1 +; NO_SVE-NEXT: umov w1, v6.b[7] +; NO_SVE-NEXT: bfi w13, w2, #5, #1 +; NO_SVE-NEXT: umov w4, v5.b[7] +; NO_SVE-NEXT: umov w2, v4.b[7] +; NO_SVE-NEXT: and w3, w3, #0x1 +; NO_SVE-NEXT: bfi w9, w15, #5, #1 +; NO_SVE-NEXT: and w5, w14, #0x1 +; NO_SVE-NEXT: tbz x10, #50, .LBB43_103 +; NO_SVE-NEXT: // %bb.102: // %cond.load148 +; NO_SVE-NEXT: add x14, x0, #50 +; NO_SVE-NEXT: ld1 { v3.b }[2], [x14] +; NO_SVE-NEXT: .LBB43_103: // %else149 +; NO_SVE-NEXT: orr w14, w11, w16, lsl #6 +; NO_SVE-NEXT: umov w7, v7.b[8] +; NO_SVE-NEXT: umov w20, v6.b[8] +; NO_SVE-NEXT: umov w22, v5.b[8] +; NO_SVE-NEXT: umov w11, v4.b[8] +; NO_SVE-NEXT: and w6, w17, #0x1 +; NO_SVE-NEXT: orr w15, w12, w18, lsl #6 +; NO_SVE-NEXT: and w19, w1, #0x1 +; NO_SVE-NEXT: orr w16, w13, w3, lsl #6 +; NO_SVE-NEXT: and w21, w4, #0x1 +; NO_SVE-NEXT: orr w17, w9, w5, lsl #6 +; NO_SVE-NEXT: and w18, w2, #0x1 +; NO_SVE-NEXT: tbz x10, #51, .LBB43_105 +; NO_SVE-NEXT: // %bb.104: // %cond.load151 +; NO_SVE-NEXT: add x9, x0, #51 +; NO_SVE-NEXT: ld1 { v3.b }[3], [x9] +; NO_SVE-NEXT: .LBB43_105: // %else152 +; NO_SVE-NEXT: orr w12, w14, w6, lsl #7 +; NO_SVE-NEXT: umov w2, v7.b[9] +; NO_SVE-NEXT: umov w4, v6.b[9] +; NO_SVE-NEXT: umov w6, v5.b[9] +; NO_SVE-NEXT: umov w9, v4.b[9] +; NO_SVE-NEXT: and w1, w7, #0x1 +; NO_SVE-NEXT: orr w13, w15, w19, lsl #7 +; NO_SVE-NEXT: and w3, w20, #0x1 +; NO_SVE-NEXT: orr w14, w16, w21, lsl #7 +; NO_SVE-NEXT: and w5, w22, #0x1 +; NO_SVE-NEXT: orr w15, w17, w18, lsl #7 +; NO_SVE-NEXT: and w16, w11, #0x1 +; NO_SVE-NEXT: tbz x10, #52, .LBB43_107 +; NO_SVE-NEXT: // %bb.106: // %cond.load154 +; NO_SVE-NEXT: add x11, x0, #52 +; NO_SVE-NEXT: ld1 { v3.b }[4], [x11] +; NO_SVE-NEXT: .LBB43_107: // %else155 +; NO_SVE-NEXT: orr w12, w12, w1, lsl #8 +; NO_SVE-NEXT: and w17, w2, #0x1 +; NO_SVE-NEXT: umov w18, v7.b[10] +; NO_SVE-NEXT: and w1, w4, #0x1 +; NO_SVE-NEXT: umov w2, v6.b[10] +; NO_SVE-NEXT: umov w4, v5.b[10] +; NO_SVE-NEXT: umov w11, v4.b[10] +; NO_SVE-NEXT: orr w13, w13, w3, lsl #8 +; NO_SVE-NEXT: orr w14, w14, w5, lsl #8 +; NO_SVE-NEXT: and w3, w6, #0x1 +; NO_SVE-NEXT: orr w15, w15, w16, lsl #8 +; NO_SVE-NEXT: and w16, w9, #0x1 +; NO_SVE-NEXT: tbz x10, #53, .LBB43_109 +; NO_SVE-NEXT: // %bb.108: // %cond.load157 +; NO_SVE-NEXT: add x9, x0, #53 +; NO_SVE-NEXT: ld1 { v3.b }[5], [x9] +; NO_SVE-NEXT: .LBB43_109: // %else158 +; NO_SVE-NEXT: orr w12, w12, w17, lsl #9 +; NO_SVE-NEXT: and w17, w18, #0x1 +; NO_SVE-NEXT: umov w18, v7.b[11] +; NO_SVE-NEXT: orr w13, w13, w1, lsl #9 +; NO_SVE-NEXT: and w1, w2, #0x1 +; NO_SVE-NEXT: umov w2, v6.b[11] +; NO_SVE-NEXT: orr w14, w14, w3, lsl #9 +; NO_SVE-NEXT: and w3, w4, #0x1 +; NO_SVE-NEXT: umov w4, v5.b[11] +; NO_SVE-NEXT: umov w9, v4.b[11] +; NO_SVE-NEXT: orr w15, w15, w16, lsl #9 +; NO_SVE-NEXT: and w16, w11, #0x1 +; NO_SVE-NEXT: tbz x10, #54, .LBB43_111 +; NO_SVE-NEXT: // %bb.110: // %cond.load160 +; NO_SVE-NEXT: add x11, x0, #54 +; NO_SVE-NEXT: ld1 { v3.b }[6], [x11] +; NO_SVE-NEXT: .LBB43_111: // %else161 +; NO_SVE-NEXT: orr w12, w12, w17, lsl #10 +; NO_SVE-NEXT: and w17, w18, #0x1 +; NO_SVE-NEXT: umov w18, v7.b[12] +; NO_SVE-NEXT: orr w13, w13, w1, lsl #10 +; NO_SVE-NEXT: and w1, w2, #0x1 +; NO_SVE-NEXT: umov w2, v6.b[12] +; NO_SVE-NEXT: orr w14, w14, w3, lsl #10 +; NO_SVE-NEXT: and w3, w4, #0x1 +; NO_SVE-NEXT: umov w4, v5.b[12] +; NO_SVE-NEXT: umov w11, v4.b[12] +; NO_SVE-NEXT: orr w15, w15, w16, lsl #10 +; NO_SVE-NEXT: and w16, w9, #0x1 +; NO_SVE-NEXT: tbz x10, #55, .LBB43_113 +; NO_SVE-NEXT: // %bb.112: // %cond.load163 +; NO_SVE-NEXT: add x9, x0, #55 +; NO_SVE-NEXT: ld1 { v3.b }[7], [x9] +; NO_SVE-NEXT: .LBB43_113: // %else164 +; NO_SVE-NEXT: orr w12, w12, w17, lsl #11 +; NO_SVE-NEXT: and w17, w18, #0x1 +; NO_SVE-NEXT: umov w18, v7.b[13] +; NO_SVE-NEXT: orr w13, w13, w1, lsl #11 +; NO_SVE-NEXT: and w1, w2, #0x1 +; NO_SVE-NEXT: umov w2, v6.b[13] +; NO_SVE-NEXT: orr w14, w14, w3, lsl #11 +; NO_SVE-NEXT: and w3, w4, #0x1 +; NO_SVE-NEXT: umov w4, v5.b[13] +; NO_SVE-NEXT: umov w9, v4.b[13] +; NO_SVE-NEXT: orr w15, w15, w16, lsl #11 +; NO_SVE-NEXT: and w16, w11, #0x1 +; NO_SVE-NEXT: tbz x10, #56, .LBB43_115 +; NO_SVE-NEXT: // %bb.114: // %cond.load166 +; NO_SVE-NEXT: add x11, x0, #56 +; NO_SVE-NEXT: ld1 { v3.b }[8], [x11] +; NO_SVE-NEXT: .LBB43_115: // %else167 +; NO_SVE-NEXT: orr w11, w12, w17, lsl #12 +; NO_SVE-NEXT: and w12, w18, #0x1 +; NO_SVE-NEXT: umov w17, v7.b[14] +; NO_SVE-NEXT: orr w13, w13, w1, lsl #12 +; NO_SVE-NEXT: and w18, w2, #0x1 +; NO_SVE-NEXT: umov w1, v6.b[14] +; NO_SVE-NEXT: orr w14, w14, w3, lsl #12 +; NO_SVE-NEXT: and w2, w4, #0x1 +; NO_SVE-NEXT: umov w3, v5.b[14] +; NO_SVE-NEXT: umov w4, v4.b[14] +; NO_SVE-NEXT: orr w5, w15, w16, lsl #12 +; NO_SVE-NEXT: and w6, w9, #0x1 +; NO_SVE-NEXT: tbz x10, #57, .LBB43_117 +; NO_SVE-NEXT: // %bb.116: // %cond.load169 +; NO_SVE-NEXT: add x9, x0, #57 +; NO_SVE-NEXT: ld1 { v3.b }[9], [x9] +; NO_SVE-NEXT: .LBB43_117: // %else170 +; NO_SVE-NEXT: orr w15, w11, w12, lsl #13 +; NO_SVE-NEXT: and w17, w17, #0x1 +; NO_SVE-NEXT: orr w16, w13, w18, lsl #13 +; NO_SVE-NEXT: and w18, w1, #0x1 +; NO_SVE-NEXT: orr w9, w14, w2, lsl #13 +; NO_SVE-NEXT: and w11, w3, #0x1 +; NO_SVE-NEXT: orr w12, w5, w6, lsl #13 +; NO_SVE-NEXT: and w13, w4, #0x1 +; NO_SVE-NEXT: tbz x10, #58, .LBB43_119 +; NO_SVE-NEXT: // %bb.118: // %cond.load172 +; NO_SVE-NEXT: add x14, x0, #58 +; NO_SVE-NEXT: ld1 { v3.b }[10], [x14] +; NO_SVE-NEXT: .LBB43_119: // %else173 +; NO_SVE-NEXT: orr w14, w15, w17, lsl #14 +; NO_SVE-NEXT: umov w15, v7.b[15] +; NO_SVE-NEXT: orr w16, w16, w18, lsl #14 +; NO_SVE-NEXT: umov w17, v6.b[15] +; NO_SVE-NEXT: umov w18, v5.b[15] +; NO_SVE-NEXT: umov w1, v4.b[15] +; NO_SVE-NEXT: orr w2, w9, w11, lsl #14 +; NO_SVE-NEXT: orr w9, w12, w13, lsl #14 +; NO_SVE-NEXT: tbz x10, #59, .LBB43_121 +; NO_SVE-NEXT: // %bb.120: // %cond.load175 +; NO_SVE-NEXT: add x11, x0, #59 +; NO_SVE-NEXT: ld1 { v3.b }[11], [x11] +; NO_SVE-NEXT: .LBB43_121: // %else176 +; NO_SVE-NEXT: orr w12, w14, w15, lsl #15 +; NO_SVE-NEXT: orr w11, w16, w17, lsl #15 +; NO_SVE-NEXT: orr w13, w2, w18, lsl #15 +; NO_SVE-NEXT: orr w9, w9, w1, lsl #15 +; NO_SVE-NEXT: tbz x10, #60, .LBB43_123 +; NO_SVE-NEXT: // %bb.122: // %cond.load178 +; NO_SVE-NEXT: add x14, x0, #60 +; NO_SVE-NEXT: ld1 { v3.b }[12], [x14] +; NO_SVE-NEXT: .LBB43_123: // %else179 +; NO_SVE-NEXT: bfi w11, w12, #16, #16 +; NO_SVE-NEXT: bfi w9, w13, #16, #16 +; NO_SVE-NEXT: tbnz x10, #61, .LBB43_128 +; NO_SVE-NEXT: // %bb.124: // %else182 +; NO_SVE-NEXT: tbnz x10, #62, .LBB43_129 +; NO_SVE-NEXT: .LBB43_125: // %else185 +; NO_SVE-NEXT: bfi x9, x11, #32, #32 +; NO_SVE-NEXT: tbnz x10, #63, .LBB43_130 +; NO_SVE-NEXT: .LBB43_126: // %else188 +; NO_SVE-NEXT: tbz w9, #0, .LBB43_131 +; NO_SVE-NEXT: .LBB43_127: // %cond.load190 +; NO_SVE-NEXT: add x10, x0, #64 +; NO_SVE-NEXT: ld1 { v4.b }[0], [x10] +; NO_SVE-NEXT: tbnz w9, #1, .LBB43_132 +; NO_SVE-NEXT: b .LBB43_133 +; NO_SVE-NEXT: .LBB43_128: // %cond.load181 +; NO_SVE-NEXT: add x12, x0, #61 +; NO_SVE-NEXT: ld1 { v3.b }[13], [x12] +; NO_SVE-NEXT: tbz x10, #62, .LBB43_125 +; NO_SVE-NEXT: .LBB43_129: // %cond.load184 +; NO_SVE-NEXT: add x12, x0, #62 +; NO_SVE-NEXT: ld1 { v3.b }[14], [x12] +; NO_SVE-NEXT: bfi x9, x11, #32, #32 +; NO_SVE-NEXT: tbz x10, #63, .LBB43_126 +; NO_SVE-NEXT: .LBB43_130: // %cond.load187 +; NO_SVE-NEXT: add x10, x0, #63 +; NO_SVE-NEXT: ld1 { v3.b }[15], [x10] +; NO_SVE-NEXT: tbnz w9, #0, .LBB43_127 +; NO_SVE-NEXT: .LBB43_131: +; NO_SVE-NEXT: // implicit-def: $q4 +; NO_SVE-NEXT: tbz w9, #1, .LBB43_133 +; NO_SVE-NEXT: .LBB43_132: // %cond.load193 +; NO_SVE-NEXT: add x10, x0, #65 +; NO_SVE-NEXT: ld1 { v4.b }[1], [x10] +; NO_SVE-NEXT: .LBB43_133: // %else194 +; NO_SVE-NEXT: tbnz w9, #2, .LBB43_149 +; NO_SVE-NEXT: // %bb.134: // %else197 +; NO_SVE-NEXT: tbnz w9, #3, .LBB43_150 +; NO_SVE-NEXT: .LBB43_135: // %else200 +; NO_SVE-NEXT: tbnz w9, #4, .LBB43_151 +; NO_SVE-NEXT: .LBB43_136: // %else203 +; NO_SVE-NEXT: tbnz w9, #5, .LBB43_152 +; NO_SVE-NEXT: .LBB43_137: // %else206 +; NO_SVE-NEXT: tbnz w9, #6, .LBB43_153 +; NO_SVE-NEXT: .LBB43_138: // %else209 +; NO_SVE-NEXT: tbnz w9, #7, .LBB43_154 +; NO_SVE-NEXT: .LBB43_139: // %else212 +; NO_SVE-NEXT: tbnz w9, #8, .LBB43_155 +; NO_SVE-NEXT: .LBB43_140: // %else215 +; NO_SVE-NEXT: tbnz w9, #9, .LBB43_156 +; NO_SVE-NEXT: .LBB43_141: // %else218 +; NO_SVE-NEXT: tbnz w9, #10, .LBB43_157 +; NO_SVE-NEXT: .LBB43_142: // %else221 +; NO_SVE-NEXT: tbnz w9, #11, .LBB43_158 +; NO_SVE-NEXT: .LBB43_143: // %else224 +; NO_SVE-NEXT: tbnz w9, #12, .LBB43_159 +; NO_SVE-NEXT: .LBB43_144: // %else227 +; NO_SVE-NEXT: tbnz w9, #13, .LBB43_160 +; NO_SVE-NEXT: .LBB43_145: // %else230 +; NO_SVE-NEXT: tbnz w9, #14, .LBB43_161 +; NO_SVE-NEXT: .LBB43_146: // %else233 +; NO_SVE-NEXT: tbnz w9, #15, .LBB43_162 +; NO_SVE-NEXT: .LBB43_147: // %else236 +; NO_SVE-NEXT: tbz w9, #16, .LBB43_163 +; NO_SVE-NEXT: .LBB43_148: // %cond.load238 +; NO_SVE-NEXT: add x10, x0, #80 +; NO_SVE-NEXT: ld1 { v5.b }[0], [x10] +; NO_SVE-NEXT: tbnz w9, #17, .LBB43_164 +; NO_SVE-NEXT: b .LBB43_165 +; NO_SVE-NEXT: .LBB43_149: // %cond.load196 +; NO_SVE-NEXT: add x10, x0, #66 +; NO_SVE-NEXT: ld1 { v4.b }[2], [x10] +; NO_SVE-NEXT: tbz w9, #3, .LBB43_135 +; NO_SVE-NEXT: .LBB43_150: // %cond.load199 +; NO_SVE-NEXT: add x10, x0, #67 +; NO_SVE-NEXT: ld1 { v4.b }[3], [x10] +; NO_SVE-NEXT: tbz w9, #4, .LBB43_136 +; NO_SVE-NEXT: .LBB43_151: // %cond.load202 +; NO_SVE-NEXT: add x10, x0, #68 +; NO_SVE-NEXT: ld1 { v4.b }[4], [x10] +; NO_SVE-NEXT: tbz w9, #5, .LBB43_137 +; NO_SVE-NEXT: .LBB43_152: // %cond.load205 +; NO_SVE-NEXT: add x10, x0, #69 +; NO_SVE-NEXT: ld1 { v4.b }[5], [x10] +; NO_SVE-NEXT: tbz w9, #6, .LBB43_138 +; NO_SVE-NEXT: .LBB43_153: // %cond.load208 +; NO_SVE-NEXT: add x10, x0, #70 +; NO_SVE-NEXT: ld1 { v4.b }[6], [x10] +; NO_SVE-NEXT: tbz w9, #7, .LBB43_139 +; NO_SVE-NEXT: .LBB43_154: // %cond.load211 +; NO_SVE-NEXT: add x10, x0, #71 +; NO_SVE-NEXT: ld1 { v4.b }[7], [x10] +; NO_SVE-NEXT: tbz w9, #8, .LBB43_140 +; NO_SVE-NEXT: .LBB43_155: // %cond.load214 +; NO_SVE-NEXT: add x10, x0, #72 +; NO_SVE-NEXT: ld1 { v4.b }[8], [x10] +; NO_SVE-NEXT: tbz w9, #9, .LBB43_141 +; NO_SVE-NEXT: .LBB43_156: // %cond.load217 +; NO_SVE-NEXT: add x10, x0, #73 +; NO_SVE-NEXT: ld1 { v4.b }[9], [x10] +; NO_SVE-NEXT: tbz w9, #10, .LBB43_142 +; NO_SVE-NEXT: .LBB43_157: // %cond.load220 +; NO_SVE-NEXT: add x10, x0, #74 +; NO_SVE-NEXT: ld1 { v4.b }[10], [x10] +; NO_SVE-NEXT: tbz w9, #11, .LBB43_143 +; NO_SVE-NEXT: .LBB43_158: // %cond.load223 +; NO_SVE-NEXT: add x10, x0, #75 +; NO_SVE-NEXT: ld1 { v4.b }[11], [x10] +; NO_SVE-NEXT: tbz w9, #12, .LBB43_144 +; NO_SVE-NEXT: .LBB43_159: // %cond.load226 +; NO_SVE-NEXT: add x10, x0, #76 +; NO_SVE-NEXT: ld1 { v4.b }[12], [x10] +; NO_SVE-NEXT: tbz w9, #13, .LBB43_145 +; NO_SVE-NEXT: .LBB43_160: // %cond.load229 +; NO_SVE-NEXT: add x10, x0, #77 +; NO_SVE-NEXT: ld1 { v4.b }[13], [x10] +; NO_SVE-NEXT: tbz w9, #14, .LBB43_146 +; NO_SVE-NEXT: .LBB43_161: // %cond.load232 +; NO_SVE-NEXT: add x10, x0, #78 +; NO_SVE-NEXT: ld1 { v4.b }[14], [x10] +; NO_SVE-NEXT: tbz w9, #15, .LBB43_147 +; NO_SVE-NEXT: .LBB43_162: // %cond.load235 +; NO_SVE-NEXT: add x10, x0, #79 +; NO_SVE-NEXT: ld1 { v4.b }[15], [x10] +; NO_SVE-NEXT: tbnz w9, #16, .LBB43_148 +; NO_SVE-NEXT: .LBB43_163: +; NO_SVE-NEXT: // implicit-def: $q5 +; NO_SVE-NEXT: tbz w9, #17, .LBB43_165 +; NO_SVE-NEXT: .LBB43_164: // %cond.load241 +; NO_SVE-NEXT: add x10, x0, #81 +; NO_SVE-NEXT: ld1 { v5.b }[1], [x10] +; NO_SVE-NEXT: .LBB43_165: // %else242 +; NO_SVE-NEXT: tbnz w9, #18, .LBB43_181 +; NO_SVE-NEXT: // %bb.166: // %else245 +; NO_SVE-NEXT: tbnz w9, #19, .LBB43_182 +; NO_SVE-NEXT: .LBB43_167: // %else248 +; NO_SVE-NEXT: tbnz w9, #20, .LBB43_183 +; NO_SVE-NEXT: .LBB43_168: // %else251 +; NO_SVE-NEXT: tbnz w9, #21, .LBB43_184 +; NO_SVE-NEXT: .LBB43_169: // %else254 +; NO_SVE-NEXT: tbnz w9, #22, .LBB43_185 +; NO_SVE-NEXT: .LBB43_170: // %else257 +; NO_SVE-NEXT: tbnz w9, #23, .LBB43_186 +; NO_SVE-NEXT: .LBB43_171: // %else260 +; NO_SVE-NEXT: tbnz w9, #24, .LBB43_187 +; NO_SVE-NEXT: .LBB43_172: // %else263 +; NO_SVE-NEXT: tbnz w9, #25, .LBB43_188 +; NO_SVE-NEXT: .LBB43_173: // %else266 +; NO_SVE-NEXT: tbnz w9, #26, .LBB43_189 +; NO_SVE-NEXT: .LBB43_174: // %else269 +; NO_SVE-NEXT: tbnz w9, #27, .LBB43_190 +; NO_SVE-NEXT: .LBB43_175: // %else272 +; NO_SVE-NEXT: tbnz w9, #28, .LBB43_191 +; NO_SVE-NEXT: .LBB43_176: // %else275 +; NO_SVE-NEXT: tbnz w9, #29, .LBB43_192 +; NO_SVE-NEXT: .LBB43_177: // %else278 +; NO_SVE-NEXT: tbnz w9, #30, .LBB43_193 +; NO_SVE-NEXT: .LBB43_178: // %else281 +; NO_SVE-NEXT: tbnz w9, #31, .LBB43_194 +; NO_SVE-NEXT: .LBB43_179: // %else284 +; NO_SVE-NEXT: tbz x9, #32, .LBB43_195 +; NO_SVE-NEXT: .LBB43_180: // %cond.load286 +; NO_SVE-NEXT: add x10, x0, #96 +; NO_SVE-NEXT: ld1 { v6.b }[0], [x10] +; NO_SVE-NEXT: tbnz x9, #33, .LBB43_196 +; NO_SVE-NEXT: b .LBB43_197 +; NO_SVE-NEXT: .LBB43_181: // %cond.load244 +; NO_SVE-NEXT: add x10, x0, #82 +; NO_SVE-NEXT: ld1 { v5.b }[2], [x10] +; NO_SVE-NEXT: tbz w9, #19, .LBB43_167 +; NO_SVE-NEXT: .LBB43_182: // %cond.load247 +; NO_SVE-NEXT: add x10, x0, #83 +; NO_SVE-NEXT: ld1 { v5.b }[3], [x10] +; NO_SVE-NEXT: tbz w9, #20, .LBB43_168 +; NO_SVE-NEXT: .LBB43_183: // %cond.load250 +; NO_SVE-NEXT: add x10, x0, #84 +; NO_SVE-NEXT: ld1 { v5.b }[4], [x10] +; NO_SVE-NEXT: tbz w9, #21, .LBB43_169 +; NO_SVE-NEXT: .LBB43_184: // %cond.load253 +; NO_SVE-NEXT: add x10, x0, #85 +; NO_SVE-NEXT: ld1 { v5.b }[5], [x10] +; NO_SVE-NEXT: tbz w9, #22, .LBB43_170 +; NO_SVE-NEXT: .LBB43_185: // %cond.load256 +; NO_SVE-NEXT: add x10, x0, #86 +; NO_SVE-NEXT: ld1 { v5.b }[6], [x10] +; NO_SVE-NEXT: tbz w9, #23, .LBB43_171 +; NO_SVE-NEXT: .LBB43_186: // %cond.load259 +; NO_SVE-NEXT: add x10, x0, #87 +; NO_SVE-NEXT: ld1 { v5.b }[7], [x10] +; NO_SVE-NEXT: tbz w9, #24, .LBB43_172 +; NO_SVE-NEXT: .LBB43_187: // %cond.load262 +; NO_SVE-NEXT: add x10, x0, #88 +; NO_SVE-NEXT: ld1 { v5.b }[8], [x10] +; NO_SVE-NEXT: tbz w9, #25, .LBB43_173 +; NO_SVE-NEXT: .LBB43_188: // %cond.load265 +; NO_SVE-NEXT: add x10, x0, #89 +; NO_SVE-NEXT: ld1 { v5.b }[9], [x10] +; NO_SVE-NEXT: tbz w9, #26, .LBB43_174 +; NO_SVE-NEXT: .LBB43_189: // %cond.load268 +; NO_SVE-NEXT: add x10, x0, #90 +; NO_SVE-NEXT: ld1 { v5.b }[10], [x10] +; NO_SVE-NEXT: tbz w9, #27, .LBB43_175 +; NO_SVE-NEXT: .LBB43_190: // %cond.load271 +; NO_SVE-NEXT: add x10, x0, #91 +; NO_SVE-NEXT: ld1 { v5.b }[11], [x10] +; NO_SVE-NEXT: tbz w9, #28, .LBB43_176 +; NO_SVE-NEXT: .LBB43_191: // %cond.load274 +; NO_SVE-NEXT: add x10, x0, #92 +; NO_SVE-NEXT: ld1 { v5.b }[12], [x10] +; NO_SVE-NEXT: tbz w9, #29, .LBB43_177 +; NO_SVE-NEXT: .LBB43_192: // %cond.load277 +; NO_SVE-NEXT: add x10, x0, #93 +; NO_SVE-NEXT: ld1 { v5.b }[13], [x10] +; NO_SVE-NEXT: tbz w9, #30, .LBB43_178 +; NO_SVE-NEXT: .LBB43_193: // %cond.load280 +; NO_SVE-NEXT: add x10, x0, #94 +; NO_SVE-NEXT: ld1 { v5.b }[14], [x10] +; NO_SVE-NEXT: tbz w9, #31, .LBB43_179 +; NO_SVE-NEXT: .LBB43_194: // %cond.load283 +; NO_SVE-NEXT: add x10, x0, #95 +; NO_SVE-NEXT: ld1 { v5.b }[15], [x10] +; NO_SVE-NEXT: tbnz x9, #32, .LBB43_180 +; NO_SVE-NEXT: .LBB43_195: +; NO_SVE-NEXT: // implicit-def: $q6 +; NO_SVE-NEXT: tbz x9, #33, .LBB43_197 +; NO_SVE-NEXT: .LBB43_196: // %cond.load289 +; NO_SVE-NEXT: add x10, x0, #97 +; NO_SVE-NEXT: ld1 { v6.b }[1], [x10] +; NO_SVE-NEXT: .LBB43_197: // %else290 +; NO_SVE-NEXT: tbnz x9, #34, .LBB43_213 +; NO_SVE-NEXT: // %bb.198: // %else293 +; NO_SVE-NEXT: tbnz x9, #35, .LBB43_214 +; NO_SVE-NEXT: .LBB43_199: // %else296 +; NO_SVE-NEXT: tbnz x9, #36, .LBB43_215 +; NO_SVE-NEXT: .LBB43_200: // %else299 +; NO_SVE-NEXT: tbnz x9, #37, .LBB43_216 +; NO_SVE-NEXT: .LBB43_201: // %else302 +; NO_SVE-NEXT: tbnz x9, #38, .LBB43_217 +; NO_SVE-NEXT: .LBB43_202: // %else305 +; NO_SVE-NEXT: tbnz x9, #39, .LBB43_218 +; NO_SVE-NEXT: .LBB43_203: // %else308 +; NO_SVE-NEXT: tbnz x9, #40, .LBB43_219 +; NO_SVE-NEXT: .LBB43_204: // %else311 +; NO_SVE-NEXT: tbnz x9, #41, .LBB43_220 +; NO_SVE-NEXT: .LBB43_205: // %else314 +; NO_SVE-NEXT: tbnz x9, #42, .LBB43_221 +; NO_SVE-NEXT: .LBB43_206: // %else317 +; NO_SVE-NEXT: tbnz x9, #43, .LBB43_222 +; NO_SVE-NEXT: .LBB43_207: // %else320 +; NO_SVE-NEXT: tbnz x9, #44, .LBB43_223 +; NO_SVE-NEXT: .LBB43_208: // %else323 +; NO_SVE-NEXT: tbnz x9, #45, .LBB43_224 +; NO_SVE-NEXT: .LBB43_209: // %else326 +; NO_SVE-NEXT: tbnz x9, #46, .LBB43_225 +; NO_SVE-NEXT: .LBB43_210: // %else329 +; NO_SVE-NEXT: tbnz x9, #47, .LBB43_226 +; NO_SVE-NEXT: .LBB43_211: // %else332 +; NO_SVE-NEXT: tbz x9, #48, .LBB43_227 +; NO_SVE-NEXT: .LBB43_212: // %cond.load334 +; NO_SVE-NEXT: add x10, x0, #112 +; NO_SVE-NEXT: ld1 { v7.b }[0], [x10] +; NO_SVE-NEXT: tbnz x9, #49, .LBB43_228 +; NO_SVE-NEXT: b .LBB43_229 +; NO_SVE-NEXT: .LBB43_213: // %cond.load292 +; NO_SVE-NEXT: add x10, x0, #98 +; NO_SVE-NEXT: ld1 { v6.b }[2], [x10] +; NO_SVE-NEXT: tbz x9, #35, .LBB43_199 +; NO_SVE-NEXT: .LBB43_214: // %cond.load295 +; NO_SVE-NEXT: add x10, x0, #99 +; NO_SVE-NEXT: ld1 { v6.b }[3], [x10] +; NO_SVE-NEXT: tbz x9, #36, .LBB43_200 +; NO_SVE-NEXT: .LBB43_215: // %cond.load298 +; NO_SVE-NEXT: add x10, x0, #100 +; NO_SVE-NEXT: ld1 { v6.b }[4], [x10] +; NO_SVE-NEXT: tbz x9, #37, .LBB43_201 +; NO_SVE-NEXT: .LBB43_216: // %cond.load301 +; NO_SVE-NEXT: add x10, x0, #101 +; NO_SVE-NEXT: ld1 { v6.b }[5], [x10] +; NO_SVE-NEXT: tbz x9, #38, .LBB43_202 +; NO_SVE-NEXT: .LBB43_217: // %cond.load304 +; NO_SVE-NEXT: add x10, x0, #102 +; NO_SVE-NEXT: ld1 { v6.b }[6], [x10] +; NO_SVE-NEXT: tbz x9, #39, .LBB43_203 +; NO_SVE-NEXT: .LBB43_218: // %cond.load307 +; NO_SVE-NEXT: add x10, x0, #103 +; NO_SVE-NEXT: ld1 { v6.b }[7], [x10] +; NO_SVE-NEXT: tbz x9, #40, .LBB43_204 +; NO_SVE-NEXT: .LBB43_219: // %cond.load310 +; NO_SVE-NEXT: add x10, x0, #104 +; NO_SVE-NEXT: ld1 { v6.b }[8], [x10] +; NO_SVE-NEXT: tbz x9, #41, .LBB43_205 +; NO_SVE-NEXT: .LBB43_220: // %cond.load313 +; NO_SVE-NEXT: add x10, x0, #105 +; NO_SVE-NEXT: ld1 { v6.b }[9], [x10] +; NO_SVE-NEXT: tbz x9, #42, .LBB43_206 +; NO_SVE-NEXT: .LBB43_221: // %cond.load316 +; NO_SVE-NEXT: add x10, x0, #106 +; NO_SVE-NEXT: ld1 { v6.b }[10], [x10] +; NO_SVE-NEXT: tbz x9, #43, .LBB43_207 +; NO_SVE-NEXT: .LBB43_222: // %cond.load319 +; NO_SVE-NEXT: add x10, x0, #107 +; NO_SVE-NEXT: ld1 { v6.b }[11], [x10] +; NO_SVE-NEXT: tbz x9, #44, .LBB43_208 +; NO_SVE-NEXT: .LBB43_223: // %cond.load322 +; NO_SVE-NEXT: add x10, x0, #108 +; NO_SVE-NEXT: ld1 { v6.b }[12], [x10] +; NO_SVE-NEXT: tbz x9, #45, .LBB43_209 +; NO_SVE-NEXT: .LBB43_224: // %cond.load325 +; NO_SVE-NEXT: add x10, x0, #109 +; NO_SVE-NEXT: ld1 { v6.b }[13], [x10] +; NO_SVE-NEXT: tbz x9, #46, .LBB43_210 +; NO_SVE-NEXT: .LBB43_225: // %cond.load328 +; NO_SVE-NEXT: add x10, x0, #110 +; NO_SVE-NEXT: ld1 { v6.b }[14], [x10] +; NO_SVE-NEXT: tbz x9, #47, .LBB43_211 +; NO_SVE-NEXT: .LBB43_226: // %cond.load331 +; NO_SVE-NEXT: add x10, x0, #111 +; NO_SVE-NEXT: ld1 { v6.b }[15], [x10] +; NO_SVE-NEXT: tbnz x9, #48, .LBB43_212 +; NO_SVE-NEXT: .LBB43_227: +; NO_SVE-NEXT: // implicit-def: $q7 +; NO_SVE-NEXT: tbz x9, #49, .LBB43_229 +; NO_SVE-NEXT: .LBB43_228: // %cond.load337 +; NO_SVE-NEXT: add x10, x0, #113 +; NO_SVE-NEXT: ld1 { v7.b }[1], [x10] +; NO_SVE-NEXT: .LBB43_229: // %else338 +; NO_SVE-NEXT: tbnz x9, #50, .LBB43_245 +; NO_SVE-NEXT: // %bb.230: // %else341 +; NO_SVE-NEXT: tbnz x9, #51, .LBB43_246 +; NO_SVE-NEXT: .LBB43_231: // %else344 +; NO_SVE-NEXT: tbnz x9, #52, .LBB43_247 +; NO_SVE-NEXT: .LBB43_232: // %else347 +; NO_SVE-NEXT: tbnz x9, #53, .LBB43_248 +; NO_SVE-NEXT: .LBB43_233: // %else350 +; NO_SVE-NEXT: tbnz x9, #54, .LBB43_249 +; NO_SVE-NEXT: .LBB43_234: // %else353 +; NO_SVE-NEXT: tbnz x9, #55, .LBB43_250 +; NO_SVE-NEXT: .LBB43_235: // %else356 +; NO_SVE-NEXT: tbnz x9, #56, .LBB43_251 +; NO_SVE-NEXT: .LBB43_236: // %else359 +; NO_SVE-NEXT: tbnz x9, #57, .LBB43_252 +; NO_SVE-NEXT: .LBB43_237: // %else362 +; NO_SVE-NEXT: tbnz x9, #58, .LBB43_253 +; NO_SVE-NEXT: .LBB43_238: // %else365 +; NO_SVE-NEXT: tbnz x9, #59, .LBB43_254 +; NO_SVE-NEXT: .LBB43_239: // %else368 +; NO_SVE-NEXT: tbnz x9, #60, .LBB43_255 +; NO_SVE-NEXT: .LBB43_240: // %else371 +; NO_SVE-NEXT: tbnz x9, #61, .LBB43_256 +; NO_SVE-NEXT: .LBB43_241: // %else374 +; NO_SVE-NEXT: tbnz x9, #62, .LBB43_257 +; NO_SVE-NEXT: .LBB43_242: // %else377 +; NO_SVE-NEXT: tbz x9, #63, .LBB43_244 +; NO_SVE-NEXT: .LBB43_243: // %cond.load379 +; NO_SVE-NEXT: add x9, x0, #127 +; NO_SVE-NEXT: ld1 { v7.b }[15], [x9] +; NO_SVE-NEXT: .LBB43_244: // %else380 +; NO_SVE-NEXT: ushll2 v16.8h, v0.16b, #0 +; NO_SVE-NEXT: ushll v0.8h, v0.8b, #0 +; NO_SVE-NEXT: ushll2 v17.8h, v1.16b, #0 +; NO_SVE-NEXT: stp q0, q16, [x8] +; NO_SVE-NEXT: ushll v1.8h, v1.8b, #0 +; NO_SVE-NEXT: ushll2 v0.8h, v2.16b, #0 +; NO_SVE-NEXT: ushll v2.8h, v2.8b, #0 +; NO_SVE-NEXT: stp q1, q17, [x8, #32] +; NO_SVE-NEXT: ushll2 v1.8h, v3.16b, #0 +; NO_SVE-NEXT: stp q2, q0, [x8, #64] +; NO_SVE-NEXT: ushll v0.8h, v3.8b, #0 +; NO_SVE-NEXT: ushll2 v2.8h, v4.16b, #0 +; NO_SVE-NEXT: stp q0, q1, [x8, #96] +; NO_SVE-NEXT: ushll v1.8h, v4.8b, #0 +; NO_SVE-NEXT: ushll2 v0.8h, v5.16b, #0 +; NO_SVE-NEXT: stp q1, q2, [x8, #128] +; NO_SVE-NEXT: ushll v2.8h, v5.8b, #0 +; NO_SVE-NEXT: ushll2 v1.8h, v6.16b, #0 +; NO_SVE-NEXT: stp q2, q0, [x8, #160] +; NO_SVE-NEXT: ushll v0.8h, v6.8b, #0 +; NO_SVE-NEXT: ushll2 v2.8h, v7.16b, #0 +; NO_SVE-NEXT: stp q0, q1, [x8, #192] +; NO_SVE-NEXT: ushll v1.8h, v7.8b, #0 +; NO_SVE-NEXT: stp q1, q2, [x8, #224] +; NO_SVE-NEXT: ldp x20, x19, [sp, #48] // 16-byte Folded Reload +; NO_SVE-NEXT: ldp x22, x21, [sp, #32] // 16-byte Folded Reload +; NO_SVE-NEXT: add sp, sp, #64 +; NO_SVE-NEXT: ret +; NO_SVE-NEXT: .LBB43_245: // %cond.load340 +; NO_SVE-NEXT: add x10, x0, #114 +; NO_SVE-NEXT: ld1 { v7.b }[2], [x10] +; NO_SVE-NEXT: tbz x9, #51, .LBB43_231 +; NO_SVE-NEXT: .LBB43_246: // %cond.load343 +; NO_SVE-NEXT: add x10, x0, #115 +; NO_SVE-NEXT: ld1 { v7.b }[3], [x10] +; NO_SVE-NEXT: tbz x9, #52, .LBB43_232 +; NO_SVE-NEXT: .LBB43_247: // %cond.load346 +; NO_SVE-NEXT: add x10, x0, #116 +; NO_SVE-NEXT: ld1 { v7.b }[4], [x10] +; NO_SVE-NEXT: tbz x9, #53, .LBB43_233 +; NO_SVE-NEXT: .LBB43_248: // %cond.load349 +; NO_SVE-NEXT: add x10, x0, #117 +; NO_SVE-NEXT: ld1 { v7.b }[5], [x10] +; NO_SVE-NEXT: tbz x9, #54, .LBB43_234 +; NO_SVE-NEXT: .LBB43_249: // %cond.load352 +; NO_SVE-NEXT: add x10, x0, #118 +; NO_SVE-NEXT: ld1 { v7.b }[6], [x10] +; NO_SVE-NEXT: tbz x9, #55, .LBB43_235 +; NO_SVE-NEXT: .LBB43_250: // %cond.load355 +; NO_SVE-NEXT: add x10, x0, #119 +; NO_SVE-NEXT: ld1 { v7.b }[7], [x10] +; NO_SVE-NEXT: tbz x9, #56, .LBB43_236 +; NO_SVE-NEXT: .LBB43_251: // %cond.load358 +; NO_SVE-NEXT: add x10, x0, #120 +; NO_SVE-NEXT: ld1 { v7.b }[8], [x10] +; NO_SVE-NEXT: tbz x9, #57, .LBB43_237 +; NO_SVE-NEXT: .LBB43_252: // %cond.load361 +; NO_SVE-NEXT: add x10, x0, #121 +; NO_SVE-NEXT: ld1 { v7.b }[9], [x10] +; NO_SVE-NEXT: tbz x9, #58, .LBB43_238 +; NO_SVE-NEXT: .LBB43_253: // %cond.load364 +; NO_SVE-NEXT: add x10, x0, #122 +; NO_SVE-NEXT: ld1 { v7.b }[10], [x10] +; NO_SVE-NEXT: tbz x9, #59, .LBB43_239 +; NO_SVE-NEXT: .LBB43_254: // %cond.load367 +; NO_SVE-NEXT: add x10, x0, #123 +; NO_SVE-NEXT: ld1 { v7.b }[11], [x10] +; NO_SVE-NEXT: tbz x9, #60, .LBB43_240 +; NO_SVE-NEXT: .LBB43_255: // %cond.load370 +; NO_SVE-NEXT: add x10, x0, #124 +; NO_SVE-NEXT: ld1 { v7.b }[12], [x10] +; NO_SVE-NEXT: tbz x9, #61, .LBB43_241 +; NO_SVE-NEXT: .LBB43_256: // %cond.load373 +; NO_SVE-NEXT: add x10, x0, #125 +; NO_SVE-NEXT: ld1 { v7.b }[13], [x10] +; NO_SVE-NEXT: tbz x9, #62, .LBB43_242 +; NO_SVE-NEXT: .LBB43_257: // %cond.load376 +; NO_SVE-NEXT: add x10, x0, #126 +; NO_SVE-NEXT: ld1 { v7.b }[14], [x10] +; NO_SVE-NEXT: tbnz x9, #63, .LBB43_243 +; NO_SVE-NEXT: b .LBB43_244 +; ; VBITS_GE_2048-LABEL: masked_load_zext_v128i8i16: ; VBITS_GE_2048: // %bb.0: ; VBITS_GE_2048-NEXT: ptrue p0.h, vl128 @@ -759,6 +11683,625 @@ } define <64 x i32> @masked_load_zext_v64i8i32(<64 x i8>* %ap, <64 x i8>* %bp) #0 { +; NO_SVE-LABEL: masked_load_zext_v64i8i32: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: sub sp, sp, #16 +; NO_SVE-NEXT: .cfi_def_cfa_offset 16 +; NO_SVE-NEXT: ldp q2, q0, [x1, #32] +; NO_SVE-NEXT: cmeq v2.16b, v2.16b, #0 +; NO_SVE-NEXT: cmeq v1.16b, v0.16b, #0 +; NO_SVE-NEXT: ldp q0, q3, [x1] +; NO_SVE-NEXT: umov w9, v1.b[1] +; NO_SVE-NEXT: umov w11, v1.b[2] +; NO_SVE-NEXT: umov w10, v1.b[0] +; NO_SVE-NEXT: umov w12, v1.b[3] +; NO_SVE-NEXT: umov w13, v1.b[4] +; NO_SVE-NEXT: umov w14, v1.b[5] +; NO_SVE-NEXT: umov w15, v1.b[6] +; NO_SVE-NEXT: umov w16, v1.b[7] +; NO_SVE-NEXT: and w9, w9, #0x1 +; NO_SVE-NEXT: and w11, w11, #0x1 +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: and w12, w12, #0x1 +; NO_SVE-NEXT: and w13, w13, #0x1 +; NO_SVE-NEXT: umov w17, v1.b[8] +; NO_SVE-NEXT: bfi w10, w9, #1, #1 +; NO_SVE-NEXT: and w14, w14, #0x1 +; NO_SVE-NEXT: bfi w10, w11, #2, #1 +; NO_SVE-NEXT: umov w18, v1.b[9] +; NO_SVE-NEXT: bfi w10, w12, #3, #1 +; NO_SVE-NEXT: umov w9, v1.b[10] +; NO_SVE-NEXT: bfi w10, w13, #4, #1 +; NO_SVE-NEXT: and w15, w15, #0x1 +; NO_SVE-NEXT: umov w11, v1.b[11] +; NO_SVE-NEXT: bfi w10, w14, #5, #1 +; NO_SVE-NEXT: and w16, w16, #0x1 +; NO_SVE-NEXT: umov w12, v1.b[12] +; NO_SVE-NEXT: orr w10, w10, w15, lsl #6 +; NO_SVE-NEXT: and w17, w17, #0x1 +; NO_SVE-NEXT: umov w13, v1.b[13] +; NO_SVE-NEXT: orr w10, w10, w16, lsl #7 +; NO_SVE-NEXT: and w18, w18, #0x1 +; NO_SVE-NEXT: orr w10, w10, w17, lsl #8 +; NO_SVE-NEXT: and w9, w9, #0x1 +; NO_SVE-NEXT: umov w15, v2.b[1] +; NO_SVE-NEXT: orr w10, w10, w18, lsl #9 +; NO_SVE-NEXT: and w11, w11, #0x1 +; NO_SVE-NEXT: and w12, w12, #0x1 +; NO_SVE-NEXT: orr w9, w10, w9, lsl #10 +; NO_SVE-NEXT: umov w14, v1.b[14] +; NO_SVE-NEXT: umov w16, v2.b[0] +; NO_SVE-NEXT: umov w17, v2.b[2] +; NO_SVE-NEXT: and w10, w13, #0x1 +; NO_SVE-NEXT: orr w9, w9, w11, lsl #11 +; NO_SVE-NEXT: umov w18, v2.b[3] +; NO_SVE-NEXT: orr w9, w9, w12, lsl #12 +; NO_SVE-NEXT: umov w12, v2.b[4] +; NO_SVE-NEXT: orr w9, w9, w10, lsl #13 +; NO_SVE-NEXT: and w10, w15, #0x1 +; NO_SVE-NEXT: and w11, w14, #0x1 +; NO_SVE-NEXT: and w13, w16, #0x1 +; NO_SVE-NEXT: and w14, w17, #0x1 +; NO_SVE-NEXT: umov w15, v2.b[5] +; NO_SVE-NEXT: bfi w13, w10, #1, #1 +; NO_SVE-NEXT: and w10, w18, #0x1 +; NO_SVE-NEXT: and w12, w12, #0x1 +; NO_SVE-NEXT: umov w16, v2.b[6] +; NO_SVE-NEXT: bfi w13, w14, #2, #1 +; NO_SVE-NEXT: umov w14, v2.b[7] +; NO_SVE-NEXT: bfi w13, w10, #3, #1 +; NO_SVE-NEXT: and w10, w15, #0x1 +; NO_SVE-NEXT: bfi w13, w12, #4, #1 +; NO_SVE-NEXT: umov w12, v2.b[8] +; NO_SVE-NEXT: and w15, w16, #0x1 +; NO_SVE-NEXT: umov w16, v2.b[9] +; NO_SVE-NEXT: bfi w13, w10, #5, #1 +; NO_SVE-NEXT: and w10, w14, #0x1 +; NO_SVE-NEXT: umov w14, v2.b[10] +; NO_SVE-NEXT: orr w13, w13, w15, lsl #6 +; NO_SVE-NEXT: and w12, w12, #0x1 +; NO_SVE-NEXT: orr w10, w13, w10, lsl #7 +; NO_SVE-NEXT: and w13, w16, #0x1 +; NO_SVE-NEXT: umov w15, v2.b[11] +; NO_SVE-NEXT: orr w10, w10, w12, lsl #8 +; NO_SVE-NEXT: umov w12, v2.b[12] +; NO_SVE-NEXT: orr w9, w9, w11, lsl #14 +; NO_SVE-NEXT: umov w11, v1.b[15] +; NO_SVE-NEXT: orr w10, w10, w13, lsl #9 +; NO_SVE-NEXT: and w13, w14, #0x1 +; NO_SVE-NEXT: cmeq v1.16b, v3.16b, #0 +; NO_SVE-NEXT: umov w17, v2.b[14] +; NO_SVE-NEXT: orr w10, w10, w13, lsl #10 +; NO_SVE-NEXT: and w13, w15, #0x1 +; NO_SVE-NEXT: umov w14, v1.b[1] +; NO_SVE-NEXT: and w12, w12, #0x1 +; NO_SVE-NEXT: umov w15, v2.b[13] +; NO_SVE-NEXT: umov w16, v1.b[0] +; NO_SVE-NEXT: orr w10, w10, w13, lsl #11 +; NO_SVE-NEXT: umov w13, v1.b[2] +; NO_SVE-NEXT: orr w10, w10, w12, lsl #12 +; NO_SVE-NEXT: umov w12, v1.b[3] +; NO_SVE-NEXT: orr w9, w9, w11, lsl #15 +; NO_SVE-NEXT: and w11, w14, #0x1 +; NO_SVE-NEXT: and w14, w15, #0x1 +; NO_SVE-NEXT: and w15, w16, #0x1 +; NO_SVE-NEXT: umov w16, v1.b[4] +; NO_SVE-NEXT: and w13, w13, #0x1 +; NO_SVE-NEXT: bfi w15, w11, #1, #1 +; NO_SVE-NEXT: and w11, w12, #0x1 +; NO_SVE-NEXT: umov w12, v1.b[5] +; NO_SVE-NEXT: orr w10, w10, w14, lsl #13 +; NO_SVE-NEXT: bfi w15, w13, #2, #1 +; NO_SVE-NEXT: umov w14, v1.b[6] +; NO_SVE-NEXT: and w13, w16, #0x1 +; NO_SVE-NEXT: bfi w15, w11, #3, #1 +; NO_SVE-NEXT: and w16, w17, #0x1 +; NO_SVE-NEXT: umov w17, v1.b[9] +; NO_SVE-NEXT: and w11, w12, #0x1 +; NO_SVE-NEXT: umov w12, v1.b[7] +; NO_SVE-NEXT: bfi w15, w13, #4, #1 +; NO_SVE-NEXT: umov w13, v1.b[8] +; NO_SVE-NEXT: and w14, w14, #0x1 +; NO_SVE-NEXT: orr w10, w10, w16, lsl #14 +; NO_SVE-NEXT: bfi w15, w11, #5, #1 +; NO_SVE-NEXT: umov w18, v1.b[13] +; NO_SVE-NEXT: and w12, w12, #0x1 +; NO_SVE-NEXT: orr w11, w15, w14, lsl #6 +; NO_SVE-NEXT: and w13, w13, #0x1 +; NO_SVE-NEXT: umov w14, v1.b[10] +; NO_SVE-NEXT: orr w11, w11, w12, lsl #7 +; NO_SVE-NEXT: and w12, w17, #0x1 +; NO_SVE-NEXT: orr w11, w11, w13, lsl #8 +; NO_SVE-NEXT: umov w15, v2.b[15] +; NO_SVE-NEXT: orr w11, w11, w12, lsl #9 +; NO_SVE-NEXT: umov w12, v1.b[11] +; NO_SVE-NEXT: and w13, w14, #0x1 +; NO_SVE-NEXT: umov w14, v1.b[12] +; NO_SVE-NEXT: cmeq v0.16b, v0.16b, #0 +; NO_SVE-NEXT: orr w11, w11, w13, lsl #10 +; NO_SVE-NEXT: orr w10, w10, w15, lsl #15 +; NO_SVE-NEXT: and w12, w12, #0x1 +; NO_SVE-NEXT: umov w13, v0.b[1] +; NO_SVE-NEXT: and w14, w14, #0x1 +; NO_SVE-NEXT: umov w15, v0.b[2] +; NO_SVE-NEXT: orr w11, w11, w12, lsl #11 +; NO_SVE-NEXT: umov w12, v0.b[0] +; NO_SVE-NEXT: orr w11, w11, w14, lsl #12 +; NO_SVE-NEXT: umov w14, v0.b[3] +; NO_SVE-NEXT: and w13, w13, #0x1 +; NO_SVE-NEXT: umov w16, v0.b[4] +; NO_SVE-NEXT: umov w17, v0.b[5] +; NO_SVE-NEXT: and w15, w15, #0x1 +; NO_SVE-NEXT: and w12, w12, #0x1 +; NO_SVE-NEXT: bfi w10, w9, #16, #16 +; NO_SVE-NEXT: bfi w12, w13, #1, #1 +; NO_SVE-NEXT: and w13, w14, #0x1 +; NO_SVE-NEXT: and w14, w16, #0x1 +; NO_SVE-NEXT: umov w16, v0.b[6] +; NO_SVE-NEXT: bfi w12, w15, #2, #1 +; NO_SVE-NEXT: and w15, w17, #0x1 +; NO_SVE-NEXT: bfi w12, w13, #3, #1 +; NO_SVE-NEXT: umov w13, v0.b[7] +; NO_SVE-NEXT: bfi w12, w14, #4, #1 +; NO_SVE-NEXT: umov w14, v1.b[14] +; NO_SVE-NEXT: bfi w12, w15, #5, #1 +; NO_SVE-NEXT: and w15, w16, #0x1 +; NO_SVE-NEXT: umov w16, v0.b[8] +; NO_SVE-NEXT: and w17, w18, #0x1 +; NO_SVE-NEXT: and w13, w13, #0x1 +; NO_SVE-NEXT: umov w18, v0.b[9] +; NO_SVE-NEXT: orr w12, w12, w15, lsl #6 +; NO_SVE-NEXT: umov w15, v0.b[10] +; NO_SVE-NEXT: orr w11, w11, w17, lsl #13 +; NO_SVE-NEXT: orr w12, w12, w13, lsl #7 +; NO_SVE-NEXT: and w13, w16, #0x1 +; NO_SVE-NEXT: umov w17, v0.b[11] +; NO_SVE-NEXT: and w14, w14, #0x1 +; NO_SVE-NEXT: and w16, w18, #0x1 +; NO_SVE-NEXT: orr w12, w12, w13, lsl #8 +; NO_SVE-NEXT: umov w13, v0.b[12] +; NO_SVE-NEXT: orr w11, w11, w14, lsl #14 +; NO_SVE-NEXT: and w14, w15, #0x1 +; NO_SVE-NEXT: umov w15, v0.b[13] +; NO_SVE-NEXT: orr w12, w12, w16, lsl #9 +; NO_SVE-NEXT: and w16, w17, #0x1 +; NO_SVE-NEXT: umov w17, v0.b[14] +; NO_SVE-NEXT: orr w12, w12, w14, lsl #10 +; NO_SVE-NEXT: umov w14, v1.b[15] +; NO_SVE-NEXT: and w13, w13, #0x1 +; NO_SVE-NEXT: orr w12, w12, w16, lsl #11 +; NO_SVE-NEXT: and w15, w15, #0x1 +; NO_SVE-NEXT: umov w16, v0.b[15] +; NO_SVE-NEXT: orr w12, w12, w13, lsl #12 +; NO_SVE-NEXT: and w13, w17, #0x1 +; NO_SVE-NEXT: orr w11, w11, w14, lsl #15 +; NO_SVE-NEXT: orr w12, w12, w15, lsl #13 +; NO_SVE-NEXT: orr w9, w12, w13, lsl #14 +; NO_SVE-NEXT: orr w9, w9, w16, lsl #15 +; NO_SVE-NEXT: bfi w9, w11, #16, #16 +; NO_SVE-NEXT: bfi x9, x10, #32, #32 +; NO_SVE-NEXT: tbz w9, #0, .LBB44_2 +; NO_SVE-NEXT: // %bb.1: // %cond.load +; NO_SVE-NEXT: ldr b0, [x0] +; NO_SVE-NEXT: tbnz w9, #1, .LBB44_3 +; NO_SVE-NEXT: b .LBB44_4 +; NO_SVE-NEXT: .LBB44_2: +; NO_SVE-NEXT: // implicit-def: $q0 +; NO_SVE-NEXT: tbz w9, #1, .LBB44_4 +; NO_SVE-NEXT: .LBB44_3: // %cond.load1 +; NO_SVE-NEXT: add x10, x0, #1 +; NO_SVE-NEXT: ld1 { v0.b }[1], [x10] +; NO_SVE-NEXT: .LBB44_4: // %else2 +; NO_SVE-NEXT: tbnz w9, #2, .LBB44_20 +; NO_SVE-NEXT: // %bb.5: // %else5 +; NO_SVE-NEXT: tbnz w9, #3, .LBB44_21 +; NO_SVE-NEXT: .LBB44_6: // %else8 +; NO_SVE-NEXT: tbnz w9, #4, .LBB44_22 +; NO_SVE-NEXT: .LBB44_7: // %else11 +; NO_SVE-NEXT: tbnz w9, #5, .LBB44_23 +; NO_SVE-NEXT: .LBB44_8: // %else14 +; NO_SVE-NEXT: tbnz w9, #6, .LBB44_24 +; NO_SVE-NEXT: .LBB44_9: // %else17 +; NO_SVE-NEXT: tbnz w9, #7, .LBB44_25 +; NO_SVE-NEXT: .LBB44_10: // %else20 +; NO_SVE-NEXT: tbnz w9, #8, .LBB44_26 +; NO_SVE-NEXT: .LBB44_11: // %else23 +; NO_SVE-NEXT: tbnz w9, #9, .LBB44_27 +; NO_SVE-NEXT: .LBB44_12: // %else26 +; NO_SVE-NEXT: tbnz w9, #10, .LBB44_28 +; NO_SVE-NEXT: .LBB44_13: // %else29 +; NO_SVE-NEXT: tbnz w9, #11, .LBB44_29 +; NO_SVE-NEXT: .LBB44_14: // %else32 +; NO_SVE-NEXT: tbnz w9, #12, .LBB44_30 +; NO_SVE-NEXT: .LBB44_15: // %else35 +; NO_SVE-NEXT: tbnz w9, #13, .LBB44_31 +; NO_SVE-NEXT: .LBB44_16: // %else38 +; NO_SVE-NEXT: tbnz w9, #14, .LBB44_32 +; NO_SVE-NEXT: .LBB44_17: // %else41 +; NO_SVE-NEXT: tbnz w9, #15, .LBB44_33 +; NO_SVE-NEXT: .LBB44_18: // %else44 +; NO_SVE-NEXT: tbz w9, #16, .LBB44_34 +; NO_SVE-NEXT: .LBB44_19: // %cond.load46 +; NO_SVE-NEXT: add x10, x0, #16 +; NO_SVE-NEXT: ld1 { v1.b }[0], [x10] +; NO_SVE-NEXT: tbnz w9, #17, .LBB44_35 +; NO_SVE-NEXT: b .LBB44_36 +; NO_SVE-NEXT: .LBB44_20: // %cond.load4 +; NO_SVE-NEXT: add x10, x0, #2 +; NO_SVE-NEXT: ld1 { v0.b }[2], [x10] +; NO_SVE-NEXT: tbz w9, #3, .LBB44_6 +; NO_SVE-NEXT: .LBB44_21: // %cond.load7 +; NO_SVE-NEXT: add x10, x0, #3 +; NO_SVE-NEXT: ld1 { v0.b }[3], [x10] +; NO_SVE-NEXT: tbz w9, #4, .LBB44_7 +; NO_SVE-NEXT: .LBB44_22: // %cond.load10 +; NO_SVE-NEXT: add x10, x0, #4 +; NO_SVE-NEXT: ld1 { v0.b }[4], [x10] +; NO_SVE-NEXT: tbz w9, #5, .LBB44_8 +; NO_SVE-NEXT: .LBB44_23: // %cond.load13 +; NO_SVE-NEXT: add x10, x0, #5 +; NO_SVE-NEXT: ld1 { v0.b }[5], [x10] +; NO_SVE-NEXT: tbz w9, #6, .LBB44_9 +; NO_SVE-NEXT: .LBB44_24: // %cond.load16 +; NO_SVE-NEXT: add x10, x0, #6 +; NO_SVE-NEXT: ld1 { v0.b }[6], [x10] +; NO_SVE-NEXT: tbz w9, #7, .LBB44_10 +; NO_SVE-NEXT: .LBB44_25: // %cond.load19 +; NO_SVE-NEXT: add x10, x0, #7 +; NO_SVE-NEXT: ld1 { v0.b }[7], [x10] +; NO_SVE-NEXT: tbz w9, #8, .LBB44_11 +; NO_SVE-NEXT: .LBB44_26: // %cond.load22 +; NO_SVE-NEXT: add x10, x0, #8 +; NO_SVE-NEXT: ld1 { v0.b }[8], [x10] +; NO_SVE-NEXT: tbz w9, #9, .LBB44_12 +; NO_SVE-NEXT: .LBB44_27: // %cond.load25 +; NO_SVE-NEXT: add x10, x0, #9 +; NO_SVE-NEXT: ld1 { v0.b }[9], [x10] +; NO_SVE-NEXT: tbz w9, #10, .LBB44_13 +; NO_SVE-NEXT: .LBB44_28: // %cond.load28 +; NO_SVE-NEXT: add x10, x0, #10 +; NO_SVE-NEXT: ld1 { v0.b }[10], [x10] +; NO_SVE-NEXT: tbz w9, #11, .LBB44_14 +; NO_SVE-NEXT: .LBB44_29: // %cond.load31 +; NO_SVE-NEXT: add x10, x0, #11 +; NO_SVE-NEXT: ld1 { v0.b }[11], [x10] +; NO_SVE-NEXT: tbz w9, #12, .LBB44_15 +; NO_SVE-NEXT: .LBB44_30: // %cond.load34 +; NO_SVE-NEXT: add x10, x0, #12 +; NO_SVE-NEXT: ld1 { v0.b }[12], [x10] +; NO_SVE-NEXT: tbz w9, #13, .LBB44_16 +; NO_SVE-NEXT: .LBB44_31: // %cond.load37 +; NO_SVE-NEXT: add x10, x0, #13 +; NO_SVE-NEXT: ld1 { v0.b }[13], [x10] +; NO_SVE-NEXT: tbz w9, #14, .LBB44_17 +; NO_SVE-NEXT: .LBB44_32: // %cond.load40 +; NO_SVE-NEXT: add x10, x0, #14 +; NO_SVE-NEXT: ld1 { v0.b }[14], [x10] +; NO_SVE-NEXT: tbz w9, #15, .LBB44_18 +; NO_SVE-NEXT: .LBB44_33: // %cond.load43 +; NO_SVE-NEXT: add x10, x0, #15 +; NO_SVE-NEXT: ld1 { v0.b }[15], [x10] +; NO_SVE-NEXT: tbnz w9, #16, .LBB44_19 +; NO_SVE-NEXT: .LBB44_34: +; NO_SVE-NEXT: // implicit-def: $q1 +; NO_SVE-NEXT: tbz w9, #17, .LBB44_36 +; NO_SVE-NEXT: .LBB44_35: // %cond.load49 +; NO_SVE-NEXT: add x10, x0, #17 +; NO_SVE-NEXT: ld1 { v1.b }[1], [x10] +; NO_SVE-NEXT: .LBB44_36: // %else50 +; NO_SVE-NEXT: tbnz w9, #18, .LBB44_52 +; NO_SVE-NEXT: // %bb.37: // %else53 +; NO_SVE-NEXT: tbnz w9, #19, .LBB44_53 +; NO_SVE-NEXT: .LBB44_38: // %else56 +; NO_SVE-NEXT: tbnz w9, #20, .LBB44_54 +; NO_SVE-NEXT: .LBB44_39: // %else59 +; NO_SVE-NEXT: tbnz w9, #21, .LBB44_55 +; NO_SVE-NEXT: .LBB44_40: // %else62 +; NO_SVE-NEXT: tbnz w9, #22, .LBB44_56 +; NO_SVE-NEXT: .LBB44_41: // %else65 +; NO_SVE-NEXT: tbnz w9, #23, .LBB44_57 +; NO_SVE-NEXT: .LBB44_42: // %else68 +; NO_SVE-NEXT: tbnz w9, #24, .LBB44_58 +; NO_SVE-NEXT: .LBB44_43: // %else71 +; NO_SVE-NEXT: tbnz w9, #25, .LBB44_59 +; NO_SVE-NEXT: .LBB44_44: // %else74 +; NO_SVE-NEXT: tbnz w9, #26, .LBB44_60 +; NO_SVE-NEXT: .LBB44_45: // %else77 +; NO_SVE-NEXT: tbnz w9, #27, .LBB44_61 +; NO_SVE-NEXT: .LBB44_46: // %else80 +; NO_SVE-NEXT: tbnz w9, #28, .LBB44_62 +; NO_SVE-NEXT: .LBB44_47: // %else83 +; NO_SVE-NEXT: tbnz w9, #29, .LBB44_63 +; NO_SVE-NEXT: .LBB44_48: // %else86 +; NO_SVE-NEXT: tbnz w9, #30, .LBB44_64 +; NO_SVE-NEXT: .LBB44_49: // %else89 +; NO_SVE-NEXT: tbnz w9, #31, .LBB44_65 +; NO_SVE-NEXT: .LBB44_50: // %else92 +; NO_SVE-NEXT: tbz x9, #32, .LBB44_66 +; NO_SVE-NEXT: .LBB44_51: // %cond.load94 +; NO_SVE-NEXT: add x10, x0, #32 +; NO_SVE-NEXT: ld1 { v2.b }[0], [x10] +; NO_SVE-NEXT: tbnz x9, #33, .LBB44_67 +; NO_SVE-NEXT: b .LBB44_68 +; NO_SVE-NEXT: .LBB44_52: // %cond.load52 +; NO_SVE-NEXT: add x10, x0, #18 +; NO_SVE-NEXT: ld1 { v1.b }[2], [x10] +; NO_SVE-NEXT: tbz w9, #19, .LBB44_38 +; NO_SVE-NEXT: .LBB44_53: // %cond.load55 +; NO_SVE-NEXT: add x10, x0, #19 +; NO_SVE-NEXT: ld1 { v1.b }[3], [x10] +; NO_SVE-NEXT: tbz w9, #20, .LBB44_39 +; NO_SVE-NEXT: .LBB44_54: // %cond.load58 +; NO_SVE-NEXT: add x10, x0, #20 +; NO_SVE-NEXT: ld1 { v1.b }[4], [x10] +; NO_SVE-NEXT: tbz w9, #21, .LBB44_40 +; NO_SVE-NEXT: .LBB44_55: // %cond.load61 +; NO_SVE-NEXT: add x10, x0, #21 +; NO_SVE-NEXT: ld1 { v1.b }[5], [x10] +; NO_SVE-NEXT: tbz w9, #22, .LBB44_41 +; NO_SVE-NEXT: .LBB44_56: // %cond.load64 +; NO_SVE-NEXT: add x10, x0, #22 +; NO_SVE-NEXT: ld1 { v1.b }[6], [x10] +; NO_SVE-NEXT: tbz w9, #23, .LBB44_42 +; NO_SVE-NEXT: .LBB44_57: // %cond.load67 +; NO_SVE-NEXT: add x10, x0, #23 +; NO_SVE-NEXT: ld1 { v1.b }[7], [x10] +; NO_SVE-NEXT: tbz w9, #24, .LBB44_43 +; NO_SVE-NEXT: .LBB44_58: // %cond.load70 +; NO_SVE-NEXT: add x10, x0, #24 +; NO_SVE-NEXT: ld1 { v1.b }[8], [x10] +; NO_SVE-NEXT: tbz w9, #25, .LBB44_44 +; NO_SVE-NEXT: .LBB44_59: // %cond.load73 +; NO_SVE-NEXT: add x10, x0, #25 +; NO_SVE-NEXT: ld1 { v1.b }[9], [x10] +; NO_SVE-NEXT: tbz w9, #26, .LBB44_45 +; NO_SVE-NEXT: .LBB44_60: // %cond.load76 +; NO_SVE-NEXT: add x10, x0, #26 +; NO_SVE-NEXT: ld1 { v1.b }[10], [x10] +; NO_SVE-NEXT: tbz w9, #27, .LBB44_46 +; NO_SVE-NEXT: .LBB44_61: // %cond.load79 +; NO_SVE-NEXT: add x10, x0, #27 +; NO_SVE-NEXT: ld1 { v1.b }[11], [x10] +; NO_SVE-NEXT: tbz w9, #28, .LBB44_47 +; NO_SVE-NEXT: .LBB44_62: // %cond.load82 +; NO_SVE-NEXT: add x10, x0, #28 +; NO_SVE-NEXT: ld1 { v1.b }[12], [x10] +; NO_SVE-NEXT: tbz w9, #29, .LBB44_48 +; NO_SVE-NEXT: .LBB44_63: // %cond.load85 +; NO_SVE-NEXT: add x10, x0, #29 +; NO_SVE-NEXT: ld1 { v1.b }[13], [x10] +; NO_SVE-NEXT: tbz w9, #30, .LBB44_49 +; NO_SVE-NEXT: .LBB44_64: // %cond.load88 +; NO_SVE-NEXT: add x10, x0, #30 +; NO_SVE-NEXT: ld1 { v1.b }[14], [x10] +; NO_SVE-NEXT: tbz w9, #31, .LBB44_50 +; NO_SVE-NEXT: .LBB44_65: // %cond.load91 +; NO_SVE-NEXT: add x10, x0, #31 +; NO_SVE-NEXT: ld1 { v1.b }[15], [x10] +; NO_SVE-NEXT: tbnz x9, #32, .LBB44_51 +; NO_SVE-NEXT: .LBB44_66: +; NO_SVE-NEXT: // implicit-def: $q2 +; NO_SVE-NEXT: tbz x9, #33, .LBB44_68 +; NO_SVE-NEXT: .LBB44_67: // %cond.load97 +; NO_SVE-NEXT: add x10, x0, #33 +; NO_SVE-NEXT: ld1 { v2.b }[1], [x10] +; NO_SVE-NEXT: .LBB44_68: // %else98 +; NO_SVE-NEXT: tbnz x9, #34, .LBB44_84 +; NO_SVE-NEXT: // %bb.69: // %else101 +; NO_SVE-NEXT: tbnz x9, #35, .LBB44_85 +; NO_SVE-NEXT: .LBB44_70: // %else104 +; NO_SVE-NEXT: tbnz x9, #36, .LBB44_86 +; NO_SVE-NEXT: .LBB44_71: // %else107 +; NO_SVE-NEXT: tbnz x9, #37, .LBB44_87 +; NO_SVE-NEXT: .LBB44_72: // %else110 +; NO_SVE-NEXT: tbnz x9, #38, .LBB44_88 +; NO_SVE-NEXT: .LBB44_73: // %else113 +; NO_SVE-NEXT: tbnz x9, #39, .LBB44_89 +; NO_SVE-NEXT: .LBB44_74: // %else116 +; NO_SVE-NEXT: tbnz x9, #40, .LBB44_90 +; NO_SVE-NEXT: .LBB44_75: // %else119 +; NO_SVE-NEXT: tbnz x9, #41, .LBB44_91 +; NO_SVE-NEXT: .LBB44_76: // %else122 +; NO_SVE-NEXT: tbnz x9, #42, .LBB44_92 +; NO_SVE-NEXT: .LBB44_77: // %else125 +; NO_SVE-NEXT: tbnz x9, #43, .LBB44_93 +; NO_SVE-NEXT: .LBB44_78: // %else128 +; NO_SVE-NEXT: tbnz x9, #44, .LBB44_94 +; NO_SVE-NEXT: .LBB44_79: // %else131 +; NO_SVE-NEXT: tbnz x9, #45, .LBB44_95 +; NO_SVE-NEXT: .LBB44_80: // %else134 +; NO_SVE-NEXT: tbnz x9, #46, .LBB44_96 +; NO_SVE-NEXT: .LBB44_81: // %else137 +; NO_SVE-NEXT: tbnz x9, #47, .LBB44_97 +; NO_SVE-NEXT: .LBB44_82: // %else140 +; NO_SVE-NEXT: tbz x9, #48, .LBB44_98 +; NO_SVE-NEXT: .LBB44_83: // %cond.load142 +; NO_SVE-NEXT: add x10, x0, #48 +; NO_SVE-NEXT: ld1 { v3.b }[0], [x10] +; NO_SVE-NEXT: tbnz x9, #49, .LBB44_99 +; NO_SVE-NEXT: b .LBB44_100 +; NO_SVE-NEXT: .LBB44_84: // %cond.load100 +; NO_SVE-NEXT: add x10, x0, #34 +; NO_SVE-NEXT: ld1 { v2.b }[2], [x10] +; NO_SVE-NEXT: tbz x9, #35, .LBB44_70 +; NO_SVE-NEXT: .LBB44_85: // %cond.load103 +; NO_SVE-NEXT: add x10, x0, #35 +; NO_SVE-NEXT: ld1 { v2.b }[3], [x10] +; NO_SVE-NEXT: tbz x9, #36, .LBB44_71 +; NO_SVE-NEXT: .LBB44_86: // %cond.load106 +; NO_SVE-NEXT: add x10, x0, #36 +; NO_SVE-NEXT: ld1 { v2.b }[4], [x10] +; NO_SVE-NEXT: tbz x9, #37, .LBB44_72 +; NO_SVE-NEXT: .LBB44_87: // %cond.load109 +; NO_SVE-NEXT: add x10, x0, #37 +; NO_SVE-NEXT: ld1 { v2.b }[5], [x10] +; NO_SVE-NEXT: tbz x9, #38, .LBB44_73 +; NO_SVE-NEXT: .LBB44_88: // %cond.load112 +; NO_SVE-NEXT: add x10, x0, #38 +; NO_SVE-NEXT: ld1 { v2.b }[6], [x10] +; NO_SVE-NEXT: tbz x9, #39, .LBB44_74 +; NO_SVE-NEXT: .LBB44_89: // %cond.load115 +; NO_SVE-NEXT: add x10, x0, #39 +; NO_SVE-NEXT: ld1 { v2.b }[7], [x10] +; NO_SVE-NEXT: tbz x9, #40, .LBB44_75 +; NO_SVE-NEXT: .LBB44_90: // %cond.load118 +; NO_SVE-NEXT: add x10, x0, #40 +; NO_SVE-NEXT: ld1 { v2.b }[8], [x10] +; NO_SVE-NEXT: tbz x9, #41, .LBB44_76 +; NO_SVE-NEXT: .LBB44_91: // %cond.load121 +; NO_SVE-NEXT: add x10, x0, #41 +; NO_SVE-NEXT: ld1 { v2.b }[9], [x10] +; NO_SVE-NEXT: tbz x9, #42, .LBB44_77 +; NO_SVE-NEXT: .LBB44_92: // %cond.load124 +; NO_SVE-NEXT: add x10, x0, #42 +; NO_SVE-NEXT: ld1 { v2.b }[10], [x10] +; NO_SVE-NEXT: tbz x9, #43, .LBB44_78 +; NO_SVE-NEXT: .LBB44_93: // %cond.load127 +; NO_SVE-NEXT: add x10, x0, #43 +; NO_SVE-NEXT: ld1 { v2.b }[11], [x10] +; NO_SVE-NEXT: tbz x9, #44, .LBB44_79 +; NO_SVE-NEXT: .LBB44_94: // %cond.load130 +; NO_SVE-NEXT: add x10, x0, #44 +; NO_SVE-NEXT: ld1 { v2.b }[12], [x10] +; NO_SVE-NEXT: tbz x9, #45, .LBB44_80 +; NO_SVE-NEXT: .LBB44_95: // %cond.load133 +; NO_SVE-NEXT: add x10, x0, #45 +; NO_SVE-NEXT: ld1 { v2.b }[13], [x10] +; NO_SVE-NEXT: tbz x9, #46, .LBB44_81 +; NO_SVE-NEXT: .LBB44_96: // %cond.load136 +; NO_SVE-NEXT: add x10, x0, #46 +; NO_SVE-NEXT: ld1 { v2.b }[14], [x10] +; NO_SVE-NEXT: tbz x9, #47, .LBB44_82 +; NO_SVE-NEXT: .LBB44_97: // %cond.load139 +; NO_SVE-NEXT: add x10, x0, #47 +; NO_SVE-NEXT: ld1 { v2.b }[15], [x10] +; NO_SVE-NEXT: tbnz x9, #48, .LBB44_83 +; NO_SVE-NEXT: .LBB44_98: +; NO_SVE-NEXT: // implicit-def: $q3 +; NO_SVE-NEXT: tbz x9, #49, .LBB44_100 +; NO_SVE-NEXT: .LBB44_99: // %cond.load145 +; NO_SVE-NEXT: add x10, x0, #49 +; NO_SVE-NEXT: ld1 { v3.b }[1], [x10] +; NO_SVE-NEXT: .LBB44_100: // %else146 +; NO_SVE-NEXT: tbnz x9, #50, .LBB44_116 +; NO_SVE-NEXT: // %bb.101: // %else149 +; NO_SVE-NEXT: tbnz x9, #51, .LBB44_117 +; NO_SVE-NEXT: .LBB44_102: // %else152 +; NO_SVE-NEXT: tbnz x9, #52, .LBB44_118 +; NO_SVE-NEXT: .LBB44_103: // %else155 +; NO_SVE-NEXT: tbnz x9, #53, .LBB44_119 +; NO_SVE-NEXT: .LBB44_104: // %else158 +; NO_SVE-NEXT: tbnz x9, #54, .LBB44_120 +; NO_SVE-NEXT: .LBB44_105: // %else161 +; NO_SVE-NEXT: tbnz x9, #55, .LBB44_121 +; NO_SVE-NEXT: .LBB44_106: // %else164 +; NO_SVE-NEXT: tbnz x9, #56, .LBB44_122 +; NO_SVE-NEXT: .LBB44_107: // %else167 +; NO_SVE-NEXT: tbnz x9, #57, .LBB44_123 +; NO_SVE-NEXT: .LBB44_108: // %else170 +; NO_SVE-NEXT: tbnz x9, #58, .LBB44_124 +; NO_SVE-NEXT: .LBB44_109: // %else173 +; NO_SVE-NEXT: tbnz x9, #59, .LBB44_125 +; NO_SVE-NEXT: .LBB44_110: // %else176 +; NO_SVE-NEXT: tbnz x9, #60, .LBB44_126 +; NO_SVE-NEXT: .LBB44_111: // %else179 +; NO_SVE-NEXT: tbnz x9, #61, .LBB44_127 +; NO_SVE-NEXT: .LBB44_112: // %else182 +; NO_SVE-NEXT: tbnz x9, #62, .LBB44_128 +; NO_SVE-NEXT: .LBB44_113: // %else185 +; NO_SVE-NEXT: tbz x9, #63, .LBB44_115 +; NO_SVE-NEXT: .LBB44_114: // %cond.load187 +; NO_SVE-NEXT: add x9, x0, #63 +; NO_SVE-NEXT: ld1 { v3.b }[15], [x9] +; NO_SVE-NEXT: .LBB44_115: // %else188 +; NO_SVE-NEXT: ushll v6.8h, v0.8b, #0 +; NO_SVE-NEXT: ushll2 v0.8h, v0.16b, #0 +; NO_SVE-NEXT: ushll2 v5.8h, v2.16b, #0 +; NO_SVE-NEXT: ushll2 v16.4s, v0.8h, #0 +; NO_SVE-NEXT: ushll v0.4s, v0.4h, #0 +; NO_SVE-NEXT: ushll2 v7.8h, v1.16b, #0 +; NO_SVE-NEXT: stp q0, q16, [x8, #32] +; NO_SVE-NEXT: ushll2 v0.4s, v6.8h, #0 +; NO_SVE-NEXT: ushll v6.4s, v6.4h, #0 +; NO_SVE-NEXT: ushll v1.8h, v1.8b, #0 +; NO_SVE-NEXT: stp q6, q0, [x8] +; NO_SVE-NEXT: ushll2 v0.4s, v5.8h, #0 +; NO_SVE-NEXT: ushll v5.4s, v5.4h, #0 +; NO_SVE-NEXT: ushll2 v4.8h, v3.16b, #0 +; NO_SVE-NEXT: ushll2 v6.4s, v1.8h, #0 +; NO_SVE-NEXT: stp q5, q0, [x8, #160] +; NO_SVE-NEXT: ushll v0.4s, v1.4h, #0 +; NO_SVE-NEXT: ushll2 v1.4s, v4.8h, #0 +; NO_SVE-NEXT: stp q0, q6, [x8, #64] +; NO_SVE-NEXT: ushll v0.4s, v4.4h, #0 +; NO_SVE-NEXT: ushll v2.8h, v2.8b, #0 +; NO_SVE-NEXT: stp q0, q1, [x8, #224] +; NO_SVE-NEXT: ushll2 v0.4s, v2.8h, #0 +; NO_SVE-NEXT: ushll v1.4s, v2.4h, #0 +; NO_SVE-NEXT: ushll v2.8h, v3.8b, #0 +; NO_SVE-NEXT: ushll2 v17.4s, v7.8h, #0 +; NO_SVE-NEXT: stp q1, q0, [x8, #128] +; NO_SVE-NEXT: ushll v7.4s, v7.4h, #0 +; NO_SVE-NEXT: ushll2 v0.4s, v2.8h, #0 +; NO_SVE-NEXT: ushll v1.4s, v2.4h, #0 +; NO_SVE-NEXT: stp q7, q17, [x8, #96] +; NO_SVE-NEXT: stp q1, q0, [x8, #192] +; NO_SVE-NEXT: add sp, sp, #16 +; NO_SVE-NEXT: ret +; NO_SVE-NEXT: .LBB44_116: // %cond.load148 +; NO_SVE-NEXT: add x10, x0, #50 +; NO_SVE-NEXT: ld1 { v3.b }[2], [x10] +; NO_SVE-NEXT: tbz x9, #51, .LBB44_102 +; NO_SVE-NEXT: .LBB44_117: // %cond.load151 +; NO_SVE-NEXT: add x10, x0, #51 +; NO_SVE-NEXT: ld1 { v3.b }[3], [x10] +; NO_SVE-NEXT: tbz x9, #52, .LBB44_103 +; NO_SVE-NEXT: .LBB44_118: // %cond.load154 +; NO_SVE-NEXT: add x10, x0, #52 +; NO_SVE-NEXT: ld1 { v3.b }[4], [x10] +; NO_SVE-NEXT: tbz x9, #53, .LBB44_104 +; NO_SVE-NEXT: .LBB44_119: // %cond.load157 +; NO_SVE-NEXT: add x10, x0, #53 +; NO_SVE-NEXT: ld1 { v3.b }[5], [x10] +; NO_SVE-NEXT: tbz x9, #54, .LBB44_105 +; NO_SVE-NEXT: .LBB44_120: // %cond.load160 +; NO_SVE-NEXT: add x10, x0, #54 +; NO_SVE-NEXT: ld1 { v3.b }[6], [x10] +; NO_SVE-NEXT: tbz x9, #55, .LBB44_106 +; NO_SVE-NEXT: .LBB44_121: // %cond.load163 +; NO_SVE-NEXT: add x10, x0, #55 +; NO_SVE-NEXT: ld1 { v3.b }[7], [x10] +; NO_SVE-NEXT: tbz x9, #56, .LBB44_107 +; NO_SVE-NEXT: .LBB44_122: // %cond.load166 +; NO_SVE-NEXT: add x10, x0, #56 +; NO_SVE-NEXT: ld1 { v3.b }[8], [x10] +; NO_SVE-NEXT: tbz x9, #57, .LBB44_108 +; NO_SVE-NEXT: .LBB44_123: // %cond.load169 +; NO_SVE-NEXT: add x10, x0, #57 +; NO_SVE-NEXT: ld1 { v3.b }[9], [x10] +; NO_SVE-NEXT: tbz x9, #58, .LBB44_109 +; NO_SVE-NEXT: .LBB44_124: // %cond.load172 +; NO_SVE-NEXT: add x10, x0, #58 +; NO_SVE-NEXT: ld1 { v3.b }[10], [x10] +; NO_SVE-NEXT: tbz x9, #59, .LBB44_110 +; NO_SVE-NEXT: .LBB44_125: // %cond.load175 +; NO_SVE-NEXT: add x10, x0, #59 +; NO_SVE-NEXT: ld1 { v3.b }[11], [x10] +; NO_SVE-NEXT: tbz x9, #60, .LBB44_111 +; NO_SVE-NEXT: .LBB44_126: // %cond.load178 +; NO_SVE-NEXT: add x10, x0, #60 +; NO_SVE-NEXT: ld1 { v3.b }[12], [x10] +; NO_SVE-NEXT: tbz x9, #61, .LBB44_112 +; NO_SVE-NEXT: .LBB44_127: // %cond.load181 +; NO_SVE-NEXT: add x10, x0, #61 +; NO_SVE-NEXT: ld1 { v3.b }[13], [x10] +; NO_SVE-NEXT: tbz x9, #62, .LBB44_113 +; NO_SVE-NEXT: .LBB44_128: // %cond.load184 +; NO_SVE-NEXT: add x10, x0, #62 +; NO_SVE-NEXT: ld1 { v3.b }[14], [x10] +; NO_SVE-NEXT: tbnz x9, #63, .LBB44_114 +; NO_SVE-NEXT: b .LBB44_115 +; ; VBITS_GE_2048-LABEL: masked_load_zext_v64i8i32: ; VBITS_GE_2048: // %bb.0: ; VBITS_GE_2048-NEXT: ptrue p0.s, vl64 @@ -775,6 +12318,338 @@ } define <32 x i64> @masked_load_zext_v32i8i64(<32 x i8>* %ap, <32 x i8>* %bp) #0 { +; NO_SVE-LABEL: masked_load_zext_v32i8i64: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: sub sp, sp, #16 +; NO_SVE-NEXT: .cfi_def_cfa_offset 16 +; NO_SVE-NEXT: ldp q1, q0, [x1] +; NO_SVE-NEXT: cmeq v1.16b, v1.16b, #0 +; NO_SVE-NEXT: cmeq v0.16b, v0.16b, #0 +; NO_SVE-NEXT: umov w9, v0.b[1] +; NO_SVE-NEXT: umov w11, v0.b[2] +; NO_SVE-NEXT: umov w10, v0.b[0] +; NO_SVE-NEXT: umov w12, v0.b[3] +; NO_SVE-NEXT: umov w13, v0.b[4] +; NO_SVE-NEXT: umov w14, v0.b[5] +; NO_SVE-NEXT: umov w15, v0.b[6] +; NO_SVE-NEXT: umov w16, v0.b[7] +; NO_SVE-NEXT: and w9, w9, #0x1 +; NO_SVE-NEXT: and w11, w11, #0x1 +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: and w12, w12, #0x1 +; NO_SVE-NEXT: and w13, w13, #0x1 +; NO_SVE-NEXT: and w14, w14, #0x1 +; NO_SVE-NEXT: bfi w10, w9, #1, #1 +; NO_SVE-NEXT: umov w9, v0.b[8] +; NO_SVE-NEXT: bfi w10, w11, #2, #1 +; NO_SVE-NEXT: umov w11, v0.b[9] +; NO_SVE-NEXT: bfi w10, w12, #3, #1 +; NO_SVE-NEXT: umov w12, v0.b[10] +; NO_SVE-NEXT: bfi w10, w13, #4, #1 +; NO_SVE-NEXT: and w15, w15, #0x1 +; NO_SVE-NEXT: umov w13, v0.b[11] +; NO_SVE-NEXT: bfi w10, w14, #5, #1 +; NO_SVE-NEXT: and w16, w16, #0x1 +; NO_SVE-NEXT: orr w10, w10, w15, lsl #6 +; NO_SVE-NEXT: and w9, w9, #0x1 +; NO_SVE-NEXT: umov w15, v1.b[1] +; NO_SVE-NEXT: orr w10, w10, w16, lsl #7 +; NO_SVE-NEXT: and w11, w11, #0x1 +; NO_SVE-NEXT: orr w9, w10, w9, lsl #8 +; NO_SVE-NEXT: umov w10, v1.b[2] +; NO_SVE-NEXT: and w12, w12, #0x1 +; NO_SVE-NEXT: umov w16, v1.b[0] +; NO_SVE-NEXT: orr w9, w9, w11, lsl #9 +; NO_SVE-NEXT: and w13, w13, #0x1 +; NO_SVE-NEXT: umov w11, v1.b[3] +; NO_SVE-NEXT: orr w9, w9, w12, lsl #10 +; NO_SVE-NEXT: umov w12, v1.b[4] +; NO_SVE-NEXT: orr w9, w9, w13, lsl #11 +; NO_SVE-NEXT: umov w13, v1.b[5] +; NO_SVE-NEXT: and w15, w15, #0x1 +; NO_SVE-NEXT: umov w14, v0.b[12] +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: and w16, w16, #0x1 +; NO_SVE-NEXT: and w11, w11, #0x1 +; NO_SVE-NEXT: bfi w16, w15, #1, #1 +; NO_SVE-NEXT: umov w15, v1.b[9] +; NO_SVE-NEXT: bfi w16, w10, #2, #1 +; NO_SVE-NEXT: and w10, w12, #0x1 +; NO_SVE-NEXT: and w12, w13, #0x1 +; NO_SVE-NEXT: umov w13, v1.b[6] +; NO_SVE-NEXT: and w14, w14, #0x1 +; NO_SVE-NEXT: bfi w16, w11, #3, #1 +; NO_SVE-NEXT: umov w11, v1.b[7] +; NO_SVE-NEXT: bfi w16, w10, #4, #1 +; NO_SVE-NEXT: orr w9, w9, w14, lsl #12 +; NO_SVE-NEXT: umov w14, v0.b[13] +; NO_SVE-NEXT: bfi w16, w12, #5, #1 +; NO_SVE-NEXT: and w12, w13, #0x1 +; NO_SVE-NEXT: umov w13, v1.b[8] +; NO_SVE-NEXT: umov w10, v0.b[14] +; NO_SVE-NEXT: and w11, w11, #0x1 +; NO_SVE-NEXT: orr w12, w16, w12, lsl #6 +; NO_SVE-NEXT: and w14, w14, #0x1 +; NO_SVE-NEXT: orr w11, w12, w11, lsl #7 +; NO_SVE-NEXT: and w12, w13, #0x1 +; NO_SVE-NEXT: umov w13, v1.b[10] +; NO_SVE-NEXT: orr w9, w9, w14, lsl #13 +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: and w14, w15, #0x1 +; NO_SVE-NEXT: umov w15, v1.b[11] +; NO_SVE-NEXT: orr w9, w9, w10, lsl #14 +; NO_SVE-NEXT: orr w10, w11, w12, lsl #8 +; NO_SVE-NEXT: umov w11, v1.b[12] +; NO_SVE-NEXT: and w12, w13, #0x1 +; NO_SVE-NEXT: umov w13, v1.b[13] +; NO_SVE-NEXT: orr w10, w10, w14, lsl #9 +; NO_SVE-NEXT: and w14, w15, #0x1 +; NO_SVE-NEXT: umov w15, v1.b[14] +; NO_SVE-NEXT: orr w10, w10, w12, lsl #10 +; NO_SVE-NEXT: umov w12, v0.b[15] +; NO_SVE-NEXT: and w11, w11, #0x1 +; NO_SVE-NEXT: orr w10, w10, w14, lsl #11 +; NO_SVE-NEXT: and w13, w13, #0x1 +; NO_SVE-NEXT: umov w14, v1.b[15] +; NO_SVE-NEXT: orr w10, w10, w11, lsl #12 +; NO_SVE-NEXT: and w11, w15, #0x1 +; NO_SVE-NEXT: orr w12, w9, w12, lsl #15 +; NO_SVE-NEXT: orr w9, w10, w13, lsl #13 +; NO_SVE-NEXT: orr w9, w9, w11, lsl #14 +; NO_SVE-NEXT: orr w9, w9, w14, lsl #15 +; NO_SVE-NEXT: bfi w9, w12, #16, #16 +; NO_SVE-NEXT: tbz w9, #0, .LBB45_2 +; NO_SVE-NEXT: // %bb.1: // %cond.load +; NO_SVE-NEXT: ldr b0, [x0] +; NO_SVE-NEXT: tbnz w9, #1, .LBB45_3 +; NO_SVE-NEXT: b .LBB45_4 +; NO_SVE-NEXT: .LBB45_2: +; NO_SVE-NEXT: // implicit-def: $q0 +; NO_SVE-NEXT: tbz w9, #1, .LBB45_4 +; NO_SVE-NEXT: .LBB45_3: // %cond.load1 +; NO_SVE-NEXT: add x10, x0, #1 +; NO_SVE-NEXT: ld1 { v0.b }[1], [x10] +; NO_SVE-NEXT: .LBB45_4: // %else2 +; NO_SVE-NEXT: tbnz w9, #2, .LBB45_20 +; NO_SVE-NEXT: // %bb.5: // %else5 +; NO_SVE-NEXT: tbnz w9, #3, .LBB45_21 +; NO_SVE-NEXT: .LBB45_6: // %else8 +; NO_SVE-NEXT: tbnz w9, #4, .LBB45_22 +; NO_SVE-NEXT: .LBB45_7: // %else11 +; NO_SVE-NEXT: tbnz w9, #5, .LBB45_23 +; NO_SVE-NEXT: .LBB45_8: // %else14 +; NO_SVE-NEXT: tbnz w9, #6, .LBB45_24 +; NO_SVE-NEXT: .LBB45_9: // %else17 +; NO_SVE-NEXT: tbnz w9, #7, .LBB45_25 +; NO_SVE-NEXT: .LBB45_10: // %else20 +; NO_SVE-NEXT: tbnz w9, #8, .LBB45_26 +; NO_SVE-NEXT: .LBB45_11: // %else23 +; NO_SVE-NEXT: tbnz w9, #9, .LBB45_27 +; NO_SVE-NEXT: .LBB45_12: // %else26 +; NO_SVE-NEXT: tbnz w9, #10, .LBB45_28 +; NO_SVE-NEXT: .LBB45_13: // %else29 +; NO_SVE-NEXT: tbnz w9, #11, .LBB45_29 +; NO_SVE-NEXT: .LBB45_14: // %else32 +; NO_SVE-NEXT: tbnz w9, #12, .LBB45_30 +; NO_SVE-NEXT: .LBB45_15: // %else35 +; NO_SVE-NEXT: tbnz w9, #13, .LBB45_31 +; NO_SVE-NEXT: .LBB45_16: // %else38 +; NO_SVE-NEXT: tbnz w9, #14, .LBB45_32 +; NO_SVE-NEXT: .LBB45_17: // %else41 +; NO_SVE-NEXT: tbnz w9, #15, .LBB45_33 +; NO_SVE-NEXT: .LBB45_18: // %else44 +; NO_SVE-NEXT: tbz w9, #16, .LBB45_34 +; NO_SVE-NEXT: .LBB45_19: // %cond.load46 +; NO_SVE-NEXT: add x10, x0, #16 +; NO_SVE-NEXT: ld1 { v1.b }[0], [x10] +; NO_SVE-NEXT: tbnz w9, #17, .LBB45_35 +; NO_SVE-NEXT: b .LBB45_36 +; NO_SVE-NEXT: .LBB45_20: // %cond.load4 +; NO_SVE-NEXT: add x10, x0, #2 +; NO_SVE-NEXT: ld1 { v0.b }[2], [x10] +; NO_SVE-NEXT: tbz w9, #3, .LBB45_6 +; NO_SVE-NEXT: .LBB45_21: // %cond.load7 +; NO_SVE-NEXT: add x10, x0, #3 +; NO_SVE-NEXT: ld1 { v0.b }[3], [x10] +; NO_SVE-NEXT: tbz w9, #4, .LBB45_7 +; NO_SVE-NEXT: .LBB45_22: // %cond.load10 +; NO_SVE-NEXT: add x10, x0, #4 +; NO_SVE-NEXT: ld1 { v0.b }[4], [x10] +; NO_SVE-NEXT: tbz w9, #5, .LBB45_8 +; NO_SVE-NEXT: .LBB45_23: // %cond.load13 +; NO_SVE-NEXT: add x10, x0, #5 +; NO_SVE-NEXT: ld1 { v0.b }[5], [x10] +; NO_SVE-NEXT: tbz w9, #6, .LBB45_9 +; NO_SVE-NEXT: .LBB45_24: // %cond.load16 +; NO_SVE-NEXT: add x10, x0, #6 +; NO_SVE-NEXT: ld1 { v0.b }[6], [x10] +; NO_SVE-NEXT: tbz w9, #7, .LBB45_10 +; NO_SVE-NEXT: .LBB45_25: // %cond.load19 +; NO_SVE-NEXT: add x10, x0, #7 +; NO_SVE-NEXT: ld1 { v0.b }[7], [x10] +; NO_SVE-NEXT: tbz w9, #8, .LBB45_11 +; NO_SVE-NEXT: .LBB45_26: // %cond.load22 +; NO_SVE-NEXT: add x10, x0, #8 +; NO_SVE-NEXT: ld1 { v0.b }[8], [x10] +; NO_SVE-NEXT: tbz w9, #9, .LBB45_12 +; NO_SVE-NEXT: .LBB45_27: // %cond.load25 +; NO_SVE-NEXT: add x10, x0, #9 +; NO_SVE-NEXT: ld1 { v0.b }[9], [x10] +; NO_SVE-NEXT: tbz w9, #10, .LBB45_13 +; NO_SVE-NEXT: .LBB45_28: // %cond.load28 +; NO_SVE-NEXT: add x10, x0, #10 +; NO_SVE-NEXT: ld1 { v0.b }[10], [x10] +; NO_SVE-NEXT: tbz w9, #11, .LBB45_14 +; NO_SVE-NEXT: .LBB45_29: // %cond.load31 +; NO_SVE-NEXT: add x10, x0, #11 +; NO_SVE-NEXT: ld1 { v0.b }[11], [x10] +; NO_SVE-NEXT: tbz w9, #12, .LBB45_15 +; NO_SVE-NEXT: .LBB45_30: // %cond.load34 +; NO_SVE-NEXT: add x10, x0, #12 +; NO_SVE-NEXT: ld1 { v0.b }[12], [x10] +; NO_SVE-NEXT: tbz w9, #13, .LBB45_16 +; NO_SVE-NEXT: .LBB45_31: // %cond.load37 +; NO_SVE-NEXT: add x10, x0, #13 +; NO_SVE-NEXT: ld1 { v0.b }[13], [x10] +; NO_SVE-NEXT: tbz w9, #14, .LBB45_17 +; NO_SVE-NEXT: .LBB45_32: // %cond.load40 +; NO_SVE-NEXT: add x10, x0, #14 +; NO_SVE-NEXT: ld1 { v0.b }[14], [x10] +; NO_SVE-NEXT: tbz w9, #15, .LBB45_18 +; NO_SVE-NEXT: .LBB45_33: // %cond.load43 +; NO_SVE-NEXT: add x10, x0, #15 +; NO_SVE-NEXT: ld1 { v0.b }[15], [x10] +; NO_SVE-NEXT: tbnz w9, #16, .LBB45_19 +; NO_SVE-NEXT: .LBB45_34: +; NO_SVE-NEXT: // implicit-def: $q1 +; NO_SVE-NEXT: tbz w9, #17, .LBB45_36 +; NO_SVE-NEXT: .LBB45_35: // %cond.load49 +; NO_SVE-NEXT: add x10, x0, #17 +; NO_SVE-NEXT: ld1 { v1.b }[1], [x10] +; NO_SVE-NEXT: .LBB45_36: // %else50 +; NO_SVE-NEXT: tbnz w9, #18, .LBB45_52 +; NO_SVE-NEXT: // %bb.37: // %else53 +; NO_SVE-NEXT: tbnz w9, #19, .LBB45_53 +; NO_SVE-NEXT: .LBB45_38: // %else56 +; NO_SVE-NEXT: tbnz w9, #20, .LBB45_54 +; NO_SVE-NEXT: .LBB45_39: // %else59 +; NO_SVE-NEXT: tbnz w9, #21, .LBB45_55 +; NO_SVE-NEXT: .LBB45_40: // %else62 +; NO_SVE-NEXT: tbnz w9, #22, .LBB45_56 +; NO_SVE-NEXT: .LBB45_41: // %else65 +; NO_SVE-NEXT: tbnz w9, #23, .LBB45_57 +; NO_SVE-NEXT: .LBB45_42: // %else68 +; NO_SVE-NEXT: tbnz w9, #24, .LBB45_58 +; NO_SVE-NEXT: .LBB45_43: // %else71 +; NO_SVE-NEXT: tbnz w9, #25, .LBB45_59 +; NO_SVE-NEXT: .LBB45_44: // %else74 +; NO_SVE-NEXT: tbnz w9, #26, .LBB45_60 +; NO_SVE-NEXT: .LBB45_45: // %else77 +; NO_SVE-NEXT: tbnz w9, #27, .LBB45_61 +; NO_SVE-NEXT: .LBB45_46: // %else80 +; NO_SVE-NEXT: tbnz w9, #28, .LBB45_62 +; NO_SVE-NEXT: .LBB45_47: // %else83 +; NO_SVE-NEXT: tbnz w9, #29, .LBB45_63 +; NO_SVE-NEXT: .LBB45_48: // %else86 +; NO_SVE-NEXT: tbnz w9, #30, .LBB45_64 +; NO_SVE-NEXT: .LBB45_49: // %else89 +; NO_SVE-NEXT: tbz w9, #31, .LBB45_51 +; NO_SVE-NEXT: .LBB45_50: // %cond.load91 +; NO_SVE-NEXT: add x9, x0, #31 +; NO_SVE-NEXT: ld1 { v1.b }[15], [x9] +; NO_SVE-NEXT: .LBB45_51: // %else92 +; NO_SVE-NEXT: ushll v3.8h, v0.8b, #0 +; NO_SVE-NEXT: ushll2 v0.8h, v0.16b, #0 +; NO_SVE-NEXT: ushll v2.8h, v1.8b, #0 +; NO_SVE-NEXT: ushll v6.4s, v0.4h, #0 +; NO_SVE-NEXT: ushll2 v0.4s, v0.8h, #0 +; NO_SVE-NEXT: ushll2 v1.8h, v1.16b, #0 +; NO_SVE-NEXT: ushll2 v16.2d, v0.4s, #0 +; NO_SVE-NEXT: ushll v0.2d, v0.2s, #0 +; NO_SVE-NEXT: ushll2 v5.4s, v3.8h, #0 +; NO_SVE-NEXT: stp q0, q16, [x8, #96] +; NO_SVE-NEXT: ushll2 v0.2d, v6.4s, #0 +; NO_SVE-NEXT: ushll v6.2d, v6.2s, #0 +; NO_SVE-NEXT: ushll2 v7.4s, v1.8h, #0 +; NO_SVE-NEXT: ushll v1.4s, v1.4h, #0 +; NO_SVE-NEXT: stp q6, q0, [x8, #64] +; NO_SVE-NEXT: ushll2 v0.2d, v5.4s, #0 +; NO_SVE-NEXT: ushll v5.2d, v5.2s, #0 +; NO_SVE-NEXT: ushll2 v4.4s, v2.8h, #0 +; NO_SVE-NEXT: ushll2 v6.2d, v1.4s, #0 +; NO_SVE-NEXT: stp q5, q0, [x8, #32] +; NO_SVE-NEXT: ushll v0.2d, v1.2s, #0 +; NO_SVE-NEXT: ushll2 v1.2d, v4.4s, #0 +; NO_SVE-NEXT: stp q0, q6, [x8, #192] +; NO_SVE-NEXT: ushll v0.2d, v4.2s, #0 +; NO_SVE-NEXT: ushll v3.4s, v3.4h, #0 +; NO_SVE-NEXT: stp q0, q1, [x8, #160] +; NO_SVE-NEXT: ushll2 v0.2d, v3.4s, #0 +; NO_SVE-NEXT: ushll v1.2d, v3.2s, #0 +; NO_SVE-NEXT: ushll v2.4s, v2.4h, #0 +; NO_SVE-NEXT: ushll2 v17.2d, v7.4s, #0 +; NO_SVE-NEXT: stp q1, q0, [x8] +; NO_SVE-NEXT: ushll v7.2d, v7.2s, #0 +; NO_SVE-NEXT: ushll2 v0.2d, v2.4s, #0 +; NO_SVE-NEXT: ushll v1.2d, v2.2s, #0 +; NO_SVE-NEXT: stp q7, q17, [x8, #224] +; NO_SVE-NEXT: stp q1, q0, [x8, #128] +; NO_SVE-NEXT: add sp, sp, #16 +; NO_SVE-NEXT: ret +; NO_SVE-NEXT: .LBB45_52: // %cond.load52 +; NO_SVE-NEXT: add x10, x0, #18 +; NO_SVE-NEXT: ld1 { v1.b }[2], [x10] +; NO_SVE-NEXT: tbz w9, #19, .LBB45_38 +; NO_SVE-NEXT: .LBB45_53: // %cond.load55 +; NO_SVE-NEXT: add x10, x0, #19 +; NO_SVE-NEXT: ld1 { v1.b }[3], [x10] +; NO_SVE-NEXT: tbz w9, #20, .LBB45_39 +; NO_SVE-NEXT: .LBB45_54: // %cond.load58 +; NO_SVE-NEXT: add x10, x0, #20 +; NO_SVE-NEXT: ld1 { v1.b }[4], [x10] +; NO_SVE-NEXT: tbz w9, #21, .LBB45_40 +; NO_SVE-NEXT: .LBB45_55: // %cond.load61 +; NO_SVE-NEXT: add x10, x0, #21 +; NO_SVE-NEXT: ld1 { v1.b }[5], [x10] +; NO_SVE-NEXT: tbz w9, #22, .LBB45_41 +; NO_SVE-NEXT: .LBB45_56: // %cond.load64 +; NO_SVE-NEXT: add x10, x0, #22 +; NO_SVE-NEXT: ld1 { v1.b }[6], [x10] +; NO_SVE-NEXT: tbz w9, #23, .LBB45_42 +; NO_SVE-NEXT: .LBB45_57: // %cond.load67 +; NO_SVE-NEXT: add x10, x0, #23 +; NO_SVE-NEXT: ld1 { v1.b }[7], [x10] +; NO_SVE-NEXT: tbz w9, #24, .LBB45_43 +; NO_SVE-NEXT: .LBB45_58: // %cond.load70 +; NO_SVE-NEXT: add x10, x0, #24 +; NO_SVE-NEXT: ld1 { v1.b }[8], [x10] +; NO_SVE-NEXT: tbz w9, #25, .LBB45_44 +; NO_SVE-NEXT: .LBB45_59: // %cond.load73 +; NO_SVE-NEXT: add x10, x0, #25 +; NO_SVE-NEXT: ld1 { v1.b }[9], [x10] +; NO_SVE-NEXT: tbz w9, #26, .LBB45_45 +; NO_SVE-NEXT: .LBB45_60: // %cond.load76 +; NO_SVE-NEXT: add x10, x0, #26 +; NO_SVE-NEXT: ld1 { v1.b }[10], [x10] +; NO_SVE-NEXT: tbz w9, #27, .LBB45_46 +; NO_SVE-NEXT: .LBB45_61: // %cond.load79 +; NO_SVE-NEXT: add x10, x0, #27 +; NO_SVE-NEXT: ld1 { v1.b }[11], [x10] +; NO_SVE-NEXT: tbz w9, #28, .LBB45_47 +; NO_SVE-NEXT: .LBB45_62: // %cond.load82 +; NO_SVE-NEXT: add x10, x0, #28 +; NO_SVE-NEXT: ld1 { v1.b }[12], [x10] +; NO_SVE-NEXT: tbz w9, #29, .LBB45_48 +; NO_SVE-NEXT: .LBB45_63: // %cond.load85 +; NO_SVE-NEXT: add x10, x0, #29 +; NO_SVE-NEXT: ld1 { v1.b }[13], [x10] +; NO_SVE-NEXT: tbz w9, #30, .LBB45_49 +; NO_SVE-NEXT: .LBB45_64: // %cond.load88 +; NO_SVE-NEXT: add x10, x0, #30 +; NO_SVE-NEXT: ld1 { v1.b }[14], [x10] +; NO_SVE-NEXT: tbnz w9, #31, .LBB45_50 +; NO_SVE-NEXT: b .LBB45_51 +; ; VBITS_GE_2048-LABEL: masked_load_zext_v32i8i64: ; VBITS_GE_2048: // %bb.0: ; VBITS_GE_2048-NEXT: ptrue p0.d, vl32 @@ -791,6 +12666,635 @@ } define <64 x i32> @masked_load_zext_v64i16i32(<64 x i16>* %ap, <64 x i16>* %bp) #0 { +; NO_SVE-LABEL: masked_load_zext_v64i16i32: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: sub sp, sp, #16 +; NO_SVE-NEXT: .cfi_def_cfa_offset 16 +; NO_SVE-NEXT: ldp q3, q2, [x1, #96] +; NO_SVE-NEXT: cmeq v3.8h, v3.8h, #0 +; NO_SVE-NEXT: xtn v3.8b, v3.8h +; NO_SVE-NEXT: cmeq v2.8h, v2.8h, #0 +; NO_SVE-NEXT: umov w9, v3.b[1] +; NO_SVE-NEXT: umov w11, v3.b[2] +; NO_SVE-NEXT: umov w10, v3.b[0] +; NO_SVE-NEXT: umov w12, v3.b[3] +; NO_SVE-NEXT: umov w13, v3.b[4] +; NO_SVE-NEXT: umov w14, v3.b[5] +; NO_SVE-NEXT: xtn v6.8b, v2.8h +; NO_SVE-NEXT: and w9, w9, #0x1 +; NO_SVE-NEXT: umov w15, v3.b[6] +; NO_SVE-NEXT: and w11, w11, #0x1 +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: and w12, w12, #0x1 +; NO_SVE-NEXT: umov w16, v3.b[7] +; NO_SVE-NEXT: bfi w10, w9, #1, #1 +; NO_SVE-NEXT: and w13, w13, #0x1 +; NO_SVE-NEXT: ldp q5, q4, [x1, #64] +; NO_SVE-NEXT: umov w17, v6.b[0] +; NO_SVE-NEXT: bfi w10, w11, #2, #1 +; NO_SVE-NEXT: and w14, w14, #0x1 +; NO_SVE-NEXT: umov w18, v6.b[1] +; NO_SVE-NEXT: bfi w10, w12, #3, #1 +; NO_SVE-NEXT: umov w9, v6.b[2] +; NO_SVE-NEXT: and w15, w15, #0x1 +; NO_SVE-NEXT: bfi w10, w13, #4, #1 +; NO_SVE-NEXT: umov w11, v6.b[3] +; NO_SVE-NEXT: and w16, w16, #0x1 +; NO_SVE-NEXT: bfi w10, w14, #5, #1 +; NO_SVE-NEXT: umov w12, v6.b[4] +; NO_SVE-NEXT: and w17, w17, #0x1 +; NO_SVE-NEXT: orr w10, w10, w15, lsl #6 +; NO_SVE-NEXT: cmeq v2.8h, v5.8h, #0 +; NO_SVE-NEXT: and w18, w18, #0x1 +; NO_SVE-NEXT: umov w13, v6.b[5] +; NO_SVE-NEXT: orr w10, w10, w16, lsl #7 +; NO_SVE-NEXT: and w9, w9, #0x1 +; NO_SVE-NEXT: xtn v5.8b, v2.8h +; NO_SVE-NEXT: orr w10, w10, w17, lsl #8 +; NO_SVE-NEXT: and w11, w11, #0x1 +; NO_SVE-NEXT: orr w10, w10, w18, lsl #9 +; NO_SVE-NEXT: umov w14, v6.b[6] +; NO_SVE-NEXT: umov w15, v5.b[1] +; NO_SVE-NEXT: and w12, w12, #0x1 +; NO_SVE-NEXT: orr w9, w10, w9, lsl #10 +; NO_SVE-NEXT: and w13, w13, #0x1 +; NO_SVE-NEXT: orr w9, w9, w11, lsl #11 +; NO_SVE-NEXT: umov w16, v6.b[7] +; NO_SVE-NEXT: orr w9, w9, w12, lsl #12 +; NO_SVE-NEXT: umov w12, v5.b[0] +; NO_SVE-NEXT: orr w9, w9, w13, lsl #13 +; NO_SVE-NEXT: umov w13, v5.b[2] +; NO_SVE-NEXT: and w10, w14, #0x1 +; NO_SVE-NEXT: and w11, w15, #0x1 +; NO_SVE-NEXT: umov w14, v5.b[3] +; NO_SVE-NEXT: orr w9, w9, w10, lsl #14 +; NO_SVE-NEXT: and w10, w12, #0x1 +; NO_SVE-NEXT: umov w12, v5.b[4] +; NO_SVE-NEXT: bfi w10, w11, #1, #1 +; NO_SVE-NEXT: and w11, w13, #0x1 +; NO_SVE-NEXT: orr w9, w9, w16, lsl #15 +; NO_SVE-NEXT: and w13, w14, #0x1 +; NO_SVE-NEXT: umov w14, v5.b[5] +; NO_SVE-NEXT: cmeq v2.8h, v4.8h, #0 +; NO_SVE-NEXT: bfi w10, w11, #2, #1 +; NO_SVE-NEXT: and w11, w12, #0x1 +; NO_SVE-NEXT: umov w12, v5.b[6] +; NO_SVE-NEXT: xtn v2.8b, v2.8h +; NO_SVE-NEXT: bfi w10, w13, #3, #1 +; NO_SVE-NEXT: umov w13, v5.b[7] +; NO_SVE-NEXT: and w14, w14, #0x1 +; NO_SVE-NEXT: umov w15, v2.b[0] +; NO_SVE-NEXT: bfi w10, w11, #4, #1 +; NO_SVE-NEXT: umov w16, v2.b[6] +; NO_SVE-NEXT: and w11, w12, #0x1 +; NO_SVE-NEXT: umov w12, v2.b[1] +; NO_SVE-NEXT: ldp q7, q3, [x1, #32] +; NO_SVE-NEXT: bfi w10, w14, #5, #1 +; NO_SVE-NEXT: and w13, w13, #0x1 +; NO_SVE-NEXT: orr w10, w10, w11, lsl #6 +; NO_SVE-NEXT: and w11, w15, #0x1 +; NO_SVE-NEXT: orr w10, w10, w13, lsl #7 +; NO_SVE-NEXT: umov w13, v2.b[2] +; NO_SVE-NEXT: orr w10, w10, w11, lsl #8 +; NO_SVE-NEXT: and w11, w12, #0x1 +; NO_SVE-NEXT: umov w12, v2.b[3] +; NO_SVE-NEXT: umov w14, v2.b[4] +; NO_SVE-NEXT: cmeq v4.8h, v7.8h, #0 +; NO_SVE-NEXT: orr w10, w10, w11, lsl #9 +; NO_SVE-NEXT: and w11, w13, #0x1 +; NO_SVE-NEXT: umov w13, v2.b[5] +; NO_SVE-NEXT: xtn v4.8b, v4.8h +; NO_SVE-NEXT: and w16, w16, #0x1 +; NO_SVE-NEXT: orr w10, w10, w11, lsl #10 +; NO_SVE-NEXT: and w11, w12, #0x1 +; NO_SVE-NEXT: and w12, w14, #0x1 +; NO_SVE-NEXT: umov w14, v4.b[1] +; NO_SVE-NEXT: umov w15, v4.b[0] +; NO_SVE-NEXT: umov w17, v4.b[2] +; NO_SVE-NEXT: orr w10, w10, w11, lsl #11 +; NO_SVE-NEXT: and w11, w13, #0x1 +; NO_SVE-NEXT: orr w10, w10, w12, lsl #12 +; NO_SVE-NEXT: orr w10, w10, w11, lsl #13 +; NO_SVE-NEXT: and w11, w14, #0x1 +; NO_SVE-NEXT: umov w14, v4.b[3] +; NO_SVE-NEXT: and w12, w15, #0x1 +; NO_SVE-NEXT: and w13, w17, #0x1 +; NO_SVE-NEXT: umov w15, v4.b[4] +; NO_SVE-NEXT: bfi w12, w11, #1, #1 +; NO_SVE-NEXT: umov w11, v4.b[5] +; NO_SVE-NEXT: cmeq v3.8h, v3.8h, #0 +; NO_SVE-NEXT: umov w17, v4.b[7] +; NO_SVE-NEXT: bfi w12, w13, #2, #1 +; NO_SVE-NEXT: and w13, w14, #0x1 +; NO_SVE-NEXT: and w14, w15, #0x1 +; NO_SVE-NEXT: umov w15, v4.b[6] +; NO_SVE-NEXT: xtn v3.8b, v3.8h +; NO_SVE-NEXT: and w11, w11, #0x1 +; NO_SVE-NEXT: bfi w12, w13, #3, #1 +; NO_SVE-NEXT: orr w10, w10, w16, lsl #14 +; NO_SVE-NEXT: umov w13, v3.b[0] +; NO_SVE-NEXT: bfi w12, w14, #4, #1 +; NO_SVE-NEXT: ldp q1, q0, [x1] +; NO_SVE-NEXT: bfi w12, w11, #5, #1 +; NO_SVE-NEXT: and w11, w15, #0x1 +; NO_SVE-NEXT: and w14, w17, #0x1 +; NO_SVE-NEXT: umov w15, v3.b[1] +; NO_SVE-NEXT: orr w11, w12, w11, lsl #6 +; NO_SVE-NEXT: and w12, w13, #0x1 +; NO_SVE-NEXT: orr w11, w11, w14, lsl #7 +; NO_SVE-NEXT: umov w14, v3.b[3] +; NO_SVE-NEXT: cmeq v1.8h, v1.8h, #0 +; NO_SVE-NEXT: orr w11, w11, w12, lsl #8 +; NO_SVE-NEXT: umov w12, v3.b[2] +; NO_SVE-NEXT: and w13, w15, #0x1 +; NO_SVE-NEXT: xtn v1.8b, v1.8h +; NO_SVE-NEXT: umov w15, v2.b[7] +; NO_SVE-NEXT: orr w11, w11, w13, lsl #9 +; NO_SVE-NEXT: and w13, w14, #0x1 +; NO_SVE-NEXT: umov w14, v3.b[4] +; NO_SVE-NEXT: umov w16, v1.b[1] +; NO_SVE-NEXT: umov w17, v1.b[2] +; NO_SVE-NEXT: and w12, w12, #0x1 +; NO_SVE-NEXT: orr w10, w10, w15, lsl #15 +; NO_SVE-NEXT: umov w18, v1.b[4] +; NO_SVE-NEXT: orr w11, w11, w12, lsl #10 +; NO_SVE-NEXT: umov w12, v1.b[0] +; NO_SVE-NEXT: orr w11, w11, w13, lsl #11 +; NO_SVE-NEXT: and w13, w14, #0x1 +; NO_SVE-NEXT: umov w14, v3.b[5] +; NO_SVE-NEXT: and w15, w16, #0x1 +; NO_SVE-NEXT: and w16, w17, #0x1 +; NO_SVE-NEXT: umov w17, v1.b[3] +; NO_SVE-NEXT: umov w1, v1.b[5] +; NO_SVE-NEXT: and w12, w12, #0x1 +; NO_SVE-NEXT: orr w11, w11, w13, lsl #12 +; NO_SVE-NEXT: bfi w12, w15, #1, #1 +; NO_SVE-NEXT: and w13, w14, #0x1 +; NO_SVE-NEXT: and w15, w18, #0x1 +; NO_SVE-NEXT: and w14, w17, #0x1 +; NO_SVE-NEXT: umov w17, v1.b[6] +; NO_SVE-NEXT: cmeq v0.8h, v0.8h, #0 +; NO_SVE-NEXT: bfi w12, w16, #2, #1 +; NO_SVE-NEXT: and w16, w1, #0x1 +; NO_SVE-NEXT: orr w11, w11, w13, lsl #13 +; NO_SVE-NEXT: bfi w12, w14, #3, #1 +; NO_SVE-NEXT: umov w14, v1.b[7] +; NO_SVE-NEXT: xtn v0.8b, v0.8h +; NO_SVE-NEXT: bfi w12, w15, #4, #1 +; NO_SVE-NEXT: umov w15, v3.b[6] +; NO_SVE-NEXT: bfi w12, w16, #5, #1 +; NO_SVE-NEXT: and w16, w17, #0x1 +; NO_SVE-NEXT: umov w17, v0.b[0] +; NO_SVE-NEXT: and w14, w14, #0x1 +; NO_SVE-NEXT: umov w18, v0.b[1] +; NO_SVE-NEXT: orr w12, w12, w16, lsl #6 +; NO_SVE-NEXT: bfi w10, w9, #16, #16 +; NO_SVE-NEXT: and w13, w15, #0x1 +; NO_SVE-NEXT: umov w15, v0.b[2] +; NO_SVE-NEXT: orr w12, w12, w14, lsl #7 +; NO_SVE-NEXT: and w14, w17, #0x1 +; NO_SVE-NEXT: umov w17, v0.b[3] +; NO_SVE-NEXT: and w16, w18, #0x1 +; NO_SVE-NEXT: orr w11, w11, w13, lsl #14 +; NO_SVE-NEXT: umov w13, v0.b[4] +; NO_SVE-NEXT: orr w12, w12, w14, lsl #8 +; NO_SVE-NEXT: and w14, w15, #0x1 +; NO_SVE-NEXT: umov w15, v0.b[5] +; NO_SVE-NEXT: orr w12, w12, w16, lsl #9 +; NO_SVE-NEXT: and w16, w17, #0x1 +; NO_SVE-NEXT: umov w17, v0.b[6] +; NO_SVE-NEXT: orr w12, w12, w14, lsl #10 +; NO_SVE-NEXT: umov w14, v3.b[7] +; NO_SVE-NEXT: and w13, w13, #0x1 +; NO_SVE-NEXT: orr w12, w12, w16, lsl #11 +; NO_SVE-NEXT: and w15, w15, #0x1 +; NO_SVE-NEXT: umov w16, v0.b[7] +; NO_SVE-NEXT: orr w12, w12, w13, lsl #12 +; NO_SVE-NEXT: and w13, w17, #0x1 +; NO_SVE-NEXT: orr w11, w11, w14, lsl #15 +; NO_SVE-NEXT: orr w12, w12, w15, lsl #13 +; NO_SVE-NEXT: orr w9, w12, w13, lsl #14 +; NO_SVE-NEXT: orr w9, w9, w16, lsl #15 +; NO_SVE-NEXT: bfi w9, w11, #16, #16 +; NO_SVE-NEXT: bfi x9, x10, #32, #32 +; NO_SVE-NEXT: tbz w9, #0, .LBB46_2 +; NO_SVE-NEXT: // %bb.1: // %cond.load +; NO_SVE-NEXT: ldr h0, [x0] +; NO_SVE-NEXT: tbnz w9, #1, .LBB46_3 +; NO_SVE-NEXT: b .LBB46_4 +; NO_SVE-NEXT: .LBB46_2: +; NO_SVE-NEXT: // implicit-def: $q0 +; NO_SVE-NEXT: tbz w9, #1, .LBB46_4 +; NO_SVE-NEXT: .LBB46_3: // %cond.load1 +; NO_SVE-NEXT: add x10, x0, #2 +; NO_SVE-NEXT: ld1 { v0.h }[1], [x10] +; NO_SVE-NEXT: .LBB46_4: // %else2 +; NO_SVE-NEXT: tbnz w9, #2, .LBB46_12 +; NO_SVE-NEXT: // %bb.5: // %else5 +; NO_SVE-NEXT: tbnz w9, #3, .LBB46_13 +; NO_SVE-NEXT: .LBB46_6: // %else8 +; NO_SVE-NEXT: tbnz w9, #4, .LBB46_14 +; NO_SVE-NEXT: .LBB46_7: // %else11 +; NO_SVE-NEXT: tbnz w9, #5, .LBB46_15 +; NO_SVE-NEXT: .LBB46_8: // %else14 +; NO_SVE-NEXT: tbnz w9, #6, .LBB46_16 +; NO_SVE-NEXT: .LBB46_9: // %else17 +; NO_SVE-NEXT: tbnz w9, #7, .LBB46_17 +; NO_SVE-NEXT: .LBB46_10: // %else20 +; NO_SVE-NEXT: tbz w9, #8, .LBB46_18 +; NO_SVE-NEXT: .LBB46_11: // %cond.load22 +; NO_SVE-NEXT: add x10, x0, #16 +; NO_SVE-NEXT: ld1 { v1.h }[0], [x10] +; NO_SVE-NEXT: tbnz w9, #9, .LBB46_19 +; NO_SVE-NEXT: b .LBB46_20 +; NO_SVE-NEXT: .LBB46_12: // %cond.load4 +; NO_SVE-NEXT: add x10, x0, #4 +; NO_SVE-NEXT: ld1 { v0.h }[2], [x10] +; NO_SVE-NEXT: tbz w9, #3, .LBB46_6 +; NO_SVE-NEXT: .LBB46_13: // %cond.load7 +; NO_SVE-NEXT: add x10, x0, #6 +; NO_SVE-NEXT: ld1 { v0.h }[3], [x10] +; NO_SVE-NEXT: tbz w9, #4, .LBB46_7 +; NO_SVE-NEXT: .LBB46_14: // %cond.load10 +; NO_SVE-NEXT: add x10, x0, #8 +; NO_SVE-NEXT: ld1 { v0.h }[4], [x10] +; NO_SVE-NEXT: tbz w9, #5, .LBB46_8 +; NO_SVE-NEXT: .LBB46_15: // %cond.load13 +; NO_SVE-NEXT: add x10, x0, #10 +; NO_SVE-NEXT: ld1 { v0.h }[5], [x10] +; NO_SVE-NEXT: tbz w9, #6, .LBB46_9 +; NO_SVE-NEXT: .LBB46_16: // %cond.load16 +; NO_SVE-NEXT: add x10, x0, #12 +; NO_SVE-NEXT: ld1 { v0.h }[6], [x10] +; NO_SVE-NEXT: tbz w9, #7, .LBB46_10 +; NO_SVE-NEXT: .LBB46_17: // %cond.load19 +; NO_SVE-NEXT: add x10, x0, #14 +; NO_SVE-NEXT: ld1 { v0.h }[7], [x10] +; NO_SVE-NEXT: tbnz w9, #8, .LBB46_11 +; NO_SVE-NEXT: .LBB46_18: +; NO_SVE-NEXT: // implicit-def: $q1 +; NO_SVE-NEXT: tbz w9, #9, .LBB46_20 +; NO_SVE-NEXT: .LBB46_19: // %cond.load25 +; NO_SVE-NEXT: add x10, x0, #18 +; NO_SVE-NEXT: ld1 { v1.h }[1], [x10] +; NO_SVE-NEXT: .LBB46_20: // %else26 +; NO_SVE-NEXT: tbnz w9, #10, .LBB46_28 +; NO_SVE-NEXT: // %bb.21: // %else29 +; NO_SVE-NEXT: tbnz w9, #11, .LBB46_29 +; NO_SVE-NEXT: .LBB46_22: // %else32 +; NO_SVE-NEXT: tbnz w9, #12, .LBB46_30 +; NO_SVE-NEXT: .LBB46_23: // %else35 +; NO_SVE-NEXT: tbnz w9, #13, .LBB46_31 +; NO_SVE-NEXT: .LBB46_24: // %else38 +; NO_SVE-NEXT: tbnz w9, #14, .LBB46_32 +; NO_SVE-NEXT: .LBB46_25: // %else41 +; NO_SVE-NEXT: tbnz w9, #15, .LBB46_33 +; NO_SVE-NEXT: .LBB46_26: // %else44 +; NO_SVE-NEXT: tbz w9, #16, .LBB46_34 +; NO_SVE-NEXT: .LBB46_27: // %cond.load46 +; NO_SVE-NEXT: add x10, x0, #32 +; NO_SVE-NEXT: ld1 { v2.h }[0], [x10] +; NO_SVE-NEXT: tbnz w9, #17, .LBB46_35 +; NO_SVE-NEXT: b .LBB46_36 +; NO_SVE-NEXT: .LBB46_28: // %cond.load28 +; NO_SVE-NEXT: add x10, x0, #20 +; NO_SVE-NEXT: ld1 { v1.h }[2], [x10] +; NO_SVE-NEXT: tbz w9, #11, .LBB46_22 +; NO_SVE-NEXT: .LBB46_29: // %cond.load31 +; NO_SVE-NEXT: add x10, x0, #22 +; NO_SVE-NEXT: ld1 { v1.h }[3], [x10] +; NO_SVE-NEXT: tbz w9, #12, .LBB46_23 +; NO_SVE-NEXT: .LBB46_30: // %cond.load34 +; NO_SVE-NEXT: add x10, x0, #24 +; NO_SVE-NEXT: ld1 { v1.h }[4], [x10] +; NO_SVE-NEXT: tbz w9, #13, .LBB46_24 +; NO_SVE-NEXT: .LBB46_31: // %cond.load37 +; NO_SVE-NEXT: add x10, x0, #26 +; NO_SVE-NEXT: ld1 { v1.h }[5], [x10] +; NO_SVE-NEXT: tbz w9, #14, .LBB46_25 +; NO_SVE-NEXT: .LBB46_32: // %cond.load40 +; NO_SVE-NEXT: add x10, x0, #28 +; NO_SVE-NEXT: ld1 { v1.h }[6], [x10] +; NO_SVE-NEXT: tbz w9, #15, .LBB46_26 +; NO_SVE-NEXT: .LBB46_33: // %cond.load43 +; NO_SVE-NEXT: add x10, x0, #30 +; NO_SVE-NEXT: ld1 { v1.h }[7], [x10] +; NO_SVE-NEXT: tbnz w9, #16, .LBB46_27 +; NO_SVE-NEXT: .LBB46_34: +; NO_SVE-NEXT: // implicit-def: $q2 +; NO_SVE-NEXT: tbz w9, #17, .LBB46_36 +; NO_SVE-NEXT: .LBB46_35: // %cond.load49 +; NO_SVE-NEXT: add x10, x0, #34 +; NO_SVE-NEXT: ld1 { v2.h }[1], [x10] +; NO_SVE-NEXT: .LBB46_36: // %else50 +; NO_SVE-NEXT: tbnz w9, #18, .LBB46_44 +; NO_SVE-NEXT: // %bb.37: // %else53 +; NO_SVE-NEXT: tbnz w9, #19, .LBB46_45 +; NO_SVE-NEXT: .LBB46_38: // %else56 +; NO_SVE-NEXT: tbnz w9, #20, .LBB46_46 +; NO_SVE-NEXT: .LBB46_39: // %else59 +; NO_SVE-NEXT: tbnz w9, #21, .LBB46_47 +; NO_SVE-NEXT: .LBB46_40: // %else62 +; NO_SVE-NEXT: tbnz w9, #22, .LBB46_48 +; NO_SVE-NEXT: .LBB46_41: // %else65 +; NO_SVE-NEXT: tbnz w9, #23, .LBB46_49 +; NO_SVE-NEXT: .LBB46_42: // %else68 +; NO_SVE-NEXT: tbz w9, #24, .LBB46_50 +; NO_SVE-NEXT: .LBB46_43: // %cond.load70 +; NO_SVE-NEXT: add x10, x0, #48 +; NO_SVE-NEXT: ld1 { v3.h }[0], [x10] +; NO_SVE-NEXT: tbnz w9, #25, .LBB46_51 +; NO_SVE-NEXT: b .LBB46_52 +; NO_SVE-NEXT: .LBB46_44: // %cond.load52 +; NO_SVE-NEXT: add x10, x0, #36 +; NO_SVE-NEXT: ld1 { v2.h }[2], [x10] +; NO_SVE-NEXT: tbz w9, #19, .LBB46_38 +; NO_SVE-NEXT: .LBB46_45: // %cond.load55 +; NO_SVE-NEXT: add x10, x0, #38 +; NO_SVE-NEXT: ld1 { v2.h }[3], [x10] +; NO_SVE-NEXT: tbz w9, #20, .LBB46_39 +; NO_SVE-NEXT: .LBB46_46: // %cond.load58 +; NO_SVE-NEXT: add x10, x0, #40 +; NO_SVE-NEXT: ld1 { v2.h }[4], [x10] +; NO_SVE-NEXT: tbz w9, #21, .LBB46_40 +; NO_SVE-NEXT: .LBB46_47: // %cond.load61 +; NO_SVE-NEXT: add x10, x0, #42 +; NO_SVE-NEXT: ld1 { v2.h }[5], [x10] +; NO_SVE-NEXT: tbz w9, #22, .LBB46_41 +; NO_SVE-NEXT: .LBB46_48: // %cond.load64 +; NO_SVE-NEXT: add x10, x0, #44 +; NO_SVE-NEXT: ld1 { v2.h }[6], [x10] +; NO_SVE-NEXT: tbz w9, #23, .LBB46_42 +; NO_SVE-NEXT: .LBB46_49: // %cond.load67 +; NO_SVE-NEXT: add x10, x0, #46 +; NO_SVE-NEXT: ld1 { v2.h }[7], [x10] +; NO_SVE-NEXT: tbnz w9, #24, .LBB46_43 +; NO_SVE-NEXT: .LBB46_50: +; NO_SVE-NEXT: // implicit-def: $q3 +; NO_SVE-NEXT: tbz w9, #25, .LBB46_52 +; NO_SVE-NEXT: .LBB46_51: // %cond.load73 +; NO_SVE-NEXT: add x10, x0, #50 +; NO_SVE-NEXT: ld1 { v3.h }[1], [x10] +; NO_SVE-NEXT: .LBB46_52: // %else74 +; NO_SVE-NEXT: tbnz w9, #26, .LBB46_60 +; NO_SVE-NEXT: // %bb.53: // %else77 +; NO_SVE-NEXT: tbnz w9, #27, .LBB46_61 +; NO_SVE-NEXT: .LBB46_54: // %else80 +; NO_SVE-NEXT: tbnz w9, #28, .LBB46_62 +; NO_SVE-NEXT: .LBB46_55: // %else83 +; NO_SVE-NEXT: tbnz w9, #29, .LBB46_63 +; NO_SVE-NEXT: .LBB46_56: // %else86 +; NO_SVE-NEXT: tbnz w9, #30, .LBB46_64 +; NO_SVE-NEXT: .LBB46_57: // %else89 +; NO_SVE-NEXT: tbnz w9, #31, .LBB46_65 +; NO_SVE-NEXT: .LBB46_58: // %else92 +; NO_SVE-NEXT: tbz x9, #32, .LBB46_66 +; NO_SVE-NEXT: .LBB46_59: // %cond.load94 +; NO_SVE-NEXT: add x10, x0, #64 +; NO_SVE-NEXT: ld1 { v4.h }[0], [x10] +; NO_SVE-NEXT: tbnz x9, #33, .LBB46_67 +; NO_SVE-NEXT: b .LBB46_68 +; NO_SVE-NEXT: .LBB46_60: // %cond.load76 +; NO_SVE-NEXT: add x10, x0, #52 +; NO_SVE-NEXT: ld1 { v3.h }[2], [x10] +; NO_SVE-NEXT: tbz w9, #27, .LBB46_54 +; NO_SVE-NEXT: .LBB46_61: // %cond.load79 +; NO_SVE-NEXT: add x10, x0, #54 +; NO_SVE-NEXT: ld1 { v3.h }[3], [x10] +; NO_SVE-NEXT: tbz w9, #28, .LBB46_55 +; NO_SVE-NEXT: .LBB46_62: // %cond.load82 +; NO_SVE-NEXT: add x10, x0, #56 +; NO_SVE-NEXT: ld1 { v3.h }[4], [x10] +; NO_SVE-NEXT: tbz w9, #29, .LBB46_56 +; NO_SVE-NEXT: .LBB46_63: // %cond.load85 +; NO_SVE-NEXT: add x10, x0, #58 +; NO_SVE-NEXT: ld1 { v3.h }[5], [x10] +; NO_SVE-NEXT: tbz w9, #30, .LBB46_57 +; NO_SVE-NEXT: .LBB46_64: // %cond.load88 +; NO_SVE-NEXT: add x10, x0, #60 +; NO_SVE-NEXT: ld1 { v3.h }[6], [x10] +; NO_SVE-NEXT: tbz w9, #31, .LBB46_58 +; NO_SVE-NEXT: .LBB46_65: // %cond.load91 +; NO_SVE-NEXT: add x10, x0, #62 +; NO_SVE-NEXT: ld1 { v3.h }[7], [x10] +; NO_SVE-NEXT: tbnz x9, #32, .LBB46_59 +; NO_SVE-NEXT: .LBB46_66: +; NO_SVE-NEXT: // implicit-def: $q4 +; NO_SVE-NEXT: tbz x9, #33, .LBB46_68 +; NO_SVE-NEXT: .LBB46_67: // %cond.load97 +; NO_SVE-NEXT: add x10, x0, #66 +; NO_SVE-NEXT: ld1 { v4.h }[1], [x10] +; NO_SVE-NEXT: .LBB46_68: // %else98 +; NO_SVE-NEXT: tbnz x9, #34, .LBB46_76 +; NO_SVE-NEXT: // %bb.69: // %else101 +; NO_SVE-NEXT: tbnz x9, #35, .LBB46_77 +; NO_SVE-NEXT: .LBB46_70: // %else104 +; NO_SVE-NEXT: tbnz x9, #36, .LBB46_78 +; NO_SVE-NEXT: .LBB46_71: // %else107 +; NO_SVE-NEXT: tbnz x9, #37, .LBB46_79 +; NO_SVE-NEXT: .LBB46_72: // %else110 +; NO_SVE-NEXT: tbnz x9, #38, .LBB46_80 +; NO_SVE-NEXT: .LBB46_73: // %else113 +; NO_SVE-NEXT: tbnz x9, #39, .LBB46_81 +; NO_SVE-NEXT: .LBB46_74: // %else116 +; NO_SVE-NEXT: tbz x9, #40, .LBB46_82 +; NO_SVE-NEXT: .LBB46_75: // %cond.load118 +; NO_SVE-NEXT: add x10, x0, #80 +; NO_SVE-NEXT: ld1 { v5.h }[0], [x10] +; NO_SVE-NEXT: tbnz x9, #41, .LBB46_83 +; NO_SVE-NEXT: b .LBB46_84 +; NO_SVE-NEXT: .LBB46_76: // %cond.load100 +; NO_SVE-NEXT: add x10, x0, #68 +; NO_SVE-NEXT: ld1 { v4.h }[2], [x10] +; NO_SVE-NEXT: tbz x9, #35, .LBB46_70 +; NO_SVE-NEXT: .LBB46_77: // %cond.load103 +; NO_SVE-NEXT: add x10, x0, #70 +; NO_SVE-NEXT: ld1 { v4.h }[3], [x10] +; NO_SVE-NEXT: tbz x9, #36, .LBB46_71 +; NO_SVE-NEXT: .LBB46_78: // %cond.load106 +; NO_SVE-NEXT: add x10, x0, #72 +; NO_SVE-NEXT: ld1 { v4.h }[4], [x10] +; NO_SVE-NEXT: tbz x9, #37, .LBB46_72 +; NO_SVE-NEXT: .LBB46_79: // %cond.load109 +; NO_SVE-NEXT: add x10, x0, #74 +; NO_SVE-NEXT: ld1 { v4.h }[5], [x10] +; NO_SVE-NEXT: tbz x9, #38, .LBB46_73 +; NO_SVE-NEXT: .LBB46_80: // %cond.load112 +; NO_SVE-NEXT: add x10, x0, #76 +; NO_SVE-NEXT: ld1 { v4.h }[6], [x10] +; NO_SVE-NEXT: tbz x9, #39, .LBB46_74 +; NO_SVE-NEXT: .LBB46_81: // %cond.load115 +; NO_SVE-NEXT: add x10, x0, #78 +; NO_SVE-NEXT: ld1 { v4.h }[7], [x10] +; NO_SVE-NEXT: tbnz x9, #40, .LBB46_75 +; NO_SVE-NEXT: .LBB46_82: +; NO_SVE-NEXT: // implicit-def: $q5 +; NO_SVE-NEXT: tbz x9, #41, .LBB46_84 +; NO_SVE-NEXT: .LBB46_83: // %cond.load121 +; NO_SVE-NEXT: add x10, x0, #82 +; NO_SVE-NEXT: ld1 { v5.h }[1], [x10] +; NO_SVE-NEXT: .LBB46_84: // %else122 +; NO_SVE-NEXT: tbnz x9, #42, .LBB46_92 +; NO_SVE-NEXT: // %bb.85: // %else125 +; NO_SVE-NEXT: tbnz x9, #43, .LBB46_93 +; NO_SVE-NEXT: .LBB46_86: // %else128 +; NO_SVE-NEXT: tbnz x9, #44, .LBB46_94 +; NO_SVE-NEXT: .LBB46_87: // %else131 +; NO_SVE-NEXT: tbnz x9, #45, .LBB46_95 +; NO_SVE-NEXT: .LBB46_88: // %else134 +; NO_SVE-NEXT: tbnz x9, #46, .LBB46_96 +; NO_SVE-NEXT: .LBB46_89: // %else137 +; NO_SVE-NEXT: tbnz x9, #47, .LBB46_97 +; NO_SVE-NEXT: .LBB46_90: // %else140 +; NO_SVE-NEXT: tbz x9, #48, .LBB46_98 +; NO_SVE-NEXT: .LBB46_91: // %cond.load142 +; NO_SVE-NEXT: add x10, x0, #96 +; NO_SVE-NEXT: ld1 { v6.h }[0], [x10] +; NO_SVE-NEXT: tbnz x9, #49, .LBB46_99 +; NO_SVE-NEXT: b .LBB46_100 +; NO_SVE-NEXT: .LBB46_92: // %cond.load124 +; NO_SVE-NEXT: add x10, x0, #84 +; NO_SVE-NEXT: ld1 { v5.h }[2], [x10] +; NO_SVE-NEXT: tbz x9, #43, .LBB46_86 +; NO_SVE-NEXT: .LBB46_93: // %cond.load127 +; NO_SVE-NEXT: add x10, x0, #86 +; NO_SVE-NEXT: ld1 { v5.h }[3], [x10] +; NO_SVE-NEXT: tbz x9, #44, .LBB46_87 +; NO_SVE-NEXT: .LBB46_94: // %cond.load130 +; NO_SVE-NEXT: add x10, x0, #88 +; NO_SVE-NEXT: ld1 { v5.h }[4], [x10] +; NO_SVE-NEXT: tbz x9, #45, .LBB46_88 +; NO_SVE-NEXT: .LBB46_95: // %cond.load133 +; NO_SVE-NEXT: add x10, x0, #90 +; NO_SVE-NEXT: ld1 { v5.h }[5], [x10] +; NO_SVE-NEXT: tbz x9, #46, .LBB46_89 +; NO_SVE-NEXT: .LBB46_96: // %cond.load136 +; NO_SVE-NEXT: add x10, x0, #92 +; NO_SVE-NEXT: ld1 { v5.h }[6], [x10] +; NO_SVE-NEXT: tbz x9, #47, .LBB46_90 +; NO_SVE-NEXT: .LBB46_97: // %cond.load139 +; NO_SVE-NEXT: add x10, x0, #94 +; NO_SVE-NEXT: ld1 { v5.h }[7], [x10] +; NO_SVE-NEXT: tbnz x9, #48, .LBB46_91 +; NO_SVE-NEXT: .LBB46_98: +; NO_SVE-NEXT: // implicit-def: $q6 +; NO_SVE-NEXT: tbz x9, #49, .LBB46_100 +; NO_SVE-NEXT: .LBB46_99: // %cond.load145 +; NO_SVE-NEXT: add x10, x0, #98 +; NO_SVE-NEXT: ld1 { v6.h }[1], [x10] +; NO_SVE-NEXT: .LBB46_100: // %else146 +; NO_SVE-NEXT: tbnz x9, #50, .LBB46_108 +; NO_SVE-NEXT: // %bb.101: // %else149 +; NO_SVE-NEXT: tbnz x9, #51, .LBB46_109 +; NO_SVE-NEXT: .LBB46_102: // %else152 +; NO_SVE-NEXT: tbnz x9, #52, .LBB46_110 +; NO_SVE-NEXT: .LBB46_103: // %else155 +; NO_SVE-NEXT: tbnz x9, #53, .LBB46_111 +; NO_SVE-NEXT: .LBB46_104: // %else158 +; NO_SVE-NEXT: tbnz x9, #54, .LBB46_112 +; NO_SVE-NEXT: .LBB46_105: // %else161 +; NO_SVE-NEXT: tbnz x9, #55, .LBB46_113 +; NO_SVE-NEXT: .LBB46_106: // %else164 +; NO_SVE-NEXT: tbz x9, #56, .LBB46_114 +; NO_SVE-NEXT: .LBB46_107: // %cond.load166 +; NO_SVE-NEXT: add x10, x0, #112 +; NO_SVE-NEXT: ld1 { v7.h }[0], [x10] +; NO_SVE-NEXT: tbnz x9, #57, .LBB46_115 +; NO_SVE-NEXT: b .LBB46_116 +; NO_SVE-NEXT: .LBB46_108: // %cond.load148 +; NO_SVE-NEXT: add x10, x0, #100 +; NO_SVE-NEXT: ld1 { v6.h }[2], [x10] +; NO_SVE-NEXT: tbz x9, #51, .LBB46_102 +; NO_SVE-NEXT: .LBB46_109: // %cond.load151 +; NO_SVE-NEXT: add x10, x0, #102 +; NO_SVE-NEXT: ld1 { v6.h }[3], [x10] +; NO_SVE-NEXT: tbz x9, #52, .LBB46_103 +; NO_SVE-NEXT: .LBB46_110: // %cond.load154 +; NO_SVE-NEXT: add x10, x0, #104 +; NO_SVE-NEXT: ld1 { v6.h }[4], [x10] +; NO_SVE-NEXT: tbz x9, #53, .LBB46_104 +; NO_SVE-NEXT: .LBB46_111: // %cond.load157 +; NO_SVE-NEXT: add x10, x0, #106 +; NO_SVE-NEXT: ld1 { v6.h }[5], [x10] +; NO_SVE-NEXT: tbz x9, #54, .LBB46_105 +; NO_SVE-NEXT: .LBB46_112: // %cond.load160 +; NO_SVE-NEXT: add x10, x0, #108 +; NO_SVE-NEXT: ld1 { v6.h }[6], [x10] +; NO_SVE-NEXT: tbz x9, #55, .LBB46_106 +; NO_SVE-NEXT: .LBB46_113: // %cond.load163 +; NO_SVE-NEXT: add x10, x0, #110 +; NO_SVE-NEXT: ld1 { v6.h }[7], [x10] +; NO_SVE-NEXT: tbnz x9, #56, .LBB46_107 +; NO_SVE-NEXT: .LBB46_114: +; NO_SVE-NEXT: // implicit-def: $q7 +; NO_SVE-NEXT: tbz x9, #57, .LBB46_116 +; NO_SVE-NEXT: .LBB46_115: // %cond.load169 +; NO_SVE-NEXT: add x10, x0, #114 +; NO_SVE-NEXT: ld1 { v7.h }[1], [x10] +; NO_SVE-NEXT: .LBB46_116: // %else170 +; NO_SVE-NEXT: tbnz x9, #58, .LBB46_124 +; NO_SVE-NEXT: // %bb.117: // %else173 +; NO_SVE-NEXT: tbnz x9, #59, .LBB46_125 +; NO_SVE-NEXT: .LBB46_118: // %else176 +; NO_SVE-NEXT: tbnz x9, #60, .LBB46_126 +; NO_SVE-NEXT: .LBB46_119: // %else179 +; NO_SVE-NEXT: tbnz x9, #61, .LBB46_127 +; NO_SVE-NEXT: .LBB46_120: // %else182 +; NO_SVE-NEXT: tbnz x9, #62, .LBB46_128 +; NO_SVE-NEXT: .LBB46_121: // %else185 +; NO_SVE-NEXT: tbz x9, #63, .LBB46_123 +; NO_SVE-NEXT: .LBB46_122: // %cond.load187 +; NO_SVE-NEXT: add x9, x0, #126 +; NO_SVE-NEXT: ld1 { v7.h }[7], [x9] +; NO_SVE-NEXT: .LBB46_123: // %else188 +; NO_SVE-NEXT: ushll2 v16.4s, v0.8h, #0 +; NO_SVE-NEXT: ushll v0.4s, v0.4h, #0 +; NO_SVE-NEXT: stp q0, q16, [x8] +; NO_SVE-NEXT: ushll2 v16.4s, v1.8h, #0 +; NO_SVE-NEXT: ushll v0.4s, v1.4h, #0 +; NO_SVE-NEXT: ushll v1.4s, v2.4h, #0 +; NO_SVE-NEXT: stp q0, q16, [x8, #32] +; NO_SVE-NEXT: ushll2 v0.4s, v2.8h, #0 +; NO_SVE-NEXT: stp q1, q0, [x8, #64] +; NO_SVE-NEXT: ushll2 v0.4s, v3.8h, #0 +; NO_SVE-NEXT: ushll v1.4s, v3.4h, #0 +; NO_SVE-NEXT: stp q1, q0, [x8, #96] +; NO_SVE-NEXT: ushll2 v0.4s, v4.8h, #0 +; NO_SVE-NEXT: ushll v1.4s, v4.4h, #0 +; NO_SVE-NEXT: stp q1, q0, [x8, #128] +; NO_SVE-NEXT: ushll2 v0.4s, v5.8h, #0 +; NO_SVE-NEXT: ushll v1.4s, v5.4h, #0 +; NO_SVE-NEXT: stp q1, q0, [x8, #160] +; NO_SVE-NEXT: ushll2 v0.4s, v6.8h, #0 +; NO_SVE-NEXT: ushll v1.4s, v6.4h, #0 +; NO_SVE-NEXT: stp q1, q0, [x8, #192] +; NO_SVE-NEXT: ushll2 v0.4s, v7.8h, #0 +; NO_SVE-NEXT: ushll v1.4s, v7.4h, #0 +; NO_SVE-NEXT: stp q1, q0, [x8, #224] +; NO_SVE-NEXT: add sp, sp, #16 +; NO_SVE-NEXT: ret +; NO_SVE-NEXT: .LBB46_124: // %cond.load172 +; NO_SVE-NEXT: add x10, x0, #116 +; NO_SVE-NEXT: ld1 { v7.h }[2], [x10] +; NO_SVE-NEXT: tbz x9, #59, .LBB46_118 +; NO_SVE-NEXT: .LBB46_125: // %cond.load175 +; NO_SVE-NEXT: add x10, x0, #118 +; NO_SVE-NEXT: ld1 { v7.h }[3], [x10] +; NO_SVE-NEXT: tbz x9, #60, .LBB46_119 +; NO_SVE-NEXT: .LBB46_126: // %cond.load178 +; NO_SVE-NEXT: add x10, x0, #120 +; NO_SVE-NEXT: ld1 { v7.h }[4], [x10] +; NO_SVE-NEXT: tbz x9, #61, .LBB46_120 +; NO_SVE-NEXT: .LBB46_127: // %cond.load181 +; NO_SVE-NEXT: add x10, x0, #122 +; NO_SVE-NEXT: ld1 { v7.h }[5], [x10] +; NO_SVE-NEXT: tbz x9, #62, .LBB46_121 +; NO_SVE-NEXT: .LBB46_128: // %cond.load184 +; NO_SVE-NEXT: add x10, x0, #124 +; NO_SVE-NEXT: ld1 { v7.h }[6], [x10] +; NO_SVE-NEXT: tbnz x9, #63, .LBB46_122 +; NO_SVE-NEXT: b .LBB46_123 +; ; VBITS_GE_2048-LABEL: masked_load_zext_v64i16i32: ; VBITS_GE_2048: // %bb.0: ; VBITS_GE_2048-NEXT: ptrue p0.s, vl64 @@ -807,6 +13311,343 @@ } define <32 x i64> @masked_load_zext_v32i16i64(<32 x i16>* %ap, <32 x i16>* %bp) #0 { +; NO_SVE-LABEL: masked_load_zext_v32i16i64: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: sub sp, sp, #16 +; NO_SVE-NEXT: .cfi_def_cfa_offset 16 +; NO_SVE-NEXT: ldp q0, q1, [x1, #32] +; NO_SVE-NEXT: cmeq v0.8h, v0.8h, #0 +; NO_SVE-NEXT: xtn v0.8b, v0.8h +; NO_SVE-NEXT: ldp q2, q3, [x1] +; NO_SVE-NEXT: umov w9, v0.b[1] +; NO_SVE-NEXT: umov w11, v0.b[2] +; NO_SVE-NEXT: umov w10, v0.b[0] +; NO_SVE-NEXT: umov w12, v0.b[3] +; NO_SVE-NEXT: umov w13, v0.b[4] +; NO_SVE-NEXT: umov w14, v0.b[5] +; NO_SVE-NEXT: cmeq v1.8h, v1.8h, #0 +; NO_SVE-NEXT: umov w15, v0.b[6] +; NO_SVE-NEXT: and w9, w9, #0x1 +; NO_SVE-NEXT: and w11, w11, #0x1 +; NO_SVE-NEXT: xtn v1.8b, v1.8h +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: and w12, w12, #0x1 +; NO_SVE-NEXT: umov w16, v0.b[7] +; NO_SVE-NEXT: bfi w10, w9, #1, #1 +; NO_SVE-NEXT: and w13, w13, #0x1 +; NO_SVE-NEXT: cmeq v2.8h, v2.8h, #0 +; NO_SVE-NEXT: umov w17, v1.b[0] +; NO_SVE-NEXT: bfi w10, w11, #2, #1 +; NO_SVE-NEXT: and w14, w14, #0x1 +; NO_SVE-NEXT: umov w9, v1.b[1] +; NO_SVE-NEXT: bfi w10, w12, #3, #1 +; NO_SVE-NEXT: xtn v2.8b, v2.8h +; NO_SVE-NEXT: umov w11, v1.b[2] +; NO_SVE-NEXT: bfi w10, w13, #4, #1 +; NO_SVE-NEXT: and w15, w15, #0x1 +; NO_SVE-NEXT: umov w12, v1.b[3] +; NO_SVE-NEXT: bfi w10, w14, #5, #1 +; NO_SVE-NEXT: and w16, w16, #0x1 +; NO_SVE-NEXT: orr w10, w10, w15, lsl #6 +; NO_SVE-NEXT: umov w15, v2.b[1] +; NO_SVE-NEXT: and w17, w17, #0x1 +; NO_SVE-NEXT: orr w10, w10, w16, lsl #7 +; NO_SVE-NEXT: umov w16, v2.b[2] +; NO_SVE-NEXT: and w9, w9, #0x1 +; NO_SVE-NEXT: and w11, w11, #0x1 +; NO_SVE-NEXT: orr w10, w10, w17, lsl #8 +; NO_SVE-NEXT: umov w17, v2.b[0] +; NO_SVE-NEXT: umov w13, v1.b[4] +; NO_SVE-NEXT: and w12, w12, #0x1 +; NO_SVE-NEXT: orr w9, w10, w9, lsl #9 +; NO_SVE-NEXT: and w15, w15, #0x1 +; NO_SVE-NEXT: orr w9, w9, w11, lsl #10 +; NO_SVE-NEXT: umov w11, v2.b[3] +; NO_SVE-NEXT: umov w14, v1.b[5] +; NO_SVE-NEXT: and w10, w16, #0x1 +; NO_SVE-NEXT: orr w9, w9, w12, lsl #11 +; NO_SVE-NEXT: umov w12, v2.b[4] +; NO_SVE-NEXT: and w16, w17, #0x1 +; NO_SVE-NEXT: umov w17, v2.b[5] +; NO_SVE-NEXT: and w13, w13, #0x1 +; NO_SVE-NEXT: bfi w16, w15, #1, #1 +; NO_SVE-NEXT: bfi w16, w10, #2, #1 +; NO_SVE-NEXT: and w10, w11, #0x1 +; NO_SVE-NEXT: orr w9, w9, w13, lsl #12 +; NO_SVE-NEXT: and w13, w14, #0x1 +; NO_SVE-NEXT: and w11, w12, #0x1 +; NO_SVE-NEXT: umov w14, v2.b[6] +; NO_SVE-NEXT: cmeq v0.8h, v3.8h, #0 +; NO_SVE-NEXT: and w12, w17, #0x1 +; NO_SVE-NEXT: bfi w16, w10, #3, #1 +; NO_SVE-NEXT: umov w10, v2.b[7] +; NO_SVE-NEXT: xtn v0.8b, v0.8h +; NO_SVE-NEXT: bfi w16, w11, #4, #1 +; NO_SVE-NEXT: umov w11, v1.b[6] +; NO_SVE-NEXT: bfi w16, w12, #5, #1 +; NO_SVE-NEXT: and w12, w14, #0x1 +; NO_SVE-NEXT: umov w14, v0.b[0] +; NO_SVE-NEXT: umov w15, v0.b[1] +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: orr w9, w9, w13, lsl #13 +; NO_SVE-NEXT: orr w12, w16, w12, lsl #6 +; NO_SVE-NEXT: umov w13, v0.b[2] +; NO_SVE-NEXT: and w11, w11, #0x1 +; NO_SVE-NEXT: orr w10, w12, w10, lsl #7 +; NO_SVE-NEXT: and w12, w14, #0x1 +; NO_SVE-NEXT: and w14, w15, #0x1 +; NO_SVE-NEXT: umov w15, v0.b[3] +; NO_SVE-NEXT: orr w9, w9, w11, lsl #14 +; NO_SVE-NEXT: umov w11, v0.b[4] +; NO_SVE-NEXT: orr w10, w10, w12, lsl #8 +; NO_SVE-NEXT: and w12, w13, #0x1 +; NO_SVE-NEXT: umov w13, v0.b[5] +; NO_SVE-NEXT: orr w10, w10, w14, lsl #9 +; NO_SVE-NEXT: and w14, w15, #0x1 +; NO_SVE-NEXT: umov w15, v0.b[6] +; NO_SVE-NEXT: orr w10, w10, w12, lsl #10 +; NO_SVE-NEXT: umov w12, v1.b[7] +; NO_SVE-NEXT: and w11, w11, #0x1 +; NO_SVE-NEXT: orr w10, w10, w14, lsl #11 +; NO_SVE-NEXT: and w13, w13, #0x1 +; NO_SVE-NEXT: umov w14, v0.b[7] +; NO_SVE-NEXT: orr w10, w10, w11, lsl #12 +; NO_SVE-NEXT: and w11, w15, #0x1 +; NO_SVE-NEXT: orr w12, w9, w12, lsl #15 +; NO_SVE-NEXT: orr w9, w10, w13, lsl #13 +; NO_SVE-NEXT: orr w9, w9, w11, lsl #14 +; NO_SVE-NEXT: orr w9, w9, w14, lsl #15 +; NO_SVE-NEXT: bfi w9, w12, #16, #16 +; NO_SVE-NEXT: tbz w9, #0, .LBB47_2 +; NO_SVE-NEXT: // %bb.1: // %cond.load +; NO_SVE-NEXT: ldr h0, [x0] +; NO_SVE-NEXT: tbnz w9, #1, .LBB47_3 +; NO_SVE-NEXT: b .LBB47_4 +; NO_SVE-NEXT: .LBB47_2: +; NO_SVE-NEXT: // implicit-def: $q0 +; NO_SVE-NEXT: tbz w9, #1, .LBB47_4 +; NO_SVE-NEXT: .LBB47_3: // %cond.load1 +; NO_SVE-NEXT: add x10, x0, #2 +; NO_SVE-NEXT: ld1 { v0.h }[1], [x10] +; NO_SVE-NEXT: .LBB47_4: // %else2 +; NO_SVE-NEXT: tbnz w9, #2, .LBB47_12 +; NO_SVE-NEXT: // %bb.5: // %else5 +; NO_SVE-NEXT: tbnz w9, #3, .LBB47_13 +; NO_SVE-NEXT: .LBB47_6: // %else8 +; NO_SVE-NEXT: tbnz w9, #4, .LBB47_14 +; NO_SVE-NEXT: .LBB47_7: // %else11 +; NO_SVE-NEXT: tbnz w9, #5, .LBB47_15 +; NO_SVE-NEXT: .LBB47_8: // %else14 +; NO_SVE-NEXT: tbnz w9, #6, .LBB47_16 +; NO_SVE-NEXT: .LBB47_9: // %else17 +; NO_SVE-NEXT: tbnz w9, #7, .LBB47_17 +; NO_SVE-NEXT: .LBB47_10: // %else20 +; NO_SVE-NEXT: tbz w9, #8, .LBB47_18 +; NO_SVE-NEXT: .LBB47_11: // %cond.load22 +; NO_SVE-NEXT: add x10, x0, #16 +; NO_SVE-NEXT: ld1 { v1.h }[0], [x10] +; NO_SVE-NEXT: tbnz w9, #9, .LBB47_19 +; NO_SVE-NEXT: b .LBB47_20 +; NO_SVE-NEXT: .LBB47_12: // %cond.load4 +; NO_SVE-NEXT: add x10, x0, #4 +; NO_SVE-NEXT: ld1 { v0.h }[2], [x10] +; NO_SVE-NEXT: tbz w9, #3, .LBB47_6 +; NO_SVE-NEXT: .LBB47_13: // %cond.load7 +; NO_SVE-NEXT: add x10, x0, #6 +; NO_SVE-NEXT: ld1 { v0.h }[3], [x10] +; NO_SVE-NEXT: tbz w9, #4, .LBB47_7 +; NO_SVE-NEXT: .LBB47_14: // %cond.load10 +; NO_SVE-NEXT: add x10, x0, #8 +; NO_SVE-NEXT: ld1 { v0.h }[4], [x10] +; NO_SVE-NEXT: tbz w9, #5, .LBB47_8 +; NO_SVE-NEXT: .LBB47_15: // %cond.load13 +; NO_SVE-NEXT: add x10, x0, #10 +; NO_SVE-NEXT: ld1 { v0.h }[5], [x10] +; NO_SVE-NEXT: tbz w9, #6, .LBB47_9 +; NO_SVE-NEXT: .LBB47_16: // %cond.load16 +; NO_SVE-NEXT: add x10, x0, #12 +; NO_SVE-NEXT: ld1 { v0.h }[6], [x10] +; NO_SVE-NEXT: tbz w9, #7, .LBB47_10 +; NO_SVE-NEXT: .LBB47_17: // %cond.load19 +; NO_SVE-NEXT: add x10, x0, #14 +; NO_SVE-NEXT: ld1 { v0.h }[7], [x10] +; NO_SVE-NEXT: tbnz w9, #8, .LBB47_11 +; NO_SVE-NEXT: .LBB47_18: +; NO_SVE-NEXT: // implicit-def: $q1 +; NO_SVE-NEXT: tbz w9, #9, .LBB47_20 +; NO_SVE-NEXT: .LBB47_19: // %cond.load25 +; NO_SVE-NEXT: add x10, x0, #18 +; NO_SVE-NEXT: ld1 { v1.h }[1], [x10] +; NO_SVE-NEXT: .LBB47_20: // %else26 +; NO_SVE-NEXT: tbnz w9, #10, .LBB47_28 +; NO_SVE-NEXT: // %bb.21: // %else29 +; NO_SVE-NEXT: tbnz w9, #11, .LBB47_29 +; NO_SVE-NEXT: .LBB47_22: // %else32 +; NO_SVE-NEXT: tbnz w9, #12, .LBB47_30 +; NO_SVE-NEXT: .LBB47_23: // %else35 +; NO_SVE-NEXT: tbnz w9, #13, .LBB47_31 +; NO_SVE-NEXT: .LBB47_24: // %else38 +; NO_SVE-NEXT: tbnz w9, #14, .LBB47_32 +; NO_SVE-NEXT: .LBB47_25: // %else41 +; NO_SVE-NEXT: tbnz w9, #15, .LBB47_33 +; NO_SVE-NEXT: .LBB47_26: // %else44 +; NO_SVE-NEXT: tbz w9, #16, .LBB47_34 +; NO_SVE-NEXT: .LBB47_27: // %cond.load46 +; NO_SVE-NEXT: add x10, x0, #32 +; NO_SVE-NEXT: ld1 { v2.h }[0], [x10] +; NO_SVE-NEXT: tbnz w9, #17, .LBB47_35 +; NO_SVE-NEXT: b .LBB47_36 +; NO_SVE-NEXT: .LBB47_28: // %cond.load28 +; NO_SVE-NEXT: add x10, x0, #20 +; NO_SVE-NEXT: ld1 { v1.h }[2], [x10] +; NO_SVE-NEXT: tbz w9, #11, .LBB47_22 +; NO_SVE-NEXT: .LBB47_29: // %cond.load31 +; NO_SVE-NEXT: add x10, x0, #22 +; NO_SVE-NEXT: ld1 { v1.h }[3], [x10] +; NO_SVE-NEXT: tbz w9, #12, .LBB47_23 +; NO_SVE-NEXT: .LBB47_30: // %cond.load34 +; NO_SVE-NEXT: add x10, x0, #24 +; NO_SVE-NEXT: ld1 { v1.h }[4], [x10] +; NO_SVE-NEXT: tbz w9, #13, .LBB47_24 +; NO_SVE-NEXT: .LBB47_31: // %cond.load37 +; NO_SVE-NEXT: add x10, x0, #26 +; NO_SVE-NEXT: ld1 { v1.h }[5], [x10] +; NO_SVE-NEXT: tbz w9, #14, .LBB47_25 +; NO_SVE-NEXT: .LBB47_32: // %cond.load40 +; NO_SVE-NEXT: add x10, x0, #28 +; NO_SVE-NEXT: ld1 { v1.h }[6], [x10] +; NO_SVE-NEXT: tbz w9, #15, .LBB47_26 +; NO_SVE-NEXT: .LBB47_33: // %cond.load43 +; NO_SVE-NEXT: add x10, x0, #30 +; NO_SVE-NEXT: ld1 { v1.h }[7], [x10] +; NO_SVE-NEXT: tbnz w9, #16, .LBB47_27 +; NO_SVE-NEXT: .LBB47_34: +; NO_SVE-NEXT: // implicit-def: $q2 +; NO_SVE-NEXT: tbz w9, #17, .LBB47_36 +; NO_SVE-NEXT: .LBB47_35: // %cond.load49 +; NO_SVE-NEXT: add x10, x0, #34 +; NO_SVE-NEXT: ld1 { v2.h }[1], [x10] +; NO_SVE-NEXT: .LBB47_36: // %else50 +; NO_SVE-NEXT: tbnz w9, #18, .LBB47_44 +; NO_SVE-NEXT: // %bb.37: // %else53 +; NO_SVE-NEXT: tbnz w9, #19, .LBB47_45 +; NO_SVE-NEXT: .LBB47_38: // %else56 +; NO_SVE-NEXT: tbnz w9, #20, .LBB47_46 +; NO_SVE-NEXT: .LBB47_39: // %else59 +; NO_SVE-NEXT: tbnz w9, #21, .LBB47_47 +; NO_SVE-NEXT: .LBB47_40: // %else62 +; NO_SVE-NEXT: tbnz w9, #22, .LBB47_48 +; NO_SVE-NEXT: .LBB47_41: // %else65 +; NO_SVE-NEXT: tbnz w9, #23, .LBB47_49 +; NO_SVE-NEXT: .LBB47_42: // %else68 +; NO_SVE-NEXT: tbz w9, #24, .LBB47_50 +; NO_SVE-NEXT: .LBB47_43: // %cond.load70 +; NO_SVE-NEXT: add x10, x0, #48 +; NO_SVE-NEXT: ld1 { v3.h }[0], [x10] +; NO_SVE-NEXT: tbnz w9, #25, .LBB47_51 +; NO_SVE-NEXT: b .LBB47_52 +; NO_SVE-NEXT: .LBB47_44: // %cond.load52 +; NO_SVE-NEXT: add x10, x0, #36 +; NO_SVE-NEXT: ld1 { v2.h }[2], [x10] +; NO_SVE-NEXT: tbz w9, #19, .LBB47_38 +; NO_SVE-NEXT: .LBB47_45: // %cond.load55 +; NO_SVE-NEXT: add x10, x0, #38 +; NO_SVE-NEXT: ld1 { v2.h }[3], [x10] +; NO_SVE-NEXT: tbz w9, #20, .LBB47_39 +; NO_SVE-NEXT: .LBB47_46: // %cond.load58 +; NO_SVE-NEXT: add x10, x0, #40 +; NO_SVE-NEXT: ld1 { v2.h }[4], [x10] +; NO_SVE-NEXT: tbz w9, #21, .LBB47_40 +; NO_SVE-NEXT: .LBB47_47: // %cond.load61 +; NO_SVE-NEXT: add x10, x0, #42 +; NO_SVE-NEXT: ld1 { v2.h }[5], [x10] +; NO_SVE-NEXT: tbz w9, #22, .LBB47_41 +; NO_SVE-NEXT: .LBB47_48: // %cond.load64 +; NO_SVE-NEXT: add x10, x0, #44 +; NO_SVE-NEXT: ld1 { v2.h }[6], [x10] +; NO_SVE-NEXT: tbz w9, #23, .LBB47_42 +; NO_SVE-NEXT: .LBB47_49: // %cond.load67 +; NO_SVE-NEXT: add x10, x0, #46 +; NO_SVE-NEXT: ld1 { v2.h }[7], [x10] +; NO_SVE-NEXT: tbnz w9, #24, .LBB47_43 +; NO_SVE-NEXT: .LBB47_50: +; NO_SVE-NEXT: // implicit-def: $q3 +; NO_SVE-NEXT: tbz w9, #25, .LBB47_52 +; NO_SVE-NEXT: .LBB47_51: // %cond.load73 +; NO_SVE-NEXT: add x10, x0, #50 +; NO_SVE-NEXT: ld1 { v3.h }[1], [x10] +; NO_SVE-NEXT: .LBB47_52: // %else74 +; NO_SVE-NEXT: tbnz w9, #26, .LBB47_60 +; NO_SVE-NEXT: // %bb.53: // %else77 +; NO_SVE-NEXT: tbnz w9, #27, .LBB47_61 +; NO_SVE-NEXT: .LBB47_54: // %else80 +; NO_SVE-NEXT: tbnz w9, #28, .LBB47_62 +; NO_SVE-NEXT: .LBB47_55: // %else83 +; NO_SVE-NEXT: tbnz w9, #29, .LBB47_63 +; NO_SVE-NEXT: .LBB47_56: // %else86 +; NO_SVE-NEXT: tbnz w9, #30, .LBB47_64 +; NO_SVE-NEXT: .LBB47_57: // %else89 +; NO_SVE-NEXT: tbz w9, #31, .LBB47_59 +; NO_SVE-NEXT: .LBB47_58: // %cond.load91 +; NO_SVE-NEXT: add x9, x0, #62 +; NO_SVE-NEXT: ld1 { v3.h }[7], [x9] +; NO_SVE-NEXT: .LBB47_59: // %else92 +; NO_SVE-NEXT: ushll v6.4s, v0.4h, #0 +; NO_SVE-NEXT: ushll2 v0.4s, v0.8h, #0 +; NO_SVE-NEXT: ushll2 v5.4s, v2.8h, #0 +; NO_SVE-NEXT: ushll2 v16.2d, v0.4s, #0 +; NO_SVE-NEXT: ushll v0.2d, v0.2s, #0 +; NO_SVE-NEXT: ushll2 v7.4s, v1.8h, #0 +; NO_SVE-NEXT: stp q0, q16, [x8, #32] +; NO_SVE-NEXT: ushll2 v0.2d, v6.4s, #0 +; NO_SVE-NEXT: ushll v6.2d, v6.2s, #0 +; NO_SVE-NEXT: ushll v1.4s, v1.4h, #0 +; NO_SVE-NEXT: stp q6, q0, [x8] +; NO_SVE-NEXT: ushll2 v0.2d, v5.4s, #0 +; NO_SVE-NEXT: ushll v5.2d, v5.2s, #0 +; NO_SVE-NEXT: ushll2 v4.4s, v3.8h, #0 +; NO_SVE-NEXT: ushll2 v6.2d, v1.4s, #0 +; NO_SVE-NEXT: stp q5, q0, [x8, #160] +; NO_SVE-NEXT: ushll v0.2d, v1.2s, #0 +; NO_SVE-NEXT: ushll2 v1.2d, v4.4s, #0 +; NO_SVE-NEXT: stp q0, q6, [x8, #64] +; NO_SVE-NEXT: ushll v0.2d, v4.2s, #0 +; NO_SVE-NEXT: ushll v2.4s, v2.4h, #0 +; NO_SVE-NEXT: stp q0, q1, [x8, #224] +; NO_SVE-NEXT: ushll2 v0.2d, v2.4s, #0 +; NO_SVE-NEXT: ushll v1.2d, v2.2s, #0 +; NO_SVE-NEXT: ushll v2.4s, v3.4h, #0 +; NO_SVE-NEXT: ushll2 v17.2d, v7.4s, #0 +; NO_SVE-NEXT: stp q1, q0, [x8, #128] +; NO_SVE-NEXT: ushll v7.2d, v7.2s, #0 +; NO_SVE-NEXT: ushll2 v0.2d, v2.4s, #0 +; NO_SVE-NEXT: ushll v1.2d, v2.2s, #0 +; NO_SVE-NEXT: stp q7, q17, [x8, #96] +; NO_SVE-NEXT: stp q1, q0, [x8, #192] +; NO_SVE-NEXT: add sp, sp, #16 +; NO_SVE-NEXT: ret +; NO_SVE-NEXT: .LBB47_60: // %cond.load76 +; NO_SVE-NEXT: add x10, x0, #52 +; NO_SVE-NEXT: ld1 { v3.h }[2], [x10] +; NO_SVE-NEXT: tbz w9, #27, .LBB47_54 +; NO_SVE-NEXT: .LBB47_61: // %cond.load79 +; NO_SVE-NEXT: add x10, x0, #54 +; NO_SVE-NEXT: ld1 { v3.h }[3], [x10] +; NO_SVE-NEXT: tbz w9, #28, .LBB47_55 +; NO_SVE-NEXT: .LBB47_62: // %cond.load82 +; NO_SVE-NEXT: add x10, x0, #56 +; NO_SVE-NEXT: ld1 { v3.h }[4], [x10] +; NO_SVE-NEXT: tbz w9, #29, .LBB47_56 +; NO_SVE-NEXT: .LBB47_63: // %cond.load85 +; NO_SVE-NEXT: add x10, x0, #58 +; NO_SVE-NEXT: ld1 { v3.h }[5], [x10] +; NO_SVE-NEXT: tbz w9, #30, .LBB47_57 +; NO_SVE-NEXT: .LBB47_64: // %cond.load88 +; NO_SVE-NEXT: add x10, x0, #60 +; NO_SVE-NEXT: ld1 { v3.h }[6], [x10] +; NO_SVE-NEXT: tbnz w9, #31, .LBB47_58 +; NO_SVE-NEXT: b .LBB47_59 +; ; VBITS_GE_2048-LABEL: masked_load_zext_v32i16i64: ; VBITS_GE_2048: // %bb.0: ; VBITS_GE_2048-NEXT: ptrue p0.d, vl32 @@ -823,6 +13664,349 @@ } define <32 x i64> @masked_load_zext_v32i32i64(<32 x i32>* %ap, <32 x i32>* %bp) #0 { +; NO_SVE-LABEL: masked_load_zext_v32i32i64: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: sub sp, sp, #16 +; NO_SVE-NEXT: .cfi_def_cfa_offset 16 +; NO_SVE-NEXT: ldp q2, q3, [x1, #64] +; NO_SVE-NEXT: cmeq v2.4s, v2.4s, #0 +; NO_SVE-NEXT: cmeq v3.4s, v3.4s, #0 +; NO_SVE-NEXT: ldp q4, q5, [x1, #96] +; NO_SVE-NEXT: uzp1 v2.8h, v2.8h, v3.8h +; NO_SVE-NEXT: cmeq v4.4s, v4.4s, #0 +; NO_SVE-NEXT: xtn v2.8b, v2.8h +; NO_SVE-NEXT: cmeq v5.4s, v5.4s, #0 +; NO_SVE-NEXT: umov w9, v2.b[1] +; NO_SVE-NEXT: ldp q0, q1, [x1] +; NO_SVE-NEXT: umov w11, v2.b[2] +; NO_SVE-NEXT: umov w10, v2.b[0] +; NO_SVE-NEXT: uzp1 v3.8h, v4.8h, v5.8h +; NO_SVE-NEXT: umov w12, v2.b[3] +; NO_SVE-NEXT: umov w13, v2.b[4] +; NO_SVE-NEXT: and w9, w9, #0x1 +; NO_SVE-NEXT: umov w14, v2.b[5] +; NO_SVE-NEXT: umov w15, v2.b[6] +; NO_SVE-NEXT: and w11, w11, #0x1 +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: xtn v3.8b, v3.8h +; NO_SVE-NEXT: and w12, w12, #0x1 +; NO_SVE-NEXT: cmeq v1.4s, v1.4s, #0 +; NO_SVE-NEXT: umov w16, v2.b[7] +; NO_SVE-NEXT: cmeq v0.4s, v0.4s, #0 +; NO_SVE-NEXT: bfi w10, w9, #1, #1 +; NO_SVE-NEXT: and w13, w13, #0x1 +; NO_SVE-NEXT: umov w9, v3.b[0] +; NO_SVE-NEXT: bfi w10, w11, #2, #1 +; NO_SVE-NEXT: and w14, w14, #0x1 +; NO_SVE-NEXT: uzp1 v0.8h, v0.8h, v1.8h +; NO_SVE-NEXT: umov w11, v3.b[1] +; NO_SVE-NEXT: bfi w10, w12, #3, #1 +; NO_SVE-NEXT: umov w12, v3.b[2] +; NO_SVE-NEXT: bfi w10, w13, #4, #1 +; NO_SVE-NEXT: and w15, w15, #0x1 +; NO_SVE-NEXT: umov w13, v3.b[3] +; NO_SVE-NEXT: bfi w10, w14, #5, #1 +; NO_SVE-NEXT: and w16, w16, #0x1 +; NO_SVE-NEXT: umov w14, v3.b[4] +; NO_SVE-NEXT: and w9, w9, #0x1 +; NO_SVE-NEXT: orr w10, w10, w15, lsl #6 +; NO_SVE-NEXT: and w11, w11, #0x1 +; NO_SVE-NEXT: xtn v0.8b, v0.8h +; NO_SVE-NEXT: orr w10, w10, w16, lsl #7 +; NO_SVE-NEXT: and w12, w12, #0x1 +; NO_SVE-NEXT: ldp q1, q4, [x1, #32] +; NO_SVE-NEXT: orr w9, w10, w9, lsl #8 +; NO_SVE-NEXT: and w13, w13, #0x1 +; NO_SVE-NEXT: umov w16, v0.b[1] +; NO_SVE-NEXT: orr w9, w9, w11, lsl #9 +; NO_SVE-NEXT: and w14, w14, #0x1 +; NO_SVE-NEXT: umov w15, v3.b[5] +; NO_SVE-NEXT: orr w9, w9, w12, lsl #10 +; NO_SVE-NEXT: umov w10, v0.b[0] +; NO_SVE-NEXT: umov w11, v0.b[2] +; NO_SVE-NEXT: orr w9, w9, w13, lsl #11 +; NO_SVE-NEXT: orr w9, w9, w14, lsl #12 +; NO_SVE-NEXT: umov w14, v0.b[3] +; NO_SVE-NEXT: and w13, w16, #0x1 +; NO_SVE-NEXT: and w12, w15, #0x1 +; NO_SVE-NEXT: umov w15, v0.b[4] +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: cmeq v2.4s, v4.4s, #0 +; NO_SVE-NEXT: and w11, w11, #0x1 +; NO_SVE-NEXT: cmeq v1.4s, v1.4s, #0 +; NO_SVE-NEXT: bfi w10, w13, #1, #1 +; NO_SVE-NEXT: umov w16, v0.b[5] +; NO_SVE-NEXT: and w13, w14, #0x1 +; NO_SVE-NEXT: uzp1 v1.8h, v1.8h, v2.8h +; NO_SVE-NEXT: bfi w10, w11, #2, #1 +; NO_SVE-NEXT: and w11, w15, #0x1 +; NO_SVE-NEXT: umov w15, v0.b[6] +; NO_SVE-NEXT: bfi w10, w13, #3, #1 +; NO_SVE-NEXT: umov w13, v0.b[7] +; NO_SVE-NEXT: and w14, w16, #0x1 +; NO_SVE-NEXT: orr w9, w9, w12, lsl #13 +; NO_SVE-NEXT: xtn v0.8b, v1.8h +; NO_SVE-NEXT: bfi w10, w11, #4, #1 +; NO_SVE-NEXT: umov w11, v3.b[6] +; NO_SVE-NEXT: bfi w10, w14, #5, #1 +; NO_SVE-NEXT: and w14, w15, #0x1 +; NO_SVE-NEXT: umov w15, v0.b[0] +; NO_SVE-NEXT: and w13, w13, #0x1 +; NO_SVE-NEXT: umov w16, v0.b[1] +; NO_SVE-NEXT: orr w10, w10, w14, lsl #6 +; NO_SVE-NEXT: orr w10, w10, w13, lsl #7 +; NO_SVE-NEXT: umov w13, v0.b[2] +; NO_SVE-NEXT: and w11, w11, #0x1 +; NO_SVE-NEXT: and w12, w15, #0x1 +; NO_SVE-NEXT: umov w15, v0.b[3] +; NO_SVE-NEXT: and w14, w16, #0x1 +; NO_SVE-NEXT: orr w9, w9, w11, lsl #14 +; NO_SVE-NEXT: umov w11, v0.b[4] +; NO_SVE-NEXT: orr w10, w10, w12, lsl #8 +; NO_SVE-NEXT: and w12, w13, #0x1 +; NO_SVE-NEXT: umov w13, v0.b[5] +; NO_SVE-NEXT: orr w10, w10, w14, lsl #9 +; NO_SVE-NEXT: and w14, w15, #0x1 +; NO_SVE-NEXT: umov w15, v0.b[6] +; NO_SVE-NEXT: orr w10, w10, w12, lsl #10 +; NO_SVE-NEXT: umov w12, v3.b[7] +; NO_SVE-NEXT: and w11, w11, #0x1 +; NO_SVE-NEXT: orr w10, w10, w14, lsl #11 +; NO_SVE-NEXT: and w13, w13, #0x1 +; NO_SVE-NEXT: umov w14, v0.b[7] +; NO_SVE-NEXT: orr w10, w10, w11, lsl #12 +; NO_SVE-NEXT: and w11, w15, #0x1 +; NO_SVE-NEXT: orr w12, w9, w12, lsl #15 +; NO_SVE-NEXT: orr w9, w10, w13, lsl #13 +; NO_SVE-NEXT: orr w9, w9, w11, lsl #14 +; NO_SVE-NEXT: orr w9, w9, w14, lsl #15 +; NO_SVE-NEXT: bfi w9, w12, #16, #16 +; NO_SVE-NEXT: tbz w9, #0, .LBB48_2 +; NO_SVE-NEXT: // %bb.1: // %cond.load +; NO_SVE-NEXT: ldr s0, [x0] +; NO_SVE-NEXT: tbnz w9, #1, .LBB48_3 +; NO_SVE-NEXT: b .LBB48_4 +; NO_SVE-NEXT: .LBB48_2: +; NO_SVE-NEXT: // implicit-def: $q0 +; NO_SVE-NEXT: tbz w9, #1, .LBB48_4 +; NO_SVE-NEXT: .LBB48_3: // %cond.load1 +; NO_SVE-NEXT: add x10, x0, #4 +; NO_SVE-NEXT: ld1 { v0.s }[1], [x10] +; NO_SVE-NEXT: .LBB48_4: // %else2 +; NO_SVE-NEXT: tbnz w9, #2, .LBB48_8 +; NO_SVE-NEXT: // %bb.5: // %else5 +; NO_SVE-NEXT: tbnz w9, #3, .LBB48_9 +; NO_SVE-NEXT: .LBB48_6: // %else8 +; NO_SVE-NEXT: tbz w9, #4, .LBB48_10 +; NO_SVE-NEXT: .LBB48_7: // %cond.load10 +; NO_SVE-NEXT: add x10, x0, #16 +; NO_SVE-NEXT: ld1 { v1.s }[0], [x10] +; NO_SVE-NEXT: tbnz w9, #5, .LBB48_11 +; NO_SVE-NEXT: b .LBB48_12 +; NO_SVE-NEXT: .LBB48_8: // %cond.load4 +; NO_SVE-NEXT: add x10, x0, #8 +; NO_SVE-NEXT: ld1 { v0.s }[2], [x10] +; NO_SVE-NEXT: tbz w9, #3, .LBB48_6 +; NO_SVE-NEXT: .LBB48_9: // %cond.load7 +; NO_SVE-NEXT: add x10, x0, #12 +; NO_SVE-NEXT: ld1 { v0.s }[3], [x10] +; NO_SVE-NEXT: tbnz w9, #4, .LBB48_7 +; NO_SVE-NEXT: .LBB48_10: +; NO_SVE-NEXT: // implicit-def: $q1 +; NO_SVE-NEXT: tbz w9, #5, .LBB48_12 +; NO_SVE-NEXT: .LBB48_11: // %cond.load13 +; NO_SVE-NEXT: add x10, x0, #20 +; NO_SVE-NEXT: ld1 { v1.s }[1], [x10] +; NO_SVE-NEXT: .LBB48_12: // %else14 +; NO_SVE-NEXT: tbnz w9, #6, .LBB48_16 +; NO_SVE-NEXT: // %bb.13: // %else17 +; NO_SVE-NEXT: tbnz w9, #7, .LBB48_17 +; NO_SVE-NEXT: .LBB48_14: // %else20 +; NO_SVE-NEXT: tbz w9, #8, .LBB48_18 +; NO_SVE-NEXT: .LBB48_15: // %cond.load22 +; NO_SVE-NEXT: add x10, x0, #32 +; NO_SVE-NEXT: ld1 { v2.s }[0], [x10] +; NO_SVE-NEXT: tbnz w9, #9, .LBB48_19 +; NO_SVE-NEXT: b .LBB48_20 +; NO_SVE-NEXT: .LBB48_16: // %cond.load16 +; NO_SVE-NEXT: add x10, x0, #24 +; NO_SVE-NEXT: ld1 { v1.s }[2], [x10] +; NO_SVE-NEXT: tbz w9, #7, .LBB48_14 +; NO_SVE-NEXT: .LBB48_17: // %cond.load19 +; NO_SVE-NEXT: add x10, x0, #28 +; NO_SVE-NEXT: ld1 { v1.s }[3], [x10] +; NO_SVE-NEXT: tbnz w9, #8, .LBB48_15 +; NO_SVE-NEXT: .LBB48_18: +; NO_SVE-NEXT: // implicit-def: $q2 +; NO_SVE-NEXT: tbz w9, #9, .LBB48_20 +; NO_SVE-NEXT: .LBB48_19: // %cond.load25 +; NO_SVE-NEXT: add x10, x0, #36 +; NO_SVE-NEXT: ld1 { v2.s }[1], [x10] +; NO_SVE-NEXT: .LBB48_20: // %else26 +; NO_SVE-NEXT: tbnz w9, #10, .LBB48_24 +; NO_SVE-NEXT: // %bb.21: // %else29 +; NO_SVE-NEXT: tbnz w9, #11, .LBB48_25 +; NO_SVE-NEXT: .LBB48_22: // %else32 +; NO_SVE-NEXT: tbz w9, #12, .LBB48_26 +; NO_SVE-NEXT: .LBB48_23: // %cond.load34 +; NO_SVE-NEXT: add x10, x0, #48 +; NO_SVE-NEXT: ld1 { v3.s }[0], [x10] +; NO_SVE-NEXT: tbnz w9, #13, .LBB48_27 +; NO_SVE-NEXT: b .LBB48_28 +; NO_SVE-NEXT: .LBB48_24: // %cond.load28 +; NO_SVE-NEXT: add x10, x0, #40 +; NO_SVE-NEXT: ld1 { v2.s }[2], [x10] +; NO_SVE-NEXT: tbz w9, #11, .LBB48_22 +; NO_SVE-NEXT: .LBB48_25: // %cond.load31 +; NO_SVE-NEXT: add x10, x0, #44 +; NO_SVE-NEXT: ld1 { v2.s }[3], [x10] +; NO_SVE-NEXT: tbnz w9, #12, .LBB48_23 +; NO_SVE-NEXT: .LBB48_26: +; NO_SVE-NEXT: // implicit-def: $q3 +; NO_SVE-NEXT: tbz w9, #13, .LBB48_28 +; NO_SVE-NEXT: .LBB48_27: // %cond.load37 +; NO_SVE-NEXT: add x10, x0, #52 +; NO_SVE-NEXT: ld1 { v3.s }[1], [x10] +; NO_SVE-NEXT: .LBB48_28: // %else38 +; NO_SVE-NEXT: tbnz w9, #14, .LBB48_32 +; NO_SVE-NEXT: // %bb.29: // %else41 +; NO_SVE-NEXT: tbnz w9, #15, .LBB48_33 +; NO_SVE-NEXT: .LBB48_30: // %else44 +; NO_SVE-NEXT: tbz w9, #16, .LBB48_34 +; NO_SVE-NEXT: .LBB48_31: // %cond.load46 +; NO_SVE-NEXT: add x10, x0, #64 +; NO_SVE-NEXT: ld1 { v4.s }[0], [x10] +; NO_SVE-NEXT: tbnz w9, #17, .LBB48_35 +; NO_SVE-NEXT: b .LBB48_36 +; NO_SVE-NEXT: .LBB48_32: // %cond.load40 +; NO_SVE-NEXT: add x10, x0, #56 +; NO_SVE-NEXT: ld1 { v3.s }[2], [x10] +; NO_SVE-NEXT: tbz w9, #15, .LBB48_30 +; NO_SVE-NEXT: .LBB48_33: // %cond.load43 +; NO_SVE-NEXT: add x10, x0, #60 +; NO_SVE-NEXT: ld1 { v3.s }[3], [x10] +; NO_SVE-NEXT: tbnz w9, #16, .LBB48_31 +; NO_SVE-NEXT: .LBB48_34: +; NO_SVE-NEXT: // implicit-def: $q4 +; NO_SVE-NEXT: tbz w9, #17, .LBB48_36 +; NO_SVE-NEXT: .LBB48_35: // %cond.load49 +; NO_SVE-NEXT: add x10, x0, #68 +; NO_SVE-NEXT: ld1 { v4.s }[1], [x10] +; NO_SVE-NEXT: .LBB48_36: // %else50 +; NO_SVE-NEXT: tbnz w9, #18, .LBB48_40 +; NO_SVE-NEXT: // %bb.37: // %else53 +; NO_SVE-NEXT: tbnz w9, #19, .LBB48_41 +; NO_SVE-NEXT: .LBB48_38: // %else56 +; NO_SVE-NEXT: tbz w9, #20, .LBB48_42 +; NO_SVE-NEXT: .LBB48_39: // %cond.load58 +; NO_SVE-NEXT: add x10, x0, #80 +; NO_SVE-NEXT: ld1 { v5.s }[0], [x10] +; NO_SVE-NEXT: tbnz w9, #21, .LBB48_43 +; NO_SVE-NEXT: b .LBB48_44 +; NO_SVE-NEXT: .LBB48_40: // %cond.load52 +; NO_SVE-NEXT: add x10, x0, #72 +; NO_SVE-NEXT: ld1 { v4.s }[2], [x10] +; NO_SVE-NEXT: tbz w9, #19, .LBB48_38 +; NO_SVE-NEXT: .LBB48_41: // %cond.load55 +; NO_SVE-NEXT: add x10, x0, #76 +; NO_SVE-NEXT: ld1 { v4.s }[3], [x10] +; NO_SVE-NEXT: tbnz w9, #20, .LBB48_39 +; NO_SVE-NEXT: .LBB48_42: +; NO_SVE-NEXT: // implicit-def: $q5 +; NO_SVE-NEXT: tbz w9, #21, .LBB48_44 +; NO_SVE-NEXT: .LBB48_43: // %cond.load61 +; NO_SVE-NEXT: add x10, x0, #84 +; NO_SVE-NEXT: ld1 { v5.s }[1], [x10] +; NO_SVE-NEXT: .LBB48_44: // %else62 +; NO_SVE-NEXT: tbnz w9, #22, .LBB48_48 +; NO_SVE-NEXT: // %bb.45: // %else65 +; NO_SVE-NEXT: tbnz w9, #23, .LBB48_49 +; NO_SVE-NEXT: .LBB48_46: // %else68 +; NO_SVE-NEXT: tbz w9, #24, .LBB48_50 +; NO_SVE-NEXT: .LBB48_47: // %cond.load70 +; NO_SVE-NEXT: add x10, x0, #96 +; NO_SVE-NEXT: ld1 { v6.s }[0], [x10] +; NO_SVE-NEXT: tbnz w9, #25, .LBB48_51 +; NO_SVE-NEXT: b .LBB48_52 +; NO_SVE-NEXT: .LBB48_48: // %cond.load64 +; NO_SVE-NEXT: add x10, x0, #88 +; NO_SVE-NEXT: ld1 { v5.s }[2], [x10] +; NO_SVE-NEXT: tbz w9, #23, .LBB48_46 +; NO_SVE-NEXT: .LBB48_49: // %cond.load67 +; NO_SVE-NEXT: add x10, x0, #92 +; NO_SVE-NEXT: ld1 { v5.s }[3], [x10] +; NO_SVE-NEXT: tbnz w9, #24, .LBB48_47 +; NO_SVE-NEXT: .LBB48_50: +; NO_SVE-NEXT: // implicit-def: $q6 +; NO_SVE-NEXT: tbz w9, #25, .LBB48_52 +; NO_SVE-NEXT: .LBB48_51: // %cond.load73 +; NO_SVE-NEXT: add x10, x0, #100 +; NO_SVE-NEXT: ld1 { v6.s }[1], [x10] +; NO_SVE-NEXT: .LBB48_52: // %else74 +; NO_SVE-NEXT: tbnz w9, #26, .LBB48_56 +; NO_SVE-NEXT: // %bb.53: // %else77 +; NO_SVE-NEXT: tbnz w9, #27, .LBB48_57 +; NO_SVE-NEXT: .LBB48_54: // %else80 +; NO_SVE-NEXT: tbz w9, #28, .LBB48_58 +; NO_SVE-NEXT: .LBB48_55: // %cond.load82 +; NO_SVE-NEXT: add x10, x0, #112 +; NO_SVE-NEXT: ld1 { v7.s }[0], [x10] +; NO_SVE-NEXT: tbnz w9, #29, .LBB48_59 +; NO_SVE-NEXT: b .LBB48_60 +; NO_SVE-NEXT: .LBB48_56: // %cond.load76 +; NO_SVE-NEXT: add x10, x0, #104 +; NO_SVE-NEXT: ld1 { v6.s }[2], [x10] +; NO_SVE-NEXT: tbz w9, #27, .LBB48_54 +; NO_SVE-NEXT: .LBB48_57: // %cond.load79 +; NO_SVE-NEXT: add x10, x0, #108 +; NO_SVE-NEXT: ld1 { v6.s }[3], [x10] +; NO_SVE-NEXT: tbnz w9, #28, .LBB48_55 +; NO_SVE-NEXT: .LBB48_58: +; NO_SVE-NEXT: // implicit-def: $q7 +; NO_SVE-NEXT: tbz w9, #29, .LBB48_60 +; NO_SVE-NEXT: .LBB48_59: // %cond.load85 +; NO_SVE-NEXT: add x10, x0, #116 +; NO_SVE-NEXT: ld1 { v7.s }[1], [x10] +; NO_SVE-NEXT: .LBB48_60: // %else86 +; NO_SVE-NEXT: tbnz w9, #30, .LBB48_64 +; NO_SVE-NEXT: // %bb.61: // %else89 +; NO_SVE-NEXT: tbz w9, #31, .LBB48_63 +; NO_SVE-NEXT: .LBB48_62: // %cond.load91 +; NO_SVE-NEXT: add x9, x0, #124 +; NO_SVE-NEXT: ld1 { v7.s }[3], [x9] +; NO_SVE-NEXT: .LBB48_63: // %else92 +; NO_SVE-NEXT: ushll2 v16.2d, v0.4s, #0 +; NO_SVE-NEXT: ushll v0.2d, v0.2s, #0 +; NO_SVE-NEXT: stp q0, q16, [x8] +; NO_SVE-NEXT: ushll2 v16.2d, v1.4s, #0 +; NO_SVE-NEXT: ushll v0.2d, v1.2s, #0 +; NO_SVE-NEXT: ushll v1.2d, v2.2s, #0 +; NO_SVE-NEXT: stp q0, q16, [x8, #32] +; NO_SVE-NEXT: ushll2 v0.2d, v2.4s, #0 +; NO_SVE-NEXT: stp q1, q0, [x8, #64] +; NO_SVE-NEXT: ushll2 v0.2d, v3.4s, #0 +; NO_SVE-NEXT: ushll v1.2d, v3.2s, #0 +; NO_SVE-NEXT: stp q1, q0, [x8, #96] +; NO_SVE-NEXT: ushll2 v0.2d, v4.4s, #0 +; NO_SVE-NEXT: ushll v1.2d, v4.2s, #0 +; NO_SVE-NEXT: stp q1, q0, [x8, #128] +; NO_SVE-NEXT: ushll2 v0.2d, v5.4s, #0 +; NO_SVE-NEXT: ushll v1.2d, v5.2s, #0 +; NO_SVE-NEXT: stp q1, q0, [x8, #160] +; NO_SVE-NEXT: ushll2 v0.2d, v6.4s, #0 +; NO_SVE-NEXT: ushll v1.2d, v6.2s, #0 +; NO_SVE-NEXT: stp q1, q0, [x8, #192] +; NO_SVE-NEXT: ushll2 v0.2d, v7.4s, #0 +; NO_SVE-NEXT: ushll v1.2d, v7.2s, #0 +; NO_SVE-NEXT: stp q1, q0, [x8, #224] +; NO_SVE-NEXT: add sp, sp, #16 +; NO_SVE-NEXT: ret +; NO_SVE-NEXT: .LBB48_64: // %cond.load88 +; NO_SVE-NEXT: add x10, x0, #120 +; NO_SVE-NEXT: ld1 { v7.s }[2], [x10] +; NO_SVE-NEXT: tbnz w9, #31, .LBB48_62 +; NO_SVE-NEXT: b .LBB48_63 +; ; VBITS_GE_2048-LABEL: masked_load_zext_v32i32i64: ; VBITS_GE_2048: // %bb.0: ; VBITS_GE_2048-NEXT: ptrue p0.d, vl32 @@ -839,13 +14023,102 @@ } define <8 x i64> @masked_load_sext_ugt_v8i32i64(<8 x i32>* %ap, <8 x i32>* %bp) #0 { +; NO_SVE-LABEL: masked_load_sext_ugt_v8i32i64: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: sub sp, sp, #16 +; NO_SVE-NEXT: .cfi_def_cfa_offset 16 +; NO_SVE-NEXT: ldp q1, q0, [x1] +; NO_SVE-NEXT: cmtst v1.4s, v1.4s, v1.4s +; NO_SVE-NEXT: cmtst v0.4s, v0.4s, v0.4s +; NO_SVE-NEXT: uzp1 v0.8h, v1.8h, v0.8h +; NO_SVE-NEXT: xtn v0.8b, v0.8h +; NO_SVE-NEXT: umov w8, v0.b[1] +; NO_SVE-NEXT: umov w10, v0.b[2] +; NO_SVE-NEXT: umov w9, v0.b[0] +; NO_SVE-NEXT: umov w11, v0.b[3] +; NO_SVE-NEXT: umov w12, v0.b[4] +; NO_SVE-NEXT: umov w13, v0.b[5] +; NO_SVE-NEXT: umov w14, v0.b[6] +; NO_SVE-NEXT: and w8, w8, #0x1 +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: and w9, w9, #0x1 +; NO_SVE-NEXT: and w11, w11, #0x1 +; NO_SVE-NEXT: bfi w9, w8, #1, #1 +; NO_SVE-NEXT: and w8, w12, #0x1 +; NO_SVE-NEXT: bfi w9, w10, #2, #1 +; NO_SVE-NEXT: and w10, w13, #0x1 +; NO_SVE-NEXT: bfi w9, w11, #3, #1 +; NO_SVE-NEXT: umov w11, v0.b[7] +; NO_SVE-NEXT: bfi w9, w8, #4, #1 +; NO_SVE-NEXT: and w8, w14, #0x1 +; NO_SVE-NEXT: bfi w9, w10, #5, #1 +; NO_SVE-NEXT: orr w8, w9, w8, lsl #6 +; NO_SVE-NEXT: orr w9, w8, w11, lsl #7 +; NO_SVE-NEXT: and w8, w9, #0xff +; NO_SVE-NEXT: tbz w9, #0, .LBB49_2 +; NO_SVE-NEXT: // %bb.1: // %cond.load +; NO_SVE-NEXT: ldr s0, [x0] +; NO_SVE-NEXT: tbnz w8, #1, .LBB49_3 +; NO_SVE-NEXT: b .LBB49_4 +; NO_SVE-NEXT: .LBB49_2: +; NO_SVE-NEXT: // implicit-def: $q0 +; NO_SVE-NEXT: tbz w8, #1, .LBB49_4 +; NO_SVE-NEXT: .LBB49_3: // %cond.load1 +; NO_SVE-NEXT: add x9, x0, #4 +; NO_SVE-NEXT: ld1 { v0.s }[1], [x9] +; NO_SVE-NEXT: .LBB49_4: // %else2 +; NO_SVE-NEXT: tbnz w8, #2, .LBB49_8 +; NO_SVE-NEXT: // %bb.5: // %else5 +; NO_SVE-NEXT: tbnz w8, #3, .LBB49_9 +; NO_SVE-NEXT: .LBB49_6: // %else8 +; NO_SVE-NEXT: tbz w8, #4, .LBB49_10 +; NO_SVE-NEXT: .LBB49_7: // %cond.load10 +; NO_SVE-NEXT: add x9, x0, #16 +; NO_SVE-NEXT: ld1 { v2.s }[0], [x9] +; NO_SVE-NEXT: tbnz w8, #5, .LBB49_11 +; NO_SVE-NEXT: b .LBB49_12 +; NO_SVE-NEXT: .LBB49_8: // %cond.load4 +; NO_SVE-NEXT: add x9, x0, #8 +; NO_SVE-NEXT: ld1 { v0.s }[2], [x9] +; NO_SVE-NEXT: tbz w8, #3, .LBB49_6 +; NO_SVE-NEXT: .LBB49_9: // %cond.load7 +; NO_SVE-NEXT: add x9, x0, #12 +; NO_SVE-NEXT: ld1 { v0.s }[3], [x9] +; NO_SVE-NEXT: tbnz w8, #4, .LBB49_7 +; NO_SVE-NEXT: .LBB49_10: +; NO_SVE-NEXT: // implicit-def: $q2 +; NO_SVE-NEXT: tbz w8, #5, .LBB49_12 +; NO_SVE-NEXT: .LBB49_11: // %cond.load13 +; NO_SVE-NEXT: add x9, x0, #20 +; NO_SVE-NEXT: ld1 { v2.s }[1], [x9] +; NO_SVE-NEXT: .LBB49_12: // %else14 +; NO_SVE-NEXT: tbnz w8, #6, .LBB49_16 +; NO_SVE-NEXT: // %bb.13: // %else17 +; NO_SVE-NEXT: tbz w8, #7, .LBB49_15 +; NO_SVE-NEXT: .LBB49_14: // %cond.load19 +; NO_SVE-NEXT: add x8, x0, #28 +; NO_SVE-NEXT: ld1 { v2.s }[3], [x8] +; NO_SVE-NEXT: .LBB49_15: // %else20 +; NO_SVE-NEXT: sshll2 v1.2d, v0.4s, #0 +; NO_SVE-NEXT: sshll v0.2d, v0.2s, #0 +; NO_SVE-NEXT: sshll2 v3.2d, v2.4s, #0 +; NO_SVE-NEXT: sshll v2.2d, v2.2s, #0 +; NO_SVE-NEXT: add sp, sp, #16 +; NO_SVE-NEXT: ret +; NO_SVE-NEXT: .LBB49_16: // %cond.load16 +; NO_SVE-NEXT: add x9, x0, #24 +; NO_SVE-NEXT: ld1 { v2.s }[2], [x9] +; NO_SVE-NEXT: tbnz w8, #7, .LBB49_14 +; NO_SVE-NEXT: b .LBB49_15 +; ; VBITS_GE_512-LABEL: masked_load_sext_ugt_v8i32i64: -; VBITS_GE_512: // %bb.0 -; VBITS_GE_512-NEXT: ptrue p0.d, vl8 -; VBITS_GE_512-NEXT: ld1w { z0.d }, p0/z, [x1] -; VBITS_GE_512-NEXT: cmpne p1.d, p0/z, z0.d, #0 -; VBITS_GE_512-NEXT: ld1sw { z0.d }, p1/z, [x0] -; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x8] +; VBITS_GE_512: // %bb.0: +; VBITS_GE_512-NEXT: ptrue p0.d, vl8 +; VBITS_GE_512-NEXT: ld1w { z0.d }, p0/z, [x1] +; VBITS_GE_512-NEXT: cmpne p1.d, p0/z, z0.d, #0 +; VBITS_GE_512-NEXT: ld1sw { z0.d }, p1/z, [x0] +; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x8] +; VBITS_GE_512-NEXT: ret %b = load <8 x i32>, <8 x i32>* %bp %mask = icmp ugt <8 x i32> %b, zeroinitializer %load = call <8 x i32> @llvm.masked.load.v8i32(<8 x i32>* %ap, i32 8, <8 x i1> %mask, <8 x i32> undef) @@ -854,13 +14127,102 @@ } define <8 x i64> @masked_load_zext_sgt_v8i32i64(<8 x i32>* %ap, <8 x i32>* %bp) #0 { +; NO_SVE-LABEL: masked_load_zext_sgt_v8i32i64: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: sub sp, sp, #16 +; NO_SVE-NEXT: .cfi_def_cfa_offset 16 +; NO_SVE-NEXT: ldp q1, q0, [x1] +; NO_SVE-NEXT: cmgt v1.4s, v1.4s, #0 +; NO_SVE-NEXT: cmgt v0.4s, v0.4s, #0 +; NO_SVE-NEXT: uzp1 v0.8h, v1.8h, v0.8h +; NO_SVE-NEXT: xtn v0.8b, v0.8h +; NO_SVE-NEXT: umov w8, v0.b[1] +; NO_SVE-NEXT: umov w10, v0.b[2] +; NO_SVE-NEXT: umov w9, v0.b[0] +; NO_SVE-NEXT: umov w11, v0.b[3] +; NO_SVE-NEXT: umov w12, v0.b[4] +; NO_SVE-NEXT: umov w13, v0.b[5] +; NO_SVE-NEXT: umov w14, v0.b[6] +; NO_SVE-NEXT: and w8, w8, #0x1 +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: and w9, w9, #0x1 +; NO_SVE-NEXT: and w11, w11, #0x1 +; NO_SVE-NEXT: bfi w9, w8, #1, #1 +; NO_SVE-NEXT: and w8, w12, #0x1 +; NO_SVE-NEXT: bfi w9, w10, #2, #1 +; NO_SVE-NEXT: and w10, w13, #0x1 +; NO_SVE-NEXT: bfi w9, w11, #3, #1 +; NO_SVE-NEXT: umov w11, v0.b[7] +; NO_SVE-NEXT: bfi w9, w8, #4, #1 +; NO_SVE-NEXT: and w8, w14, #0x1 +; NO_SVE-NEXT: bfi w9, w10, #5, #1 +; NO_SVE-NEXT: orr w8, w9, w8, lsl #6 +; NO_SVE-NEXT: orr w9, w8, w11, lsl #7 +; NO_SVE-NEXT: and w8, w9, #0xff +; NO_SVE-NEXT: tbz w9, #0, .LBB50_2 +; NO_SVE-NEXT: // %bb.1: // %cond.load +; NO_SVE-NEXT: ldr s0, [x0] +; NO_SVE-NEXT: tbnz w8, #1, .LBB50_3 +; NO_SVE-NEXT: b .LBB50_4 +; NO_SVE-NEXT: .LBB50_2: +; NO_SVE-NEXT: // implicit-def: $q0 +; NO_SVE-NEXT: tbz w8, #1, .LBB50_4 +; NO_SVE-NEXT: .LBB50_3: // %cond.load1 +; NO_SVE-NEXT: add x9, x0, #4 +; NO_SVE-NEXT: ld1 { v0.s }[1], [x9] +; NO_SVE-NEXT: .LBB50_4: // %else2 +; NO_SVE-NEXT: tbnz w8, #2, .LBB50_8 +; NO_SVE-NEXT: // %bb.5: // %else5 +; NO_SVE-NEXT: tbnz w8, #3, .LBB50_9 +; NO_SVE-NEXT: .LBB50_6: // %else8 +; NO_SVE-NEXT: tbz w8, #4, .LBB50_10 +; NO_SVE-NEXT: .LBB50_7: // %cond.load10 +; NO_SVE-NEXT: add x9, x0, #16 +; NO_SVE-NEXT: ld1 { v2.s }[0], [x9] +; NO_SVE-NEXT: tbnz w8, #5, .LBB50_11 +; NO_SVE-NEXT: b .LBB50_12 +; NO_SVE-NEXT: .LBB50_8: // %cond.load4 +; NO_SVE-NEXT: add x9, x0, #8 +; NO_SVE-NEXT: ld1 { v0.s }[2], [x9] +; NO_SVE-NEXT: tbz w8, #3, .LBB50_6 +; NO_SVE-NEXT: .LBB50_9: // %cond.load7 +; NO_SVE-NEXT: add x9, x0, #12 +; NO_SVE-NEXT: ld1 { v0.s }[3], [x9] +; NO_SVE-NEXT: tbnz w8, #4, .LBB50_7 +; NO_SVE-NEXT: .LBB50_10: +; NO_SVE-NEXT: // implicit-def: $q2 +; NO_SVE-NEXT: tbz w8, #5, .LBB50_12 +; NO_SVE-NEXT: .LBB50_11: // %cond.load13 +; NO_SVE-NEXT: add x9, x0, #20 +; NO_SVE-NEXT: ld1 { v2.s }[1], [x9] +; NO_SVE-NEXT: .LBB50_12: // %else14 +; NO_SVE-NEXT: tbnz w8, #6, .LBB50_16 +; NO_SVE-NEXT: // %bb.13: // %else17 +; NO_SVE-NEXT: tbz w8, #7, .LBB50_15 +; NO_SVE-NEXT: .LBB50_14: // %cond.load19 +; NO_SVE-NEXT: add x8, x0, #28 +; NO_SVE-NEXT: ld1 { v2.s }[3], [x8] +; NO_SVE-NEXT: .LBB50_15: // %else20 +; NO_SVE-NEXT: ushll2 v1.2d, v0.4s, #0 +; NO_SVE-NEXT: ushll v0.2d, v0.2s, #0 +; NO_SVE-NEXT: ushll2 v3.2d, v2.4s, #0 +; NO_SVE-NEXT: ushll v2.2d, v2.2s, #0 +; NO_SVE-NEXT: add sp, sp, #16 +; NO_SVE-NEXT: ret +; NO_SVE-NEXT: .LBB50_16: // %cond.load16 +; NO_SVE-NEXT: add x9, x0, #24 +; NO_SVE-NEXT: ld1 { v2.s }[2], [x9] +; NO_SVE-NEXT: tbnz w8, #7, .LBB50_14 +; NO_SVE-NEXT: b .LBB50_15 +; ; VBITS_GE_512-LABEL: masked_load_zext_sgt_v8i32i64: -; VBITS_GE_512: // %bb.0 -; VBITS_GE_512-NEXT: ptrue p0.d, vl8 -; VBITS_GE_512-NEXT: ld1sw { z0.d }, p0/z, [x1] -; VBITS_GE_512-NEXT: cmpgt p1.d, p0/z, z0.d, #0 -; VBITS_GE_512-NEXT: ld1w { z0.d }, p1/z, [x0] -; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x8] +; VBITS_GE_512: // %bb.0: +; VBITS_GE_512-NEXT: ptrue p0.d, vl8 +; VBITS_GE_512-NEXT: ld1sw { z0.d }, p0/z, [x1] +; VBITS_GE_512-NEXT: cmpgt p1.d, p0/z, z0.d, #0 +; VBITS_GE_512-NEXT: ld1w { z0.d }, p1/z, [x0] +; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x8] +; VBITS_GE_512-NEXT: ret %b = load <8 x i32>, <8 x i32>* %bp %mask = icmp sgt <8 x i32> %b, zeroinitializer %load = call <8 x i32> @llvm.masked.load.v8i32(<8 x i32>* %ap, i32 8, <8 x i1> %mask, <8 x i32> undef) diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-stores.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-stores.ll --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-stores.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-stores.ll @@ -1,6 +1,7 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -aarch64-sve-vector-bits-min=128 < %s | FileCheck %s -D#VBYTES=16 -check-prefix=NO_SVE -; RUN: llc -aarch64-sve-vector-bits-min=256 < %s | FileCheck %s -D#VBYTES=32 -check-prefixes=CHECK -; RUN: llc -aarch64-sve-vector-bits-min=384 < %s | FileCheck %s -D#VBYTES=32 -check-prefixes=CHECK +; RUN: llc -aarch64-sve-vector-bits-min=256 < %s | FileCheck %s -D#VBYTES=32 -check-prefixes=CHECK,VBITS_GE_256 +; RUN: llc -aarch64-sve-vector-bits-min=384 < %s | FileCheck %s -D#VBYTES=32 -check-prefixes=CHECK,VBITS_GE_384 ; RUN: llc -aarch64-sve-vector-bits-min=512 < %s | FileCheck %s -D#VBYTES=64 -check-prefixes=CHECK,VBITS_GE_512 ; RUN: llc -aarch64-sve-vector-bits-min=640 < %s | FileCheck %s -D#VBYTES=64 -check-prefixes=CHECK,VBITS_GE_512 ; RUN: llc -aarch64-sve-vector-bits-min=768 < %s | FileCheck %s -D#VBYTES=64 -check-prefixes=CHECK,VBITS_GE_512 @@ -24,6 +25,32 @@ ;; Masked Stores ;; define void @masked_store_v2f16(<2 x half>* %ap, <2 x half>* %bp) #0 { +; NO_SVE-LABEL: masked_store_v2f16: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: sub sp, sp, #16 +; NO_SVE-NEXT: .cfi_def_cfa_offset 16 +; NO_SVE-NEXT: ldr s0, [x0] +; NO_SVE-NEXT: ldr s1, [x1] +; NO_SVE-NEXT: fcmeq v1.4h, v0.4h, v1.4h +; NO_SVE-NEXT: umov w8, v1.h[1] +; NO_SVE-NEXT: umov w9, v1.h[0] +; NO_SVE-NEXT: bfi w9, w8, #1, #31 +; NO_SVE-NEXT: and w8, w9, #0x3 +; NO_SVE-NEXT: tbnz w9, #0, .LBB0_3 +; NO_SVE-NEXT: // %bb.1: // %else +; NO_SVE-NEXT: tbnz w8, #1, .LBB0_4 +; NO_SVE-NEXT: .LBB0_2: // %else2 +; NO_SVE-NEXT: add sp, sp, #16 +; NO_SVE-NEXT: ret +; NO_SVE-NEXT: .LBB0_3: // %cond.store +; NO_SVE-NEXT: str h0, [x1] +; NO_SVE-NEXT: tbz w8, #1, .LBB0_2 +; NO_SVE-NEXT: .LBB0_4: // %cond.store1 +; NO_SVE-NEXT: add x8, x1, #2 +; NO_SVE-NEXT: st1 { v0.h }[1], [x8] +; NO_SVE-NEXT: add sp, sp, #16 +; NO_SVE-NEXT: ret +; ; CHECK-LABEL: masked_store_v2f16: ; CHECK: // %bb.0: ; CHECK-NEXT: ldr s1, [x0] @@ -55,6 +82,32 @@ define void @masked_store_v2f32(<2 x float>* %ap, <2 x float>* %bp) #0 { +; NO_SVE-LABEL: masked_store_v2f32: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: sub sp, sp, #16 +; NO_SVE-NEXT: .cfi_def_cfa_offset 16 +; NO_SVE-NEXT: ldr d0, [x0] +; NO_SVE-NEXT: ldr d1, [x1] +; NO_SVE-NEXT: fcmeq v1.2s, v0.2s, v1.2s +; NO_SVE-NEXT: mov w8, v1.s[1] +; NO_SVE-NEXT: fmov w9, s1 +; NO_SVE-NEXT: bfi w9, w8, #1, #31 +; NO_SVE-NEXT: and w8, w9, #0x3 +; NO_SVE-NEXT: tbnz w9, #0, .LBB1_3 +; NO_SVE-NEXT: // %bb.1: // %else +; NO_SVE-NEXT: tbnz w8, #1, .LBB1_4 +; NO_SVE-NEXT: .LBB1_2: // %else2 +; NO_SVE-NEXT: add sp, sp, #16 +; NO_SVE-NEXT: ret +; NO_SVE-NEXT: .LBB1_3: // %cond.store +; NO_SVE-NEXT: str s0, [x1] +; NO_SVE-NEXT: tbz w8, #1, .LBB1_2 +; NO_SVE-NEXT: .LBB1_4: // %cond.store1 +; NO_SVE-NEXT: add x8, x1, #4 +; NO_SVE-NEXT: st1 { v0.s }[1], [x8] +; NO_SVE-NEXT: add sp, sp, #16 +; NO_SVE-NEXT: ret +; ; CHECK-LABEL: masked_store_v2f32: ; CHECK: // %bb.0: ; CHECK-NEXT: ldr d0, [x0] @@ -72,6 +125,51 @@ } define void @masked_store_v4f32(<4 x float>* %ap, <4 x float>* %bp) #0 { +; NO_SVE-LABEL: masked_store_v4f32: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: sub sp, sp, #16 +; NO_SVE-NEXT: .cfi_def_cfa_offset 16 +; NO_SVE-NEXT: ldr q0, [x0] +; NO_SVE-NEXT: ldr q1, [x1] +; NO_SVE-NEXT: fcmeq v1.4s, v0.4s, v1.4s +; NO_SVE-NEXT: xtn v1.4h, v1.4s +; NO_SVE-NEXT: umov w8, v1.h[1] +; NO_SVE-NEXT: umov w9, v1.h[2] +; NO_SVE-NEXT: umov w10, v1.h[0] +; NO_SVE-NEXT: umov w11, v1.h[3] +; NO_SVE-NEXT: and w8, w8, #0x1 +; NO_SVE-NEXT: and w9, w9, #0x1 +; NO_SVE-NEXT: bfi w10, w8, #1, #1 +; NO_SVE-NEXT: bfi w10, w9, #2, #1 +; NO_SVE-NEXT: bfi w10, w11, #3, #29 +; NO_SVE-NEXT: and w8, w10, #0xf +; NO_SVE-NEXT: tbnz w10, #0, .LBB2_5 +; NO_SVE-NEXT: // %bb.1: // %else +; NO_SVE-NEXT: tbnz w8, #1, .LBB2_6 +; NO_SVE-NEXT: .LBB2_2: // %else2 +; NO_SVE-NEXT: tbnz w8, #2, .LBB2_7 +; NO_SVE-NEXT: .LBB2_3: // %else4 +; NO_SVE-NEXT: tbnz w8, #3, .LBB2_8 +; NO_SVE-NEXT: .LBB2_4: // %else6 +; NO_SVE-NEXT: add sp, sp, #16 +; NO_SVE-NEXT: ret +; NO_SVE-NEXT: .LBB2_5: // %cond.store +; NO_SVE-NEXT: str s0, [x1] +; NO_SVE-NEXT: tbz w8, #1, .LBB2_2 +; NO_SVE-NEXT: .LBB2_6: // %cond.store1 +; NO_SVE-NEXT: add x9, x1, #4 +; NO_SVE-NEXT: st1 { v0.s }[1], [x9] +; NO_SVE-NEXT: tbz w8, #2, .LBB2_3 +; NO_SVE-NEXT: .LBB2_7: // %cond.store3 +; NO_SVE-NEXT: add x9, x1, #8 +; NO_SVE-NEXT: st1 { v0.s }[2], [x9] +; NO_SVE-NEXT: tbz w8, #3, .LBB2_4 +; NO_SVE-NEXT: .LBB2_8: // %cond.store5 +; NO_SVE-NEXT: add x8, x1, #12 +; NO_SVE-NEXT: st1 { v0.s }[3], [x8] +; NO_SVE-NEXT: add sp, sp, #16 +; NO_SVE-NEXT: ret +; ; CHECK-LABEL: masked_store_v4f32: ; CHECK: // %bb.0: ; CHECK-NEXT: ldr q0, [x0] @@ -89,6 +187,89 @@ } define void @masked_store_v8f32(<8 x float>* %ap, <8 x float>* %bp) #0 { +; NO_SVE-LABEL: masked_store_v8f32: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: sub sp, sp, #16 +; NO_SVE-NEXT: .cfi_def_cfa_offset 16 +; NO_SVE-NEXT: ldp q1, q0, [x0] +; NO_SVE-NEXT: ldp q3, q2, [x1] +; NO_SVE-NEXT: fcmeq v3.4s, v1.4s, v3.4s +; NO_SVE-NEXT: fcmeq v2.4s, v0.4s, v2.4s +; NO_SVE-NEXT: uzp1 v2.8h, v3.8h, v2.8h +; NO_SVE-NEXT: xtn v2.8b, v2.8h +; NO_SVE-NEXT: umov w8, v2.b[1] +; NO_SVE-NEXT: umov w10, v2.b[2] +; NO_SVE-NEXT: umov w9, v2.b[0] +; NO_SVE-NEXT: umov w11, v2.b[3] +; NO_SVE-NEXT: umov w12, v2.b[4] +; NO_SVE-NEXT: umov w13, v2.b[5] +; NO_SVE-NEXT: umov w14, v2.b[6] +; NO_SVE-NEXT: and w8, w8, #0x1 +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: and w9, w9, #0x1 +; NO_SVE-NEXT: and w11, w11, #0x1 +; NO_SVE-NEXT: bfi w9, w8, #1, #1 +; NO_SVE-NEXT: and w8, w12, #0x1 +; NO_SVE-NEXT: bfi w9, w10, #2, #1 +; NO_SVE-NEXT: and w10, w13, #0x1 +; NO_SVE-NEXT: bfi w9, w11, #3, #1 +; NO_SVE-NEXT: umov w11, v2.b[7] +; NO_SVE-NEXT: bfi w9, w8, #4, #1 +; NO_SVE-NEXT: and w8, w14, #0x1 +; NO_SVE-NEXT: bfi w9, w10, #5, #1 +; NO_SVE-NEXT: orr w8, w9, w8, lsl #6 +; NO_SVE-NEXT: orr w9, w8, w11, lsl #7 +; NO_SVE-NEXT: and w8, w9, #0xff +; NO_SVE-NEXT: tbnz w9, #0, .LBB3_9 +; NO_SVE-NEXT: // %bb.1: // %else +; NO_SVE-NEXT: tbnz w8, #1, .LBB3_10 +; NO_SVE-NEXT: .LBB3_2: // %else2 +; NO_SVE-NEXT: tbnz w8, #2, .LBB3_11 +; NO_SVE-NEXT: .LBB3_3: // %else4 +; NO_SVE-NEXT: tbnz w8, #3, .LBB3_12 +; NO_SVE-NEXT: .LBB3_4: // %else6 +; NO_SVE-NEXT: tbnz w8, #4, .LBB3_13 +; NO_SVE-NEXT: .LBB3_5: // %else8 +; NO_SVE-NEXT: tbnz w8, #5, .LBB3_14 +; NO_SVE-NEXT: .LBB3_6: // %else10 +; NO_SVE-NEXT: tbnz w8, #6, .LBB3_15 +; NO_SVE-NEXT: .LBB3_7: // %else12 +; NO_SVE-NEXT: tbnz w8, #7, .LBB3_16 +; NO_SVE-NEXT: .LBB3_8: // %else14 +; NO_SVE-NEXT: add sp, sp, #16 +; NO_SVE-NEXT: ret +; NO_SVE-NEXT: .LBB3_9: // %cond.store +; NO_SVE-NEXT: str s1, [x1] +; NO_SVE-NEXT: tbz w8, #1, .LBB3_2 +; NO_SVE-NEXT: .LBB3_10: // %cond.store1 +; NO_SVE-NEXT: add x9, x1, #4 +; NO_SVE-NEXT: st1 { v1.s }[1], [x9] +; NO_SVE-NEXT: tbz w8, #2, .LBB3_3 +; NO_SVE-NEXT: .LBB3_11: // %cond.store3 +; NO_SVE-NEXT: add x9, x1, #8 +; NO_SVE-NEXT: st1 { v1.s }[2], [x9] +; NO_SVE-NEXT: tbz w8, #3, .LBB3_4 +; NO_SVE-NEXT: .LBB3_12: // %cond.store5 +; NO_SVE-NEXT: add x9, x1, #12 +; NO_SVE-NEXT: st1 { v1.s }[3], [x9] +; NO_SVE-NEXT: tbz w8, #4, .LBB3_5 +; NO_SVE-NEXT: .LBB3_13: // %cond.store7 +; NO_SVE-NEXT: str s0, [x1, #16] +; NO_SVE-NEXT: tbz w8, #5, .LBB3_6 +; NO_SVE-NEXT: .LBB3_14: // %cond.store9 +; NO_SVE-NEXT: add x9, x1, #20 +; NO_SVE-NEXT: st1 { v0.s }[1], [x9] +; NO_SVE-NEXT: tbz w8, #6, .LBB3_7 +; NO_SVE-NEXT: .LBB3_15: // %cond.store11 +; NO_SVE-NEXT: add x9, x1, #24 +; NO_SVE-NEXT: st1 { v0.s }[2], [x9] +; NO_SVE-NEXT: tbz w8, #7, .LBB3_8 +; NO_SVE-NEXT: .LBB3_16: // %cond.store13 +; NO_SVE-NEXT: add x8, x1, #28 +; NO_SVE-NEXT: st1 { v0.s }[3], [x8] +; NO_SVE-NEXT: add sp, sp, #16 +; NO_SVE-NEXT: ret +; ; CHECK-LABEL: masked_store_v8f32: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.s, vl8 @@ -105,6 +286,193 @@ } define void @masked_store_v16f32(<16 x float>* %ap, <16 x float>* %bp) #0 { +; NO_SVE-LABEL: masked_store_v16f32: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: sub sp, sp, #16 +; NO_SVE-NEXT: .cfi_def_cfa_offset 16 +; NO_SVE-NEXT: ldp q3, q2, [x0] +; NO_SVE-NEXT: ldp q1, q0, [x1] +; NO_SVE-NEXT: fcmeq v1.4s, v3.4s, v1.4s +; NO_SVE-NEXT: fcmeq v4.4s, v2.4s, v0.4s +; NO_SVE-NEXT: ldp q6, q5, [x1, #32] +; NO_SVE-NEXT: uzp1 v4.8h, v1.8h, v4.8h +; NO_SVE-NEXT: ldp q1, q0, [x0, #32] +; NO_SVE-NEXT: xtn v4.8b, v4.8h +; NO_SVE-NEXT: umov w8, v4.b[1] +; NO_SVE-NEXT: umov w10, v4.b[2] +; NO_SVE-NEXT: fcmeq v6.4s, v1.4s, v6.4s +; NO_SVE-NEXT: umov w9, v4.b[0] +; NO_SVE-NEXT: umov w11, v4.b[3] +; NO_SVE-NEXT: fcmeq v5.4s, v0.4s, v5.4s +; NO_SVE-NEXT: umov w12, v4.b[4] +; NO_SVE-NEXT: and w8, w8, #0x1 +; NO_SVE-NEXT: umov w13, v4.b[5] +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: and w9, w9, #0x1 +; NO_SVE-NEXT: and w11, w11, #0x1 +; NO_SVE-NEXT: uzp1 v5.8h, v6.8h, v5.8h +; NO_SVE-NEXT: bfi w9, w8, #1, #1 +; NO_SVE-NEXT: umov w8, v4.b[6] +; NO_SVE-NEXT: and w12, w12, #0x1 +; NO_SVE-NEXT: bfi w9, w10, #2, #1 +; NO_SVE-NEXT: umov w10, v4.b[7] +; NO_SVE-NEXT: and w13, w13, #0x1 +; NO_SVE-NEXT: bfi w9, w11, #3, #1 +; NO_SVE-NEXT: xtn v5.8b, v5.8h +; NO_SVE-NEXT: bfi w9, w12, #4, #1 +; NO_SVE-NEXT: and w8, w8, #0x1 +; NO_SVE-NEXT: umov w11, v5.b[0] +; NO_SVE-NEXT: umov w12, v5.b[1] +; NO_SVE-NEXT: bfi w9, w13, #5, #1 +; NO_SVE-NEXT: umov w13, v5.b[2] +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: orr w8, w9, w8, lsl #6 +; NO_SVE-NEXT: umov w9, v5.b[3] +; NO_SVE-NEXT: and w11, w11, #0x1 +; NO_SVE-NEXT: orr w8, w8, w10, lsl #7 +; NO_SVE-NEXT: umov w10, v5.b[4] +; NO_SVE-NEXT: and w12, w12, #0x1 +; NO_SVE-NEXT: orr w8, w8, w11, lsl #8 +; NO_SVE-NEXT: umov w11, v5.b[5] +; NO_SVE-NEXT: and w13, w13, #0x1 +; NO_SVE-NEXT: orr w8, w8, w12, lsl #9 +; NO_SVE-NEXT: umov w12, v5.b[6] +; NO_SVE-NEXT: and w9, w9, #0x1 +; NO_SVE-NEXT: orr w8, w8, w13, lsl #10 +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: orr w8, w8, w9, lsl #11 +; NO_SVE-NEXT: and w9, w11, #0x1 +; NO_SVE-NEXT: umov w11, v5.b[7] +; NO_SVE-NEXT: orr w8, w8, w10, lsl #12 +; NO_SVE-NEXT: and w10, w12, #0x1 +; NO_SVE-NEXT: orr w8, w8, w9, lsl #13 +; NO_SVE-NEXT: orr w8, w8, w10, lsl #14 +; NO_SVE-NEXT: orr w9, w8, w11, lsl #15 +; NO_SVE-NEXT: and w8, w9, #0xffff +; NO_SVE-NEXT: tbnz w9, #0, .LBB4_17 +; NO_SVE-NEXT: // %bb.1: // %else +; NO_SVE-NEXT: tbnz w8, #1, .LBB4_18 +; NO_SVE-NEXT: .LBB4_2: // %else2 +; NO_SVE-NEXT: tbnz w8, #2, .LBB4_19 +; NO_SVE-NEXT: .LBB4_3: // %else4 +; NO_SVE-NEXT: tbnz w8, #3, .LBB4_20 +; NO_SVE-NEXT: .LBB4_4: // %else6 +; NO_SVE-NEXT: tbnz w8, #4, .LBB4_21 +; NO_SVE-NEXT: .LBB4_5: // %else8 +; NO_SVE-NEXT: tbnz w8, #5, .LBB4_22 +; NO_SVE-NEXT: .LBB4_6: // %else10 +; NO_SVE-NEXT: tbnz w8, #6, .LBB4_23 +; NO_SVE-NEXT: .LBB4_7: // %else12 +; NO_SVE-NEXT: tbnz w8, #7, .LBB4_24 +; NO_SVE-NEXT: .LBB4_8: // %else14 +; NO_SVE-NEXT: tbnz w8, #8, .LBB4_25 +; NO_SVE-NEXT: .LBB4_9: // %else16 +; NO_SVE-NEXT: tbnz w8, #9, .LBB4_26 +; NO_SVE-NEXT: .LBB4_10: // %else18 +; NO_SVE-NEXT: tbnz w8, #10, .LBB4_27 +; NO_SVE-NEXT: .LBB4_11: // %else20 +; NO_SVE-NEXT: tbnz w8, #11, .LBB4_28 +; NO_SVE-NEXT: .LBB4_12: // %else22 +; NO_SVE-NEXT: tbnz w8, #12, .LBB4_29 +; NO_SVE-NEXT: .LBB4_13: // %else24 +; NO_SVE-NEXT: tbnz w8, #13, .LBB4_30 +; NO_SVE-NEXT: .LBB4_14: // %else26 +; NO_SVE-NEXT: tbnz w8, #14, .LBB4_31 +; NO_SVE-NEXT: .LBB4_15: // %else28 +; NO_SVE-NEXT: tbnz w8, #15, .LBB4_32 +; NO_SVE-NEXT: .LBB4_16: // %else30 +; NO_SVE-NEXT: add sp, sp, #16 +; NO_SVE-NEXT: ret +; NO_SVE-NEXT: .LBB4_17: // %cond.store +; NO_SVE-NEXT: str s3, [x0] +; NO_SVE-NEXT: tbz w8, #1, .LBB4_2 +; NO_SVE-NEXT: .LBB4_18: // %cond.store1 +; NO_SVE-NEXT: add x9, x0, #4 +; NO_SVE-NEXT: st1 { v3.s }[1], [x9] +; NO_SVE-NEXT: tbz w8, #2, .LBB4_3 +; NO_SVE-NEXT: .LBB4_19: // %cond.store3 +; NO_SVE-NEXT: add x9, x0, #8 +; NO_SVE-NEXT: st1 { v3.s }[2], [x9] +; NO_SVE-NEXT: tbz w8, #3, .LBB4_4 +; NO_SVE-NEXT: .LBB4_20: // %cond.store5 +; NO_SVE-NEXT: add x9, x0, #12 +; NO_SVE-NEXT: st1 { v3.s }[3], [x9] +; NO_SVE-NEXT: tbz w8, #4, .LBB4_5 +; NO_SVE-NEXT: .LBB4_21: // %cond.store7 +; NO_SVE-NEXT: str s2, [x0, #16] +; NO_SVE-NEXT: tbz w8, #5, .LBB4_6 +; NO_SVE-NEXT: .LBB4_22: // %cond.store9 +; NO_SVE-NEXT: add x9, x0, #20 +; NO_SVE-NEXT: st1 { v2.s }[1], [x9] +; NO_SVE-NEXT: tbz w8, #6, .LBB4_7 +; NO_SVE-NEXT: .LBB4_23: // %cond.store11 +; NO_SVE-NEXT: add x9, x0, #24 +; NO_SVE-NEXT: st1 { v2.s }[2], [x9] +; NO_SVE-NEXT: tbz w8, #7, .LBB4_8 +; NO_SVE-NEXT: .LBB4_24: // %cond.store13 +; NO_SVE-NEXT: add x9, x0, #28 +; NO_SVE-NEXT: st1 { v2.s }[3], [x9] +; NO_SVE-NEXT: tbz w8, #8, .LBB4_9 +; NO_SVE-NEXT: .LBB4_25: // %cond.store15 +; NO_SVE-NEXT: str s1, [x0, #32] +; NO_SVE-NEXT: tbz w8, #9, .LBB4_10 +; NO_SVE-NEXT: .LBB4_26: // %cond.store17 +; NO_SVE-NEXT: add x9, x0, #36 +; NO_SVE-NEXT: st1 { v1.s }[1], [x9] +; NO_SVE-NEXT: tbz w8, #10, .LBB4_11 +; NO_SVE-NEXT: .LBB4_27: // %cond.store19 +; NO_SVE-NEXT: add x9, x0, #40 +; NO_SVE-NEXT: st1 { v1.s }[2], [x9] +; NO_SVE-NEXT: tbz w8, #11, .LBB4_12 +; NO_SVE-NEXT: .LBB4_28: // %cond.store21 +; NO_SVE-NEXT: add x9, x0, #44 +; NO_SVE-NEXT: st1 { v1.s }[3], [x9] +; NO_SVE-NEXT: tbz w8, #12, .LBB4_13 +; NO_SVE-NEXT: .LBB4_29: // %cond.store23 +; NO_SVE-NEXT: str s0, [x0, #48] +; NO_SVE-NEXT: tbz w8, #13, .LBB4_14 +; NO_SVE-NEXT: .LBB4_30: // %cond.store25 +; NO_SVE-NEXT: add x9, x0, #52 +; NO_SVE-NEXT: st1 { v0.s }[1], [x9] +; NO_SVE-NEXT: tbz w8, #14, .LBB4_15 +; NO_SVE-NEXT: .LBB4_31: // %cond.store27 +; NO_SVE-NEXT: add x9, x0, #56 +; NO_SVE-NEXT: st1 { v0.s }[2], [x9] +; NO_SVE-NEXT: tbz w8, #15, .LBB4_16 +; NO_SVE-NEXT: .LBB4_32: // %cond.store29 +; NO_SVE-NEXT: add x8, x0, #60 +; NO_SVE-NEXT: st1 { v0.s }[3], [x8] +; NO_SVE-NEXT: add sp, sp, #16 +; NO_SVE-NEXT: ret +; +; VBITS_GE_256-LABEL: masked_store_v16f32: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov x8, #8 +; VBITS_GE_256-NEXT: ptrue p0.s, vl8 +; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0] +; VBITS_GE_256-NEXT: ld1w { z2.s }, p0/z, [x1, x8, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z3.s }, p0/z, [x1] +; VBITS_GE_256-NEXT: fcmeq p1.s, p0/z, z0.s, z2.s +; VBITS_GE_256-NEXT: fcmeq p0.s, p0/z, z1.s, z3.s +; VBITS_GE_256-NEXT: st1w { z0.s }, p1, [x0, x8, lsl #2] +; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x0] +; VBITS_GE_256-NEXT: ret +; +; VBITS_GE_384-LABEL: masked_store_v16f32: +; VBITS_GE_384: // %bb.0: +; VBITS_GE_384-NEXT: mov x8, #8 +; VBITS_GE_384-NEXT: ptrue p0.s, vl8 +; VBITS_GE_384-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] +; VBITS_GE_384-NEXT: ld1w { z1.s }, p0/z, [x0] +; VBITS_GE_384-NEXT: ld1w { z2.s }, p0/z, [x1, x8, lsl #2] +; VBITS_GE_384-NEXT: ld1w { z3.s }, p0/z, [x1] +; VBITS_GE_384-NEXT: fcmeq p1.s, p0/z, z0.s, z2.s +; VBITS_GE_384-NEXT: fcmeq p0.s, p0/z, z1.s, z3.s +; VBITS_GE_384-NEXT: st1w { z0.s }, p1, [x0, x8, lsl #2] +; VBITS_GE_384-NEXT: st1w { z1.s }, p0, [x0] +; VBITS_GE_384-NEXT: ret +; ; VBITS_GE_512-LABEL: masked_store_v16f32: ; VBITS_GE_512: // %bb.0: ; VBITS_GE_512-NEXT: ptrue p0.s, vl16 @@ -121,6 +489,363 @@ } define void @masked_store_v32f32(<32 x float>* %ap, <32 x float>* %bp) #0 { +; NO_SVE-LABEL: masked_store_v32f32: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: sub sp, sp, #16 +; NO_SVE-NEXT: .cfi_def_cfa_offset 16 +; NO_SVE-NEXT: ldp q3, q2, [x0, #64] +; NO_SVE-NEXT: ldp q0, q1, [x1, #64] +; NO_SVE-NEXT: fcmeq v7.4s, v3.4s, v0.4s +; NO_SVE-NEXT: fcmeq v5.4s, v2.4s, v1.4s +; NO_SVE-NEXT: ldp q1, q0, [x0, #96] +; NO_SVE-NEXT: uzp1 v5.8h, v7.8h, v5.8h +; NO_SVE-NEXT: ldp q16, q17, [x1, #96] +; NO_SVE-NEXT: xtn v20.8b, v5.8h +; NO_SVE-NEXT: umov w8, v20.b[1] +; NO_SVE-NEXT: umov w10, v20.b[2] +; NO_SVE-NEXT: fcmeq v16.4s, v1.4s, v16.4s +; NO_SVE-NEXT: umov w9, v20.b[0] +; NO_SVE-NEXT: umov w11, v20.b[3] +; NO_SVE-NEXT: fcmeq v17.4s, v0.4s, v17.4s +; NO_SVE-NEXT: umov w12, v20.b[4] +; NO_SVE-NEXT: ldp q6, q4, [x0] +; NO_SVE-NEXT: and w8, w8, #0x1 +; NO_SVE-NEXT: umov w13, v20.b[5] +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: umov w14, v20.b[6] +; NO_SVE-NEXT: uzp1 v16.8h, v16.8h, v17.8h +; NO_SVE-NEXT: and w9, w9, #0x1 +; NO_SVE-NEXT: and w11, w11, #0x1 +; NO_SVE-NEXT: bfi w9, w8, #1, #1 +; NO_SVE-NEXT: umov w8, v20.b[7] +; NO_SVE-NEXT: and w12, w12, #0x1 +; NO_SVE-NEXT: ldp q18, q7, [x1] +; NO_SVE-NEXT: xtn v16.8b, v16.8h +; NO_SVE-NEXT: bfi w9, w10, #2, #1 +; NO_SVE-NEXT: and w13, w13, #0x1 +; NO_SVE-NEXT: bfi w9, w11, #3, #1 +; NO_SVE-NEXT: umov w10, v16.b[0] +; NO_SVE-NEXT: umov w11, v16.b[1] +; NO_SVE-NEXT: fcmeq v18.4s, v6.4s, v18.4s +; NO_SVE-NEXT: bfi w9, w12, #4, #1 +; NO_SVE-NEXT: umov w12, v16.b[2] +; NO_SVE-NEXT: and w14, w14, #0x1 +; NO_SVE-NEXT: fcmeq v19.4s, v4.4s, v7.4s +; NO_SVE-NEXT: bfi w9, w13, #5, #1 +; NO_SVE-NEXT: ldp q7, q5, [x0, #32] +; NO_SVE-NEXT: and w8, w8, #0x1 +; NO_SVE-NEXT: umov w13, v16.b[3] +; NO_SVE-NEXT: orr w9, w9, w14, lsl #6 +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: uzp1 v17.8h, v18.8h, v19.8h +; NO_SVE-NEXT: umov w14, v16.b[4] +; NO_SVE-NEXT: and w11, w11, #0x1 +; NO_SVE-NEXT: orr w8, w9, w8, lsl #7 +; NO_SVE-NEXT: and w12, w12, #0x1 +; NO_SVE-NEXT: orr w8, w8, w10, lsl #8 +; NO_SVE-NEXT: ldp q18, q19, [x1, #32] +; NO_SVE-NEXT: xtn v17.8b, v17.8h +; NO_SVE-NEXT: orr w8, w8, w11, lsl #9 +; NO_SVE-NEXT: and w13, w13, #0x1 +; NO_SVE-NEXT: orr w8, w8, w12, lsl #10 +; NO_SVE-NEXT: umov w10, v17.b[1] +; NO_SVE-NEXT: umov w12, v17.b[2] +; NO_SVE-NEXT: and w14, w14, #0x1 +; NO_SVE-NEXT: umov w11, v17.b[0] +; NO_SVE-NEXT: fcmeq v18.4s, v7.4s, v18.4s +; NO_SVE-NEXT: orr w8, w8, w13, lsl #11 +; NO_SVE-NEXT: fcmeq v19.4s, v5.4s, v19.4s +; NO_SVE-NEXT: orr w8, w8, w14, lsl #12 +; NO_SVE-NEXT: umov w13, v17.b[3] +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: umov w14, v17.b[4] +; NO_SVE-NEXT: and w12, w12, #0x1 +; NO_SVE-NEXT: umov w15, v17.b[5] +; NO_SVE-NEXT: and w11, w11, #0x1 +; NO_SVE-NEXT: uzp1 v18.8h, v18.8h, v19.8h +; NO_SVE-NEXT: bfi w11, w10, #1, #1 +; NO_SVE-NEXT: and w10, w13, #0x1 +; NO_SVE-NEXT: bfi w11, w12, #2, #1 +; NO_SVE-NEXT: and w12, w14, #0x1 +; NO_SVE-NEXT: umov w14, v17.b[6] +; NO_SVE-NEXT: umov w9, v16.b[5] +; NO_SVE-NEXT: and w13, w15, #0x1 +; NO_SVE-NEXT: bfi w11, w10, #3, #1 +; NO_SVE-NEXT: umov w10, v17.b[7] +; NO_SVE-NEXT: xtn v17.8b, v18.8h +; NO_SVE-NEXT: bfi w11, w12, #4, #1 +; NO_SVE-NEXT: umov w12, v16.b[6] +; NO_SVE-NEXT: bfi w11, w13, #5, #1 +; NO_SVE-NEXT: and w13, w14, #0x1 +; NO_SVE-NEXT: umov w14, v17.b[0] +; NO_SVE-NEXT: and w9, w9, #0x1 +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: umov w15, v17.b[1] +; NO_SVE-NEXT: orr w8, w8, w9, lsl #13 +; NO_SVE-NEXT: orr w9, w11, w13, lsl #6 +; NO_SVE-NEXT: and w11, w12, #0x1 +; NO_SVE-NEXT: umov w12, v17.b[2] +; NO_SVE-NEXT: orr w9, w9, w10, lsl #7 +; NO_SVE-NEXT: and w10, w14, #0x1 +; NO_SVE-NEXT: umov w14, v17.b[3] +; NO_SVE-NEXT: and w13, w15, #0x1 +; NO_SVE-NEXT: orr w9, w9, w10, lsl #8 +; NO_SVE-NEXT: umov w10, v17.b[4] +; NO_SVE-NEXT: orr w8, w8, w11, lsl #14 +; NO_SVE-NEXT: and w11, w12, #0x1 +; NO_SVE-NEXT: umov w12, v17.b[5] +; NO_SVE-NEXT: orr w9, w9, w13, lsl #9 +; NO_SVE-NEXT: and w13, w14, #0x1 +; NO_SVE-NEXT: umov w14, v17.b[6] +; NO_SVE-NEXT: orr w9, w9, w11, lsl #10 +; NO_SVE-NEXT: umov w11, v16.b[7] +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: orr w9, w9, w13, lsl #11 +; NO_SVE-NEXT: and w12, w12, #0x1 +; NO_SVE-NEXT: umov w13, v17.b[7] +; NO_SVE-NEXT: orr w9, w9, w10, lsl #12 +; NO_SVE-NEXT: and w10, w14, #0x1 +; NO_SVE-NEXT: orr w11, w8, w11, lsl #15 +; NO_SVE-NEXT: orr w8, w9, w12, lsl #13 +; NO_SVE-NEXT: orr w8, w8, w10, lsl #14 +; NO_SVE-NEXT: orr w8, w8, w13, lsl #15 +; NO_SVE-NEXT: bfi w8, w11, #16, #16 +; NO_SVE-NEXT: tbnz w8, #0, .LBB5_33 +; NO_SVE-NEXT: // %bb.1: // %else +; NO_SVE-NEXT: tbnz w8, #1, .LBB5_34 +; NO_SVE-NEXT: .LBB5_2: // %else2 +; NO_SVE-NEXT: tbnz w8, #2, .LBB5_35 +; NO_SVE-NEXT: .LBB5_3: // %else4 +; NO_SVE-NEXT: tbnz w8, #3, .LBB5_36 +; NO_SVE-NEXT: .LBB5_4: // %else6 +; NO_SVE-NEXT: tbnz w8, #4, .LBB5_37 +; NO_SVE-NEXT: .LBB5_5: // %else8 +; NO_SVE-NEXT: tbnz w8, #5, .LBB5_38 +; NO_SVE-NEXT: .LBB5_6: // %else10 +; NO_SVE-NEXT: tbnz w8, #6, .LBB5_39 +; NO_SVE-NEXT: .LBB5_7: // %else12 +; NO_SVE-NEXT: tbnz w8, #7, .LBB5_40 +; NO_SVE-NEXT: .LBB5_8: // %else14 +; NO_SVE-NEXT: tbnz w8, #8, .LBB5_41 +; NO_SVE-NEXT: .LBB5_9: // %else16 +; NO_SVE-NEXT: tbnz w8, #9, .LBB5_42 +; NO_SVE-NEXT: .LBB5_10: // %else18 +; NO_SVE-NEXT: tbnz w8, #10, .LBB5_43 +; NO_SVE-NEXT: .LBB5_11: // %else20 +; NO_SVE-NEXT: tbnz w8, #11, .LBB5_44 +; NO_SVE-NEXT: .LBB5_12: // %else22 +; NO_SVE-NEXT: tbnz w8, #12, .LBB5_45 +; NO_SVE-NEXT: .LBB5_13: // %else24 +; NO_SVE-NEXT: tbnz w8, #13, .LBB5_46 +; NO_SVE-NEXT: .LBB5_14: // %else26 +; NO_SVE-NEXT: tbnz w8, #14, .LBB5_47 +; NO_SVE-NEXT: .LBB5_15: // %else28 +; NO_SVE-NEXT: tbnz w8, #15, .LBB5_48 +; NO_SVE-NEXT: .LBB5_16: // %else30 +; NO_SVE-NEXT: tbnz w8, #16, .LBB5_49 +; NO_SVE-NEXT: .LBB5_17: // %else32 +; NO_SVE-NEXT: tbnz w8, #17, .LBB5_50 +; NO_SVE-NEXT: .LBB5_18: // %else34 +; NO_SVE-NEXT: tbnz w8, #18, .LBB5_51 +; NO_SVE-NEXT: .LBB5_19: // %else36 +; NO_SVE-NEXT: tbnz w8, #19, .LBB5_52 +; NO_SVE-NEXT: .LBB5_20: // %else38 +; NO_SVE-NEXT: tbnz w8, #20, .LBB5_53 +; NO_SVE-NEXT: .LBB5_21: // %else40 +; NO_SVE-NEXT: tbnz w8, #21, .LBB5_54 +; NO_SVE-NEXT: .LBB5_22: // %else42 +; NO_SVE-NEXT: tbnz w8, #22, .LBB5_55 +; NO_SVE-NEXT: .LBB5_23: // %else44 +; NO_SVE-NEXT: tbnz w8, #23, .LBB5_56 +; NO_SVE-NEXT: .LBB5_24: // %else46 +; NO_SVE-NEXT: tbnz w8, #24, .LBB5_57 +; NO_SVE-NEXT: .LBB5_25: // %else48 +; NO_SVE-NEXT: tbnz w8, #25, .LBB5_58 +; NO_SVE-NEXT: .LBB5_26: // %else50 +; NO_SVE-NEXT: tbnz w8, #26, .LBB5_59 +; NO_SVE-NEXT: .LBB5_27: // %else52 +; NO_SVE-NEXT: tbnz w8, #27, .LBB5_60 +; NO_SVE-NEXT: .LBB5_28: // %else54 +; NO_SVE-NEXT: tbnz w8, #28, .LBB5_61 +; NO_SVE-NEXT: .LBB5_29: // %else56 +; NO_SVE-NEXT: tbnz w8, #29, .LBB5_62 +; NO_SVE-NEXT: .LBB5_30: // %else58 +; NO_SVE-NEXT: tbnz w8, #30, .LBB5_63 +; NO_SVE-NEXT: .LBB5_31: // %else60 +; NO_SVE-NEXT: tbnz w8, #31, .LBB5_64 +; NO_SVE-NEXT: .LBB5_32: // %else62 +; NO_SVE-NEXT: add sp, sp, #16 +; NO_SVE-NEXT: ret +; NO_SVE-NEXT: .LBB5_33: // %cond.store +; NO_SVE-NEXT: str s6, [x0] +; NO_SVE-NEXT: tbz w8, #1, .LBB5_2 +; NO_SVE-NEXT: .LBB5_34: // %cond.store1 +; NO_SVE-NEXT: add x9, x0, #4 +; NO_SVE-NEXT: st1 { v6.s }[1], [x9] +; NO_SVE-NEXT: tbz w8, #2, .LBB5_3 +; NO_SVE-NEXT: .LBB5_35: // %cond.store3 +; NO_SVE-NEXT: add x9, x0, #8 +; NO_SVE-NEXT: st1 { v6.s }[2], [x9] +; NO_SVE-NEXT: tbz w8, #3, .LBB5_4 +; NO_SVE-NEXT: .LBB5_36: // %cond.store5 +; NO_SVE-NEXT: add x9, x0, #12 +; NO_SVE-NEXT: st1 { v6.s }[3], [x9] +; NO_SVE-NEXT: tbz w8, #4, .LBB5_5 +; NO_SVE-NEXT: .LBB5_37: // %cond.store7 +; NO_SVE-NEXT: str s4, [x0, #16] +; NO_SVE-NEXT: tbz w8, #5, .LBB5_6 +; NO_SVE-NEXT: .LBB5_38: // %cond.store9 +; NO_SVE-NEXT: add x9, x0, #20 +; NO_SVE-NEXT: st1 { v4.s }[1], [x9] +; NO_SVE-NEXT: tbz w8, #6, .LBB5_7 +; NO_SVE-NEXT: .LBB5_39: // %cond.store11 +; NO_SVE-NEXT: add x9, x0, #24 +; NO_SVE-NEXT: st1 { v4.s }[2], [x9] +; NO_SVE-NEXT: tbz w8, #7, .LBB5_8 +; NO_SVE-NEXT: .LBB5_40: // %cond.store13 +; NO_SVE-NEXT: add x9, x0, #28 +; NO_SVE-NEXT: st1 { v4.s }[3], [x9] +; NO_SVE-NEXT: tbz w8, #8, .LBB5_9 +; NO_SVE-NEXT: .LBB5_41: // %cond.store15 +; NO_SVE-NEXT: str s7, [x0, #32] +; NO_SVE-NEXT: tbz w8, #9, .LBB5_10 +; NO_SVE-NEXT: .LBB5_42: // %cond.store17 +; NO_SVE-NEXT: add x9, x0, #36 +; NO_SVE-NEXT: st1 { v7.s }[1], [x9] +; NO_SVE-NEXT: tbz w8, #10, .LBB5_11 +; NO_SVE-NEXT: .LBB5_43: // %cond.store19 +; NO_SVE-NEXT: add x9, x0, #40 +; NO_SVE-NEXT: st1 { v7.s }[2], [x9] +; NO_SVE-NEXT: tbz w8, #11, .LBB5_12 +; NO_SVE-NEXT: .LBB5_44: // %cond.store21 +; NO_SVE-NEXT: add x9, x0, #44 +; NO_SVE-NEXT: st1 { v7.s }[3], [x9] +; NO_SVE-NEXT: tbz w8, #12, .LBB5_13 +; NO_SVE-NEXT: .LBB5_45: // %cond.store23 +; NO_SVE-NEXT: str s5, [x0, #48] +; NO_SVE-NEXT: tbz w8, #13, .LBB5_14 +; NO_SVE-NEXT: .LBB5_46: // %cond.store25 +; NO_SVE-NEXT: add x9, x0, #52 +; NO_SVE-NEXT: st1 { v5.s }[1], [x9] +; NO_SVE-NEXT: tbz w8, #14, .LBB5_15 +; NO_SVE-NEXT: .LBB5_47: // %cond.store27 +; NO_SVE-NEXT: add x9, x0, #56 +; NO_SVE-NEXT: st1 { v5.s }[2], [x9] +; NO_SVE-NEXT: tbz w8, #15, .LBB5_16 +; NO_SVE-NEXT: .LBB5_48: // %cond.store29 +; NO_SVE-NEXT: add x9, x0, #60 +; NO_SVE-NEXT: st1 { v5.s }[3], [x9] +; NO_SVE-NEXT: tbz w8, #16, .LBB5_17 +; NO_SVE-NEXT: .LBB5_49: // %cond.store31 +; NO_SVE-NEXT: str s3, [x0, #64] +; NO_SVE-NEXT: tbz w8, #17, .LBB5_18 +; NO_SVE-NEXT: .LBB5_50: // %cond.store33 +; NO_SVE-NEXT: add x9, x0, #68 +; NO_SVE-NEXT: st1 { v3.s }[1], [x9] +; NO_SVE-NEXT: tbz w8, #18, .LBB5_19 +; NO_SVE-NEXT: .LBB5_51: // %cond.store35 +; NO_SVE-NEXT: add x9, x0, #72 +; NO_SVE-NEXT: st1 { v3.s }[2], [x9] +; NO_SVE-NEXT: tbz w8, #19, .LBB5_20 +; NO_SVE-NEXT: .LBB5_52: // %cond.store37 +; NO_SVE-NEXT: add x9, x0, #76 +; NO_SVE-NEXT: st1 { v3.s }[3], [x9] +; NO_SVE-NEXT: tbz w8, #20, .LBB5_21 +; NO_SVE-NEXT: .LBB5_53: // %cond.store39 +; NO_SVE-NEXT: str s2, [x0, #80] +; NO_SVE-NEXT: tbz w8, #21, .LBB5_22 +; NO_SVE-NEXT: .LBB5_54: // %cond.store41 +; NO_SVE-NEXT: add x9, x0, #84 +; NO_SVE-NEXT: st1 { v2.s }[1], [x9] +; NO_SVE-NEXT: tbz w8, #22, .LBB5_23 +; NO_SVE-NEXT: .LBB5_55: // %cond.store43 +; NO_SVE-NEXT: add x9, x0, #88 +; NO_SVE-NEXT: st1 { v2.s }[2], [x9] +; NO_SVE-NEXT: tbz w8, #23, .LBB5_24 +; NO_SVE-NEXT: .LBB5_56: // %cond.store45 +; NO_SVE-NEXT: add x9, x0, #92 +; NO_SVE-NEXT: st1 { v2.s }[3], [x9] +; NO_SVE-NEXT: tbz w8, #24, .LBB5_25 +; NO_SVE-NEXT: .LBB5_57: // %cond.store47 +; NO_SVE-NEXT: str s1, [x0, #96] +; NO_SVE-NEXT: tbz w8, #25, .LBB5_26 +; NO_SVE-NEXT: .LBB5_58: // %cond.store49 +; NO_SVE-NEXT: add x9, x0, #100 +; NO_SVE-NEXT: st1 { v1.s }[1], [x9] +; NO_SVE-NEXT: tbz w8, #26, .LBB5_27 +; NO_SVE-NEXT: .LBB5_59: // %cond.store51 +; NO_SVE-NEXT: add x9, x0, #104 +; NO_SVE-NEXT: st1 { v1.s }[2], [x9] +; NO_SVE-NEXT: tbz w8, #27, .LBB5_28 +; NO_SVE-NEXT: .LBB5_60: // %cond.store53 +; NO_SVE-NEXT: add x9, x0, #108 +; NO_SVE-NEXT: st1 { v1.s }[3], [x9] +; NO_SVE-NEXT: tbz w8, #28, .LBB5_29 +; NO_SVE-NEXT: .LBB5_61: // %cond.store55 +; NO_SVE-NEXT: str s0, [x0, #112] +; NO_SVE-NEXT: tbz w8, #29, .LBB5_30 +; NO_SVE-NEXT: .LBB5_62: // %cond.store57 +; NO_SVE-NEXT: add x9, x0, #116 +; NO_SVE-NEXT: st1 { v0.s }[1], [x9] +; NO_SVE-NEXT: tbz w8, #30, .LBB5_31 +; NO_SVE-NEXT: .LBB5_63: // %cond.store59 +; NO_SVE-NEXT: add x9, x0, #120 +; NO_SVE-NEXT: st1 { v0.s }[2], [x9] +; NO_SVE-NEXT: tbz w8, #31, .LBB5_32 +; NO_SVE-NEXT: .LBB5_64: // %cond.store61 +; NO_SVE-NEXT: add x8, x0, #124 +; NO_SVE-NEXT: st1 { v0.s }[3], [x8] +; NO_SVE-NEXT: add sp, sp, #16 +; NO_SVE-NEXT: ret +; +; VBITS_GE_256-LABEL: masked_store_v32f32: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov x8, #24 +; VBITS_GE_256-NEXT: mov x9, #16 +; VBITS_GE_256-NEXT: mov x10, #8 +; VBITS_GE_256-NEXT: ptrue p0.s, vl8 +; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0, x9, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z2.s }, p0/z, [x0, x10, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z3.s }, p0/z, [x0] +; VBITS_GE_256-NEXT: ld1w { z4.s }, p0/z, [x1, x9, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z5.s }, p0/z, [x1, x10, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z6.s }, p0/z, [x1, x8, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z7.s }, p0/z, [x1] +; VBITS_GE_256-NEXT: fcmeq p1.s, p0/z, z2.s, z5.s +; VBITS_GE_256-NEXT: fcmeq p2.s, p0/z, z1.s, z4.s +; VBITS_GE_256-NEXT: fcmeq p3.s, p0/z, z0.s, z6.s +; VBITS_GE_256-NEXT: fcmeq p0.s, p0/z, z3.s, z7.s +; VBITS_GE_256-NEXT: st1w { z0.s }, p3, [x0, x8, lsl #2] +; VBITS_GE_256-NEXT: st1w { z1.s }, p2, [x0, x9, lsl #2] +; VBITS_GE_256-NEXT: st1w { z2.s }, p1, [x0, x10, lsl #2] +; VBITS_GE_256-NEXT: st1w { z3.s }, p0, [x0] +; VBITS_GE_256-NEXT: ret +; +; VBITS_GE_384-LABEL: masked_store_v32f32: +; VBITS_GE_384: // %bb.0: +; VBITS_GE_384-NEXT: mov x8, #24 +; VBITS_GE_384-NEXT: mov x9, #16 +; VBITS_GE_384-NEXT: mov x10, #8 +; VBITS_GE_384-NEXT: ptrue p0.s, vl8 +; VBITS_GE_384-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] +; VBITS_GE_384-NEXT: ld1w { z1.s }, p0/z, [x0, x9, lsl #2] +; VBITS_GE_384-NEXT: ld1w { z2.s }, p0/z, [x0, x10, lsl #2] +; VBITS_GE_384-NEXT: ld1w { z3.s }, p0/z, [x0] +; VBITS_GE_384-NEXT: ld1w { z4.s }, p0/z, [x1, x9, lsl #2] +; VBITS_GE_384-NEXT: ld1w { z5.s }, p0/z, [x1, x10, lsl #2] +; VBITS_GE_384-NEXT: ld1w { z6.s }, p0/z, [x1, x8, lsl #2] +; VBITS_GE_384-NEXT: ld1w { z7.s }, p0/z, [x1] +; VBITS_GE_384-NEXT: fcmeq p1.s, p0/z, z2.s, z5.s +; VBITS_GE_384-NEXT: fcmeq p2.s, p0/z, z1.s, z4.s +; VBITS_GE_384-NEXT: fcmeq p3.s, p0/z, z0.s, z6.s +; VBITS_GE_384-NEXT: fcmeq p0.s, p0/z, z3.s, z7.s +; VBITS_GE_384-NEXT: st1w { z0.s }, p3, [x0, x8, lsl #2] +; VBITS_GE_384-NEXT: st1w { z1.s }, p2, [x0, x9, lsl #2] +; VBITS_GE_384-NEXT: st1w { z2.s }, p1, [x0, x10, lsl #2] +; VBITS_GE_384-NEXT: st1w { z3.s }, p0, [x0] +; VBITS_GE_384-NEXT: ret +; ; VBITS_GE_1024-LABEL: masked_store_v32f32: ; VBITS_GE_1024: // %bb.0: ; VBITS_GE_1024-NEXT: ptrue p0.s, vl32 @@ -137,6 +862,712 @@ } define void @masked_store_v64f32(<64 x float>* %ap, <64 x float>* %bp) #0 { +; NO_SVE-LABEL: masked_store_v64f32: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: sub sp, sp, #48 +; NO_SVE-NEXT: .cfi_def_cfa_offset 48 +; NO_SVE-NEXT: stp d11, d10, [sp, #16] // 16-byte Folded Spill +; NO_SVE-NEXT: stp d9, d8, [sp, #32] // 16-byte Folded Spill +; NO_SVE-NEXT: .cfi_offset b8, -8 +; NO_SVE-NEXT: .cfi_offset b9, -16 +; NO_SVE-NEXT: .cfi_offset b10, -24 +; NO_SVE-NEXT: .cfi_offset b11, -32 +; NO_SVE-NEXT: ldp q3, q2, [x0, #192] +; NO_SVE-NEXT: ldp q4, q5, [x1, #192] +; NO_SVE-NEXT: fcmeq v4.4s, v3.4s, v4.4s +; NO_SVE-NEXT: fcmeq v5.4s, v2.4s, v5.4s +; NO_SVE-NEXT: ldp q1, q0, [x0, #224] +; NO_SVE-NEXT: uzp1 v4.8h, v4.8h, v5.8h +; NO_SVE-NEXT: ldp q6, q7, [x1, #224] +; NO_SVE-NEXT: xtn v24.8b, v4.8h +; NO_SVE-NEXT: umov w8, v24.b[1] +; NO_SVE-NEXT: umov w10, v24.b[2] +; NO_SVE-NEXT: fcmeq v6.4s, v1.4s, v6.4s +; NO_SVE-NEXT: umov w9, v24.b[0] +; NO_SVE-NEXT: umov w11, v24.b[3] +; NO_SVE-NEXT: fcmeq v7.4s, v0.4s, v7.4s +; NO_SVE-NEXT: umov w12, v24.b[4] +; NO_SVE-NEXT: umov w13, v24.b[5] +; NO_SVE-NEXT: and w8, w8, #0x1 +; NO_SVE-NEXT: ldp q26, q27, [x1, #128] +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: umov w14, v24.b[6] +; NO_SVE-NEXT: uzp1 v5.8h, v6.8h, v7.8h +; NO_SVE-NEXT: and w9, w9, #0x1 +; NO_SVE-NEXT: and w11, w11, #0x1 +; NO_SVE-NEXT: and w12, w12, #0x1 +; NO_SVE-NEXT: bfi w9, w8, #1, #1 +; NO_SVE-NEXT: and w8, w13, #0x1 +; NO_SVE-NEXT: bfi w9, w10, #2, #1 +; NO_SVE-NEXT: umov w15, v24.b[7] +; NO_SVE-NEXT: ldp q7, q6, [x0, #128] +; NO_SVE-NEXT: xtn v28.8b, v5.8h +; NO_SVE-NEXT: bfi w9, w11, #3, #1 +; NO_SVE-NEXT: and w11, w14, #0x1 +; NO_SVE-NEXT: bfi w9, w12, #4, #1 +; NO_SVE-NEXT: umov w10, v28.b[0] +; NO_SVE-NEXT: bfi w9, w8, #5, #1 +; NO_SVE-NEXT: fcmeq v30.4s, v7.4s, v26.4s +; NO_SVE-NEXT: orr w8, w9, w11, lsl #6 +; NO_SVE-NEXT: and w12, w15, #0x1 +; NO_SVE-NEXT: umov w11, v28.b[2] +; NO_SVE-NEXT: fcmeq v29.4s, v6.4s, v27.4s +; NO_SVE-NEXT: umov w14, v28.b[6] +; NO_SVE-NEXT: and w9, w10, #0x1 +; NO_SVE-NEXT: umov w10, v28.b[1] +; NO_SVE-NEXT: orr w8, w8, w12, lsl #7 +; NO_SVE-NEXT: orr w8, w8, w9, lsl #8 +; NO_SVE-NEXT: and w11, w11, #0x1 +; NO_SVE-NEXT: uzp1 v31.8h, v30.8h, v29.8h +; NO_SVE-NEXT: and w9, w10, #0x1 +; NO_SVE-NEXT: umov w10, v28.b[3] +; NO_SVE-NEXT: ldp q5, q4, [x0, #160] +; NO_SVE-NEXT: orr w8, w8, w9, lsl #9 +; NO_SVE-NEXT: xtn v31.8b, v31.8h +; NO_SVE-NEXT: orr w8, w8, w11, lsl #10 +; NO_SVE-NEXT: and w9, w10, #0x1 +; NO_SVE-NEXT: umov w10, v28.b[4] +; NO_SVE-NEXT: umov w11, v31.b[1] +; NO_SVE-NEXT: umov w12, v31.b[0] +; NO_SVE-NEXT: umov w13, v31.b[2] +; NO_SVE-NEXT: orr w8, w8, w9, lsl #11 +; NO_SVE-NEXT: ldp q10, q11, [x1, #160] +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: umov w9, v28.b[5] +; NO_SVE-NEXT: and w11, w11, #0x1 +; NO_SVE-NEXT: umov w15, v31.b[6] +; NO_SVE-NEXT: orr w8, w8, w10, lsl #12 +; NO_SVE-NEXT: and w10, w12, #0x1 +; NO_SVE-NEXT: umov w12, v31.b[3] +; NO_SVE-NEXT: bfi w10, w11, #1, #1 +; NO_SVE-NEXT: and w11, w13, #0x1 +; NO_SVE-NEXT: umov w13, v31.b[4] +; NO_SVE-NEXT: and w9, w9, #0x1 +; NO_SVE-NEXT: fcmeq v11.4s, v4.4s, v11.4s +; NO_SVE-NEXT: fcmeq v10.4s, v5.4s, v10.4s +; NO_SVE-NEXT: bfi w10, w11, #2, #1 +; NO_SVE-NEXT: ldp q19, q18, [x0, #64] +; NO_SVE-NEXT: and w11, w12, #0x1 +; NO_SVE-NEXT: and w12, w13, #0x1 +; NO_SVE-NEXT: umov w13, v31.b[5] +; NO_SVE-NEXT: orr w8, w8, w9, lsl #13 +; NO_SVE-NEXT: uzp1 v10.8h, v10.8h, v11.8h +; NO_SVE-NEXT: and w9, w14, #0x1 +; NO_SVE-NEXT: bfi w10, w11, #3, #1 +; NO_SVE-NEXT: umov w14, v28.b[7] +; NO_SVE-NEXT: bfi w10, w12, #4, #1 +; NO_SVE-NEXT: and w12, w15, #0x1 +; NO_SVE-NEXT: ldp q8, q9, [x1, #64] +; NO_SVE-NEXT: and w11, w13, #0x1 +; NO_SVE-NEXT: umov w13, v31.b[7] +; NO_SVE-NEXT: xtn v31.8b, v10.8h +; NO_SVE-NEXT: orr w8, w8, w9, lsl #14 +; NO_SVE-NEXT: orr w8, w8, w14, lsl #15 +; NO_SVE-NEXT: fcmeq v8.4s, v19.4s, v8.4s +; NO_SVE-NEXT: bfi w10, w11, #5, #1 +; NO_SVE-NEXT: umov w11, v31.b[0] +; NO_SVE-NEXT: orr w9, w10, w12, lsl #6 +; NO_SVE-NEXT: fcmeq v9.4s, v18.4s, v9.4s +; NO_SVE-NEXT: umov w12, v31.b[1] +; NO_SVE-NEXT: and w10, w13, #0x1 +; NO_SVE-NEXT: umov w13, v31.b[2] +; NO_SVE-NEXT: ldp q17, q16, [x0, #96] +; NO_SVE-NEXT: and w11, w11, #0x1 +; NO_SVE-NEXT: orr w9, w9, w10, lsl #7 +; NO_SVE-NEXT: uzp1 v8.8h, v8.8h, v9.8h +; NO_SVE-NEXT: umov w10, v31.b[3] +; NO_SVE-NEXT: orr w9, w9, w11, lsl #8 +; NO_SVE-NEXT: and w11, w12, #0x1 +; NO_SVE-NEXT: and w12, w13, #0x1 +; NO_SVE-NEXT: umov w13, v31.b[4] +; NO_SVE-NEXT: orr w9, w9, w11, lsl #9 +; NO_SVE-NEXT: umov w14, v31.b[6] +; NO_SVE-NEXT: xtn v28.8b, v8.8h +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: ldp q29, q30, [x1, #96] +; NO_SVE-NEXT: umov w11, v28.b[1] +; NO_SVE-NEXT: orr w9, w9, w12, lsl #10 +; NO_SVE-NEXT: umov w12, v31.b[5] +; NO_SVE-NEXT: umov w15, v28.b[0] +; NO_SVE-NEXT: orr w9, w9, w10, lsl #11 +; NO_SVE-NEXT: and w10, w13, #0x1 +; NO_SVE-NEXT: umov w13, v28.b[2] +; NO_SVE-NEXT: umov w16, v28.b[5] +; NO_SVE-NEXT: and w11, w11, #0x1 +; NO_SVE-NEXT: orr w9, w9, w10, lsl #12 +; NO_SVE-NEXT: fcmeq v30.4s, v16.4s, v30.4s +; NO_SVE-NEXT: and w10, w12, #0x1 +; NO_SVE-NEXT: fcmeq v29.4s, v17.4s, v29.4s +; NO_SVE-NEXT: and w12, w15, #0x1 +; NO_SVE-NEXT: bfi w12, w11, #1, #1 +; NO_SVE-NEXT: and w11, w13, #0x1 +; NO_SVE-NEXT: umov w13, v28.b[3] +; NO_SVE-NEXT: umov w15, v28.b[4] +; NO_SVE-NEXT: ldp q23, q22, [x0] +; NO_SVE-NEXT: uzp1 v29.8h, v29.8h, v30.8h +; NO_SVE-NEXT: bfi w12, w11, #2, #1 +; NO_SVE-NEXT: orr w9, w9, w10, lsl #13 +; NO_SVE-NEXT: umov w10, v28.b[7] +; NO_SVE-NEXT: and w11, w13, #0x1 +; NO_SVE-NEXT: and w13, w15, #0x1 +; NO_SVE-NEXT: and w15, w16, #0x1 +; NO_SVE-NEXT: umov w16, v28.b[6] +; NO_SVE-NEXT: xtn v28.8b, v29.8h +; NO_SVE-NEXT: and w14, w14, #0x1 +; NO_SVE-NEXT: ldp q26, q27, [x1] +; NO_SVE-NEXT: bfi w12, w11, #3, #1 +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: bfi w12, w13, #4, #1 +; NO_SVE-NEXT: and w11, w16, #0x1 +; NO_SVE-NEXT: umov w13, v28.b[0] +; NO_SVE-NEXT: bfi w12, w15, #5, #1 +; NO_SVE-NEXT: fcmeq v26.4s, v23.4s, v26.4s +; NO_SVE-NEXT: orr w11, w12, w11, lsl #6 +; NO_SVE-NEXT: umov w12, v28.b[1] +; NO_SVE-NEXT: orr w10, w11, w10, lsl #7 +; NO_SVE-NEXT: fcmeq v27.4s, v22.4s, v27.4s +; NO_SVE-NEXT: umov w15, v28.b[3] +; NO_SVE-NEXT: and w11, w13, #0x1 +; NO_SVE-NEXT: umov w13, v28.b[2] +; NO_SVE-NEXT: ldp q21, q20, [x0, #32] +; NO_SVE-NEXT: and w12, w12, #0x1 +; NO_SVE-NEXT: orr w10, w10, w11, lsl #8 +; NO_SVE-NEXT: uzp1 v26.8h, v26.8h, v27.8h +; NO_SVE-NEXT: umov w11, v28.b[4] +; NO_SVE-NEXT: orr w9, w9, w14, lsl #14 +; NO_SVE-NEXT: umov w14, v31.b[7] +; NO_SVE-NEXT: orr w10, w10, w12, lsl #9 +; NO_SVE-NEXT: and w12, w13, #0x1 +; NO_SVE-NEXT: umov w13, v28.b[5] +; NO_SVE-NEXT: and w15, w15, #0x1 +; NO_SVE-NEXT: ldp q24, q25, [x1, #32] +; NO_SVE-NEXT: xtn v26.8b, v26.8h +; NO_SVE-NEXT: orr w10, w10, w12, lsl #10 +; NO_SVE-NEXT: and w11, w11, #0x1 +; NO_SVE-NEXT: orr w9, w9, w14, lsl #15 +; NO_SVE-NEXT: umov w16, v26.b[1] +; NO_SVE-NEXT: orr w10, w10, w15, lsl #11 +; NO_SVE-NEXT: umov w14, v26.b[2] +; NO_SVE-NEXT: orr w10, w10, w11, lsl #12 +; NO_SVE-NEXT: and w11, w13, #0x1 +; NO_SVE-NEXT: umov w13, v26.b[0] +; NO_SVE-NEXT: fcmeq v25.4s, v20.4s, v25.4s +; NO_SVE-NEXT: umov w15, v26.b[3] +; NO_SVE-NEXT: fcmeq v24.4s, v21.4s, v24.4s +; NO_SVE-NEXT: and w12, w16, #0x1 +; NO_SVE-NEXT: umov w16, v26.b[4] +; NO_SVE-NEXT: and w14, w14, #0x1 +; NO_SVE-NEXT: umov w17, v26.b[5] +; NO_SVE-NEXT: and w13, w13, #0x1 +; NO_SVE-NEXT: bfi w13, w12, #1, #1 +; NO_SVE-NEXT: and w12, w15, #0x1 +; NO_SVE-NEXT: uzp1 v24.8h, v24.8h, v25.8h +; NO_SVE-NEXT: bfi w13, w14, #2, #1 +; NO_SVE-NEXT: and w14, w16, #0x1 +; NO_SVE-NEXT: umov w16, v26.b[6] +; NO_SVE-NEXT: and w15, w17, #0x1 +; NO_SVE-NEXT: bfi w13, w12, #3, #1 +; NO_SVE-NEXT: umov w12, v26.b[7] +; NO_SVE-NEXT: orr w10, w10, w11, lsl #13 +; NO_SVE-NEXT: xtn v24.8b, v24.8h +; NO_SVE-NEXT: bfi w13, w14, #4, #1 +; NO_SVE-NEXT: umov w14, v28.b[6] +; NO_SVE-NEXT: bfi w13, w15, #5, #1 +; NO_SVE-NEXT: and w15, w16, #0x1 +; NO_SVE-NEXT: umov w16, v24.b[0] +; NO_SVE-NEXT: and w12, w12, #0x1 +; NO_SVE-NEXT: umov w17, v24.b[1] +; NO_SVE-NEXT: orr w11, w13, w15, lsl #6 +; NO_SVE-NEXT: bfi w9, w8, #16, #16 +; NO_SVE-NEXT: and w13, w14, #0x1 +; NO_SVE-NEXT: umov w14, v24.b[2] +; NO_SVE-NEXT: orr w11, w11, w12, lsl #7 +; NO_SVE-NEXT: and w12, w16, #0x1 +; NO_SVE-NEXT: umov w16, v24.b[3] +; NO_SVE-NEXT: and w15, w17, #0x1 +; NO_SVE-NEXT: orr w11, w11, w12, lsl #8 +; NO_SVE-NEXT: umov w12, v24.b[4] +; NO_SVE-NEXT: orr w10, w10, w13, lsl #14 +; NO_SVE-NEXT: and w13, w14, #0x1 +; NO_SVE-NEXT: umov w14, v24.b[5] +; NO_SVE-NEXT: orr w11, w11, w15, lsl #9 +; NO_SVE-NEXT: and w15, w16, #0x1 +; NO_SVE-NEXT: umov w16, v24.b[6] +; NO_SVE-NEXT: orr w11, w11, w13, lsl #10 +; NO_SVE-NEXT: umov w13, v28.b[7] +; NO_SVE-NEXT: and w12, w12, #0x1 +; NO_SVE-NEXT: orr w11, w11, w15, lsl #11 +; NO_SVE-NEXT: and w14, w14, #0x1 +; NO_SVE-NEXT: umov w15, v24.b[7] +; NO_SVE-NEXT: orr w11, w11, w12, lsl #12 +; NO_SVE-NEXT: and w12, w16, #0x1 +; NO_SVE-NEXT: orr w10, w10, w13, lsl #15 +; NO_SVE-NEXT: orr w11, w11, w14, lsl #13 +; NO_SVE-NEXT: orr w8, w11, w12, lsl #14 +; NO_SVE-NEXT: orr w8, w8, w15, lsl #15 +; NO_SVE-NEXT: bfi w8, w10, #16, #16 +; NO_SVE-NEXT: bfi x8, x9, #32, #32 +; NO_SVE-NEXT: tbnz w8, #0, .LBB6_66 +; NO_SVE-NEXT: // %bb.1: // %else +; NO_SVE-NEXT: tbnz w8, #1, .LBB6_67 +; NO_SVE-NEXT: .LBB6_2: // %else2 +; NO_SVE-NEXT: tbnz w8, #2, .LBB6_68 +; NO_SVE-NEXT: .LBB6_3: // %else4 +; NO_SVE-NEXT: tbnz w8, #3, .LBB6_69 +; NO_SVE-NEXT: .LBB6_4: // %else6 +; NO_SVE-NEXT: tbnz w8, #4, .LBB6_70 +; NO_SVE-NEXT: .LBB6_5: // %else8 +; NO_SVE-NEXT: tbnz w8, #5, .LBB6_71 +; NO_SVE-NEXT: .LBB6_6: // %else10 +; NO_SVE-NEXT: tbnz w8, #6, .LBB6_72 +; NO_SVE-NEXT: .LBB6_7: // %else12 +; NO_SVE-NEXT: tbnz w8, #7, .LBB6_73 +; NO_SVE-NEXT: .LBB6_8: // %else14 +; NO_SVE-NEXT: tbnz w8, #8, .LBB6_74 +; NO_SVE-NEXT: .LBB6_9: // %else16 +; NO_SVE-NEXT: tbnz w8, #9, .LBB6_75 +; NO_SVE-NEXT: .LBB6_10: // %else18 +; NO_SVE-NEXT: tbnz w8, #10, .LBB6_76 +; NO_SVE-NEXT: .LBB6_11: // %else20 +; NO_SVE-NEXT: tbnz w8, #11, .LBB6_77 +; NO_SVE-NEXT: .LBB6_12: // %else22 +; NO_SVE-NEXT: tbnz w8, #12, .LBB6_78 +; NO_SVE-NEXT: .LBB6_13: // %else24 +; NO_SVE-NEXT: tbnz w8, #13, .LBB6_79 +; NO_SVE-NEXT: .LBB6_14: // %else26 +; NO_SVE-NEXT: tbnz w8, #14, .LBB6_80 +; NO_SVE-NEXT: .LBB6_15: // %else28 +; NO_SVE-NEXT: tbnz w8, #15, .LBB6_81 +; NO_SVE-NEXT: .LBB6_16: // %else30 +; NO_SVE-NEXT: tbnz w8, #16, .LBB6_82 +; NO_SVE-NEXT: .LBB6_17: // %else32 +; NO_SVE-NEXT: tbnz w8, #17, .LBB6_83 +; NO_SVE-NEXT: .LBB6_18: // %else34 +; NO_SVE-NEXT: tbnz w8, #18, .LBB6_84 +; NO_SVE-NEXT: .LBB6_19: // %else36 +; NO_SVE-NEXT: tbnz w8, #19, .LBB6_85 +; NO_SVE-NEXT: .LBB6_20: // %else38 +; NO_SVE-NEXT: tbnz w8, #20, .LBB6_86 +; NO_SVE-NEXT: .LBB6_21: // %else40 +; NO_SVE-NEXT: tbnz w8, #21, .LBB6_87 +; NO_SVE-NEXT: .LBB6_22: // %else42 +; NO_SVE-NEXT: tbnz w8, #22, .LBB6_88 +; NO_SVE-NEXT: .LBB6_23: // %else44 +; NO_SVE-NEXT: tbnz w8, #23, .LBB6_89 +; NO_SVE-NEXT: .LBB6_24: // %else46 +; NO_SVE-NEXT: tbnz w8, #24, .LBB6_90 +; NO_SVE-NEXT: .LBB6_25: // %else48 +; NO_SVE-NEXT: tbnz w8, #25, .LBB6_91 +; NO_SVE-NEXT: .LBB6_26: // %else50 +; NO_SVE-NEXT: tbnz w8, #26, .LBB6_92 +; NO_SVE-NEXT: .LBB6_27: // %else52 +; NO_SVE-NEXT: tbnz w8, #27, .LBB6_93 +; NO_SVE-NEXT: .LBB6_28: // %else54 +; NO_SVE-NEXT: tbnz w8, #28, .LBB6_94 +; NO_SVE-NEXT: .LBB6_29: // %else56 +; NO_SVE-NEXT: tbnz w8, #29, .LBB6_95 +; NO_SVE-NEXT: .LBB6_30: // %else58 +; NO_SVE-NEXT: tbnz w8, #30, .LBB6_96 +; NO_SVE-NEXT: .LBB6_31: // %else60 +; NO_SVE-NEXT: tbnz w8, #31, .LBB6_97 +; NO_SVE-NEXT: .LBB6_32: // %else62 +; NO_SVE-NEXT: tbnz x8, #32, .LBB6_98 +; NO_SVE-NEXT: .LBB6_33: // %else64 +; NO_SVE-NEXT: tbnz x8, #33, .LBB6_99 +; NO_SVE-NEXT: .LBB6_34: // %else66 +; NO_SVE-NEXT: tbnz x8, #34, .LBB6_100 +; NO_SVE-NEXT: .LBB6_35: // %else68 +; NO_SVE-NEXT: tbnz x8, #35, .LBB6_101 +; NO_SVE-NEXT: .LBB6_36: // %else70 +; NO_SVE-NEXT: tbnz x8, #36, .LBB6_102 +; NO_SVE-NEXT: .LBB6_37: // %else72 +; NO_SVE-NEXT: tbnz x8, #37, .LBB6_103 +; NO_SVE-NEXT: .LBB6_38: // %else74 +; NO_SVE-NEXT: tbnz x8, #38, .LBB6_104 +; NO_SVE-NEXT: .LBB6_39: // %else76 +; NO_SVE-NEXT: tbnz x8, #39, .LBB6_105 +; NO_SVE-NEXT: .LBB6_40: // %else78 +; NO_SVE-NEXT: tbnz x8, #40, .LBB6_106 +; NO_SVE-NEXT: .LBB6_41: // %else80 +; NO_SVE-NEXT: tbnz x8, #41, .LBB6_107 +; NO_SVE-NEXT: .LBB6_42: // %else82 +; NO_SVE-NEXT: tbnz x8, #42, .LBB6_108 +; NO_SVE-NEXT: .LBB6_43: // %else84 +; NO_SVE-NEXT: tbnz x8, #43, .LBB6_109 +; NO_SVE-NEXT: .LBB6_44: // %else86 +; NO_SVE-NEXT: tbnz x8, #44, .LBB6_110 +; NO_SVE-NEXT: .LBB6_45: // %else88 +; NO_SVE-NEXT: tbnz x8, #45, .LBB6_111 +; NO_SVE-NEXT: .LBB6_46: // %else90 +; NO_SVE-NEXT: tbnz x8, #46, .LBB6_112 +; NO_SVE-NEXT: .LBB6_47: // %else92 +; NO_SVE-NEXT: tbnz x8, #47, .LBB6_113 +; NO_SVE-NEXT: .LBB6_48: // %else94 +; NO_SVE-NEXT: tbnz x8, #48, .LBB6_114 +; NO_SVE-NEXT: .LBB6_49: // %else96 +; NO_SVE-NEXT: tbnz x8, #49, .LBB6_115 +; NO_SVE-NEXT: .LBB6_50: // %else98 +; NO_SVE-NEXT: tbnz x8, #50, .LBB6_116 +; NO_SVE-NEXT: .LBB6_51: // %else100 +; NO_SVE-NEXT: tbnz x8, #51, .LBB6_117 +; NO_SVE-NEXT: .LBB6_52: // %else102 +; NO_SVE-NEXT: tbnz x8, #52, .LBB6_118 +; NO_SVE-NEXT: .LBB6_53: // %else104 +; NO_SVE-NEXT: tbnz x8, #53, .LBB6_119 +; NO_SVE-NEXT: .LBB6_54: // %else106 +; NO_SVE-NEXT: tbnz x8, #54, .LBB6_120 +; NO_SVE-NEXT: .LBB6_55: // %else108 +; NO_SVE-NEXT: tbnz x8, #55, .LBB6_121 +; NO_SVE-NEXT: .LBB6_56: // %else110 +; NO_SVE-NEXT: tbnz x8, #56, .LBB6_122 +; NO_SVE-NEXT: .LBB6_57: // %else112 +; NO_SVE-NEXT: tbnz x8, #57, .LBB6_123 +; NO_SVE-NEXT: .LBB6_58: // %else114 +; NO_SVE-NEXT: tbnz x8, #58, .LBB6_124 +; NO_SVE-NEXT: .LBB6_59: // %else116 +; NO_SVE-NEXT: tbnz x8, #59, .LBB6_125 +; NO_SVE-NEXT: .LBB6_60: // %else118 +; NO_SVE-NEXT: tbnz x8, #60, .LBB6_126 +; NO_SVE-NEXT: .LBB6_61: // %else120 +; NO_SVE-NEXT: tbnz x8, #61, .LBB6_127 +; NO_SVE-NEXT: .LBB6_62: // %else122 +; NO_SVE-NEXT: tbnz x8, #62, .LBB6_128 +; NO_SVE-NEXT: .LBB6_63: // %else124 +; NO_SVE-NEXT: tbz x8, #63, .LBB6_65 +; NO_SVE-NEXT: .LBB6_64: // %cond.store125 +; NO_SVE-NEXT: add x8, x0, #252 +; NO_SVE-NEXT: st1 { v0.s }[3], [x8] +; NO_SVE-NEXT: .LBB6_65: // %else126 +; NO_SVE-NEXT: ldp d9, d8, [sp, #32] // 16-byte Folded Reload +; NO_SVE-NEXT: ldp d11, d10, [sp, #16] // 16-byte Folded Reload +; NO_SVE-NEXT: add sp, sp, #48 +; NO_SVE-NEXT: ret +; NO_SVE-NEXT: .LBB6_66: // %cond.store +; NO_SVE-NEXT: str s23, [x0] +; NO_SVE-NEXT: tbz w8, #1, .LBB6_2 +; NO_SVE-NEXT: .LBB6_67: // %cond.store1 +; NO_SVE-NEXT: add x9, x0, #4 +; NO_SVE-NEXT: st1 { v23.s }[1], [x9] +; NO_SVE-NEXT: tbz w8, #2, .LBB6_3 +; NO_SVE-NEXT: .LBB6_68: // %cond.store3 +; NO_SVE-NEXT: add x9, x0, #8 +; NO_SVE-NEXT: st1 { v23.s }[2], [x9] +; NO_SVE-NEXT: tbz w8, #3, .LBB6_4 +; NO_SVE-NEXT: .LBB6_69: // %cond.store5 +; NO_SVE-NEXT: add x9, x0, #12 +; NO_SVE-NEXT: st1 { v23.s }[3], [x9] +; NO_SVE-NEXT: tbz w8, #4, .LBB6_5 +; NO_SVE-NEXT: .LBB6_70: // %cond.store7 +; NO_SVE-NEXT: str s22, [x0, #16] +; NO_SVE-NEXT: tbz w8, #5, .LBB6_6 +; NO_SVE-NEXT: .LBB6_71: // %cond.store9 +; NO_SVE-NEXT: add x9, x0, #20 +; NO_SVE-NEXT: st1 { v22.s }[1], [x9] +; NO_SVE-NEXT: tbz w8, #6, .LBB6_7 +; NO_SVE-NEXT: .LBB6_72: // %cond.store11 +; NO_SVE-NEXT: add x9, x0, #24 +; NO_SVE-NEXT: st1 { v22.s }[2], [x9] +; NO_SVE-NEXT: tbz w8, #7, .LBB6_8 +; NO_SVE-NEXT: .LBB6_73: // %cond.store13 +; NO_SVE-NEXT: add x9, x0, #28 +; NO_SVE-NEXT: st1 { v22.s }[3], [x9] +; NO_SVE-NEXT: tbz w8, #8, .LBB6_9 +; NO_SVE-NEXT: .LBB6_74: // %cond.store15 +; NO_SVE-NEXT: str s21, [x0, #32] +; NO_SVE-NEXT: tbz w8, #9, .LBB6_10 +; NO_SVE-NEXT: .LBB6_75: // %cond.store17 +; NO_SVE-NEXT: add x9, x0, #36 +; NO_SVE-NEXT: st1 { v21.s }[1], [x9] +; NO_SVE-NEXT: tbz w8, #10, .LBB6_11 +; NO_SVE-NEXT: .LBB6_76: // %cond.store19 +; NO_SVE-NEXT: add x9, x0, #40 +; NO_SVE-NEXT: st1 { v21.s }[2], [x9] +; NO_SVE-NEXT: tbz w8, #11, .LBB6_12 +; NO_SVE-NEXT: .LBB6_77: // %cond.store21 +; NO_SVE-NEXT: add x9, x0, #44 +; NO_SVE-NEXT: st1 { v21.s }[3], [x9] +; NO_SVE-NEXT: tbz w8, #12, .LBB6_13 +; NO_SVE-NEXT: .LBB6_78: // %cond.store23 +; NO_SVE-NEXT: str s20, [x0, #48] +; NO_SVE-NEXT: tbz w8, #13, .LBB6_14 +; NO_SVE-NEXT: .LBB6_79: // %cond.store25 +; NO_SVE-NEXT: add x9, x0, #52 +; NO_SVE-NEXT: st1 { v20.s }[1], [x9] +; NO_SVE-NEXT: tbz w8, #14, .LBB6_15 +; NO_SVE-NEXT: .LBB6_80: // %cond.store27 +; NO_SVE-NEXT: add x9, x0, #56 +; NO_SVE-NEXT: st1 { v20.s }[2], [x9] +; NO_SVE-NEXT: tbz w8, #15, .LBB6_16 +; NO_SVE-NEXT: .LBB6_81: // %cond.store29 +; NO_SVE-NEXT: add x9, x0, #60 +; NO_SVE-NEXT: st1 { v20.s }[3], [x9] +; NO_SVE-NEXT: tbz w8, #16, .LBB6_17 +; NO_SVE-NEXT: .LBB6_82: // %cond.store31 +; NO_SVE-NEXT: str s19, [x0, #64] +; NO_SVE-NEXT: tbz w8, #17, .LBB6_18 +; NO_SVE-NEXT: .LBB6_83: // %cond.store33 +; NO_SVE-NEXT: add x9, x0, #68 +; NO_SVE-NEXT: st1 { v19.s }[1], [x9] +; NO_SVE-NEXT: tbz w8, #18, .LBB6_19 +; NO_SVE-NEXT: .LBB6_84: // %cond.store35 +; NO_SVE-NEXT: add x9, x0, #72 +; NO_SVE-NEXT: st1 { v19.s }[2], [x9] +; NO_SVE-NEXT: tbz w8, #19, .LBB6_20 +; NO_SVE-NEXT: .LBB6_85: // %cond.store37 +; NO_SVE-NEXT: add x9, x0, #76 +; NO_SVE-NEXT: st1 { v19.s }[3], [x9] +; NO_SVE-NEXT: tbz w8, #20, .LBB6_21 +; NO_SVE-NEXT: .LBB6_86: // %cond.store39 +; NO_SVE-NEXT: str s18, [x0, #80] +; NO_SVE-NEXT: tbz w8, #21, .LBB6_22 +; NO_SVE-NEXT: .LBB6_87: // %cond.store41 +; NO_SVE-NEXT: add x9, x0, #84 +; NO_SVE-NEXT: st1 { v18.s }[1], [x9] +; NO_SVE-NEXT: tbz w8, #22, .LBB6_23 +; NO_SVE-NEXT: .LBB6_88: // %cond.store43 +; NO_SVE-NEXT: add x9, x0, #88 +; NO_SVE-NEXT: st1 { v18.s }[2], [x9] +; NO_SVE-NEXT: tbz w8, #23, .LBB6_24 +; NO_SVE-NEXT: .LBB6_89: // %cond.store45 +; NO_SVE-NEXT: add x9, x0, #92 +; NO_SVE-NEXT: st1 { v18.s }[3], [x9] +; NO_SVE-NEXT: tbz w8, #24, .LBB6_25 +; NO_SVE-NEXT: .LBB6_90: // %cond.store47 +; NO_SVE-NEXT: str s17, [x0, #96] +; NO_SVE-NEXT: tbz w8, #25, .LBB6_26 +; NO_SVE-NEXT: .LBB6_91: // %cond.store49 +; NO_SVE-NEXT: add x9, x0, #100 +; NO_SVE-NEXT: st1 { v17.s }[1], [x9] +; NO_SVE-NEXT: tbz w8, #26, .LBB6_27 +; NO_SVE-NEXT: .LBB6_92: // %cond.store51 +; NO_SVE-NEXT: add x9, x0, #104 +; NO_SVE-NEXT: st1 { v17.s }[2], [x9] +; NO_SVE-NEXT: tbz w8, #27, .LBB6_28 +; NO_SVE-NEXT: .LBB6_93: // %cond.store53 +; NO_SVE-NEXT: add x9, x0, #108 +; NO_SVE-NEXT: st1 { v17.s }[3], [x9] +; NO_SVE-NEXT: tbz w8, #28, .LBB6_29 +; NO_SVE-NEXT: .LBB6_94: // %cond.store55 +; NO_SVE-NEXT: str s16, [x0, #112] +; NO_SVE-NEXT: tbz w8, #29, .LBB6_30 +; NO_SVE-NEXT: .LBB6_95: // %cond.store57 +; NO_SVE-NEXT: add x9, x0, #116 +; NO_SVE-NEXT: st1 { v16.s }[1], [x9] +; NO_SVE-NEXT: tbz w8, #30, .LBB6_31 +; NO_SVE-NEXT: .LBB6_96: // %cond.store59 +; NO_SVE-NEXT: add x9, x0, #120 +; NO_SVE-NEXT: st1 { v16.s }[2], [x9] +; NO_SVE-NEXT: tbz w8, #31, .LBB6_32 +; NO_SVE-NEXT: .LBB6_97: // %cond.store61 +; NO_SVE-NEXT: add x9, x0, #124 +; NO_SVE-NEXT: st1 { v16.s }[3], [x9] +; NO_SVE-NEXT: tbz x8, #32, .LBB6_33 +; NO_SVE-NEXT: .LBB6_98: // %cond.store63 +; NO_SVE-NEXT: str s7, [x0, #128] +; NO_SVE-NEXT: tbz x8, #33, .LBB6_34 +; NO_SVE-NEXT: .LBB6_99: // %cond.store65 +; NO_SVE-NEXT: add x9, x0, #132 +; NO_SVE-NEXT: st1 { v7.s }[1], [x9] +; NO_SVE-NEXT: tbz x8, #34, .LBB6_35 +; NO_SVE-NEXT: .LBB6_100: // %cond.store67 +; NO_SVE-NEXT: add x9, x0, #136 +; NO_SVE-NEXT: st1 { v7.s }[2], [x9] +; NO_SVE-NEXT: tbz x8, #35, .LBB6_36 +; NO_SVE-NEXT: .LBB6_101: // %cond.store69 +; NO_SVE-NEXT: add x9, x0, #140 +; NO_SVE-NEXT: st1 { v7.s }[3], [x9] +; NO_SVE-NEXT: tbz x8, #36, .LBB6_37 +; NO_SVE-NEXT: .LBB6_102: // %cond.store71 +; NO_SVE-NEXT: str s6, [x0, #144] +; NO_SVE-NEXT: tbz x8, #37, .LBB6_38 +; NO_SVE-NEXT: .LBB6_103: // %cond.store73 +; NO_SVE-NEXT: add x9, x0, #148 +; NO_SVE-NEXT: st1 { v6.s }[1], [x9] +; NO_SVE-NEXT: tbz x8, #38, .LBB6_39 +; NO_SVE-NEXT: .LBB6_104: // %cond.store75 +; NO_SVE-NEXT: add x9, x0, #152 +; NO_SVE-NEXT: st1 { v6.s }[2], [x9] +; NO_SVE-NEXT: tbz x8, #39, .LBB6_40 +; NO_SVE-NEXT: .LBB6_105: // %cond.store77 +; NO_SVE-NEXT: add x9, x0, #156 +; NO_SVE-NEXT: st1 { v6.s }[3], [x9] +; NO_SVE-NEXT: tbz x8, #40, .LBB6_41 +; NO_SVE-NEXT: .LBB6_106: // %cond.store79 +; NO_SVE-NEXT: str s5, [x0, #160] +; NO_SVE-NEXT: tbz x8, #41, .LBB6_42 +; NO_SVE-NEXT: .LBB6_107: // %cond.store81 +; NO_SVE-NEXT: add x9, x0, #164 +; NO_SVE-NEXT: st1 { v5.s }[1], [x9] +; NO_SVE-NEXT: tbz x8, #42, .LBB6_43 +; NO_SVE-NEXT: .LBB6_108: // %cond.store83 +; NO_SVE-NEXT: add x9, x0, #168 +; NO_SVE-NEXT: st1 { v5.s }[2], [x9] +; NO_SVE-NEXT: tbz x8, #43, .LBB6_44 +; NO_SVE-NEXT: .LBB6_109: // %cond.store85 +; NO_SVE-NEXT: add x9, x0, #172 +; NO_SVE-NEXT: st1 { v5.s }[3], [x9] +; NO_SVE-NEXT: tbz x8, #44, .LBB6_45 +; NO_SVE-NEXT: .LBB6_110: // %cond.store87 +; NO_SVE-NEXT: str s4, [x0, #176] +; NO_SVE-NEXT: tbz x8, #45, .LBB6_46 +; NO_SVE-NEXT: .LBB6_111: // %cond.store89 +; NO_SVE-NEXT: add x9, x0, #180 +; NO_SVE-NEXT: st1 { v4.s }[1], [x9] +; NO_SVE-NEXT: tbz x8, #46, .LBB6_47 +; NO_SVE-NEXT: .LBB6_112: // %cond.store91 +; NO_SVE-NEXT: add x9, x0, #184 +; NO_SVE-NEXT: st1 { v4.s }[2], [x9] +; NO_SVE-NEXT: tbz x8, #47, .LBB6_48 +; NO_SVE-NEXT: .LBB6_113: // %cond.store93 +; NO_SVE-NEXT: add x9, x0, #188 +; NO_SVE-NEXT: st1 { v4.s }[3], [x9] +; NO_SVE-NEXT: tbz x8, #48, .LBB6_49 +; NO_SVE-NEXT: .LBB6_114: // %cond.store95 +; NO_SVE-NEXT: str s3, [x0, #192] +; NO_SVE-NEXT: tbz x8, #49, .LBB6_50 +; NO_SVE-NEXT: .LBB6_115: // %cond.store97 +; NO_SVE-NEXT: add x9, x0, #196 +; NO_SVE-NEXT: st1 { v3.s }[1], [x9] +; NO_SVE-NEXT: tbz x8, #50, .LBB6_51 +; NO_SVE-NEXT: .LBB6_116: // %cond.store99 +; NO_SVE-NEXT: add x9, x0, #200 +; NO_SVE-NEXT: st1 { v3.s }[2], [x9] +; NO_SVE-NEXT: tbz x8, #51, .LBB6_52 +; NO_SVE-NEXT: .LBB6_117: // %cond.store101 +; NO_SVE-NEXT: add x9, x0, #204 +; NO_SVE-NEXT: st1 { v3.s }[3], [x9] +; NO_SVE-NEXT: tbz x8, #52, .LBB6_53 +; NO_SVE-NEXT: .LBB6_118: // %cond.store103 +; NO_SVE-NEXT: str s2, [x0, #208] +; NO_SVE-NEXT: tbz x8, #53, .LBB6_54 +; NO_SVE-NEXT: .LBB6_119: // %cond.store105 +; NO_SVE-NEXT: add x9, x0, #212 +; NO_SVE-NEXT: st1 { v2.s }[1], [x9] +; NO_SVE-NEXT: tbz x8, #54, .LBB6_55 +; NO_SVE-NEXT: .LBB6_120: // %cond.store107 +; NO_SVE-NEXT: add x9, x0, #216 +; NO_SVE-NEXT: st1 { v2.s }[2], [x9] +; NO_SVE-NEXT: tbz x8, #55, .LBB6_56 +; NO_SVE-NEXT: .LBB6_121: // %cond.store109 +; NO_SVE-NEXT: add x9, x0, #220 +; NO_SVE-NEXT: st1 { v2.s }[3], [x9] +; NO_SVE-NEXT: tbz x8, #56, .LBB6_57 +; NO_SVE-NEXT: .LBB6_122: // %cond.store111 +; NO_SVE-NEXT: str s1, [x0, #224] +; NO_SVE-NEXT: tbz x8, #57, .LBB6_58 +; NO_SVE-NEXT: .LBB6_123: // %cond.store113 +; NO_SVE-NEXT: add x9, x0, #228 +; NO_SVE-NEXT: st1 { v1.s }[1], [x9] +; NO_SVE-NEXT: tbz x8, #58, .LBB6_59 +; NO_SVE-NEXT: .LBB6_124: // %cond.store115 +; NO_SVE-NEXT: add x9, x0, #232 +; NO_SVE-NEXT: st1 { v1.s }[2], [x9] +; NO_SVE-NEXT: tbz x8, #59, .LBB6_60 +; NO_SVE-NEXT: .LBB6_125: // %cond.store117 +; NO_SVE-NEXT: add x9, x0, #236 +; NO_SVE-NEXT: st1 { v1.s }[3], [x9] +; NO_SVE-NEXT: tbz x8, #60, .LBB6_61 +; NO_SVE-NEXT: .LBB6_126: // %cond.store119 +; NO_SVE-NEXT: str s0, [x0, #240] +; NO_SVE-NEXT: tbz x8, #61, .LBB6_62 +; NO_SVE-NEXT: .LBB6_127: // %cond.store121 +; NO_SVE-NEXT: add x9, x0, #244 +; NO_SVE-NEXT: st1 { v0.s }[1], [x9] +; NO_SVE-NEXT: tbz x8, #62, .LBB6_63 +; NO_SVE-NEXT: .LBB6_128: // %cond.store123 +; NO_SVE-NEXT: add x9, x0, #248 +; NO_SVE-NEXT: st1 { v0.s }[2], [x9] +; NO_SVE-NEXT: tbnz x8, #63, .LBB6_64 +; NO_SVE-NEXT: b .LBB6_65 +; +; VBITS_GE_256-LABEL: masked_store_v64f32: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov x8, #56 +; VBITS_GE_256-NEXT: mov x9, #48 +; VBITS_GE_256-NEXT: mov x10, #40 +; VBITS_GE_256-NEXT: mov x11, #32 +; VBITS_GE_256-NEXT: mov x12, #24 +; VBITS_GE_256-NEXT: mov x13, #16 +; VBITS_GE_256-NEXT: mov x14, #8 +; VBITS_GE_256-NEXT: ptrue p0.s, vl8 +; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0, x9, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z2.s }, p0/z, [x0, x10, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z3.s }, p0/z, [x0, x11, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z4.s }, p0/z, [x0, x12, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z5.s }, p0/z, [x0, x13, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z6.s }, p0/z, [x0, x14, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z7.s }, p0/z, [x0] +; VBITS_GE_256-NEXT: ld1w { z16.s }, p0/z, [x1, x13, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z17.s }, p0/z, [x1, x14, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z18.s }, p0/z, [x1, x11, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z19.s }, p0/z, [x1, x12, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z20.s }, p0/z, [x1, x9, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z21.s }, p0/z, [x1, x10, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z22.s }, p0/z, [x1, x8, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z23.s }, p0/z, [x1] +; VBITS_GE_256-NEXT: fcmeq p1.s, p0/z, z6.s, z17.s +; VBITS_GE_256-NEXT: fcmeq p2.s, p0/z, z5.s, z16.s +; VBITS_GE_256-NEXT: fcmeq p3.s, p0/z, z4.s, z19.s +; VBITS_GE_256-NEXT: fcmeq p4.s, p0/z, z3.s, z18.s +; VBITS_GE_256-NEXT: fcmeq p5.s, p0/z, z2.s, z21.s +; VBITS_GE_256-NEXT: fcmeq p6.s, p0/z, z1.s, z20.s +; VBITS_GE_256-NEXT: fcmeq p7.s, p0/z, z0.s, z22.s +; VBITS_GE_256-NEXT: fcmeq p0.s, p0/z, z7.s, z23.s +; VBITS_GE_256-NEXT: st1w { z0.s }, p7, [x0, x8, lsl #2] +; VBITS_GE_256-NEXT: st1w { z1.s }, p6, [x0, x9, lsl #2] +; VBITS_GE_256-NEXT: st1w { z2.s }, p5, [x0, x10, lsl #2] +; VBITS_GE_256-NEXT: st1w { z3.s }, p4, [x0, x11, lsl #2] +; VBITS_GE_256-NEXT: st1w { z4.s }, p3, [x0, x12, lsl #2] +; VBITS_GE_256-NEXT: st1w { z5.s }, p2, [x0, x13, lsl #2] +; VBITS_GE_256-NEXT: st1w { z6.s }, p1, [x0, x14, lsl #2] +; VBITS_GE_256-NEXT: st1w { z7.s }, p0, [x0] +; VBITS_GE_256-NEXT: ret +; +; VBITS_GE_384-LABEL: masked_store_v64f32: +; VBITS_GE_384: // %bb.0: +; VBITS_GE_384-NEXT: mov x8, #56 +; VBITS_GE_384-NEXT: mov x9, #48 +; VBITS_GE_384-NEXT: mov x10, #40 +; VBITS_GE_384-NEXT: mov x11, #32 +; VBITS_GE_384-NEXT: mov x12, #24 +; VBITS_GE_384-NEXT: mov x13, #16 +; VBITS_GE_384-NEXT: mov x14, #8 +; VBITS_GE_384-NEXT: ptrue p0.s, vl8 +; VBITS_GE_384-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] +; VBITS_GE_384-NEXT: ld1w { z1.s }, p0/z, [x0, x9, lsl #2] +; VBITS_GE_384-NEXT: ld1w { z2.s }, p0/z, [x0, x10, lsl #2] +; VBITS_GE_384-NEXT: ld1w { z3.s }, p0/z, [x0, x11, lsl #2] +; VBITS_GE_384-NEXT: ld1w { z4.s }, p0/z, [x0, x12, lsl #2] +; VBITS_GE_384-NEXT: ld1w { z5.s }, p0/z, [x0, x13, lsl #2] +; VBITS_GE_384-NEXT: ld1w { z6.s }, p0/z, [x0, x14, lsl #2] +; VBITS_GE_384-NEXT: ld1w { z7.s }, p0/z, [x0] +; VBITS_GE_384-NEXT: ld1w { z16.s }, p0/z, [x1, x13, lsl #2] +; VBITS_GE_384-NEXT: ld1w { z17.s }, p0/z, [x1, x14, lsl #2] +; VBITS_GE_384-NEXT: ld1w { z18.s }, p0/z, [x1, x11, lsl #2] +; VBITS_GE_384-NEXT: ld1w { z19.s }, p0/z, [x1, x12, lsl #2] +; VBITS_GE_384-NEXT: ld1w { z20.s }, p0/z, [x1, x9, lsl #2] +; VBITS_GE_384-NEXT: ld1w { z21.s }, p0/z, [x1, x10, lsl #2] +; VBITS_GE_384-NEXT: ld1w { z22.s }, p0/z, [x1, x8, lsl #2] +; VBITS_GE_384-NEXT: ld1w { z23.s }, p0/z, [x1] +; VBITS_GE_384-NEXT: fcmeq p1.s, p0/z, z6.s, z17.s +; VBITS_GE_384-NEXT: fcmeq p2.s, p0/z, z5.s, z16.s +; VBITS_GE_384-NEXT: fcmeq p3.s, p0/z, z4.s, z19.s +; VBITS_GE_384-NEXT: fcmeq p4.s, p0/z, z3.s, z18.s +; VBITS_GE_384-NEXT: fcmeq p5.s, p0/z, z2.s, z21.s +; VBITS_GE_384-NEXT: fcmeq p6.s, p0/z, z1.s, z20.s +; VBITS_GE_384-NEXT: fcmeq p7.s, p0/z, z0.s, z22.s +; VBITS_GE_384-NEXT: fcmeq p0.s, p0/z, z7.s, z23.s +; VBITS_GE_384-NEXT: st1w { z0.s }, p7, [x0, x8, lsl #2] +; VBITS_GE_384-NEXT: st1w { z1.s }, p6, [x0, x9, lsl #2] +; VBITS_GE_384-NEXT: st1w { z2.s }, p5, [x0, x10, lsl #2] +; VBITS_GE_384-NEXT: st1w { z3.s }, p4, [x0, x11, lsl #2] +; VBITS_GE_384-NEXT: st1w { z4.s }, p3, [x0, x12, lsl #2] +; VBITS_GE_384-NEXT: st1w { z5.s }, p2, [x0, x13, lsl #2] +; VBITS_GE_384-NEXT: st1w { z6.s }, p1, [x0, x14, lsl #2] +; VBITS_GE_384-NEXT: st1w { z7.s }, p0, [x0] +; VBITS_GE_384-NEXT: ret +; ; VBITS_GE_2048-LABEL: masked_store_v64f32: ; VBITS_GE_2048: // %bb.0: ; VBITS_GE_2048-NEXT: ptrue p0.s, vl64 @@ -153,14 +1584,156 @@ } define void @masked_store_trunc_v8i64i8(<8 x i64>* %ap, <8 x i64>* %bp, <8 x i8>* %dest) #0 { +; NO_SVE-LABEL: masked_store_trunc_v8i64i8: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: sub sp, sp, #16 +; NO_SVE-NEXT: .cfi_def_cfa_offset 16 +; NO_SVE-NEXT: ldp q0, q5, [x1, #32] +; NO_SVE-NEXT: ldp q1, q2, [x0] +; NO_SVE-NEXT: ldp q3, q4, [x0, #32] +; NO_SVE-NEXT: cmeq v0.2d, v3.2d, v0.2d +; NO_SVE-NEXT: ldp q6, q7, [x1] +; NO_SVE-NEXT: cmeq v5.2d, v4.2d, v5.2d +; NO_SVE-NEXT: uzp1 v3.4s, v3.4s, v4.4s +; NO_SVE-NEXT: uzp1 v0.4s, v0.4s, v5.4s +; NO_SVE-NEXT: cmeq v6.2d, v1.2d, v6.2d +; NO_SVE-NEXT: uzp1 v1.4s, v1.4s, v2.4s +; NO_SVE-NEXT: cmeq v7.2d, v2.2d, v7.2d +; NO_SVE-NEXT: uzp1 v5.4s, v6.4s, v7.4s +; NO_SVE-NEXT: uzp1 v0.8h, v5.8h, v0.8h +; NO_SVE-NEXT: xtn v0.8b, v0.8h +; NO_SVE-NEXT: umov w8, v0.b[1] +; NO_SVE-NEXT: umov w9, v0.b[0] +; NO_SVE-NEXT: umov w10, v0.b[2] +; NO_SVE-NEXT: umov w11, v0.b[3] +; NO_SVE-NEXT: umov w12, v0.b[4] +; NO_SVE-NEXT: and w8, w8, #0x1 +; NO_SVE-NEXT: and w9, w9, #0x1 +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: and w11, w11, #0x1 +; NO_SVE-NEXT: and w12, w12, #0x1 +; NO_SVE-NEXT: bfi w9, w8, #1, #1 +; NO_SVE-NEXT: umov w8, v0.b[5] +; NO_SVE-NEXT: bfi w9, w10, #2, #1 +; NO_SVE-NEXT: umov w10, v0.b[6] +; NO_SVE-NEXT: bfi w9, w11, #3, #1 +; NO_SVE-NEXT: umov w11, v0.b[7] +; NO_SVE-NEXT: bfi w9, w12, #4, #1 +; NO_SVE-NEXT: and w8, w8, #0x1 +; NO_SVE-NEXT: uzp1 v0.8h, v1.8h, v3.8h +; NO_SVE-NEXT: bfi w9, w8, #5, #1 +; NO_SVE-NEXT: and w8, w10, #0x1 +; NO_SVE-NEXT: orr w8, w9, w8, lsl #6 +; NO_SVE-NEXT: xtn v0.8b, v0.8h +; NO_SVE-NEXT: orr w9, w8, w11, lsl #7 +; NO_SVE-NEXT: and w8, w9, #0xff +; NO_SVE-NEXT: tbnz w9, #0, .LBB7_9 +; NO_SVE-NEXT: // %bb.1: // %else +; NO_SVE-NEXT: tbnz w8, #1, .LBB7_10 +; NO_SVE-NEXT: .LBB7_2: // %else2 +; NO_SVE-NEXT: tbnz w8, #2, .LBB7_11 +; NO_SVE-NEXT: .LBB7_3: // %else4 +; NO_SVE-NEXT: tbnz w8, #3, .LBB7_12 +; NO_SVE-NEXT: .LBB7_4: // %else6 +; NO_SVE-NEXT: tbnz w8, #4, .LBB7_13 +; NO_SVE-NEXT: .LBB7_5: // %else8 +; NO_SVE-NEXT: tbnz w8, #5, .LBB7_14 +; NO_SVE-NEXT: .LBB7_6: // %else10 +; NO_SVE-NEXT: tbnz w8, #6, .LBB7_15 +; NO_SVE-NEXT: .LBB7_7: // %else12 +; NO_SVE-NEXT: tbnz w8, #7, .LBB7_16 +; NO_SVE-NEXT: .LBB7_8: // %else14 +; NO_SVE-NEXT: add sp, sp, #16 +; NO_SVE-NEXT: ret +; NO_SVE-NEXT: .LBB7_9: // %cond.store +; NO_SVE-NEXT: st1 { v0.b }[0], [x2] +; NO_SVE-NEXT: tbz w8, #1, .LBB7_2 +; NO_SVE-NEXT: .LBB7_10: // %cond.store1 +; NO_SVE-NEXT: add x9, x2, #1 +; NO_SVE-NEXT: st1 { v0.b }[1], [x9] +; NO_SVE-NEXT: tbz w8, #2, .LBB7_3 +; NO_SVE-NEXT: .LBB7_11: // %cond.store3 +; NO_SVE-NEXT: add x9, x2, #2 +; NO_SVE-NEXT: st1 { v0.b }[2], [x9] +; NO_SVE-NEXT: tbz w8, #3, .LBB7_4 +; NO_SVE-NEXT: .LBB7_12: // %cond.store5 +; NO_SVE-NEXT: add x9, x2, #3 +; NO_SVE-NEXT: st1 { v0.b }[3], [x9] +; NO_SVE-NEXT: tbz w8, #4, .LBB7_5 +; NO_SVE-NEXT: .LBB7_13: // %cond.store7 +; NO_SVE-NEXT: add x9, x2, #4 +; NO_SVE-NEXT: st1 { v0.b }[4], [x9] +; NO_SVE-NEXT: tbz w8, #5, .LBB7_6 +; NO_SVE-NEXT: .LBB7_14: // %cond.store9 +; NO_SVE-NEXT: add x9, x2, #5 +; NO_SVE-NEXT: st1 { v0.b }[5], [x9] +; NO_SVE-NEXT: tbz w8, #6, .LBB7_7 +; NO_SVE-NEXT: .LBB7_15: // %cond.store11 +; NO_SVE-NEXT: add x9, x2, #6 +; NO_SVE-NEXT: st1 { v0.b }[6], [x9] +; NO_SVE-NEXT: tbz w8, #7, .LBB7_8 +; NO_SVE-NEXT: .LBB7_16: // %cond.store13 +; NO_SVE-NEXT: add x8, x2, #7 +; NO_SVE-NEXT: st1 { v0.b }[7], [x8] +; NO_SVE-NEXT: add sp, sp, #16 +; NO_SVE-NEXT: ret +; +; VBITS_GE_256-LABEL: masked_store_trunc_v8i64i8: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov x8, #4 +; VBITS_GE_256-NEXT: ptrue p0.d, vl4 +; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0] +; VBITS_GE_256-NEXT: ld1d { z2.d }, p0/z, [x1, x8, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z3.d }, p0/z, [x1] +; VBITS_GE_256-NEXT: cmpeq p1.d, p0/z, z0.d, z2.d +; VBITS_GE_256-NEXT: cmpeq p0.d, p0/z, z1.d, z3.d +; VBITS_GE_256-NEXT: mov z2.d, p1/z, #-1 // =0xffffffffffffffff +; VBITS_GE_256-NEXT: mov z3.d, p0/z, #-1 // =0xffffffffffffffff +; VBITS_GE_256-NEXT: ptrue p1.s, vl4 +; VBITS_GE_256-NEXT: uzp1 z2.s, z2.s, z2.s +; VBITS_GE_256-NEXT: uzp1 z3.s, z3.s, z3.s +; VBITS_GE_256-NEXT: ptrue p0.s, vl8 +; VBITS_GE_256-NEXT: splice z3.s, p1, z3.s, z2.s +; VBITS_GE_256-NEXT: uzp1 z0.s, z0.s, z0.s +; VBITS_GE_256-NEXT: uzp1 z1.s, z1.s, z1.s +; VBITS_GE_256-NEXT: cmpne p0.s, p0/z, z3.s, #0 +; VBITS_GE_256-NEXT: splice z1.s, p1, z1.s, z0.s +; VBITS_GE_256-NEXT: st1b { z1.s }, p0, [x2] +; VBITS_GE_256-NEXT: ret +; +; VBITS_GE_384-LABEL: masked_store_trunc_v8i64i8: +; VBITS_GE_384: // %bb.0: +; VBITS_GE_384-NEXT: mov x8, #4 +; VBITS_GE_384-NEXT: ptrue p0.d, vl4 +; VBITS_GE_384-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] +; VBITS_GE_384-NEXT: ld1d { z1.d }, p0/z, [x0] +; VBITS_GE_384-NEXT: ld1d { z2.d }, p0/z, [x1, x8, lsl #3] +; VBITS_GE_384-NEXT: ld1d { z3.d }, p0/z, [x1] +; VBITS_GE_384-NEXT: cmpeq p1.d, p0/z, z0.d, z2.d +; VBITS_GE_384-NEXT: cmpeq p0.d, p0/z, z1.d, z3.d +; VBITS_GE_384-NEXT: mov z2.d, p1/z, #-1 // =0xffffffffffffffff +; VBITS_GE_384-NEXT: mov z3.d, p0/z, #-1 // =0xffffffffffffffff +; VBITS_GE_384-NEXT: ptrue p1.s, vl4 +; VBITS_GE_384-NEXT: uzp1 z2.s, z2.s, z2.s +; VBITS_GE_384-NEXT: uzp1 z3.s, z3.s, z3.s +; VBITS_GE_384-NEXT: ptrue p0.s, vl8 +; VBITS_GE_384-NEXT: splice z3.s, p1, z3.s, z2.s +; VBITS_GE_384-NEXT: uzp1 z0.s, z0.s, z0.s +; VBITS_GE_384-NEXT: uzp1 z1.s, z1.s, z1.s +; VBITS_GE_384-NEXT: cmpne p0.s, p0/z, z3.s, #0 +; VBITS_GE_384-NEXT: splice z1.s, p1, z1.s, z0.s +; VBITS_GE_384-NEXT: st1b { z1.s }, p0, [x2] +; VBITS_GE_384-NEXT: ret +; ; VBITS_GE_512-LABEL: masked_store_trunc_v8i64i8: ; VBITS_GE_512: // %bb.0: -; VBITS_GE_512-NEXT: ptrue p[[P0:[0-9]+]].d, vl8 -; VBITS_GE_512-NEXT: ld1d { [[Z0:z[0-9]+]].d }, p0/z, [x0] -; VBITS_GE_512-NEXT: ld1d { [[Z1:z[0-9]+]].d }, p0/z, [x1] -; VBITS_GE_512-NEXT: cmpeq p[[P1:[0-9]+]].d, p[[P0]]/z, [[Z0]].d, [[Z1]].d -; VBITS_GE_512-NEXT: st1b { [[Z0]].d }, p[[P1]], [x{{[0-9]+}}] -; VBITS_GE_512-NEXT: ret +; VBITS_GE_512-NEXT: ptrue p0.d, vl8 +; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x0] +; VBITS_GE_512-NEXT: ld1d { z1.d }, p0/z, [x1] +; VBITS_GE_512-NEXT: cmpeq p0.d, p0/z, z0.d, z1.d +; VBITS_GE_512-NEXT: st1b { z0.d }, p0, [x2] +; VBITS_GE_512-NEXT: ret %a = load <8 x i64>, <8 x i64>* %ap %b = load <8 x i64>, <8 x i64>* %bp @@ -171,13 +1744,161 @@ } define void @masked_store_trunc_v8i64i16(<8 x i64>* %ap, <8 x i64>* %bp, <8 x i16>* %dest) #0 { -; CHECK-LABEL: masked_store_trunc_v8i64i16: -; VBITS_GE_512: ptrue p[[P0:[0-9]+]].d, vl8 -; VBITS_GE_512-NEXT: ld1d { [[Z0:z[0-9]+]].d }, p0/z, [x0] -; VBITS_GE_512-NEXT: ld1d { [[Z1:z[0-9]+]].d }, p0/z, [x1] -; VBITS_GE_512-NEXT: cmpeq p[[P1:[0-9]+]].d, p[[P0]]/z, [[Z0]].d, [[Z1]].d -; VBITS_GE_512-NEXT: st1h { [[Z0]].d }, p[[P1]], [x{{[0-9]+}}] -; VBITS_GE_512-NEXT: ret +; NO_SVE-LABEL: masked_store_trunc_v8i64i16: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: sub sp, sp, #16 +; NO_SVE-NEXT: .cfi_def_cfa_offset 16 +; NO_SVE-NEXT: ldp q0, q5, [x1, #32] +; NO_SVE-NEXT: ldp q1, q2, [x0] +; NO_SVE-NEXT: ldp q3, q4, [x0, #32] +; NO_SVE-NEXT: cmeq v0.2d, v3.2d, v0.2d +; NO_SVE-NEXT: ldp q6, q7, [x1] +; NO_SVE-NEXT: cmeq v5.2d, v4.2d, v5.2d +; NO_SVE-NEXT: uzp1 v3.4s, v3.4s, v4.4s +; NO_SVE-NEXT: uzp1 v0.4s, v0.4s, v5.4s +; NO_SVE-NEXT: cmeq v6.2d, v1.2d, v6.2d +; NO_SVE-NEXT: cmeq v7.2d, v2.2d, v7.2d +; NO_SVE-NEXT: uzp1 v5.4s, v6.4s, v7.4s +; NO_SVE-NEXT: uzp1 v0.8h, v5.8h, v0.8h +; NO_SVE-NEXT: xtn v0.8b, v0.8h +; NO_SVE-NEXT: umov w8, v0.b[1] +; NO_SVE-NEXT: umov w9, v0.b[0] +; NO_SVE-NEXT: umov w10, v0.b[2] +; NO_SVE-NEXT: umov w11, v0.b[3] +; NO_SVE-NEXT: umov w12, v0.b[4] +; NO_SVE-NEXT: and w8, w8, #0x1 +; NO_SVE-NEXT: and w9, w9, #0x1 +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: and w11, w11, #0x1 +; NO_SVE-NEXT: and w12, w12, #0x1 +; NO_SVE-NEXT: bfi w9, w8, #1, #1 +; NO_SVE-NEXT: umov w8, v0.b[5] +; NO_SVE-NEXT: bfi w9, w10, #2, #1 +; NO_SVE-NEXT: umov w10, v0.b[6] +; NO_SVE-NEXT: bfi w9, w11, #3, #1 +; NO_SVE-NEXT: umov w11, v0.b[7] +; NO_SVE-NEXT: uzp1 v0.4s, v1.4s, v2.4s +; NO_SVE-NEXT: bfi w9, w12, #4, #1 +; NO_SVE-NEXT: and w8, w8, #0x1 +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: bfi w9, w8, #5, #1 +; NO_SVE-NEXT: orr w8, w9, w10, lsl #6 +; NO_SVE-NEXT: orr w9, w8, w11, lsl #7 +; NO_SVE-NEXT: uzp1 v0.8h, v0.8h, v3.8h +; NO_SVE-NEXT: and w8, w9, #0xff +; NO_SVE-NEXT: tbnz w9, #0, .LBB8_9 +; NO_SVE-NEXT: // %bb.1: // %else +; NO_SVE-NEXT: tbnz w8, #1, .LBB8_10 +; NO_SVE-NEXT: .LBB8_2: // %else2 +; NO_SVE-NEXT: tbnz w8, #2, .LBB8_11 +; NO_SVE-NEXT: .LBB8_3: // %else4 +; NO_SVE-NEXT: tbnz w8, #3, .LBB8_12 +; NO_SVE-NEXT: .LBB8_4: // %else6 +; NO_SVE-NEXT: tbnz w8, #4, .LBB8_13 +; NO_SVE-NEXT: .LBB8_5: // %else8 +; NO_SVE-NEXT: tbnz w8, #5, .LBB8_14 +; NO_SVE-NEXT: .LBB8_6: // %else10 +; NO_SVE-NEXT: tbnz w8, #6, .LBB8_15 +; NO_SVE-NEXT: .LBB8_7: // %else12 +; NO_SVE-NEXT: tbnz w8, #7, .LBB8_16 +; NO_SVE-NEXT: .LBB8_8: // %else14 +; NO_SVE-NEXT: add sp, sp, #16 +; NO_SVE-NEXT: ret +; NO_SVE-NEXT: .LBB8_9: // %cond.store +; NO_SVE-NEXT: str h0, [x2] +; NO_SVE-NEXT: tbz w8, #1, .LBB8_2 +; NO_SVE-NEXT: .LBB8_10: // %cond.store1 +; NO_SVE-NEXT: add x9, x2, #2 +; NO_SVE-NEXT: st1 { v0.h }[1], [x9] +; NO_SVE-NEXT: tbz w8, #2, .LBB8_3 +; NO_SVE-NEXT: .LBB8_11: // %cond.store3 +; NO_SVE-NEXT: add x9, x2, #4 +; NO_SVE-NEXT: st1 { v0.h }[2], [x9] +; NO_SVE-NEXT: tbz w8, #3, .LBB8_4 +; NO_SVE-NEXT: .LBB8_12: // %cond.store5 +; NO_SVE-NEXT: add x9, x2, #6 +; NO_SVE-NEXT: st1 { v0.h }[3], [x9] +; NO_SVE-NEXT: tbz w8, #4, .LBB8_5 +; NO_SVE-NEXT: .LBB8_13: // %cond.store7 +; NO_SVE-NEXT: add x9, x2, #8 +; NO_SVE-NEXT: st1 { v0.h }[4], [x9] +; NO_SVE-NEXT: tbz w8, #5, .LBB8_6 +; NO_SVE-NEXT: .LBB8_14: // %cond.store9 +; NO_SVE-NEXT: add x9, x2, #10 +; NO_SVE-NEXT: st1 { v0.h }[5], [x9] +; NO_SVE-NEXT: tbz w8, #6, .LBB8_7 +; NO_SVE-NEXT: .LBB8_15: // %cond.store11 +; NO_SVE-NEXT: add x9, x2, #12 +; NO_SVE-NEXT: st1 { v0.h }[6], [x9] +; NO_SVE-NEXT: tbz w8, #7, .LBB8_8 +; NO_SVE-NEXT: .LBB8_16: // %cond.store13 +; NO_SVE-NEXT: add x8, x2, #14 +; NO_SVE-NEXT: st1 { v0.h }[7], [x8] +; NO_SVE-NEXT: add sp, sp, #16 +; NO_SVE-NEXT: ret +; +; VBITS_GE_256-LABEL: masked_store_trunc_v8i64i16: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov x8, #4 +; VBITS_GE_256-NEXT: ptrue p0.d, vl4 +; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0] +; VBITS_GE_256-NEXT: ld1d { z2.d }, p0/z, [x1, x8, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z3.d }, p0/z, [x1] +; VBITS_GE_256-NEXT: cmpeq p1.d, p0/z, z0.d, z2.d +; VBITS_GE_256-NEXT: cmpeq p0.d, p0/z, z1.d, z3.d +; VBITS_GE_256-NEXT: uzp1 z0.s, z0.s, z0.s +; VBITS_GE_256-NEXT: uzp1 z1.s, z1.s, z1.s +; VBITS_GE_256-NEXT: mov z2.d, p1/z, #-1 // =0xffffffffffffffff +; VBITS_GE_256-NEXT: mov z3.d, p0/z, #-1 // =0xffffffffffffffff +; VBITS_GE_256-NEXT: uzp1 z0.h, z0.h, z0.h +; VBITS_GE_256-NEXT: uzp1 z1.h, z1.h, z1.h +; VBITS_GE_256-NEXT: ptrue p1.s, vl4 +; VBITS_GE_256-NEXT: uzp1 z2.s, z2.s, z2.s +; VBITS_GE_256-NEXT: uzp1 z3.s, z3.s, z3.s +; VBITS_GE_256-NEXT: ptrue p0.h, vl8 +; VBITS_GE_256-NEXT: mov v1.d[1], v0.d[0] +; VBITS_GE_256-NEXT: splice z3.s, p1, z3.s, z2.s +; VBITS_GE_256-NEXT: uzp1 z0.h, z3.h, z3.h +; VBITS_GE_256-NEXT: cmpne p0.h, p0/z, z0.h, #0 +; VBITS_GE_256-NEXT: st1h { z1.h }, p0, [x2] +; VBITS_GE_256-NEXT: ret +; +; VBITS_GE_384-LABEL: masked_store_trunc_v8i64i16: +; VBITS_GE_384: // %bb.0: +; VBITS_GE_384-NEXT: mov x8, #4 +; VBITS_GE_384-NEXT: ptrue p0.d, vl4 +; VBITS_GE_384-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] +; VBITS_GE_384-NEXT: ld1d { z1.d }, p0/z, [x0] +; VBITS_GE_384-NEXT: ld1d { z2.d }, p0/z, [x1, x8, lsl #3] +; VBITS_GE_384-NEXT: ld1d { z3.d }, p0/z, [x1] +; VBITS_GE_384-NEXT: cmpeq p1.d, p0/z, z0.d, z2.d +; VBITS_GE_384-NEXT: cmpeq p0.d, p0/z, z1.d, z3.d +; VBITS_GE_384-NEXT: uzp1 z0.s, z0.s, z0.s +; VBITS_GE_384-NEXT: uzp1 z1.s, z1.s, z1.s +; VBITS_GE_384-NEXT: mov z2.d, p1/z, #-1 // =0xffffffffffffffff +; VBITS_GE_384-NEXT: mov z3.d, p0/z, #-1 // =0xffffffffffffffff +; VBITS_GE_384-NEXT: uzp1 z0.h, z0.h, z0.h +; VBITS_GE_384-NEXT: uzp1 z1.h, z1.h, z1.h +; VBITS_GE_384-NEXT: ptrue p1.s, vl4 +; VBITS_GE_384-NEXT: uzp1 z2.s, z2.s, z2.s +; VBITS_GE_384-NEXT: uzp1 z3.s, z3.s, z3.s +; VBITS_GE_384-NEXT: ptrue p0.h, vl8 +; VBITS_GE_384-NEXT: mov v1.d[1], v0.d[0] +; VBITS_GE_384-NEXT: splice z3.s, p1, z3.s, z2.s +; VBITS_GE_384-NEXT: uzp1 z0.h, z3.h, z3.h +; VBITS_GE_384-NEXT: cmpne p0.h, p0/z, z0.h, #0 +; VBITS_GE_384-NEXT: st1h { z1.h }, p0, [x2] +; VBITS_GE_384-NEXT: ret +; +; VBITS_GE_512-LABEL: masked_store_trunc_v8i64i16: +; VBITS_GE_512: // %bb.0: +; VBITS_GE_512-NEXT: ptrue p0.d, vl8 +; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x0] +; VBITS_GE_512-NEXT: ld1d { z1.d }, p0/z, [x1] +; VBITS_GE_512-NEXT: cmpeq p0.d, p0/z, z0.d, z1.d +; VBITS_GE_512-NEXT: st1h { z0.d }, p0, [x2] +; VBITS_GE_512-NEXT: ret %a = load <8 x i64>, <8 x i64>* %ap %b = load <8 x i64>, <8 x i64>* %bp %mask = icmp eq <8 x i64> %a, %b @@ -187,13 +1908,154 @@ } define void @masked_store_trunc_v8i64i32(<8 x i64>* %ap, <8 x i64>* %bp, <8 x i32>* %dest) #0 { -; CHECK-LABEL: masked_store_trunc_v8i64i32: -; VBITS_GE_512: ptrue p[[P0:[0-9]+]].d, vl8 -; VBITS_GE_512-NEXT: ld1d { [[Z0:z[0-9]+]].d }, p0/z, [x0] -; VBITS_GE_512-NEXT: ld1d { [[Z1:z[0-9]+]].d }, p0/z, [x1] -; VBITS_GE_512-NEXT: cmpeq p[[P1:[0-9]+]].d, p[[P0]]/z, [[Z0]].d, [[Z1]].d -; VBITS_GE_512-NEXT: st1w { [[Z0]].d }, p[[P1]], [x{{[0-9]+}}] -; VBITS_GE_512-NEXT: ret +; NO_SVE-LABEL: masked_store_trunc_v8i64i32: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: sub sp, sp, #16 +; NO_SVE-NEXT: .cfi_def_cfa_offset 16 +; NO_SVE-NEXT: ldp q2, q5, [x1, #32] +; NO_SVE-NEXT: ldp q3, q4, [x0] +; NO_SVE-NEXT: ldp q0, q1, [x0, #32] +; NO_SVE-NEXT: cmeq v2.2d, v0.2d, v2.2d +; NO_SVE-NEXT: ldp q6, q7, [x1] +; NO_SVE-NEXT: cmeq v5.2d, v1.2d, v5.2d +; NO_SVE-NEXT: uzp1 v2.4s, v2.4s, v5.4s +; NO_SVE-NEXT: cmeq v6.2d, v3.2d, v6.2d +; NO_SVE-NEXT: cmeq v7.2d, v4.2d, v7.2d +; NO_SVE-NEXT: uzp1 v5.4s, v6.4s, v7.4s +; NO_SVE-NEXT: uzp1 v2.8h, v5.8h, v2.8h +; NO_SVE-NEXT: xtn v2.8b, v2.8h +; NO_SVE-NEXT: umov w8, v2.b[1] +; NO_SVE-NEXT: umov w9, v2.b[2] +; NO_SVE-NEXT: umov w10, v2.b[0] +; NO_SVE-NEXT: umov w11, v2.b[3] +; NO_SVE-NEXT: umov w12, v2.b[4] +; NO_SVE-NEXT: umov w13, v2.b[5] +; NO_SVE-NEXT: umov w14, v2.b[6] +; NO_SVE-NEXT: and w8, w8, #0x1 +; NO_SVE-NEXT: and w9, w9, #0x1 +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: and w11, w11, #0x1 +; NO_SVE-NEXT: bfi w10, w8, #1, #1 +; NO_SVE-NEXT: and w8, w12, #0x1 +; NO_SVE-NEXT: bfi w10, w9, #2, #1 +; NO_SVE-NEXT: and w9, w13, #0x1 +; NO_SVE-NEXT: bfi w10, w11, #3, #1 +; NO_SVE-NEXT: umov w11, v2.b[7] +; NO_SVE-NEXT: bfi w10, w8, #4, #1 +; NO_SVE-NEXT: and w8, w14, #0x1 +; NO_SVE-NEXT: bfi w10, w9, #5, #1 +; NO_SVE-NEXT: orr w8, w10, w8, lsl #6 +; NO_SVE-NEXT: orr w9, w8, w11, lsl #7 +; NO_SVE-NEXT: uzp1 v2.4s, v3.4s, v4.4s +; NO_SVE-NEXT: and w8, w9, #0xff +; NO_SVE-NEXT: tbnz w9, #0, .LBB9_9 +; NO_SVE-NEXT: // %bb.1: // %else +; NO_SVE-NEXT: tbnz w8, #1, .LBB9_10 +; NO_SVE-NEXT: .LBB9_2: // %else2 +; NO_SVE-NEXT: tbnz w8, #2, .LBB9_11 +; NO_SVE-NEXT: .LBB9_3: // %else4 +; NO_SVE-NEXT: tbnz w8, #3, .LBB9_12 +; NO_SVE-NEXT: .LBB9_4: // %else6 +; NO_SVE-NEXT: uzp1 v0.4s, v0.4s, v1.4s +; NO_SVE-NEXT: tbnz w8, #4, .LBB9_13 +; NO_SVE-NEXT: .LBB9_5: // %else8 +; NO_SVE-NEXT: tbnz w8, #5, .LBB9_14 +; NO_SVE-NEXT: .LBB9_6: // %else10 +; NO_SVE-NEXT: tbnz w8, #6, .LBB9_15 +; NO_SVE-NEXT: .LBB9_7: // %else12 +; NO_SVE-NEXT: tbnz w8, #7, .LBB9_16 +; NO_SVE-NEXT: .LBB9_8: // %else14 +; NO_SVE-NEXT: add sp, sp, #16 +; NO_SVE-NEXT: ret +; NO_SVE-NEXT: .LBB9_9: // %cond.store +; NO_SVE-NEXT: str s2, [x2] +; NO_SVE-NEXT: tbz w8, #1, .LBB9_2 +; NO_SVE-NEXT: .LBB9_10: // %cond.store1 +; NO_SVE-NEXT: add x9, x2, #4 +; NO_SVE-NEXT: st1 { v2.s }[1], [x9] +; NO_SVE-NEXT: tbz w8, #2, .LBB9_3 +; NO_SVE-NEXT: .LBB9_11: // %cond.store3 +; NO_SVE-NEXT: add x9, x2, #8 +; NO_SVE-NEXT: st1 { v2.s }[2], [x9] +; NO_SVE-NEXT: tbz w8, #3, .LBB9_4 +; NO_SVE-NEXT: .LBB9_12: // %cond.store5 +; NO_SVE-NEXT: add x9, x2, #12 +; NO_SVE-NEXT: st1 { v2.s }[3], [x9] +; NO_SVE-NEXT: uzp1 v0.4s, v0.4s, v1.4s +; NO_SVE-NEXT: tbz w8, #4, .LBB9_5 +; NO_SVE-NEXT: .LBB9_13: // %cond.store7 +; NO_SVE-NEXT: str s0, [x2, #16] +; NO_SVE-NEXT: tbz w8, #5, .LBB9_6 +; NO_SVE-NEXT: .LBB9_14: // %cond.store9 +; NO_SVE-NEXT: add x9, x2, #20 +; NO_SVE-NEXT: st1 { v0.s }[1], [x9] +; NO_SVE-NEXT: tbz w8, #6, .LBB9_7 +; NO_SVE-NEXT: .LBB9_15: // %cond.store11 +; NO_SVE-NEXT: add x9, x2, #24 +; NO_SVE-NEXT: st1 { v0.s }[2], [x9] +; NO_SVE-NEXT: tbz w8, #7, .LBB9_8 +; NO_SVE-NEXT: .LBB9_16: // %cond.store13 +; NO_SVE-NEXT: add x8, x2, #28 +; NO_SVE-NEXT: st1 { v0.s }[3], [x8] +; NO_SVE-NEXT: add sp, sp, #16 +; NO_SVE-NEXT: ret +; +; VBITS_GE_256-LABEL: masked_store_trunc_v8i64i32: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov x8, #4 +; VBITS_GE_256-NEXT: ptrue p0.d, vl4 +; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0] +; VBITS_GE_256-NEXT: ld1d { z2.d }, p0/z, [x1, x8, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z3.d }, p0/z, [x1] +; VBITS_GE_256-NEXT: cmpeq p1.d, p0/z, z0.d, z2.d +; VBITS_GE_256-NEXT: cmpeq p0.d, p0/z, z1.d, z3.d +; VBITS_GE_256-NEXT: mov z2.d, p1/z, #-1 // =0xffffffffffffffff +; VBITS_GE_256-NEXT: mov z3.d, p0/z, #-1 // =0xffffffffffffffff +; VBITS_GE_256-NEXT: ptrue p1.s, vl4 +; VBITS_GE_256-NEXT: uzp1 z2.s, z2.s, z2.s +; VBITS_GE_256-NEXT: uzp1 z3.s, z3.s, z3.s +; VBITS_GE_256-NEXT: ptrue p0.s, vl8 +; VBITS_GE_256-NEXT: splice z3.s, p1, z3.s, z2.s +; VBITS_GE_256-NEXT: uzp1 z0.s, z0.s, z0.s +; VBITS_GE_256-NEXT: uzp1 z1.s, z1.s, z1.s +; VBITS_GE_256-NEXT: cmpne p0.s, p0/z, z3.s, #0 +; VBITS_GE_256-NEXT: splice z1.s, p1, z1.s, z0.s +; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x2] +; VBITS_GE_256-NEXT: ret +; +; VBITS_GE_384-LABEL: masked_store_trunc_v8i64i32: +; VBITS_GE_384: // %bb.0: +; VBITS_GE_384-NEXT: mov x8, #4 +; VBITS_GE_384-NEXT: ptrue p0.d, vl4 +; VBITS_GE_384-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] +; VBITS_GE_384-NEXT: ld1d { z1.d }, p0/z, [x0] +; VBITS_GE_384-NEXT: ld1d { z2.d }, p0/z, [x1, x8, lsl #3] +; VBITS_GE_384-NEXT: ld1d { z3.d }, p0/z, [x1] +; VBITS_GE_384-NEXT: cmpeq p1.d, p0/z, z0.d, z2.d +; VBITS_GE_384-NEXT: cmpeq p0.d, p0/z, z1.d, z3.d +; VBITS_GE_384-NEXT: mov z2.d, p1/z, #-1 // =0xffffffffffffffff +; VBITS_GE_384-NEXT: mov z3.d, p0/z, #-1 // =0xffffffffffffffff +; VBITS_GE_384-NEXT: ptrue p1.s, vl4 +; VBITS_GE_384-NEXT: uzp1 z2.s, z2.s, z2.s +; VBITS_GE_384-NEXT: uzp1 z3.s, z3.s, z3.s +; VBITS_GE_384-NEXT: ptrue p0.s, vl8 +; VBITS_GE_384-NEXT: splice z3.s, p1, z3.s, z2.s +; VBITS_GE_384-NEXT: uzp1 z0.s, z0.s, z0.s +; VBITS_GE_384-NEXT: uzp1 z1.s, z1.s, z1.s +; VBITS_GE_384-NEXT: cmpne p0.s, p0/z, z3.s, #0 +; VBITS_GE_384-NEXT: splice z1.s, p1, z1.s, z0.s +; VBITS_GE_384-NEXT: st1w { z1.s }, p0, [x2] +; VBITS_GE_384-NEXT: ret +; +; VBITS_GE_512-LABEL: masked_store_trunc_v8i64i32: +; VBITS_GE_512: // %bb.0: +; VBITS_GE_512-NEXT: ptrue p0.d, vl8 +; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x0] +; VBITS_GE_512-NEXT: ld1d { z1.d }, p0/z, [x1] +; VBITS_GE_512-NEXT: cmpeq p0.d, p0/z, z0.d, z1.d +; VBITS_GE_512-NEXT: st1w { z0.d }, p0, [x2] +; VBITS_GE_512-NEXT: ret %a = load <8 x i64>, <8 x i64>* %ap %b = load <8 x i64>, <8 x i64>* %bp %mask = icmp eq <8 x i64> %a, %b @@ -203,13 +2065,233 @@ } define void @masked_store_trunc_v16i32i8(<16 x i32>* %ap, <16 x i32>* %bp, <16 x i8>* %dest) #0 { -; CHECK-LABEL: masked_store_trunc_v16i32i8: -; VBITS_GE_512: ptrue p[[P0:[0-9]+]].s, vl16 -; VBITS_GE_512-NEXT: ld1w { [[Z0:z[0-9]+]].s }, p0/z, [x0] -; VBITS_GE_512-NEXT: ld1w { [[Z1:z[0-9]+]].s }, p0/z, [x1] -; VBITS_GE_512-NEXT: cmpeq p[[P1:[0-9]+]].s, p[[P0]]/z, [[Z0]].s, [[Z1]].s -; VBITS_GE_512-NEXT: st1b { [[Z0]].s }, p[[P1]], [x{{[0-9]+}}] -; VBITS_GE_512-NEXT: ret +; NO_SVE-LABEL: masked_store_trunc_v16i32i8: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: sub sp, sp, #16 +; NO_SVE-NEXT: .cfi_def_cfa_offset 16 +; NO_SVE-NEXT: ldp q3, q2, [x1] +; NO_SVE-NEXT: ldp q0, q1, [x0] +; NO_SVE-NEXT: cmeq v3.4s, v0.4s, v3.4s +; NO_SVE-NEXT: cmeq v2.4s, v1.4s, v2.4s +; NO_SVE-NEXT: ldp q5, q4, [x0, #32] +; NO_SVE-NEXT: uzp1 v2.8h, v3.8h, v2.8h +; NO_SVE-NEXT: uzp1 v0.8h, v0.8h, v1.8h +; NO_SVE-NEXT: xtn v2.8b, v2.8h +; NO_SVE-NEXT: ldp q6, q3, [x1, #32] +; NO_SVE-NEXT: umov w8, v2.b[1] +; NO_SVE-NEXT: umov w10, v2.b[2] +; NO_SVE-NEXT: umov w9, v2.b[0] +; NO_SVE-NEXT: umov w11, v2.b[3] +; NO_SVE-NEXT: umov w12, v2.b[4] +; NO_SVE-NEXT: umov w13, v2.b[5] +; NO_SVE-NEXT: cmeq v6.4s, v5.4s, v6.4s +; NO_SVE-NEXT: and w8, w8, #0x1 +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: cmeq v3.4s, v4.4s, v3.4s +; NO_SVE-NEXT: and w9, w9, #0x1 +; NO_SVE-NEXT: and w11, w11, #0x1 +; NO_SVE-NEXT: and w12, w12, #0x1 +; NO_SVE-NEXT: uzp1 v3.8h, v6.8h, v3.8h +; NO_SVE-NEXT: bfi w9, w8, #1, #1 +; NO_SVE-NEXT: umov w8, v2.b[6] +; NO_SVE-NEXT: bfi w9, w10, #2, #1 +; NO_SVE-NEXT: umov w10, v2.b[7] +; NO_SVE-NEXT: and w13, w13, #0x1 +; NO_SVE-NEXT: bfi w9, w11, #3, #1 +; NO_SVE-NEXT: xtn v3.8b, v3.8h +; NO_SVE-NEXT: bfi w9, w12, #4, #1 +; NO_SVE-NEXT: and w8, w8, #0x1 +; NO_SVE-NEXT: bfi w9, w13, #5, #1 +; NO_SVE-NEXT: umov w11, v3.b[0] +; NO_SVE-NEXT: umov w12, v3.b[1] +; NO_SVE-NEXT: umov w13, v3.b[2] +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: orr w8, w9, w8, lsl #6 +; NO_SVE-NEXT: umov w9, v3.b[3] +; NO_SVE-NEXT: orr w8, w8, w10, lsl #7 +; NO_SVE-NEXT: umov w10, v3.b[4] +; NO_SVE-NEXT: and w11, w11, #0x1 +; NO_SVE-NEXT: and w12, w12, #0x1 +; NO_SVE-NEXT: uzp1 v2.8h, v5.8h, v4.8h +; NO_SVE-NEXT: orr w8, w8, w11, lsl #8 +; NO_SVE-NEXT: and w11, w13, #0x1 +; NO_SVE-NEXT: orr w8, w8, w12, lsl #9 +; NO_SVE-NEXT: umov w12, v3.b[5] +; NO_SVE-NEXT: and w9, w9, #0x1 +; NO_SVE-NEXT: orr w8, w8, w11, lsl #10 +; NO_SVE-NEXT: umov w11, v3.b[6] +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: orr w8, w8, w9, lsl #11 +; NO_SVE-NEXT: umov w9, v3.b[7] +; NO_SVE-NEXT: orr w8, w8, w10, lsl #12 +; NO_SVE-NEXT: and w10, w12, #0x1 +; NO_SVE-NEXT: uzp1 v0.16b, v0.16b, v2.16b +; NO_SVE-NEXT: and w11, w11, #0x1 +; NO_SVE-NEXT: orr w8, w8, w10, lsl #13 +; NO_SVE-NEXT: orr w8, w8, w11, lsl #14 +; NO_SVE-NEXT: orr w9, w8, w9, lsl #15 +; NO_SVE-NEXT: and w8, w9, #0xffff +; NO_SVE-NEXT: tbnz w9, #0, .LBB10_17 +; NO_SVE-NEXT: // %bb.1: // %else +; NO_SVE-NEXT: tbnz w8, #1, .LBB10_18 +; NO_SVE-NEXT: .LBB10_2: // %else2 +; NO_SVE-NEXT: tbnz w8, #2, .LBB10_19 +; NO_SVE-NEXT: .LBB10_3: // %else4 +; NO_SVE-NEXT: tbnz w8, #3, .LBB10_20 +; NO_SVE-NEXT: .LBB10_4: // %else6 +; NO_SVE-NEXT: tbnz w8, #4, .LBB10_21 +; NO_SVE-NEXT: .LBB10_5: // %else8 +; NO_SVE-NEXT: tbnz w8, #5, .LBB10_22 +; NO_SVE-NEXT: .LBB10_6: // %else10 +; NO_SVE-NEXT: tbnz w8, #6, .LBB10_23 +; NO_SVE-NEXT: .LBB10_7: // %else12 +; NO_SVE-NEXT: tbnz w8, #7, .LBB10_24 +; NO_SVE-NEXT: .LBB10_8: // %else14 +; NO_SVE-NEXT: tbnz w8, #8, .LBB10_25 +; NO_SVE-NEXT: .LBB10_9: // %else16 +; NO_SVE-NEXT: tbnz w8, #9, .LBB10_26 +; NO_SVE-NEXT: .LBB10_10: // %else18 +; NO_SVE-NEXT: tbnz w8, #10, .LBB10_27 +; NO_SVE-NEXT: .LBB10_11: // %else20 +; NO_SVE-NEXT: tbnz w8, #11, .LBB10_28 +; NO_SVE-NEXT: .LBB10_12: // %else22 +; NO_SVE-NEXT: tbnz w8, #12, .LBB10_29 +; NO_SVE-NEXT: .LBB10_13: // %else24 +; NO_SVE-NEXT: tbnz w8, #13, .LBB10_30 +; NO_SVE-NEXT: .LBB10_14: // %else26 +; NO_SVE-NEXT: tbnz w8, #14, .LBB10_31 +; NO_SVE-NEXT: .LBB10_15: // %else28 +; NO_SVE-NEXT: tbnz w8, #15, .LBB10_32 +; NO_SVE-NEXT: .LBB10_16: // %else30 +; NO_SVE-NEXT: add sp, sp, #16 +; NO_SVE-NEXT: ret +; NO_SVE-NEXT: .LBB10_17: // %cond.store +; NO_SVE-NEXT: st1 { v0.b }[0], [x2] +; NO_SVE-NEXT: tbz w8, #1, .LBB10_2 +; NO_SVE-NEXT: .LBB10_18: // %cond.store1 +; NO_SVE-NEXT: add x9, x2, #1 +; NO_SVE-NEXT: st1 { v0.b }[1], [x9] +; NO_SVE-NEXT: tbz w8, #2, .LBB10_3 +; NO_SVE-NEXT: .LBB10_19: // %cond.store3 +; NO_SVE-NEXT: add x9, x2, #2 +; NO_SVE-NEXT: st1 { v0.b }[2], [x9] +; NO_SVE-NEXT: tbz w8, #3, .LBB10_4 +; NO_SVE-NEXT: .LBB10_20: // %cond.store5 +; NO_SVE-NEXT: add x9, x2, #3 +; NO_SVE-NEXT: st1 { v0.b }[3], [x9] +; NO_SVE-NEXT: tbz w8, #4, .LBB10_5 +; NO_SVE-NEXT: .LBB10_21: // %cond.store7 +; NO_SVE-NEXT: add x9, x2, #4 +; NO_SVE-NEXT: st1 { v0.b }[4], [x9] +; NO_SVE-NEXT: tbz w8, #5, .LBB10_6 +; NO_SVE-NEXT: .LBB10_22: // %cond.store9 +; NO_SVE-NEXT: add x9, x2, #5 +; NO_SVE-NEXT: st1 { v0.b }[5], [x9] +; NO_SVE-NEXT: tbz w8, #6, .LBB10_7 +; NO_SVE-NEXT: .LBB10_23: // %cond.store11 +; NO_SVE-NEXT: add x9, x2, #6 +; NO_SVE-NEXT: st1 { v0.b }[6], [x9] +; NO_SVE-NEXT: tbz w8, #7, .LBB10_8 +; NO_SVE-NEXT: .LBB10_24: // %cond.store13 +; NO_SVE-NEXT: add x9, x2, #7 +; NO_SVE-NEXT: st1 { v0.b }[7], [x9] +; NO_SVE-NEXT: tbz w8, #8, .LBB10_9 +; NO_SVE-NEXT: .LBB10_25: // %cond.store15 +; NO_SVE-NEXT: add x9, x2, #8 +; NO_SVE-NEXT: st1 { v0.b }[8], [x9] +; NO_SVE-NEXT: tbz w8, #9, .LBB10_10 +; NO_SVE-NEXT: .LBB10_26: // %cond.store17 +; NO_SVE-NEXT: add x9, x2, #9 +; NO_SVE-NEXT: st1 { v0.b }[9], [x9] +; NO_SVE-NEXT: tbz w8, #10, .LBB10_11 +; NO_SVE-NEXT: .LBB10_27: // %cond.store19 +; NO_SVE-NEXT: add x9, x2, #10 +; NO_SVE-NEXT: st1 { v0.b }[10], [x9] +; NO_SVE-NEXT: tbz w8, #11, .LBB10_12 +; NO_SVE-NEXT: .LBB10_28: // %cond.store21 +; NO_SVE-NEXT: add x9, x2, #11 +; NO_SVE-NEXT: st1 { v0.b }[11], [x9] +; NO_SVE-NEXT: tbz w8, #12, .LBB10_13 +; NO_SVE-NEXT: .LBB10_29: // %cond.store23 +; NO_SVE-NEXT: add x9, x2, #12 +; NO_SVE-NEXT: st1 { v0.b }[12], [x9] +; NO_SVE-NEXT: tbz w8, #13, .LBB10_14 +; NO_SVE-NEXT: .LBB10_30: // %cond.store25 +; NO_SVE-NEXT: add x9, x2, #13 +; NO_SVE-NEXT: st1 { v0.b }[13], [x9] +; NO_SVE-NEXT: tbz w8, #14, .LBB10_15 +; NO_SVE-NEXT: .LBB10_31: // %cond.store27 +; NO_SVE-NEXT: add x9, x2, #14 +; NO_SVE-NEXT: st1 { v0.b }[14], [x9] +; NO_SVE-NEXT: tbz w8, #15, .LBB10_16 +; NO_SVE-NEXT: .LBB10_32: // %cond.store29 +; NO_SVE-NEXT: add x8, x2, #15 +; NO_SVE-NEXT: st1 { v0.b }[15], [x8] +; NO_SVE-NEXT: add sp, sp, #16 +; NO_SVE-NEXT: ret +; +; VBITS_GE_256-LABEL: masked_store_trunc_v16i32i8: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov x8, #8 +; VBITS_GE_256-NEXT: ptrue p0.s, vl8 +; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0] +; VBITS_GE_256-NEXT: ld1w { z2.s }, p0/z, [x1, x8, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z3.s }, p0/z, [x1] +; VBITS_GE_256-NEXT: cmpeq p1.s, p0/z, z0.s, z2.s +; VBITS_GE_256-NEXT: cmpeq p0.s, p0/z, z1.s, z3.s +; VBITS_GE_256-NEXT: mov z2.s, p1/z, #-1 // =0xffffffffffffffff +; VBITS_GE_256-NEXT: mov z3.s, p0/z, #-1 // =0xffffffffffffffff +; VBITS_GE_256-NEXT: uzp1 z2.h, z2.h, z2.h +; VBITS_GE_256-NEXT: uzp1 z3.h, z3.h, z3.h +; VBITS_GE_256-NEXT: uzp1 z0.h, z0.h, z0.h +; VBITS_GE_256-NEXT: uzp1 z1.h, z1.h, z1.h +; VBITS_GE_256-NEXT: uzp1 z2.b, z2.b, z2.b +; VBITS_GE_256-NEXT: uzp1 z3.b, z3.b, z3.b +; VBITS_GE_256-NEXT: mov v3.d[1], v2.d[0] +; VBITS_GE_256-NEXT: uzp1 z0.b, z0.b, z0.b +; VBITS_GE_256-NEXT: uzp1 z1.b, z1.b, z1.b +; VBITS_GE_256-NEXT: ptrue p0.b, vl16 +; VBITS_GE_256-NEXT: mov v1.d[1], v0.d[0] +; VBITS_GE_256-NEXT: cmpne p0.b, p0/z, z3.b, #0 +; VBITS_GE_256-NEXT: st1b { z1.b }, p0, [x2] +; VBITS_GE_256-NEXT: ret +; +; VBITS_GE_384-LABEL: masked_store_trunc_v16i32i8: +; VBITS_GE_384: // %bb.0: +; VBITS_GE_384-NEXT: mov x8, #8 +; VBITS_GE_384-NEXT: ptrue p0.s, vl8 +; VBITS_GE_384-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] +; VBITS_GE_384-NEXT: ld1w { z1.s }, p0/z, [x0] +; VBITS_GE_384-NEXT: ld1w { z2.s }, p0/z, [x1, x8, lsl #2] +; VBITS_GE_384-NEXT: ld1w { z3.s }, p0/z, [x1] +; VBITS_GE_384-NEXT: cmpeq p1.s, p0/z, z0.s, z2.s +; VBITS_GE_384-NEXT: cmpeq p0.s, p0/z, z1.s, z3.s +; VBITS_GE_384-NEXT: mov z2.s, p1/z, #-1 // =0xffffffffffffffff +; VBITS_GE_384-NEXT: mov z3.s, p0/z, #-1 // =0xffffffffffffffff +; VBITS_GE_384-NEXT: uzp1 z2.h, z2.h, z2.h +; VBITS_GE_384-NEXT: uzp1 z3.h, z3.h, z3.h +; VBITS_GE_384-NEXT: uzp1 z0.h, z0.h, z0.h +; VBITS_GE_384-NEXT: uzp1 z1.h, z1.h, z1.h +; VBITS_GE_384-NEXT: uzp1 z2.b, z2.b, z2.b +; VBITS_GE_384-NEXT: uzp1 z3.b, z3.b, z3.b +; VBITS_GE_384-NEXT: mov v3.d[1], v2.d[0] +; VBITS_GE_384-NEXT: uzp1 z0.b, z0.b, z0.b +; VBITS_GE_384-NEXT: uzp1 z1.b, z1.b, z1.b +; VBITS_GE_384-NEXT: ptrue p0.b, vl16 +; VBITS_GE_384-NEXT: mov v1.d[1], v0.d[0] +; VBITS_GE_384-NEXT: cmpne p0.b, p0/z, z3.b, #0 +; VBITS_GE_384-NEXT: st1b { z1.b }, p0, [x2] +; VBITS_GE_384-NEXT: ret +; +; VBITS_GE_512-LABEL: masked_store_trunc_v16i32i8: +; VBITS_GE_512: // %bb.0: +; VBITS_GE_512-NEXT: ptrue p0.s, vl16 +; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x0] +; VBITS_GE_512-NEXT: ld1w { z1.s }, p0/z, [x1] +; VBITS_GE_512-NEXT: cmpeq p0.s, p0/z, z0.s, z1.s +; VBITS_GE_512-NEXT: st1b { z0.s }, p0, [x2] +; VBITS_GE_512-NEXT: ret %a = load <16 x i32>, <16 x i32>* %ap %b = load <16 x i32>, <16 x i32>* %bp %mask = icmp eq <16 x i32> %a, %b @@ -219,13 +2301,232 @@ } define void @masked_store_trunc_v16i32i16(<16 x i32>* %ap, <16 x i32>* %bp, <16 x i16>* %dest) #0 { -; CHECK-LABEL: masked_store_trunc_v16i32i16: -; VBITS_GE_512: ptrue p[[P0:[0-9]+]].s, vl16 -; VBITS_GE_512-NEXT: ld1w { [[Z0:z[0-9]+]].s }, p0/z, [x0] -; VBITS_GE_512-NEXT: ld1w { [[Z1:z[0-9]+]].s }, p0/z, [x1] -; VBITS_GE_512-NEXT: cmpeq p[[P1:[0-9]+]].s, p[[P0]]/z, [[Z0]].s, [[Z1]].s -; VBITS_GE_512-NEXT: st1h { [[Z0]].s }, p[[P1]], [x{{[0-9]+}}] -; VBITS_GE_512-NEXT: ret +; NO_SVE-LABEL: masked_store_trunc_v16i32i16: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: sub sp, sp, #16 +; NO_SVE-NEXT: .cfi_def_cfa_offset 16 +; NO_SVE-NEXT: ldp q1, q0, [x1] +; NO_SVE-NEXT: ldp q2, q3, [x0] +; NO_SVE-NEXT: cmeq v1.4s, v2.4s, v1.4s +; NO_SVE-NEXT: cmeq v4.4s, v3.4s, v0.4s +; NO_SVE-NEXT: ldp q6, q5, [x1, #32] +; NO_SVE-NEXT: uzp1 v4.8h, v1.8h, v4.8h +; NO_SVE-NEXT: uzp1 v2.8h, v2.8h, v3.8h +; NO_SVE-NEXT: xtn v4.8b, v4.8h +; NO_SVE-NEXT: ldp q1, q0, [x0, #32] +; NO_SVE-NEXT: umov w8, v4.b[1] +; NO_SVE-NEXT: umov w10, v4.b[2] +; NO_SVE-NEXT: umov w9, v4.b[0] +; NO_SVE-NEXT: umov w11, v4.b[3] +; NO_SVE-NEXT: umov w12, v4.b[4] +; NO_SVE-NEXT: umov w13, v4.b[5] +; NO_SVE-NEXT: cmeq v6.4s, v1.4s, v6.4s +; NO_SVE-NEXT: and w8, w8, #0x1 +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: cmeq v5.4s, v0.4s, v5.4s +; NO_SVE-NEXT: and w9, w9, #0x1 +; NO_SVE-NEXT: and w11, w11, #0x1 +; NO_SVE-NEXT: and w12, w12, #0x1 +; NO_SVE-NEXT: uzp1 v5.8h, v6.8h, v5.8h +; NO_SVE-NEXT: bfi w9, w8, #1, #1 +; NO_SVE-NEXT: umov w8, v4.b[6] +; NO_SVE-NEXT: bfi w9, w10, #2, #1 +; NO_SVE-NEXT: umov w10, v4.b[7] +; NO_SVE-NEXT: and w13, w13, #0x1 +; NO_SVE-NEXT: bfi w9, w11, #3, #1 +; NO_SVE-NEXT: xtn v5.8b, v5.8h +; NO_SVE-NEXT: bfi w9, w12, #4, #1 +; NO_SVE-NEXT: and w8, w8, #0x1 +; NO_SVE-NEXT: bfi w9, w13, #5, #1 +; NO_SVE-NEXT: umov w11, v5.b[0] +; NO_SVE-NEXT: umov w12, v5.b[1] +; NO_SVE-NEXT: umov w13, v5.b[2] +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: orr w8, w9, w8, lsl #6 +; NO_SVE-NEXT: umov w9, v5.b[3] +; NO_SVE-NEXT: orr w8, w8, w10, lsl #7 +; NO_SVE-NEXT: umov w10, v5.b[4] +; NO_SVE-NEXT: and w11, w11, #0x1 +; NO_SVE-NEXT: and w12, w12, #0x1 +; NO_SVE-NEXT: and w13, w13, #0x1 +; NO_SVE-NEXT: orr w8, w8, w11, lsl #8 +; NO_SVE-NEXT: umov w11, v5.b[5] +; NO_SVE-NEXT: orr w8, w8, w12, lsl #9 +; NO_SVE-NEXT: umov w12, v5.b[6] +; NO_SVE-NEXT: and w9, w9, #0x1 +; NO_SVE-NEXT: orr w8, w8, w13, lsl #10 +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: orr w8, w8, w9, lsl #11 +; NO_SVE-NEXT: umov w9, v5.b[7] +; NO_SVE-NEXT: and w11, w11, #0x1 +; NO_SVE-NEXT: orr w8, w8, w10, lsl #12 +; NO_SVE-NEXT: and w10, w12, #0x1 +; NO_SVE-NEXT: orr w8, w8, w11, lsl #13 +; NO_SVE-NEXT: orr w8, w8, w10, lsl #14 +; NO_SVE-NEXT: orr w9, w8, w9, lsl #15 +; NO_SVE-NEXT: and w8, w9, #0xffff +; NO_SVE-NEXT: tbnz w9, #0, .LBB11_17 +; NO_SVE-NEXT: // %bb.1: // %else +; NO_SVE-NEXT: tbnz w8, #1, .LBB11_18 +; NO_SVE-NEXT: .LBB11_2: // %else2 +; NO_SVE-NEXT: tbnz w8, #2, .LBB11_19 +; NO_SVE-NEXT: .LBB11_3: // %else4 +; NO_SVE-NEXT: tbnz w8, #3, .LBB11_20 +; NO_SVE-NEXT: .LBB11_4: // %else6 +; NO_SVE-NEXT: tbnz w8, #4, .LBB11_21 +; NO_SVE-NEXT: .LBB11_5: // %else8 +; NO_SVE-NEXT: tbnz w8, #5, .LBB11_22 +; NO_SVE-NEXT: .LBB11_6: // %else10 +; NO_SVE-NEXT: tbnz w8, #6, .LBB11_23 +; NO_SVE-NEXT: .LBB11_7: // %else12 +; NO_SVE-NEXT: tbnz w8, #7, .LBB11_24 +; NO_SVE-NEXT: .LBB11_8: // %else14 +; NO_SVE-NEXT: uzp1 v0.8h, v1.8h, v0.8h +; NO_SVE-NEXT: tbnz w8, #8, .LBB11_25 +; NO_SVE-NEXT: .LBB11_9: // %else16 +; NO_SVE-NEXT: tbnz w8, #9, .LBB11_26 +; NO_SVE-NEXT: .LBB11_10: // %else18 +; NO_SVE-NEXT: tbnz w8, #10, .LBB11_27 +; NO_SVE-NEXT: .LBB11_11: // %else20 +; NO_SVE-NEXT: tbnz w8, #11, .LBB11_28 +; NO_SVE-NEXT: .LBB11_12: // %else22 +; NO_SVE-NEXT: tbnz w8, #12, .LBB11_29 +; NO_SVE-NEXT: .LBB11_13: // %else24 +; NO_SVE-NEXT: tbnz w8, #13, .LBB11_30 +; NO_SVE-NEXT: .LBB11_14: // %else26 +; NO_SVE-NEXT: tbnz w8, #14, .LBB11_31 +; NO_SVE-NEXT: .LBB11_15: // %else28 +; NO_SVE-NEXT: tbnz w8, #15, .LBB11_32 +; NO_SVE-NEXT: .LBB11_16: // %else30 +; NO_SVE-NEXT: add sp, sp, #16 +; NO_SVE-NEXT: ret +; NO_SVE-NEXT: .LBB11_17: // %cond.store +; NO_SVE-NEXT: str h2, [x2] +; NO_SVE-NEXT: tbz w8, #1, .LBB11_2 +; NO_SVE-NEXT: .LBB11_18: // %cond.store1 +; NO_SVE-NEXT: add x9, x2, #2 +; NO_SVE-NEXT: st1 { v2.h }[1], [x9] +; NO_SVE-NEXT: tbz w8, #2, .LBB11_3 +; NO_SVE-NEXT: .LBB11_19: // %cond.store3 +; NO_SVE-NEXT: add x9, x2, #4 +; NO_SVE-NEXT: st1 { v2.h }[2], [x9] +; NO_SVE-NEXT: tbz w8, #3, .LBB11_4 +; NO_SVE-NEXT: .LBB11_20: // %cond.store5 +; NO_SVE-NEXT: add x9, x2, #6 +; NO_SVE-NEXT: st1 { v2.h }[3], [x9] +; NO_SVE-NEXT: tbz w8, #4, .LBB11_5 +; NO_SVE-NEXT: .LBB11_21: // %cond.store7 +; NO_SVE-NEXT: add x9, x2, #8 +; NO_SVE-NEXT: st1 { v2.h }[4], [x9] +; NO_SVE-NEXT: tbz w8, #5, .LBB11_6 +; NO_SVE-NEXT: .LBB11_22: // %cond.store9 +; NO_SVE-NEXT: add x9, x2, #10 +; NO_SVE-NEXT: st1 { v2.h }[5], [x9] +; NO_SVE-NEXT: tbz w8, #6, .LBB11_7 +; NO_SVE-NEXT: .LBB11_23: // %cond.store11 +; NO_SVE-NEXT: add x9, x2, #12 +; NO_SVE-NEXT: st1 { v2.h }[6], [x9] +; NO_SVE-NEXT: tbz w8, #7, .LBB11_8 +; NO_SVE-NEXT: .LBB11_24: // %cond.store13 +; NO_SVE-NEXT: add x9, x2, #14 +; NO_SVE-NEXT: st1 { v2.h }[7], [x9] +; NO_SVE-NEXT: uzp1 v0.8h, v1.8h, v0.8h +; NO_SVE-NEXT: tbz w8, #8, .LBB11_9 +; NO_SVE-NEXT: .LBB11_25: // %cond.store15 +; NO_SVE-NEXT: str h0, [x2, #16] +; NO_SVE-NEXT: tbz w8, #9, .LBB11_10 +; NO_SVE-NEXT: .LBB11_26: // %cond.store17 +; NO_SVE-NEXT: add x9, x2, #18 +; NO_SVE-NEXT: st1 { v0.h }[1], [x9] +; NO_SVE-NEXT: tbz w8, #10, .LBB11_11 +; NO_SVE-NEXT: .LBB11_27: // %cond.store19 +; NO_SVE-NEXT: add x9, x2, #20 +; NO_SVE-NEXT: st1 { v0.h }[2], [x9] +; NO_SVE-NEXT: tbz w8, #11, .LBB11_12 +; NO_SVE-NEXT: .LBB11_28: // %cond.store21 +; NO_SVE-NEXT: add x9, x2, #22 +; NO_SVE-NEXT: st1 { v0.h }[3], [x9] +; NO_SVE-NEXT: tbz w8, #12, .LBB11_13 +; NO_SVE-NEXT: .LBB11_29: // %cond.store23 +; NO_SVE-NEXT: add x9, x2, #24 +; NO_SVE-NEXT: st1 { v0.h }[4], [x9] +; NO_SVE-NEXT: tbz w8, #13, .LBB11_14 +; NO_SVE-NEXT: .LBB11_30: // %cond.store25 +; NO_SVE-NEXT: add x9, x2, #26 +; NO_SVE-NEXT: st1 { v0.h }[5], [x9] +; NO_SVE-NEXT: tbz w8, #14, .LBB11_15 +; NO_SVE-NEXT: .LBB11_31: // %cond.store27 +; NO_SVE-NEXT: add x9, x2, #28 +; NO_SVE-NEXT: st1 { v0.h }[6], [x9] +; NO_SVE-NEXT: tbz w8, #15, .LBB11_16 +; NO_SVE-NEXT: .LBB11_32: // %cond.store29 +; NO_SVE-NEXT: add x8, x2, #30 +; NO_SVE-NEXT: st1 { v0.h }[7], [x8] +; NO_SVE-NEXT: add sp, sp, #16 +; NO_SVE-NEXT: ret +; +; VBITS_GE_256-LABEL: masked_store_trunc_v16i32i16: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov x8, #8 +; VBITS_GE_256-NEXT: ptrue p0.s, vl8 +; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0] +; VBITS_GE_256-NEXT: ld1w { z2.s }, p0/z, [x1, x8, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z3.s }, p0/z, [x1] +; VBITS_GE_256-NEXT: cmpeq p1.s, p0/z, z0.s, z2.s +; VBITS_GE_256-NEXT: cmpeq p0.s, p0/z, z1.s, z3.s +; VBITS_GE_256-NEXT: mov z2.s, p1/z, #-1 // =0xffffffffffffffff +; VBITS_GE_256-NEXT: mov z3.s, p0/z, #-1 // =0xffffffffffffffff +; VBITS_GE_256-NEXT: uzp1 z2.h, z2.h, z2.h +; VBITS_GE_256-NEXT: uzp1 z3.h, z3.h, z3.h +; VBITS_GE_256-NEXT: uzp1 z2.b, z2.b, z2.b +; VBITS_GE_256-NEXT: uzp1 z3.b, z3.b, z3.b +; VBITS_GE_256-NEXT: mov v3.d[1], v2.d[0] +; VBITS_GE_256-NEXT: ptrue p0.h, vl16 +; VBITS_GE_256-NEXT: uzp1 z0.h, z0.h, z0.h +; VBITS_GE_256-NEXT: uzp1 z1.h, z1.h, z1.h +; VBITS_GE_256-NEXT: ptrue p1.h, vl8 +; VBITS_GE_256-NEXT: splice z1.h, p1, z1.h, z0.h +; VBITS_GE_256-NEXT: sunpklo z2.h, z3.b +; VBITS_GE_256-NEXT: cmpne p0.h, p0/z, z2.h, #0 +; VBITS_GE_256-NEXT: st1h { z1.h }, p0, [x2] +; VBITS_GE_256-NEXT: ret +; +; VBITS_GE_384-LABEL: masked_store_trunc_v16i32i16: +; VBITS_GE_384: // %bb.0: +; VBITS_GE_384-NEXT: mov x8, #8 +; VBITS_GE_384-NEXT: ptrue p0.s, vl8 +; VBITS_GE_384-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] +; VBITS_GE_384-NEXT: ld1w { z1.s }, p0/z, [x0] +; VBITS_GE_384-NEXT: ld1w { z2.s }, p0/z, [x1, x8, lsl #2] +; VBITS_GE_384-NEXT: ld1w { z3.s }, p0/z, [x1] +; VBITS_GE_384-NEXT: cmpeq p1.s, p0/z, z0.s, z2.s +; VBITS_GE_384-NEXT: cmpeq p0.s, p0/z, z1.s, z3.s +; VBITS_GE_384-NEXT: mov z2.s, p1/z, #-1 // =0xffffffffffffffff +; VBITS_GE_384-NEXT: mov z3.s, p0/z, #-1 // =0xffffffffffffffff +; VBITS_GE_384-NEXT: uzp1 z2.h, z2.h, z2.h +; VBITS_GE_384-NEXT: uzp1 z3.h, z3.h, z3.h +; VBITS_GE_384-NEXT: uzp1 z2.b, z2.b, z2.b +; VBITS_GE_384-NEXT: uzp1 z3.b, z3.b, z3.b +; VBITS_GE_384-NEXT: mov v3.d[1], v2.d[0] +; VBITS_GE_384-NEXT: ptrue p0.h, vl16 +; VBITS_GE_384-NEXT: uzp1 z0.h, z0.h, z0.h +; VBITS_GE_384-NEXT: uzp1 z1.h, z1.h, z1.h +; VBITS_GE_384-NEXT: ptrue p1.h, vl8 +; VBITS_GE_384-NEXT: splice z1.h, p1, z1.h, z0.h +; VBITS_GE_384-NEXT: sunpklo z2.h, z3.b +; VBITS_GE_384-NEXT: cmpne p0.h, p0/z, z2.h, #0 +; VBITS_GE_384-NEXT: st1h { z1.h }, p0, [x2] +; VBITS_GE_384-NEXT: ret +; +; VBITS_GE_512-LABEL: masked_store_trunc_v16i32i16: +; VBITS_GE_512: // %bb.0: +; VBITS_GE_512-NEXT: ptrue p0.s, vl16 +; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x0] +; VBITS_GE_512-NEXT: ld1w { z1.s }, p0/z, [x1] +; VBITS_GE_512-NEXT: cmpeq p0.s, p0/z, z0.s, z1.s +; VBITS_GE_512-NEXT: st1h { z0.s }, p0, [x2] +; VBITS_GE_512-NEXT: ret %a = load <16 x i32>, <16 x i32>* %ap %b = load <16 x i32>, <16 x i32>* %bp %mask = icmp eq <16 x i32> %a, %b @@ -235,13 +2536,369 @@ } define void @masked_store_trunc_v32i16i8(<32 x i16>* %ap, <32 x i16>* %bp, <32 x i8>* %dest) #0 { -; CHECK-LABEL: masked_store_trunc_v32i16i8: -; VBITS_GE_512: ptrue p[[P0:[0-9]+]].h, vl32 -; VBITS_GE_512-NEXT: ld1h { [[Z0:z[0-9]+]].h }, p0/z, [x0] -; VBITS_GE_512-NEXT: ld1h { [[Z1:z[0-9]+]].h }, p0/z, [x1] -; VBITS_GE_512-NEXT: cmpeq p[[P1:[0-9]+]].h, p[[P0]]/z, [[Z0]].h, [[Z1]].h -; VBITS_GE_512-NEXT: st1b { [[Z0]].h }, p[[P1]], [x{{[0-9]+}}] -; VBITS_GE_512-NEXT: ret +; NO_SVE-LABEL: masked_store_trunc_v32i16i8: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: sub sp, sp, #16 +; NO_SVE-NEXT: .cfi_def_cfa_offset 16 +; NO_SVE-NEXT: ldp q0, q1, [x0, #32] +; NO_SVE-NEXT: ldp q2, q4, [x1, #32] +; NO_SVE-NEXT: cmeq v5.8h, v0.8h, v2.8h +; NO_SVE-NEXT: xtn v5.8b, v5.8h +; NO_SVE-NEXT: cmeq v4.8h, v1.8h, v4.8h +; NO_SVE-NEXT: umov w8, v5.b[1] +; NO_SVE-NEXT: umov w9, v5.b[2] +; NO_SVE-NEXT: umov w10, v5.b[0] +; NO_SVE-NEXT: umov w11, v5.b[3] +; NO_SVE-NEXT: umov w12, v5.b[4] +; NO_SVE-NEXT: umov w13, v5.b[5] +; NO_SVE-NEXT: xtn v4.8b, v4.8h +; NO_SVE-NEXT: and w8, w8, #0x1 +; NO_SVE-NEXT: umov w14, v5.b[6] +; NO_SVE-NEXT: ldp q3, q2, [x0] +; NO_SVE-NEXT: and w9, w9, #0x1 +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: and w11, w11, #0x1 +; NO_SVE-NEXT: umov w15, v5.b[7] +; NO_SVE-NEXT: bfi w10, w8, #1, #1 +; NO_SVE-NEXT: and w12, w12, #0x1 +; NO_SVE-NEXT: umov w16, v4.b[0] +; NO_SVE-NEXT: bfi w10, w9, #2, #1 +; NO_SVE-NEXT: and w13, w13, #0x1 +; NO_SVE-NEXT: umov w8, v4.b[1] +; NO_SVE-NEXT: ldp q7, q6, [x1] +; NO_SVE-NEXT: bfi w10, w11, #3, #1 +; NO_SVE-NEXT: umov w9, v4.b[2] +; NO_SVE-NEXT: and w14, w14, #0x1 +; NO_SVE-NEXT: bfi w10, w12, #4, #1 +; NO_SVE-NEXT: umov w11, v4.b[3] +; NO_SVE-NEXT: and w15, w15, #0x1 +; NO_SVE-NEXT: cmeq v5.8h, v3.8h, v7.8h +; NO_SVE-NEXT: bfi w10, w13, #5, #1 +; NO_SVE-NEXT: and w16, w16, #0x1 +; NO_SVE-NEXT: orr w10, w10, w14, lsl #6 +; NO_SVE-NEXT: xtn v5.8b, v5.8h +; NO_SVE-NEXT: and w8, w8, #0x1 +; NO_SVE-NEXT: umov w12, v4.b[4] +; NO_SVE-NEXT: orr w10, w10, w15, lsl #7 +; NO_SVE-NEXT: umov w13, v5.b[1] +; NO_SVE-NEXT: umov w14, v5.b[2] +; NO_SVE-NEXT: and w9, w9, #0x1 +; NO_SVE-NEXT: orr w10, w10, w16, lsl #8 +; NO_SVE-NEXT: and w11, w11, #0x1 +; NO_SVE-NEXT: orr w8, w10, w8, lsl #9 +; NO_SVE-NEXT: orr w8, w8, w9, lsl #10 +; NO_SVE-NEXT: and w10, w12, #0x1 +; NO_SVE-NEXT: umov w9, v5.b[0] +; NO_SVE-NEXT: orr w8, w8, w11, lsl #11 +; NO_SVE-NEXT: umov w11, v4.b[5] +; NO_SVE-NEXT: and w12, w13, #0x1 +; NO_SVE-NEXT: and w13, w14, #0x1 +; NO_SVE-NEXT: umov w14, v5.b[3] +; NO_SVE-NEXT: umov w15, v5.b[4] +; NO_SVE-NEXT: umov w16, v5.b[5] +; NO_SVE-NEXT: and w9, w9, #0x1 +; NO_SVE-NEXT: orr w8, w8, w10, lsl #12 +; NO_SVE-NEXT: and w10, w11, #0x1 +; NO_SVE-NEXT: bfi w9, w12, #1, #1 +; NO_SVE-NEXT: and w11, w14, #0x1 +; NO_SVE-NEXT: umov w14, v5.b[6] +; NO_SVE-NEXT: and w12, w15, #0x1 +; NO_SVE-NEXT: bfi w9, w13, #2, #1 +; NO_SVE-NEXT: cmeq v6.8h, v2.8h, v6.8h +; NO_SVE-NEXT: and w13, w16, #0x1 +; NO_SVE-NEXT: bfi w9, w11, #3, #1 +; NO_SVE-NEXT: umov w11, v5.b[7] +; NO_SVE-NEXT: xtn v5.8b, v6.8h +; NO_SVE-NEXT: bfi w9, w12, #4, #1 +; NO_SVE-NEXT: umov w12, v4.b[6] +; NO_SVE-NEXT: bfi w9, w13, #5, #1 +; NO_SVE-NEXT: and w13, w14, #0x1 +; NO_SVE-NEXT: umov w14, v5.b[0] +; NO_SVE-NEXT: and w11, w11, #0x1 +; NO_SVE-NEXT: umov w15, v5.b[1] +; NO_SVE-NEXT: orr w8, w8, w10, lsl #13 +; NO_SVE-NEXT: orr w9, w9, w13, lsl #6 +; NO_SVE-NEXT: and w10, w12, #0x1 +; NO_SVE-NEXT: umov w12, v5.b[2] +; NO_SVE-NEXT: orr w9, w9, w11, lsl #7 +; NO_SVE-NEXT: and w11, w14, #0x1 +; NO_SVE-NEXT: umov w14, v5.b[3] +; NO_SVE-NEXT: and w13, w15, #0x1 +; NO_SVE-NEXT: orr w8, w8, w10, lsl #14 +; NO_SVE-NEXT: umov w10, v5.b[4] +; NO_SVE-NEXT: orr w9, w9, w11, lsl #8 +; NO_SVE-NEXT: and w11, w12, #0x1 +; NO_SVE-NEXT: umov w12, v5.b[5] +; NO_SVE-NEXT: orr w9, w9, w13, lsl #9 +; NO_SVE-NEXT: and w13, w14, #0x1 +; NO_SVE-NEXT: umov w14, v5.b[6] +; NO_SVE-NEXT: orr w9, w9, w11, lsl #10 +; NO_SVE-NEXT: umov w11, v4.b[7] +; NO_SVE-NEXT: and w10, w10, #0x1 +; NO_SVE-NEXT: orr w9, w9, w13, lsl #11 +; NO_SVE-NEXT: and w12, w12, #0x1 +; NO_SVE-NEXT: umov w13, v5.b[7] +; NO_SVE-NEXT: orr w9, w9, w10, lsl #12 +; NO_SVE-NEXT: and w10, w14, #0x1 +; NO_SVE-NEXT: orr w11, w8, w11, lsl #15 +; NO_SVE-NEXT: orr w8, w9, w12, lsl #13 +; NO_SVE-NEXT: orr w8, w8, w10, lsl #14 +; NO_SVE-NEXT: orr w8, w8, w13, lsl #15 +; NO_SVE-NEXT: uzp1 v2.16b, v3.16b, v2.16b +; NO_SVE-NEXT: bfi w8, w11, #16, #16 +; NO_SVE-NEXT: tbnz w8, #0, .LBB12_33 +; NO_SVE-NEXT: // %bb.1: // %else +; NO_SVE-NEXT: tbnz w8, #1, .LBB12_34 +; NO_SVE-NEXT: .LBB12_2: // %else2 +; NO_SVE-NEXT: tbnz w8, #2, .LBB12_35 +; NO_SVE-NEXT: .LBB12_3: // %else4 +; NO_SVE-NEXT: tbnz w8, #3, .LBB12_36 +; NO_SVE-NEXT: .LBB12_4: // %else6 +; NO_SVE-NEXT: tbnz w8, #4, .LBB12_37 +; NO_SVE-NEXT: .LBB12_5: // %else8 +; NO_SVE-NEXT: tbnz w8, #5, .LBB12_38 +; NO_SVE-NEXT: .LBB12_6: // %else10 +; NO_SVE-NEXT: tbnz w8, #6, .LBB12_39 +; NO_SVE-NEXT: .LBB12_7: // %else12 +; NO_SVE-NEXT: tbnz w8, #7, .LBB12_40 +; NO_SVE-NEXT: .LBB12_8: // %else14 +; NO_SVE-NEXT: tbnz w8, #8, .LBB12_41 +; NO_SVE-NEXT: .LBB12_9: // %else16 +; NO_SVE-NEXT: tbnz w8, #9, .LBB12_42 +; NO_SVE-NEXT: .LBB12_10: // %else18 +; NO_SVE-NEXT: tbnz w8, #10, .LBB12_43 +; NO_SVE-NEXT: .LBB12_11: // %else20 +; NO_SVE-NEXT: tbnz w8, #11, .LBB12_44 +; NO_SVE-NEXT: .LBB12_12: // %else22 +; NO_SVE-NEXT: tbnz w8, #12, .LBB12_45 +; NO_SVE-NEXT: .LBB12_13: // %else24 +; NO_SVE-NEXT: tbnz w8, #13, .LBB12_46 +; NO_SVE-NEXT: .LBB12_14: // %else26 +; NO_SVE-NEXT: tbnz w8, #14, .LBB12_47 +; NO_SVE-NEXT: .LBB12_15: // %else28 +; NO_SVE-NEXT: tbnz w8, #15, .LBB12_48 +; NO_SVE-NEXT: .LBB12_16: // %else30 +; NO_SVE-NEXT: uzp1 v0.16b, v0.16b, v1.16b +; NO_SVE-NEXT: tbnz w8, #16, .LBB12_49 +; NO_SVE-NEXT: .LBB12_17: // %else32 +; NO_SVE-NEXT: tbnz w8, #17, .LBB12_50 +; NO_SVE-NEXT: .LBB12_18: // %else34 +; NO_SVE-NEXT: tbnz w8, #18, .LBB12_51 +; NO_SVE-NEXT: .LBB12_19: // %else36 +; NO_SVE-NEXT: tbnz w8, #19, .LBB12_52 +; NO_SVE-NEXT: .LBB12_20: // %else38 +; NO_SVE-NEXT: tbnz w8, #20, .LBB12_53 +; NO_SVE-NEXT: .LBB12_21: // %else40 +; NO_SVE-NEXT: tbnz w8, #21, .LBB12_54 +; NO_SVE-NEXT: .LBB12_22: // %else42 +; NO_SVE-NEXT: tbnz w8, #22, .LBB12_55 +; NO_SVE-NEXT: .LBB12_23: // %else44 +; NO_SVE-NEXT: tbnz w8, #23, .LBB12_56 +; NO_SVE-NEXT: .LBB12_24: // %else46 +; NO_SVE-NEXT: tbnz w8, #24, .LBB12_57 +; NO_SVE-NEXT: .LBB12_25: // %else48 +; NO_SVE-NEXT: tbnz w8, #25, .LBB12_58 +; NO_SVE-NEXT: .LBB12_26: // %else50 +; NO_SVE-NEXT: tbnz w8, #26, .LBB12_59 +; NO_SVE-NEXT: .LBB12_27: // %else52 +; NO_SVE-NEXT: tbnz w8, #27, .LBB12_60 +; NO_SVE-NEXT: .LBB12_28: // %else54 +; NO_SVE-NEXT: tbnz w8, #28, .LBB12_61 +; NO_SVE-NEXT: .LBB12_29: // %else56 +; NO_SVE-NEXT: tbnz w8, #29, .LBB12_62 +; NO_SVE-NEXT: .LBB12_30: // %else58 +; NO_SVE-NEXT: tbnz w8, #30, .LBB12_63 +; NO_SVE-NEXT: .LBB12_31: // %else60 +; NO_SVE-NEXT: tbnz w8, #31, .LBB12_64 +; NO_SVE-NEXT: .LBB12_32: // %else62 +; NO_SVE-NEXT: add sp, sp, #16 +; NO_SVE-NEXT: ret +; NO_SVE-NEXT: .LBB12_33: // %cond.store +; NO_SVE-NEXT: st1 { v2.b }[0], [x2] +; NO_SVE-NEXT: tbz w8, #1, .LBB12_2 +; NO_SVE-NEXT: .LBB12_34: // %cond.store1 +; NO_SVE-NEXT: add x9, x2, #1 +; NO_SVE-NEXT: st1 { v2.b }[1], [x9] +; NO_SVE-NEXT: tbz w8, #2, .LBB12_3 +; NO_SVE-NEXT: .LBB12_35: // %cond.store3 +; NO_SVE-NEXT: add x9, x2, #2 +; NO_SVE-NEXT: st1 { v2.b }[2], [x9] +; NO_SVE-NEXT: tbz w8, #3, .LBB12_4 +; NO_SVE-NEXT: .LBB12_36: // %cond.store5 +; NO_SVE-NEXT: add x9, x2, #3 +; NO_SVE-NEXT: st1 { v2.b }[3], [x9] +; NO_SVE-NEXT: tbz w8, #4, .LBB12_5 +; NO_SVE-NEXT: .LBB12_37: // %cond.store7 +; NO_SVE-NEXT: add x9, x2, #4 +; NO_SVE-NEXT: st1 { v2.b }[4], [x9] +; NO_SVE-NEXT: tbz w8, #5, .LBB12_6 +; NO_SVE-NEXT: .LBB12_38: // %cond.store9 +; NO_SVE-NEXT: add x9, x2, #5 +; NO_SVE-NEXT: st1 { v2.b }[5], [x9] +; NO_SVE-NEXT: tbz w8, #6, .LBB12_7 +; NO_SVE-NEXT: .LBB12_39: // %cond.store11 +; NO_SVE-NEXT: add x9, x2, #6 +; NO_SVE-NEXT: st1 { v2.b }[6], [x9] +; NO_SVE-NEXT: tbz w8, #7, .LBB12_8 +; NO_SVE-NEXT: .LBB12_40: // %cond.store13 +; NO_SVE-NEXT: add x9, x2, #7 +; NO_SVE-NEXT: st1 { v2.b }[7], [x9] +; NO_SVE-NEXT: tbz w8, #8, .LBB12_9 +; NO_SVE-NEXT: .LBB12_41: // %cond.store15 +; NO_SVE-NEXT: add x9, x2, #8 +; NO_SVE-NEXT: st1 { v2.b }[8], [x9] +; NO_SVE-NEXT: tbz w8, #9, .LBB12_10 +; NO_SVE-NEXT: .LBB12_42: // %cond.store17 +; NO_SVE-NEXT: add x9, x2, #9 +; NO_SVE-NEXT: st1 { v2.b }[9], [x9] +; NO_SVE-NEXT: tbz w8, #10, .LBB12_11 +; NO_SVE-NEXT: .LBB12_43: // %cond.store19 +; NO_SVE-NEXT: add x9, x2, #10 +; NO_SVE-NEXT: st1 { v2.b }[10], [x9] +; NO_SVE-NEXT: tbz w8, #11, .LBB12_12 +; NO_SVE-NEXT: .LBB12_44: // %cond.store21 +; NO_SVE-NEXT: add x9, x2, #11 +; NO_SVE-NEXT: st1 { v2.b }[11], [x9] +; NO_SVE-NEXT: tbz w8, #12, .LBB12_13 +; NO_SVE-NEXT: .LBB12_45: // %cond.store23 +; NO_SVE-NEXT: add x9, x2, #12 +; NO_SVE-NEXT: st1 { v2.b }[12], [x9] +; NO_SVE-NEXT: tbz w8, #13, .LBB12_14 +; NO_SVE-NEXT: .LBB12_46: // %cond.store25 +; NO_SVE-NEXT: add x9, x2, #13 +; NO_SVE-NEXT: st1 { v2.b }[13], [x9] +; NO_SVE-NEXT: tbz w8, #14, .LBB12_15 +; NO_SVE-NEXT: .LBB12_47: // %cond.store27 +; NO_SVE-NEXT: add x9, x2, #14 +; NO_SVE-NEXT: st1 { v2.b }[14], [x9] +; NO_SVE-NEXT: tbz w8, #15, .LBB12_16 +; NO_SVE-NEXT: .LBB12_48: // %cond.store29 +; NO_SVE-NEXT: add x9, x2, #15 +; NO_SVE-NEXT: st1 { v2.b }[15], [x9] +; NO_SVE-NEXT: uzp1 v0.16b, v0.16b, v1.16b +; NO_SVE-NEXT: tbz w8, #16, .LBB12_17 +; NO_SVE-NEXT: .LBB12_49: // %cond.store31 +; NO_SVE-NEXT: add x9, x2, #16 +; NO_SVE-NEXT: st1 { v0.b }[0], [x9] +; NO_SVE-NEXT: tbz w8, #17, .LBB12_18 +; NO_SVE-NEXT: .LBB12_50: // %cond.store33 +; NO_SVE-NEXT: add x9, x2, #17 +; NO_SVE-NEXT: st1 { v0.b }[1], [x9] +; NO_SVE-NEXT: tbz w8, #18, .LBB12_19 +; NO_SVE-NEXT: .LBB12_51: // %cond.store35 +; NO_SVE-NEXT: add x9, x2, #18 +; NO_SVE-NEXT: st1 { v0.b }[2], [x9] +; NO_SVE-NEXT: tbz w8, #19, .LBB12_20 +; NO_SVE-NEXT: .LBB12_52: // %cond.store37 +; NO_SVE-NEXT: add x9, x2, #19 +; NO_SVE-NEXT: st1 { v0.b }[3], [x9] +; NO_SVE-NEXT: tbz w8, #20, .LBB12_21 +; NO_SVE-NEXT: .LBB12_53: // %cond.store39 +; NO_SVE-NEXT: add x9, x2, #20 +; NO_SVE-NEXT: st1 { v0.b }[4], [x9] +; NO_SVE-NEXT: tbz w8, #21, .LBB12_22 +; NO_SVE-NEXT: .LBB12_54: // %cond.store41 +; NO_SVE-NEXT: add x9, x2, #21 +; NO_SVE-NEXT: st1 { v0.b }[5], [x9] +; NO_SVE-NEXT: tbz w8, #22, .LBB12_23 +; NO_SVE-NEXT: .LBB12_55: // %cond.store43 +; NO_SVE-NEXT: add x9, x2, #22 +; NO_SVE-NEXT: st1 { v0.b }[6], [x9] +; NO_SVE-NEXT: tbz w8, #23, .LBB12_24 +; NO_SVE-NEXT: .LBB12_56: // %cond.store45 +; NO_SVE-NEXT: add x9, x2, #23 +; NO_SVE-NEXT: st1 { v0.b }[7], [x9] +; NO_SVE-NEXT: tbz w8, #24, .LBB12_25 +; NO_SVE-NEXT: .LBB12_57: // %cond.store47 +; NO_SVE-NEXT: add x9, x2, #24 +; NO_SVE-NEXT: st1 { v0.b }[8], [x9] +; NO_SVE-NEXT: tbz w8, #25, .LBB12_26 +; NO_SVE-NEXT: .LBB12_58: // %cond.store49 +; NO_SVE-NEXT: add x9, x2, #25 +; NO_SVE-NEXT: st1 { v0.b }[9], [x9] +; NO_SVE-NEXT: tbz w8, #26, .LBB12_27 +; NO_SVE-NEXT: .LBB12_59: // %cond.store51 +; NO_SVE-NEXT: add x9, x2, #26 +; NO_SVE-NEXT: st1 { v0.b }[10], [x9] +; NO_SVE-NEXT: tbz w8, #27, .LBB12_28 +; NO_SVE-NEXT: .LBB12_60: // %cond.store53 +; NO_SVE-NEXT: add x9, x2, #27 +; NO_SVE-NEXT: st1 { v0.b }[11], [x9] +; NO_SVE-NEXT: tbz w8, #28, .LBB12_29 +; NO_SVE-NEXT: .LBB12_61: // %cond.store55 +; NO_SVE-NEXT: add x9, x2, #28 +; NO_SVE-NEXT: st1 { v0.b }[12], [x9] +; NO_SVE-NEXT: tbz w8, #29, .LBB12_30 +; NO_SVE-NEXT: .LBB12_62: // %cond.store57 +; NO_SVE-NEXT: add x9, x2, #29 +; NO_SVE-NEXT: st1 { v0.b }[13], [x9] +; NO_SVE-NEXT: tbz w8, #30, .LBB12_31 +; NO_SVE-NEXT: .LBB12_63: // %cond.store59 +; NO_SVE-NEXT: add x9, x2, #30 +; NO_SVE-NEXT: st1 { v0.b }[14], [x9] +; NO_SVE-NEXT: tbz w8, #31, .LBB12_32 +; NO_SVE-NEXT: .LBB12_64: // %cond.store61 +; NO_SVE-NEXT: add x8, x2, #31 +; NO_SVE-NEXT: st1 { v0.b }[15], [x8] +; NO_SVE-NEXT: add sp, sp, #16 +; NO_SVE-NEXT: ret +; +; VBITS_GE_256-LABEL: masked_store_trunc_v32i16i8: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov x8, #16 +; VBITS_GE_256-NEXT: ptrue p0.h, vl16 +; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] +; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0] +; VBITS_GE_256-NEXT: ld1h { z2.h }, p0/z, [x1, x8, lsl #1] +; VBITS_GE_256-NEXT: ld1h { z3.h }, p0/z, [x1] +; VBITS_GE_256-NEXT: cmpeq p1.h, p0/z, z0.h, z2.h +; VBITS_GE_256-NEXT: cmpeq p0.h, p0/z, z1.h, z3.h +; VBITS_GE_256-NEXT: mov z2.h, p1/z, #-1 // =0xffffffffffffffff +; VBITS_GE_256-NEXT: mov z3.h, p0/z, #-1 // =0xffffffffffffffff +; VBITS_GE_256-NEXT: ptrue p1.b, vl16 +; VBITS_GE_256-NEXT: uzp1 z2.b, z2.b, z2.b +; VBITS_GE_256-NEXT: uzp1 z3.b, z3.b, z3.b +; VBITS_GE_256-NEXT: ptrue p0.b, vl32 +; VBITS_GE_256-NEXT: splice z3.b, p1, z3.b, z2.b +; VBITS_GE_256-NEXT: uzp1 z0.b, z0.b, z0.b +; VBITS_GE_256-NEXT: uzp1 z1.b, z1.b, z1.b +; VBITS_GE_256-NEXT: cmpne p0.b, p0/z, z3.b, #0 +; VBITS_GE_256-NEXT: splice z1.b, p1, z1.b, z0.b +; VBITS_GE_256-NEXT: st1b { z1.b }, p0, [x2] +; VBITS_GE_256-NEXT: ret +; +; VBITS_GE_384-LABEL: masked_store_trunc_v32i16i8: +; VBITS_GE_384: // %bb.0: +; VBITS_GE_384-NEXT: mov x8, #16 +; VBITS_GE_384-NEXT: ptrue p0.h, vl16 +; VBITS_GE_384-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] +; VBITS_GE_384-NEXT: ld1h { z1.h }, p0/z, [x0] +; VBITS_GE_384-NEXT: ld1h { z2.h }, p0/z, [x1, x8, lsl #1] +; VBITS_GE_384-NEXT: ld1h { z3.h }, p0/z, [x1] +; VBITS_GE_384-NEXT: cmpeq p1.h, p0/z, z0.h, z2.h +; VBITS_GE_384-NEXT: cmpeq p0.h, p0/z, z1.h, z3.h +; VBITS_GE_384-NEXT: mov z2.h, p1/z, #-1 // =0xffffffffffffffff +; VBITS_GE_384-NEXT: mov z3.h, p0/z, #-1 // =0xffffffffffffffff +; VBITS_GE_384-NEXT: ptrue p1.b, vl16 +; VBITS_GE_384-NEXT: uzp1 z2.b, z2.b, z2.b +; VBITS_GE_384-NEXT: uzp1 z3.b, z3.b, z3.b +; VBITS_GE_384-NEXT: ptrue p0.b, vl32 +; VBITS_GE_384-NEXT: splice z3.b, p1, z3.b, z2.b +; VBITS_GE_384-NEXT: uzp1 z0.b, z0.b, z0.b +; VBITS_GE_384-NEXT: uzp1 z1.b, z1.b, z1.b +; VBITS_GE_384-NEXT: cmpne p0.b, p0/z, z3.b, #0 +; VBITS_GE_384-NEXT: splice z1.b, p1, z1.b, z0.b +; VBITS_GE_384-NEXT: st1b { z1.b }, p0, [x2] +; VBITS_GE_384-NEXT: ret +; +; VBITS_GE_512-LABEL: masked_store_trunc_v32i16i8: +; VBITS_GE_512: // %bb.0: +; VBITS_GE_512-NEXT: ptrue p0.h, vl32 +; VBITS_GE_512-NEXT: ld1h { z0.h }, p0/z, [x0] +; VBITS_GE_512-NEXT: ld1h { z1.h }, p0/z, [x1] +; VBITS_GE_512-NEXT: cmpeq p0.h, p0/z, z0.h, z1.h +; VBITS_GE_512-NEXT: st1b { z0.h }, p0, [x2] +; VBITS_GE_512-NEXT: ret %a = load <32 x i16>, <32 x i16>* %ap %b = load <32 x i16>, <32 x i16>* %bp %mask = icmp eq <32 x i16> %a, %b diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-sdiv-pow2.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-sdiv-pow2.ll --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-sdiv-pow2.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-sdiv-pow2.ll @@ -1,5 +1,6 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -aarch64-sve-vector-bits-min=256 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_EQ_256 -; RUN: llc -aarch64-sve-vector-bits-min=384 < %s | FileCheck %s -check-prefixes=CHECK +; RUN: llc -aarch64-sve-vector-bits-min=384 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_384 ; RUN: llc -aarch64-sve-vector-bits-min=512 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512 ; RUN: llc -aarch64-sve-vector-bits-min=640 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512 ; RUN: llc -aarch64-sve-vector-bits-min=768 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512 @@ -67,6 +68,18 @@ ; VBITS_EQ_256-NEXT: st1b { z1.b }, p0, [x0] ; VBITS_EQ_256-NEXT: ret ; +; VBITS_GE_384-LABEL: sdiv_v64i8: +; VBITS_GE_384: // %bb.0: +; VBITS_GE_384-NEXT: mov w8, #32 +; VBITS_GE_384-NEXT: ptrue p0.b, vl32 +; VBITS_GE_384-NEXT: ld1b { z0.b }, p0/z, [x0, x8] +; VBITS_GE_384-NEXT: ld1b { z1.b }, p0/z, [x0] +; VBITS_GE_384-NEXT: asrd z0.b, p0/m, z0.b, #5 +; VBITS_GE_384-NEXT: asrd z1.b, p0/m, z1.b, #5 +; VBITS_GE_384-NEXT: st1b { z0.b }, p0, [x0, x8] +; VBITS_GE_384-NEXT: st1b { z1.b }, p0, [x0] +; VBITS_GE_384-NEXT: ret +; ; VBITS_GE_512-LABEL: sdiv_v64i8: ; VBITS_GE_512: // %bb.0: ; VBITS_GE_512-NEXT: ptrue p0.b, vl64 @@ -81,6 +94,46 @@ } define void @sdiv_v128i8(<128 x i8>* %a) #0 { +; VBITS_EQ_256-LABEL: sdiv_v128i8: +; VBITS_EQ_256: // %bb.0: +; VBITS_EQ_256-NEXT: mov w8, #96 +; VBITS_EQ_256-NEXT: mov w9, #32 +; VBITS_EQ_256-NEXT: mov w10, #64 +; VBITS_EQ_256-NEXT: ptrue p0.b, vl32 +; VBITS_EQ_256-NEXT: ld1b { z0.b }, p0/z, [x0, x8] +; VBITS_EQ_256-NEXT: ld1b { z1.b }, p0/z, [x0, x9] +; VBITS_EQ_256-NEXT: ld1b { z2.b }, p0/z, [x0, x10] +; VBITS_EQ_256-NEXT: ld1b { z3.b }, p0/z, [x0] +; VBITS_EQ_256-NEXT: asrd z1.b, p0/m, z1.b, #5 +; VBITS_EQ_256-NEXT: asrd z0.b, p0/m, z0.b, #5 +; VBITS_EQ_256-NEXT: asrd z2.b, p0/m, z2.b, #5 +; VBITS_EQ_256-NEXT: asrd z3.b, p0/m, z3.b, #5 +; VBITS_EQ_256-NEXT: st1b { z2.b }, p0, [x0, x10] +; VBITS_EQ_256-NEXT: st1b { z0.b }, p0, [x0, x8] +; VBITS_EQ_256-NEXT: st1b { z1.b }, p0, [x0, x9] +; VBITS_EQ_256-NEXT: st1b { z3.b }, p0, [x0] +; VBITS_EQ_256-NEXT: ret +; +; VBITS_GE_384-LABEL: sdiv_v128i8: +; VBITS_GE_384: // %bb.0: +; VBITS_GE_384-NEXT: mov w8, #96 +; VBITS_GE_384-NEXT: mov w9, #32 +; VBITS_GE_384-NEXT: mov w10, #64 +; VBITS_GE_384-NEXT: ptrue p0.b, vl32 +; VBITS_GE_384-NEXT: ld1b { z0.b }, p0/z, [x0, x8] +; VBITS_GE_384-NEXT: ld1b { z1.b }, p0/z, [x0, x9] +; VBITS_GE_384-NEXT: ld1b { z2.b }, p0/z, [x0, x10] +; VBITS_GE_384-NEXT: ld1b { z3.b }, p0/z, [x0] +; VBITS_GE_384-NEXT: asrd z1.b, p0/m, z1.b, #5 +; VBITS_GE_384-NEXT: asrd z0.b, p0/m, z0.b, #5 +; VBITS_GE_384-NEXT: asrd z2.b, p0/m, z2.b, #5 +; VBITS_GE_384-NEXT: asrd z3.b, p0/m, z3.b, #5 +; VBITS_GE_384-NEXT: st1b { z2.b }, p0, [x0, x10] +; VBITS_GE_384-NEXT: st1b { z0.b }, p0, [x0, x8] +; VBITS_GE_384-NEXT: st1b { z1.b }, p0, [x0, x9] +; VBITS_GE_384-NEXT: st1b { z3.b }, p0, [x0] +; VBITS_GE_384-NEXT: ret +; ; VBITS_GE_1024-LABEL: sdiv_v128i8: ; VBITS_GE_1024: // %bb.0: ; VBITS_GE_1024-NEXT: ptrue p0.b, vl128 @@ -95,6 +148,78 @@ } define void @sdiv_v256i8(<256 x i8>* %a) #0 { +; VBITS_EQ_256-LABEL: sdiv_v256i8: +; VBITS_EQ_256: // %bb.0: +; VBITS_EQ_256-NEXT: mov w8, #192 +; VBITS_EQ_256-NEXT: mov w9, #96 +; VBITS_EQ_256-NEXT: mov w10, #32 +; VBITS_EQ_256-NEXT: mov w11, #160 +; VBITS_EQ_256-NEXT: mov w12, #64 +; VBITS_EQ_256-NEXT: mov w13, #224 +; VBITS_EQ_256-NEXT: mov w14, #128 +; VBITS_EQ_256-NEXT: ptrue p0.b, vl32 +; VBITS_EQ_256-NEXT: ld1b { z0.b }, p0/z, [x0, x9] +; VBITS_EQ_256-NEXT: ld1b { z1.b }, p0/z, [x0, x10] +; VBITS_EQ_256-NEXT: ld1b { z2.b }, p0/z, [x0, x11] +; VBITS_EQ_256-NEXT: ld1b { z3.b }, p0/z, [x0, x12] +; VBITS_EQ_256-NEXT: ld1b { z4.b }, p0/z, [x0, x13] +; VBITS_EQ_256-NEXT: ld1b { z5.b }, p0/z, [x0, x14] +; VBITS_EQ_256-NEXT: ld1b { z6.b }, p0/z, [x0, x8] +; VBITS_EQ_256-NEXT: ld1b { z7.b }, p0/z, [x0] +; VBITS_EQ_256-NEXT: asrd z1.b, p0/m, z1.b, #5 +; VBITS_EQ_256-NEXT: asrd z0.b, p0/m, z0.b, #5 +; VBITS_EQ_256-NEXT: asrd z3.b, p0/m, z3.b, #5 +; VBITS_EQ_256-NEXT: asrd z2.b, p0/m, z2.b, #5 +; VBITS_EQ_256-NEXT: asrd z5.b, p0/m, z5.b, #5 +; VBITS_EQ_256-NEXT: asrd z4.b, p0/m, z4.b, #5 +; VBITS_EQ_256-NEXT: asrd z6.b, p0/m, z6.b, #5 +; VBITS_EQ_256-NEXT: asrd z7.b, p0/m, z7.b, #5 +; VBITS_EQ_256-NEXT: st1b { z6.b }, p0, [x0, x8] +; VBITS_EQ_256-NEXT: st1b { z4.b }, p0, [x0, x13] +; VBITS_EQ_256-NEXT: st1b { z5.b }, p0, [x0, x14] +; VBITS_EQ_256-NEXT: st1b { z2.b }, p0, [x0, x11] +; VBITS_EQ_256-NEXT: st1b { z3.b }, p0, [x0, x12] +; VBITS_EQ_256-NEXT: st1b { z0.b }, p0, [x0, x9] +; VBITS_EQ_256-NEXT: st1b { z1.b }, p0, [x0, x10] +; VBITS_EQ_256-NEXT: st1b { z7.b }, p0, [x0] +; VBITS_EQ_256-NEXT: ret +; +; VBITS_GE_384-LABEL: sdiv_v256i8: +; VBITS_GE_384: // %bb.0: +; VBITS_GE_384-NEXT: mov w8, #192 +; VBITS_GE_384-NEXT: mov w9, #96 +; VBITS_GE_384-NEXT: mov w10, #32 +; VBITS_GE_384-NEXT: mov w11, #160 +; VBITS_GE_384-NEXT: mov w12, #64 +; VBITS_GE_384-NEXT: mov w13, #224 +; VBITS_GE_384-NEXT: mov w14, #128 +; VBITS_GE_384-NEXT: ptrue p0.b, vl32 +; VBITS_GE_384-NEXT: ld1b { z0.b }, p0/z, [x0, x9] +; VBITS_GE_384-NEXT: ld1b { z1.b }, p0/z, [x0, x10] +; VBITS_GE_384-NEXT: ld1b { z2.b }, p0/z, [x0, x11] +; VBITS_GE_384-NEXT: ld1b { z3.b }, p0/z, [x0, x12] +; VBITS_GE_384-NEXT: ld1b { z4.b }, p0/z, [x0, x13] +; VBITS_GE_384-NEXT: ld1b { z5.b }, p0/z, [x0, x14] +; VBITS_GE_384-NEXT: ld1b { z6.b }, p0/z, [x0, x8] +; VBITS_GE_384-NEXT: ld1b { z7.b }, p0/z, [x0] +; VBITS_GE_384-NEXT: asrd z1.b, p0/m, z1.b, #5 +; VBITS_GE_384-NEXT: asrd z0.b, p0/m, z0.b, #5 +; VBITS_GE_384-NEXT: asrd z3.b, p0/m, z3.b, #5 +; VBITS_GE_384-NEXT: asrd z2.b, p0/m, z2.b, #5 +; VBITS_GE_384-NEXT: asrd z5.b, p0/m, z5.b, #5 +; VBITS_GE_384-NEXT: asrd z4.b, p0/m, z4.b, #5 +; VBITS_GE_384-NEXT: asrd z6.b, p0/m, z6.b, #5 +; VBITS_GE_384-NEXT: asrd z7.b, p0/m, z7.b, #5 +; VBITS_GE_384-NEXT: st1b { z6.b }, p0, [x0, x8] +; VBITS_GE_384-NEXT: st1b { z4.b }, p0, [x0, x13] +; VBITS_GE_384-NEXT: st1b { z5.b }, p0, [x0, x14] +; VBITS_GE_384-NEXT: st1b { z2.b }, p0, [x0, x11] +; VBITS_GE_384-NEXT: st1b { z3.b }, p0, [x0, x12] +; VBITS_GE_384-NEXT: st1b { z0.b }, p0, [x0, x9] +; VBITS_GE_384-NEXT: st1b { z1.b }, p0, [x0, x10] +; VBITS_GE_384-NEXT: st1b { z7.b }, p0, [x0] +; VBITS_GE_384-NEXT: ret +; ; VBITS_GE_2048-LABEL: sdiv_v256i8: ; VBITS_GE_2048: // %bb.0: ; VBITS_GE_2048-NEXT: ptrue p0.b, vl256 @@ -159,6 +284,18 @@ ; VBITS_EQ_256-NEXT: st1h { z1.h }, p0, [x0] ; VBITS_EQ_256-NEXT: ret ; +; VBITS_GE_384-LABEL: sdiv_v32i16: +; VBITS_GE_384: // %bb.0: +; VBITS_GE_384-NEXT: mov x8, #16 +; VBITS_GE_384-NEXT: ptrue p0.h, vl16 +; VBITS_GE_384-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] +; VBITS_GE_384-NEXT: ld1h { z1.h }, p0/z, [x0] +; VBITS_GE_384-NEXT: asrd z0.h, p0/m, z0.h, #5 +; VBITS_GE_384-NEXT: asrd z1.h, p0/m, z1.h, #5 +; VBITS_GE_384-NEXT: st1h { z0.h }, p0, [x0, x8, lsl #1] +; VBITS_GE_384-NEXT: st1h { z1.h }, p0, [x0] +; VBITS_GE_384-NEXT: ret +; ; VBITS_GE_512-LABEL: sdiv_v32i16: ; VBITS_GE_512: // %bb.0: ; VBITS_GE_512-NEXT: ptrue p0.h, vl32 @@ -173,6 +310,46 @@ } define void @sdiv_v64i16(<64 x i16>* %a) #0 { +; VBITS_EQ_256-LABEL: sdiv_v64i16: +; VBITS_EQ_256: // %bb.0: +; VBITS_EQ_256-NEXT: mov x8, #48 +; VBITS_EQ_256-NEXT: mov x9, #16 +; VBITS_EQ_256-NEXT: mov x10, #32 +; VBITS_EQ_256-NEXT: ptrue p0.h, vl16 +; VBITS_EQ_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] +; VBITS_EQ_256-NEXT: ld1h { z1.h }, p0/z, [x0, x9, lsl #1] +; VBITS_EQ_256-NEXT: ld1h { z2.h }, p0/z, [x0, x10, lsl #1] +; VBITS_EQ_256-NEXT: ld1h { z3.h }, p0/z, [x0] +; VBITS_EQ_256-NEXT: asrd z1.h, p0/m, z1.h, #5 +; VBITS_EQ_256-NEXT: asrd z0.h, p0/m, z0.h, #5 +; VBITS_EQ_256-NEXT: asrd z2.h, p0/m, z2.h, #5 +; VBITS_EQ_256-NEXT: asrd z3.h, p0/m, z3.h, #5 +; VBITS_EQ_256-NEXT: st1h { z2.h }, p0, [x0, x10, lsl #1] +; VBITS_EQ_256-NEXT: st1h { z0.h }, p0, [x0, x8, lsl #1] +; VBITS_EQ_256-NEXT: st1h { z1.h }, p0, [x0, x9, lsl #1] +; VBITS_EQ_256-NEXT: st1h { z3.h }, p0, [x0] +; VBITS_EQ_256-NEXT: ret +; +; VBITS_GE_384-LABEL: sdiv_v64i16: +; VBITS_GE_384: // %bb.0: +; VBITS_GE_384-NEXT: mov x8, #48 +; VBITS_GE_384-NEXT: mov x9, #16 +; VBITS_GE_384-NEXT: mov x10, #32 +; VBITS_GE_384-NEXT: ptrue p0.h, vl16 +; VBITS_GE_384-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] +; VBITS_GE_384-NEXT: ld1h { z1.h }, p0/z, [x0, x9, lsl #1] +; VBITS_GE_384-NEXT: ld1h { z2.h }, p0/z, [x0, x10, lsl #1] +; VBITS_GE_384-NEXT: ld1h { z3.h }, p0/z, [x0] +; VBITS_GE_384-NEXT: asrd z1.h, p0/m, z1.h, #5 +; VBITS_GE_384-NEXT: asrd z0.h, p0/m, z0.h, #5 +; VBITS_GE_384-NEXT: asrd z2.h, p0/m, z2.h, #5 +; VBITS_GE_384-NEXT: asrd z3.h, p0/m, z3.h, #5 +; VBITS_GE_384-NEXT: st1h { z2.h }, p0, [x0, x10, lsl #1] +; VBITS_GE_384-NEXT: st1h { z0.h }, p0, [x0, x8, lsl #1] +; VBITS_GE_384-NEXT: st1h { z1.h }, p0, [x0, x9, lsl #1] +; VBITS_GE_384-NEXT: st1h { z3.h }, p0, [x0] +; VBITS_GE_384-NEXT: ret +; ; VBITS_GE_1024-LABEL: sdiv_v64i16: ; VBITS_GE_1024: // %bb.0: ; VBITS_GE_1024-NEXT: ptrue p0.h, vl64 @@ -187,6 +364,78 @@ } define void @sdiv_v128i16(<128 x i16>* %a) #0 { +; VBITS_EQ_256-LABEL: sdiv_v128i16: +; VBITS_EQ_256: // %bb.0: +; VBITS_EQ_256-NEXT: mov x8, #96 +; VBITS_EQ_256-NEXT: mov x9, #48 +; VBITS_EQ_256-NEXT: mov x10, #16 +; VBITS_EQ_256-NEXT: mov x11, #80 +; VBITS_EQ_256-NEXT: mov x12, #32 +; VBITS_EQ_256-NEXT: mov x13, #112 +; VBITS_EQ_256-NEXT: mov x14, #64 +; VBITS_EQ_256-NEXT: ptrue p0.h, vl16 +; VBITS_EQ_256-NEXT: ld1h { z0.h }, p0/z, [x0, x9, lsl #1] +; VBITS_EQ_256-NEXT: ld1h { z1.h }, p0/z, [x0, x10, lsl #1] +; VBITS_EQ_256-NEXT: ld1h { z2.h }, p0/z, [x0, x11, lsl #1] +; VBITS_EQ_256-NEXT: ld1h { z3.h }, p0/z, [x0, x12, lsl #1] +; VBITS_EQ_256-NEXT: ld1h { z4.h }, p0/z, [x0, x13, lsl #1] +; VBITS_EQ_256-NEXT: ld1h { z5.h }, p0/z, [x0, x14, lsl #1] +; VBITS_EQ_256-NEXT: ld1h { z6.h }, p0/z, [x0, x8, lsl #1] +; VBITS_EQ_256-NEXT: ld1h { z7.h }, p0/z, [x0] +; VBITS_EQ_256-NEXT: asrd z1.h, p0/m, z1.h, #5 +; VBITS_EQ_256-NEXT: asrd z0.h, p0/m, z0.h, #5 +; VBITS_EQ_256-NEXT: asrd z3.h, p0/m, z3.h, #5 +; VBITS_EQ_256-NEXT: asrd z2.h, p0/m, z2.h, #5 +; VBITS_EQ_256-NEXT: asrd z5.h, p0/m, z5.h, #5 +; VBITS_EQ_256-NEXT: asrd z4.h, p0/m, z4.h, #5 +; VBITS_EQ_256-NEXT: asrd z6.h, p0/m, z6.h, #5 +; VBITS_EQ_256-NEXT: asrd z7.h, p0/m, z7.h, #5 +; VBITS_EQ_256-NEXT: st1h { z6.h }, p0, [x0, x8, lsl #1] +; VBITS_EQ_256-NEXT: st1h { z4.h }, p0, [x0, x13, lsl #1] +; VBITS_EQ_256-NEXT: st1h { z5.h }, p0, [x0, x14, lsl #1] +; VBITS_EQ_256-NEXT: st1h { z2.h }, p0, [x0, x11, lsl #1] +; VBITS_EQ_256-NEXT: st1h { z3.h }, p0, [x0, x12, lsl #1] +; VBITS_EQ_256-NEXT: st1h { z0.h }, p0, [x0, x9, lsl #1] +; VBITS_EQ_256-NEXT: st1h { z1.h }, p0, [x0, x10, lsl #1] +; VBITS_EQ_256-NEXT: st1h { z7.h }, p0, [x0] +; VBITS_EQ_256-NEXT: ret +; +; VBITS_GE_384-LABEL: sdiv_v128i16: +; VBITS_GE_384: // %bb.0: +; VBITS_GE_384-NEXT: mov x8, #96 +; VBITS_GE_384-NEXT: mov x9, #48 +; VBITS_GE_384-NEXT: mov x10, #16 +; VBITS_GE_384-NEXT: mov x11, #80 +; VBITS_GE_384-NEXT: mov x12, #32 +; VBITS_GE_384-NEXT: mov x13, #112 +; VBITS_GE_384-NEXT: mov x14, #64 +; VBITS_GE_384-NEXT: ptrue p0.h, vl16 +; VBITS_GE_384-NEXT: ld1h { z0.h }, p0/z, [x0, x9, lsl #1] +; VBITS_GE_384-NEXT: ld1h { z1.h }, p0/z, [x0, x10, lsl #1] +; VBITS_GE_384-NEXT: ld1h { z2.h }, p0/z, [x0, x11, lsl #1] +; VBITS_GE_384-NEXT: ld1h { z3.h }, p0/z, [x0, x12, lsl #1] +; VBITS_GE_384-NEXT: ld1h { z4.h }, p0/z, [x0, x13, lsl #1] +; VBITS_GE_384-NEXT: ld1h { z5.h }, p0/z, [x0, x14, lsl #1] +; VBITS_GE_384-NEXT: ld1h { z6.h }, p0/z, [x0, x8, lsl #1] +; VBITS_GE_384-NEXT: ld1h { z7.h }, p0/z, [x0] +; VBITS_GE_384-NEXT: asrd z1.h, p0/m, z1.h, #5 +; VBITS_GE_384-NEXT: asrd z0.h, p0/m, z0.h, #5 +; VBITS_GE_384-NEXT: asrd z3.h, p0/m, z3.h, #5 +; VBITS_GE_384-NEXT: asrd z2.h, p0/m, z2.h, #5 +; VBITS_GE_384-NEXT: asrd z5.h, p0/m, z5.h, #5 +; VBITS_GE_384-NEXT: asrd z4.h, p0/m, z4.h, #5 +; VBITS_GE_384-NEXT: asrd z6.h, p0/m, z6.h, #5 +; VBITS_GE_384-NEXT: asrd z7.h, p0/m, z7.h, #5 +; VBITS_GE_384-NEXT: st1h { z6.h }, p0, [x0, x8, lsl #1] +; VBITS_GE_384-NEXT: st1h { z4.h }, p0, [x0, x13, lsl #1] +; VBITS_GE_384-NEXT: st1h { z5.h }, p0, [x0, x14, lsl #1] +; VBITS_GE_384-NEXT: st1h { z2.h }, p0, [x0, x11, lsl #1] +; VBITS_GE_384-NEXT: st1h { z3.h }, p0, [x0, x12, lsl #1] +; VBITS_GE_384-NEXT: st1h { z0.h }, p0, [x0, x9, lsl #1] +; VBITS_GE_384-NEXT: st1h { z1.h }, p0, [x0, x10, lsl #1] +; VBITS_GE_384-NEXT: st1h { z7.h }, p0, [x0] +; VBITS_GE_384-NEXT: ret +; ; VBITS_GE_2048-LABEL: sdiv_v128i16: ; VBITS_GE_2048: // %bb.0: ; VBITS_GE_2048-NEXT: ptrue p0.h, vl128 @@ -251,6 +500,18 @@ ; VBITS_EQ_256-NEXT: st1w { z1.s }, p0, [x0] ; VBITS_EQ_256-NEXT: ret ; +; VBITS_GE_384-LABEL: sdiv_v16i32: +; VBITS_GE_384: // %bb.0: +; VBITS_GE_384-NEXT: mov x8, #8 +; VBITS_GE_384-NEXT: ptrue p0.s, vl8 +; VBITS_GE_384-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] +; VBITS_GE_384-NEXT: ld1w { z1.s }, p0/z, [x0] +; VBITS_GE_384-NEXT: asrd z0.s, p0/m, z0.s, #5 +; VBITS_GE_384-NEXT: asrd z1.s, p0/m, z1.s, #5 +; VBITS_GE_384-NEXT: st1w { z0.s }, p0, [x0, x8, lsl #2] +; VBITS_GE_384-NEXT: st1w { z1.s }, p0, [x0] +; VBITS_GE_384-NEXT: ret +; ; VBITS_GE_512-LABEL: sdiv_v16i32: ; VBITS_GE_512: // %bb.0: ; VBITS_GE_512-NEXT: ptrue p0.s, vl16 @@ -265,6 +526,46 @@ } define void @sdiv_v32i32(<32 x i32>* %a) #0 { +; VBITS_EQ_256-LABEL: sdiv_v32i32: +; VBITS_EQ_256: // %bb.0: +; VBITS_EQ_256-NEXT: mov x8, #24 +; VBITS_EQ_256-NEXT: mov x9, #8 +; VBITS_EQ_256-NEXT: mov x10, #16 +; VBITS_EQ_256-NEXT: ptrue p0.s, vl8 +; VBITS_EQ_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] +; VBITS_EQ_256-NEXT: ld1w { z1.s }, p0/z, [x0, x9, lsl #2] +; VBITS_EQ_256-NEXT: ld1w { z2.s }, p0/z, [x0, x10, lsl #2] +; VBITS_EQ_256-NEXT: ld1w { z3.s }, p0/z, [x0] +; VBITS_EQ_256-NEXT: asrd z1.s, p0/m, z1.s, #5 +; VBITS_EQ_256-NEXT: asrd z0.s, p0/m, z0.s, #5 +; VBITS_EQ_256-NEXT: asrd z2.s, p0/m, z2.s, #5 +; VBITS_EQ_256-NEXT: asrd z3.s, p0/m, z3.s, #5 +; VBITS_EQ_256-NEXT: st1w { z2.s }, p0, [x0, x10, lsl #2] +; VBITS_EQ_256-NEXT: st1w { z0.s }, p0, [x0, x8, lsl #2] +; VBITS_EQ_256-NEXT: st1w { z1.s }, p0, [x0, x9, lsl #2] +; VBITS_EQ_256-NEXT: st1w { z3.s }, p0, [x0] +; VBITS_EQ_256-NEXT: ret +; +; VBITS_GE_384-LABEL: sdiv_v32i32: +; VBITS_GE_384: // %bb.0: +; VBITS_GE_384-NEXT: mov x8, #24 +; VBITS_GE_384-NEXT: mov x9, #8 +; VBITS_GE_384-NEXT: mov x10, #16 +; VBITS_GE_384-NEXT: ptrue p0.s, vl8 +; VBITS_GE_384-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] +; VBITS_GE_384-NEXT: ld1w { z1.s }, p0/z, [x0, x9, lsl #2] +; VBITS_GE_384-NEXT: ld1w { z2.s }, p0/z, [x0, x10, lsl #2] +; VBITS_GE_384-NEXT: ld1w { z3.s }, p0/z, [x0] +; VBITS_GE_384-NEXT: asrd z1.s, p0/m, z1.s, #5 +; VBITS_GE_384-NEXT: asrd z0.s, p0/m, z0.s, #5 +; VBITS_GE_384-NEXT: asrd z2.s, p0/m, z2.s, #5 +; VBITS_GE_384-NEXT: asrd z3.s, p0/m, z3.s, #5 +; VBITS_GE_384-NEXT: st1w { z2.s }, p0, [x0, x10, lsl #2] +; VBITS_GE_384-NEXT: st1w { z0.s }, p0, [x0, x8, lsl #2] +; VBITS_GE_384-NEXT: st1w { z1.s }, p0, [x0, x9, lsl #2] +; VBITS_GE_384-NEXT: st1w { z3.s }, p0, [x0] +; VBITS_GE_384-NEXT: ret +; ; VBITS_GE_1024-LABEL: sdiv_v32i32: ; VBITS_GE_1024: // %bb.0: ; VBITS_GE_1024-NEXT: ptrue p0.s, vl32 @@ -279,6 +580,78 @@ } define void @sdiv_v64i32(<64 x i32>* %a) #0 { +; VBITS_EQ_256-LABEL: sdiv_v64i32: +; VBITS_EQ_256: // %bb.0: +; VBITS_EQ_256-NEXT: mov x8, #48 +; VBITS_EQ_256-NEXT: mov x9, #24 +; VBITS_EQ_256-NEXT: mov x10, #8 +; VBITS_EQ_256-NEXT: mov x11, #40 +; VBITS_EQ_256-NEXT: mov x12, #16 +; VBITS_EQ_256-NEXT: mov x13, #56 +; VBITS_EQ_256-NEXT: mov x14, #32 +; VBITS_EQ_256-NEXT: ptrue p0.s, vl8 +; VBITS_EQ_256-NEXT: ld1w { z0.s }, p0/z, [x0, x9, lsl #2] +; VBITS_EQ_256-NEXT: ld1w { z1.s }, p0/z, [x0, x10, lsl #2] +; VBITS_EQ_256-NEXT: ld1w { z2.s }, p0/z, [x0, x11, lsl #2] +; VBITS_EQ_256-NEXT: ld1w { z3.s }, p0/z, [x0, x12, lsl #2] +; VBITS_EQ_256-NEXT: ld1w { z4.s }, p0/z, [x0, x13, lsl #2] +; VBITS_EQ_256-NEXT: ld1w { z5.s }, p0/z, [x0, x14, lsl #2] +; VBITS_EQ_256-NEXT: ld1w { z6.s }, p0/z, [x0, x8, lsl #2] +; VBITS_EQ_256-NEXT: ld1w { z7.s }, p0/z, [x0] +; VBITS_EQ_256-NEXT: asrd z1.s, p0/m, z1.s, #5 +; VBITS_EQ_256-NEXT: asrd z0.s, p0/m, z0.s, #5 +; VBITS_EQ_256-NEXT: asrd z3.s, p0/m, z3.s, #5 +; VBITS_EQ_256-NEXT: asrd z2.s, p0/m, z2.s, #5 +; VBITS_EQ_256-NEXT: asrd z5.s, p0/m, z5.s, #5 +; VBITS_EQ_256-NEXT: asrd z4.s, p0/m, z4.s, #5 +; VBITS_EQ_256-NEXT: asrd z6.s, p0/m, z6.s, #5 +; VBITS_EQ_256-NEXT: asrd z7.s, p0/m, z7.s, #5 +; VBITS_EQ_256-NEXT: st1w { z6.s }, p0, [x0, x8, lsl #2] +; VBITS_EQ_256-NEXT: st1w { z4.s }, p0, [x0, x13, lsl #2] +; VBITS_EQ_256-NEXT: st1w { z5.s }, p0, [x0, x14, lsl #2] +; VBITS_EQ_256-NEXT: st1w { z2.s }, p0, [x0, x11, lsl #2] +; VBITS_EQ_256-NEXT: st1w { z3.s }, p0, [x0, x12, lsl #2] +; VBITS_EQ_256-NEXT: st1w { z0.s }, p0, [x0, x9, lsl #2] +; VBITS_EQ_256-NEXT: st1w { z1.s }, p0, [x0, x10, lsl #2] +; VBITS_EQ_256-NEXT: st1w { z7.s }, p0, [x0] +; VBITS_EQ_256-NEXT: ret +; +; VBITS_GE_384-LABEL: sdiv_v64i32: +; VBITS_GE_384: // %bb.0: +; VBITS_GE_384-NEXT: mov x8, #48 +; VBITS_GE_384-NEXT: mov x9, #24 +; VBITS_GE_384-NEXT: mov x10, #8 +; VBITS_GE_384-NEXT: mov x11, #40 +; VBITS_GE_384-NEXT: mov x12, #16 +; VBITS_GE_384-NEXT: mov x13, #56 +; VBITS_GE_384-NEXT: mov x14, #32 +; VBITS_GE_384-NEXT: ptrue p0.s, vl8 +; VBITS_GE_384-NEXT: ld1w { z0.s }, p0/z, [x0, x9, lsl #2] +; VBITS_GE_384-NEXT: ld1w { z1.s }, p0/z, [x0, x10, lsl #2] +; VBITS_GE_384-NEXT: ld1w { z2.s }, p0/z, [x0, x11, lsl #2] +; VBITS_GE_384-NEXT: ld1w { z3.s }, p0/z, [x0, x12, lsl #2] +; VBITS_GE_384-NEXT: ld1w { z4.s }, p0/z, [x0, x13, lsl #2] +; VBITS_GE_384-NEXT: ld1w { z5.s }, p0/z, [x0, x14, lsl #2] +; VBITS_GE_384-NEXT: ld1w { z6.s }, p0/z, [x0, x8, lsl #2] +; VBITS_GE_384-NEXT: ld1w { z7.s }, p0/z, [x0] +; VBITS_GE_384-NEXT: asrd z1.s, p0/m, z1.s, #5 +; VBITS_GE_384-NEXT: asrd z0.s, p0/m, z0.s, #5 +; VBITS_GE_384-NEXT: asrd z3.s, p0/m, z3.s, #5 +; VBITS_GE_384-NEXT: asrd z2.s, p0/m, z2.s, #5 +; VBITS_GE_384-NEXT: asrd z5.s, p0/m, z5.s, #5 +; VBITS_GE_384-NEXT: asrd z4.s, p0/m, z4.s, #5 +; VBITS_GE_384-NEXT: asrd z6.s, p0/m, z6.s, #5 +; VBITS_GE_384-NEXT: asrd z7.s, p0/m, z7.s, #5 +; VBITS_GE_384-NEXT: st1w { z6.s }, p0, [x0, x8, lsl #2] +; VBITS_GE_384-NEXT: st1w { z4.s }, p0, [x0, x13, lsl #2] +; VBITS_GE_384-NEXT: st1w { z5.s }, p0, [x0, x14, lsl #2] +; VBITS_GE_384-NEXT: st1w { z2.s }, p0, [x0, x11, lsl #2] +; VBITS_GE_384-NEXT: st1w { z3.s }, p0, [x0, x12, lsl #2] +; VBITS_GE_384-NEXT: st1w { z0.s }, p0, [x0, x9, lsl #2] +; VBITS_GE_384-NEXT: st1w { z1.s }, p0, [x0, x10, lsl #2] +; VBITS_GE_384-NEXT: st1w { z7.s }, p0, [x0] +; VBITS_GE_384-NEXT: ret +; ; VBITS_GE_2048-LABEL: sdiv_v64i32: ; VBITS_GE_2048: // %bb.0: ; VBITS_GE_2048-NEXT: ptrue p0.s, vl64 @@ -344,6 +717,18 @@ ; VBITS_EQ_256-NEXT: st1d { z1.d }, p0, [x0] ; VBITS_EQ_256-NEXT: ret ; +; VBITS_GE_384-LABEL: sdiv_v8i64: +; VBITS_GE_384: // %bb.0: +; VBITS_GE_384-NEXT: mov x8, #4 +; VBITS_GE_384-NEXT: ptrue p0.d, vl4 +; VBITS_GE_384-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] +; VBITS_GE_384-NEXT: ld1d { z1.d }, p0/z, [x0] +; VBITS_GE_384-NEXT: asrd z0.d, p0/m, z0.d, #5 +; VBITS_GE_384-NEXT: asrd z1.d, p0/m, z1.d, #5 +; VBITS_GE_384-NEXT: st1d { z0.d }, p0, [x0, x8, lsl #3] +; VBITS_GE_384-NEXT: st1d { z1.d }, p0, [x0] +; VBITS_GE_384-NEXT: ret +; ; VBITS_GE_512-LABEL: sdiv_v8i64: ; VBITS_GE_512: // %bb.0: ; VBITS_GE_512-NEXT: ptrue p0.d, vl8 @@ -358,6 +743,46 @@ } define void @sdiv_v16i64(<16 x i64>* %a) #0 { +; VBITS_EQ_256-LABEL: sdiv_v16i64: +; VBITS_EQ_256: // %bb.0: +; VBITS_EQ_256-NEXT: mov x8, #12 +; VBITS_EQ_256-NEXT: mov x9, #4 +; VBITS_EQ_256-NEXT: mov x10, #8 +; VBITS_EQ_256-NEXT: ptrue p0.d, vl4 +; VBITS_EQ_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] +; VBITS_EQ_256-NEXT: ld1d { z1.d }, p0/z, [x0, x9, lsl #3] +; VBITS_EQ_256-NEXT: ld1d { z2.d }, p0/z, [x0, x10, lsl #3] +; VBITS_EQ_256-NEXT: ld1d { z3.d }, p0/z, [x0] +; VBITS_EQ_256-NEXT: asrd z1.d, p0/m, z1.d, #5 +; VBITS_EQ_256-NEXT: asrd z0.d, p0/m, z0.d, #5 +; VBITS_EQ_256-NEXT: asrd z2.d, p0/m, z2.d, #5 +; VBITS_EQ_256-NEXT: asrd z3.d, p0/m, z3.d, #5 +; VBITS_EQ_256-NEXT: st1d { z2.d }, p0, [x0, x10, lsl #3] +; VBITS_EQ_256-NEXT: st1d { z0.d }, p0, [x0, x8, lsl #3] +; VBITS_EQ_256-NEXT: st1d { z1.d }, p0, [x0, x9, lsl #3] +; VBITS_EQ_256-NEXT: st1d { z3.d }, p0, [x0] +; VBITS_EQ_256-NEXT: ret +; +; VBITS_GE_384-LABEL: sdiv_v16i64: +; VBITS_GE_384: // %bb.0: +; VBITS_GE_384-NEXT: mov x8, #12 +; VBITS_GE_384-NEXT: mov x9, #4 +; VBITS_GE_384-NEXT: mov x10, #8 +; VBITS_GE_384-NEXT: ptrue p0.d, vl4 +; VBITS_GE_384-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] +; VBITS_GE_384-NEXT: ld1d { z1.d }, p0/z, [x0, x9, lsl #3] +; VBITS_GE_384-NEXT: ld1d { z2.d }, p0/z, [x0, x10, lsl #3] +; VBITS_GE_384-NEXT: ld1d { z3.d }, p0/z, [x0] +; VBITS_GE_384-NEXT: asrd z1.d, p0/m, z1.d, #5 +; VBITS_GE_384-NEXT: asrd z0.d, p0/m, z0.d, #5 +; VBITS_GE_384-NEXT: asrd z2.d, p0/m, z2.d, #5 +; VBITS_GE_384-NEXT: asrd z3.d, p0/m, z3.d, #5 +; VBITS_GE_384-NEXT: st1d { z2.d }, p0, [x0, x10, lsl #3] +; VBITS_GE_384-NEXT: st1d { z0.d }, p0, [x0, x8, lsl #3] +; VBITS_GE_384-NEXT: st1d { z1.d }, p0, [x0, x9, lsl #3] +; VBITS_GE_384-NEXT: st1d { z3.d }, p0, [x0] +; VBITS_GE_384-NEXT: ret +; ; VBITS_GE_1024-LABEL: sdiv_v16i64: ; VBITS_GE_1024: // %bb.0: ; VBITS_GE_1024-NEXT: ptrue p0.d, vl16 @@ -372,6 +797,78 @@ } define void @sdiv_v32i64(<32 x i64>* %a) #0 { +; VBITS_EQ_256-LABEL: sdiv_v32i64: +; VBITS_EQ_256: // %bb.0: +; VBITS_EQ_256-NEXT: mov x8, #24 +; VBITS_EQ_256-NEXT: mov x9, #12 +; VBITS_EQ_256-NEXT: mov x10, #4 +; VBITS_EQ_256-NEXT: mov x11, #20 +; VBITS_EQ_256-NEXT: mov x12, #8 +; VBITS_EQ_256-NEXT: mov x13, #28 +; VBITS_EQ_256-NEXT: mov x14, #16 +; VBITS_EQ_256-NEXT: ptrue p0.d, vl4 +; VBITS_EQ_256-NEXT: ld1d { z0.d }, p0/z, [x0, x9, lsl #3] +; VBITS_EQ_256-NEXT: ld1d { z1.d }, p0/z, [x0, x10, lsl #3] +; VBITS_EQ_256-NEXT: ld1d { z2.d }, p0/z, [x0, x11, lsl #3] +; VBITS_EQ_256-NEXT: ld1d { z3.d }, p0/z, [x0, x12, lsl #3] +; VBITS_EQ_256-NEXT: ld1d { z4.d }, p0/z, [x0, x13, lsl #3] +; VBITS_EQ_256-NEXT: ld1d { z5.d }, p0/z, [x0, x14, lsl #3] +; VBITS_EQ_256-NEXT: ld1d { z6.d }, p0/z, [x0, x8, lsl #3] +; VBITS_EQ_256-NEXT: ld1d { z7.d }, p0/z, [x0] +; VBITS_EQ_256-NEXT: asrd z1.d, p0/m, z1.d, #5 +; VBITS_EQ_256-NEXT: asrd z0.d, p0/m, z0.d, #5 +; VBITS_EQ_256-NEXT: asrd z3.d, p0/m, z3.d, #5 +; VBITS_EQ_256-NEXT: asrd z2.d, p0/m, z2.d, #5 +; VBITS_EQ_256-NEXT: asrd z5.d, p0/m, z5.d, #5 +; VBITS_EQ_256-NEXT: asrd z4.d, p0/m, z4.d, #5 +; VBITS_EQ_256-NEXT: asrd z6.d, p0/m, z6.d, #5 +; VBITS_EQ_256-NEXT: asrd z7.d, p0/m, z7.d, #5 +; VBITS_EQ_256-NEXT: st1d { z6.d }, p0, [x0, x8, lsl #3] +; VBITS_EQ_256-NEXT: st1d { z4.d }, p0, [x0, x13, lsl #3] +; VBITS_EQ_256-NEXT: st1d { z5.d }, p0, [x0, x14, lsl #3] +; VBITS_EQ_256-NEXT: st1d { z2.d }, p0, [x0, x11, lsl #3] +; VBITS_EQ_256-NEXT: st1d { z3.d }, p0, [x0, x12, lsl #3] +; VBITS_EQ_256-NEXT: st1d { z0.d }, p0, [x0, x9, lsl #3] +; VBITS_EQ_256-NEXT: st1d { z1.d }, p0, [x0, x10, lsl #3] +; VBITS_EQ_256-NEXT: st1d { z7.d }, p0, [x0] +; VBITS_EQ_256-NEXT: ret +; +; VBITS_GE_384-LABEL: sdiv_v32i64: +; VBITS_GE_384: // %bb.0: +; VBITS_GE_384-NEXT: mov x8, #24 +; VBITS_GE_384-NEXT: mov x9, #12 +; VBITS_GE_384-NEXT: mov x10, #4 +; VBITS_GE_384-NEXT: mov x11, #20 +; VBITS_GE_384-NEXT: mov x12, #8 +; VBITS_GE_384-NEXT: mov x13, #28 +; VBITS_GE_384-NEXT: mov x14, #16 +; VBITS_GE_384-NEXT: ptrue p0.d, vl4 +; VBITS_GE_384-NEXT: ld1d { z0.d }, p0/z, [x0, x9, lsl #3] +; VBITS_GE_384-NEXT: ld1d { z1.d }, p0/z, [x0, x10, lsl #3] +; VBITS_GE_384-NEXT: ld1d { z2.d }, p0/z, [x0, x11, lsl #3] +; VBITS_GE_384-NEXT: ld1d { z3.d }, p0/z, [x0, x12, lsl #3] +; VBITS_GE_384-NEXT: ld1d { z4.d }, p0/z, [x0, x13, lsl #3] +; VBITS_GE_384-NEXT: ld1d { z5.d }, p0/z, [x0, x14, lsl #3] +; VBITS_GE_384-NEXT: ld1d { z6.d }, p0/z, [x0, x8, lsl #3] +; VBITS_GE_384-NEXT: ld1d { z7.d }, p0/z, [x0] +; VBITS_GE_384-NEXT: asrd z1.d, p0/m, z1.d, #5 +; VBITS_GE_384-NEXT: asrd z0.d, p0/m, z0.d, #5 +; VBITS_GE_384-NEXT: asrd z3.d, p0/m, z3.d, #5 +; VBITS_GE_384-NEXT: asrd z2.d, p0/m, z2.d, #5 +; VBITS_GE_384-NEXT: asrd z5.d, p0/m, z5.d, #5 +; VBITS_GE_384-NEXT: asrd z4.d, p0/m, z4.d, #5 +; VBITS_GE_384-NEXT: asrd z6.d, p0/m, z6.d, #5 +; VBITS_GE_384-NEXT: asrd z7.d, p0/m, z7.d, #5 +; VBITS_GE_384-NEXT: st1d { z6.d }, p0, [x0, x8, lsl #3] +; VBITS_GE_384-NEXT: st1d { z4.d }, p0, [x0, x13, lsl #3] +; VBITS_GE_384-NEXT: st1d { z5.d }, p0, [x0, x14, lsl #3] +; VBITS_GE_384-NEXT: st1d { z2.d }, p0, [x0, x11, lsl #3] +; VBITS_GE_384-NEXT: st1d { z3.d }, p0, [x0, x12, lsl #3] +; VBITS_GE_384-NEXT: st1d { z0.d }, p0, [x0, x9, lsl #3] +; VBITS_GE_384-NEXT: st1d { z1.d }, p0, [x0, x10, lsl #3] +; VBITS_GE_384-NEXT: st1d { z7.d }, p0, [x0] +; VBITS_GE_384-NEXT: ret +; ; VBITS_GE_2048-LABEL: sdiv_v32i64: ; VBITS_GE_2048: // %bb.0: ; VBITS_GE_2048-NEXT: ptrue p0.d, vl32 diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-shuffles.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-shuffles.ll --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-shuffles.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-shuffles.ll @@ -1,3 +1,4 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s | FileCheck %s target triple = "aarch64-unknown-linux-gnu" @@ -8,6 +9,28 @@ ; successfully exits code generation. define void @hang_when_merging_stores_after_legalisation(<8 x i32>* %a, <2 x i32> %b) #0 { ; CHECK-LABEL: hang_when_merging_stores_after_legalisation: +; CHECK: // %bb.0: +; CHECK-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: mov x29, sp +; CHECK-NEXT: .cfi_def_cfa w29, 16 +; CHECK-NEXT: .cfi_offset w30, -8 +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: sub x9, sp, #48 +; CHECK-NEXT: and sp, x9, #0xffffffffffffffe0 +; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: stp s0, s0, [sp, #24] +; CHECK-NEXT: stp s0, s0, [sp, #16] +; CHECK-NEXT: stp s0, s0, [sp, #8] +; CHECK-NEXT: stp s0, s0, [sp] +; CHECK-NEXT: ld1w { z0.s }, p0/z, [sp] +; CHECK-NEXT: mov z1.d, z0.d +; CHECK-NEXT: ext z1.b, z1.b, z1.b, #16 +; CHECK-NEXT: st2 { v0.4s, v1.4s }, [x0] +; CHECK-NEXT: mov sp, x29 +; CHECK-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload +; CHECK-NEXT: ret %splat = shufflevector <2 x i32> %b, <2 x i32> undef, <8 x i32> zeroinitializer %interleaved.vec = shufflevector <8 x i32> %splat, <8 x i32> undef, <8 x i32> store <8 x i32> %interleaved.vec, <8 x i32>* %a, align 4 @@ -17,8 +40,85 @@ ; Ensure we don't crash when trying to lower a shuffle via and extract define void @crash_when_lowering_extract_shuffle(<32 x i32>* %dst, i1 %cond) #0 { ; CHECK-LABEL: crash_when_lowering_extract_shuffle: -; CHECK: ld1w { z3.s }, p0/z, [x0] -; CHECK: st1w { z3.s }, p0, [x0] +; CHECK: // %bb.0: +; CHECK-NEXT: tbnz w1, #0, .LBB1_2 +; CHECK-NEXT: // %bb.1: // %vector.body +; CHECK-NEXT: mov z0.b, #0 // =0x0 +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: umov w8, v0.b[8] +; CHECK-NEXT: umov w9, v0.b[1] +; CHECK-NEXT: umov w10, v0.b[9] +; CHECK-NEXT: umov w11, v0.b[2] +; CHECK-NEXT: mov v1.16b, v0.16b +; CHECK-NEXT: fmov s2, w8 +; CHECK-NEXT: umov w8, v0.b[10] +; CHECK-NEXT: mov v1.b[1], w9 +; CHECK-NEXT: umov w9, v0.b[3] +; CHECK-NEXT: mov v2.b[1], w10 +; CHECK-NEXT: umov w10, v0.b[11] +; CHECK-NEXT: mov v1.b[2], w11 +; CHECK-NEXT: umov w11, v0.b[7] +; CHECK-NEXT: mov v2.b[2], w8 +; CHECK-NEXT: umov w8, v0.b[4] +; CHECK-NEXT: mov v1.b[3], w9 +; CHECK-NEXT: umov w9, v0.b[12] +; CHECK-NEXT: mov v2.b[3], w10 +; CHECK-NEXT: umov w10, v0.b[5] +; CHECK-NEXT: mov v1.b[4], w8 +; CHECK-NEXT: umov w8, v0.b[13] +; CHECK-NEXT: mov v2.b[4], w9 +; CHECK-NEXT: umov w9, v0.b[6] +; CHECK-NEXT: mov v1.b[5], w10 +; CHECK-NEXT: umov w10, v0.b[14] +; CHECK-NEXT: mov v2.b[5], w8 +; CHECK-NEXT: mov x8, #16 +; CHECK-NEXT: mov v1.b[6], w9 +; CHECK-NEXT: mov x9, #24 +; CHECK-NEXT: ld1w { z4.s }, p0/z, [x0, x8, lsl #2] +; CHECK-NEXT: mov v2.b[6], w10 +; CHECK-NEXT: umov w10, v0.b[15] +; CHECK-NEXT: ext z0.b, z0.b, z0.b, #16 +; CHECK-NEXT: ld1w { z5.s }, p0/z, [x0, x9, lsl #2] +; CHECK-NEXT: dup v3.2d, v0.d[1] +; CHECK-NEXT: uunpklo z0.h, z0.b +; CHECK-NEXT: mov v1.b[7], w11 +; CHECK-NEXT: uunpklo z0.s, z0.h +; CHECK-NEXT: mov v2.b[7], w10 +; CHECK-NEXT: uunpklo z3.h, z3.b +; CHECK-NEXT: uunpklo z3.s, z3.h +; CHECK-NEXT: mov x11, #8 +; CHECK-NEXT: lsl z0.s, z0.s, #31 +; CHECK-NEXT: lsl z3.s, z3.s, #31 +; CHECK-NEXT: asr z0.s, z0.s, #31 +; CHECK-NEXT: asr z3.s, z3.s, #31 +; CHECK-NEXT: uunpklo z1.h, z1.b +; CHECK-NEXT: uunpklo z2.h, z2.b +; CHECK-NEXT: and z0.s, z0.s, #0x1 +; CHECK-NEXT: and z3.s, z3.s, #0x1 +; CHECK-NEXT: uunpklo z1.s, z1.h +; CHECK-NEXT: uunpklo z2.s, z2.h +; CHECK-NEXT: cmpne p1.s, p0/z, z0.s, #0 +; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0, x11, lsl #2] +; CHECK-NEXT: cmpne p2.s, p0/z, z3.s, #0 +; CHECK-NEXT: ld1w { z3.s }, p0/z, [x0] +; CHECK-NEXT: lsl z1.s, z1.s, #31 +; CHECK-NEXT: lsl z2.s, z2.s, #31 +; CHECK-NEXT: asr z1.s, z1.s, #31 +; CHECK-NEXT: asr z2.s, z2.s, #31 +; CHECK-NEXT: and z1.s, z1.s, #0x1 +; CHECK-NEXT: and z2.s, z2.s, #0x1 +; CHECK-NEXT: mov z4.s, p1/m, #0 // =0x0 +; CHECK-NEXT: mov z5.s, p2/m, #0 // =0x0 +; CHECK-NEXT: cmpne p1.s, p0/z, z1.s, #0 +; CHECK-NEXT: cmpne p2.s, p0/z, z2.s, #0 +; CHECK-NEXT: mov z3.s, p1/m, #0 // =0x0 +; CHECK-NEXT: mov z0.s, p2/m, #0 // =0x0 +; CHECK-NEXT: st1w { z4.s }, p0, [x0, x8, lsl #2] +; CHECK-NEXT: st1w { z5.s }, p0, [x0, x9, lsl #2] +; CHECK-NEXT: st1w { z0.s }, p0, [x0, x11, lsl #2] +; CHECK-NEXT: st1w { z3.s }, p0, [x0] +; CHECK-NEXT: .LBB1_2: // %exit +; CHECK-NEXT: ret %broadcast.splat = shufflevector <32 x i1> zeroinitializer, <32 x i1> zeroinitializer, <32 x i32> zeroinitializer br i1 %cond, label %exit, label %vector.body diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-splat-vector.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-splat-vector.ll --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-splat-vector.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-splat-vector.ll @@ -1,6 +1,7 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -aarch64-sve-vector-bits-min=128 < %s | FileCheck %s -check-prefix=NO_SVE ; RUN: llc -aarch64-sve-vector-bits-min=256 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_EQ_256 -; RUN: llc -aarch64-sve-vector-bits-min=384 < %s | FileCheck %s -check-prefixes=CHECK +; RUN: llc -aarch64-sve-vector-bits-min=384 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_384 ; RUN: llc -aarch64-sve-vector-bits-min=512 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512 ; RUN: llc -aarch64-sve-vector-bits-min=640 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512 ; RUN: llc -aarch64-sve-vector-bits-min=768 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512 @@ -26,9 +27,15 @@ ; Don't use SVE for 64-bit vectors. define <8 x i8> @splat_v8i8(i8 %a) #0 { +; NO_SVE-LABEL: splat_v8i8: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: dup v0.8b, w0 +; NO_SVE-NEXT: ret +; ; CHECK-LABEL: splat_v8i8: -; CHECK: dup v0.8b, w0 -; CHECK-NEXT: ret +; CHECK: // %bb.0: +; CHECK-NEXT: dup v0.8b, w0 +; CHECK-NEXT: ret %insert = insertelement <8 x i8> undef, i8 %a, i64 0 %splat = shufflevector <8 x i8> %insert, <8 x i8> undef, <8 x i32> zeroinitializer ret <8 x i8> %splat @@ -36,20 +43,33 @@ ; Don't use SVE for 128-bit vectors. define <16 x i8> @splat_v16i8(i8 %a) #0 { +; NO_SVE-LABEL: splat_v16i8: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: dup v0.16b, w0 +; NO_SVE-NEXT: ret +; ; CHECK-LABEL: splat_v16i8: -; CHECK: dup v0.16b, w0 -; CHECK-NEXT: ret +; CHECK: // %bb.0: +; CHECK-NEXT: dup v0.16b, w0 +; CHECK-NEXT: ret %insert = insertelement <16 x i8> undef, i8 %a, i64 0 %splat = shufflevector <16 x i8> %insert, <16 x i8> undef, <16 x i32> zeroinitializer ret <16 x i8> %splat } define void @splat_v32i8(i8 %a, <32 x i8>* %b) #0 { +; NO_SVE-LABEL: splat_v32i8: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: dup v0.16b, w0 +; NO_SVE-NEXT: stp q0, q0, [x1] +; NO_SVE-NEXT: ret +; ; CHECK-LABEL: splat_v32i8: -; CHECK-DAG: mov [[RES:z[0-9]+]].b, w0 -; CHECK-DAG: ptrue [[PG:p[0-9]+]].b, vl32 -; CHECK-NEXT: st1b { [[RES]].b }, [[PG]], [x1] -; CHECK-NEXT: ret +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.b, vl32 +; CHECK-NEXT: mov z0.b, w0 +; CHECK-NEXT: st1b { z0.b }, p0, [x1] +; CHECK-NEXT: ret %insert = insertelement <32 x i8> undef, i8 %a, i64 0 %splat = shufflevector <32 x i8> %insert, <32 x i8> undef, <32 x i32> zeroinitializer store <32 x i8> %splat, <32 x i8>* %b @@ -57,19 +77,39 @@ } define void @splat_v64i8(i8 %a, <64 x i8>* %b) #0 { -; CHECK-LABEL: splat_v64i8: -; VBITS_GE_512-DAG: mov [[RES:z[0-9]+]].b, w0 -; VBITS_GE_512-DAG: ptrue [[PG:p[0-9]+]].b, vl64 -; VBITS_GE_512-NEXT: st1b { [[RES]].b }, [[PG]], [x1] -; VBITS_GE_512-NEXT: ret +; NO_SVE-LABEL: splat_v64i8: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: dup v0.16b, w0 +; NO_SVE-NEXT: stp q0, q0, [x1] +; NO_SVE-NEXT: stp q0, q0, [x1, #32] +; NO_SVE-NEXT: ret +; +; VBITS_EQ_256-LABEL: splat_v64i8: +; VBITS_EQ_256: // %bb.0: +; VBITS_EQ_256-NEXT: mov w8, #32 +; VBITS_EQ_256-NEXT: mov z0.b, w0 +; VBITS_EQ_256-NEXT: ptrue p0.b, vl32 +; VBITS_EQ_256-NEXT: st1b { z0.b }, p0, [x1] +; VBITS_EQ_256-NEXT: st1b { z0.b }, p0, [x1, x8] +; VBITS_EQ_256-NEXT: ret +; +; VBITS_GE_384-LABEL: splat_v64i8: +; VBITS_GE_384: // %bb.0: +; VBITS_GE_384-NEXT: mov w8, #32 +; VBITS_GE_384-NEXT: mov z0.b, w0 +; VBITS_GE_384-NEXT: ptrue p0.b, vl32 +; VBITS_GE_384-NEXT: st1b { z0.b }, p0, [x1] +; VBITS_GE_384-NEXT: st1b { z0.b }, p0, [x1, x8] +; VBITS_GE_384-NEXT: ret +; +; VBITS_GE_512-LABEL: splat_v64i8: +; VBITS_GE_512: // %bb.0: +; VBITS_GE_512-NEXT: ptrue p0.b, vl64 +; VBITS_GE_512-NEXT: mov z0.b, w0 +; VBITS_GE_512-NEXT: st1b { z0.b }, p0, [x1] +; VBITS_GE_512-NEXT: ret ; Ensure sensible type legalisation. -; VBITS_EQ_256-DAG: mov [[RES:z[0-9]+]].b, w0 -; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].b, vl32 -; VBITS_EQ_256-DAG: mov w[[NUMELTS:[0-9]+]], #32 -; VBITS_EQ_256-DAG: st1b { [[RES]].b }, [[PG]], [x1] -; VBITS_EQ_256-DAG: st1b { [[RES]].b }, [[PG]], [x1, x[[NUMELTS]]] -; VBITS_EQ_256-NEXT: ret %insert = insertelement <64 x i8> undef, i8 %a, i64 0 %splat = shufflevector <64 x i8> %insert, <64 x i8> undef, <64 x i32> zeroinitializer store <64 x i8> %splat, <64 x i8>* %b @@ -77,11 +117,47 @@ } define void @splat_v128i8(i8 %a, <128 x i8>* %b) #0 { -; CHECK-LABEL: splat_v128i8: -; VBITS_GE_1024-DAG: mov [[RES:z[0-9]+]].b, w0 -; VBITS_GE_1024-DAG: ptrue [[PG:p[0-9]+]].b, vl128 -; VBITS_GE_1024-NEXT: st1b { [[RES]].b }, [[PG]], [x1] -; VBITS_GE_1024-NEXT: ret +; NO_SVE-LABEL: splat_v128i8: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: dup v0.16b, w0 +; NO_SVE-NEXT: stp q0, q0, [x1] +; NO_SVE-NEXT: stp q0, q0, [x1, #32] +; NO_SVE-NEXT: stp q0, q0, [x1, #64] +; NO_SVE-NEXT: stp q0, q0, [x1, #96] +; NO_SVE-NEXT: ret +; +; VBITS_EQ_256-LABEL: splat_v128i8: +; VBITS_EQ_256: // %bb.0: +; VBITS_EQ_256-NEXT: mov w8, #96 +; VBITS_EQ_256-NEXT: mov w9, #64 +; VBITS_EQ_256-NEXT: mov w10, #32 +; VBITS_EQ_256-NEXT: ptrue p0.b, vl32 +; VBITS_EQ_256-NEXT: mov z0.b, w0 +; VBITS_EQ_256-NEXT: st1b { z0.b }, p0, [x1, x8] +; VBITS_EQ_256-NEXT: st1b { z0.b }, p0, [x1, x9] +; VBITS_EQ_256-NEXT: st1b { z0.b }, p0, [x1, x10] +; VBITS_EQ_256-NEXT: st1b { z0.b }, p0, [x1] +; VBITS_EQ_256-NEXT: ret +; +; VBITS_GE_384-LABEL: splat_v128i8: +; VBITS_GE_384: // %bb.0: +; VBITS_GE_384-NEXT: mov w8, #96 +; VBITS_GE_384-NEXT: mov w9, #64 +; VBITS_GE_384-NEXT: mov w10, #32 +; VBITS_GE_384-NEXT: ptrue p0.b, vl32 +; VBITS_GE_384-NEXT: mov z0.b, w0 +; VBITS_GE_384-NEXT: st1b { z0.b }, p0, [x1, x8] +; VBITS_GE_384-NEXT: st1b { z0.b }, p0, [x1, x9] +; VBITS_GE_384-NEXT: st1b { z0.b }, p0, [x1, x10] +; VBITS_GE_384-NEXT: st1b { z0.b }, p0, [x1] +; VBITS_GE_384-NEXT: ret +; +; VBITS_GE_1024-LABEL: splat_v128i8: +; VBITS_GE_1024: // %bb.0: +; VBITS_GE_1024-NEXT: ptrue p0.b, vl128 +; VBITS_GE_1024-NEXT: mov z0.b, w0 +; VBITS_GE_1024-NEXT: st1b { z0.b }, p0, [x1] +; VBITS_GE_1024-NEXT: ret %insert = insertelement <128 x i8> undef, i8 %a, i64 0 %splat = shufflevector <128 x i8> %insert, <128 x i8> undef, <128 x i32> zeroinitializer store <128 x i8> %splat, <128 x i8>* %b @@ -89,11 +165,67 @@ } define void @splat_v256i8(i8 %a, <256 x i8>* %b) #0 { -; CHECK-LABEL: splat_v256i8: -; VBITS_GE_2048-DAG: mov [[RES:z[0-9]+]].b, w0 -; VBITS_GE_2048-DAG: ptrue [[PG:p[0-9]+]].b, vl256 -; VBITS_GE_2048-NEXT: st1b { [[RES]].b }, [[PG]], [x1] -; VBITS_GE_2048-NEXT: ret +; NO_SVE-LABEL: splat_v256i8: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: dup v0.16b, w0 +; NO_SVE-NEXT: stp q0, q0, [x1] +; NO_SVE-NEXT: stp q0, q0, [x1, #32] +; NO_SVE-NEXT: stp q0, q0, [x1, #64] +; NO_SVE-NEXT: stp q0, q0, [x1, #96] +; NO_SVE-NEXT: stp q0, q0, [x1, #128] +; NO_SVE-NEXT: stp q0, q0, [x1, #160] +; NO_SVE-NEXT: stp q0, q0, [x1, #192] +; NO_SVE-NEXT: stp q0, q0, [x1, #224] +; NO_SVE-NEXT: ret +; +; VBITS_EQ_256-LABEL: splat_v256i8: +; VBITS_EQ_256: // %bb.0: +; VBITS_EQ_256-NEXT: mov w8, #224 +; VBITS_EQ_256-NEXT: mov w9, #192 +; VBITS_EQ_256-NEXT: ptrue p0.b, vl32 +; VBITS_EQ_256-NEXT: mov z0.b, w0 +; VBITS_EQ_256-NEXT: mov w10, #160 +; VBITS_EQ_256-NEXT: mov w11, #128 +; VBITS_EQ_256-NEXT: mov w12, #96 +; VBITS_EQ_256-NEXT: st1b { z0.b }, p0, [x1, x8] +; VBITS_EQ_256-NEXT: mov w8, #64 +; VBITS_EQ_256-NEXT: st1b { z0.b }, p0, [x1, x9] +; VBITS_EQ_256-NEXT: mov w9, #32 +; VBITS_EQ_256-NEXT: st1b { z0.b }, p0, [x1, x10] +; VBITS_EQ_256-NEXT: st1b { z0.b }, p0, [x1, x11] +; VBITS_EQ_256-NEXT: st1b { z0.b }, p0, [x1, x12] +; VBITS_EQ_256-NEXT: st1b { z0.b }, p0, [x1, x8] +; VBITS_EQ_256-NEXT: st1b { z0.b }, p0, [x1] +; VBITS_EQ_256-NEXT: st1b { z0.b }, p0, [x1, x9] +; VBITS_EQ_256-NEXT: ret +; +; VBITS_GE_384-LABEL: splat_v256i8: +; VBITS_GE_384: // %bb.0: +; VBITS_GE_384-NEXT: mov w8, #224 +; VBITS_GE_384-NEXT: mov w9, #192 +; VBITS_GE_384-NEXT: ptrue p0.b, vl32 +; VBITS_GE_384-NEXT: mov z0.b, w0 +; VBITS_GE_384-NEXT: mov w10, #160 +; VBITS_GE_384-NEXT: mov w11, #128 +; VBITS_GE_384-NEXT: mov w12, #96 +; VBITS_GE_384-NEXT: st1b { z0.b }, p0, [x1, x8] +; VBITS_GE_384-NEXT: mov w8, #64 +; VBITS_GE_384-NEXT: st1b { z0.b }, p0, [x1, x9] +; VBITS_GE_384-NEXT: mov w9, #32 +; VBITS_GE_384-NEXT: st1b { z0.b }, p0, [x1, x10] +; VBITS_GE_384-NEXT: st1b { z0.b }, p0, [x1, x11] +; VBITS_GE_384-NEXT: st1b { z0.b }, p0, [x1, x12] +; VBITS_GE_384-NEXT: st1b { z0.b }, p0, [x1, x8] +; VBITS_GE_384-NEXT: st1b { z0.b }, p0, [x1] +; VBITS_GE_384-NEXT: st1b { z0.b }, p0, [x1, x9] +; VBITS_GE_384-NEXT: ret +; +; VBITS_GE_2048-LABEL: splat_v256i8: +; VBITS_GE_2048: // %bb.0: +; VBITS_GE_2048-NEXT: ptrue p0.b, vl256 +; VBITS_GE_2048-NEXT: mov z0.b, w0 +; VBITS_GE_2048-NEXT: st1b { z0.b }, p0, [x1] +; VBITS_GE_2048-NEXT: ret %insert = insertelement <256 x i8> undef, i8 %a, i64 0 %splat = shufflevector <256 x i8> %insert, <256 x i8> undef, <256 x i32> zeroinitializer store <256 x i8> %splat, <256 x i8>* %b @@ -102,9 +234,15 @@ ; Don't use SVE for 64-bit vectors. define <4 x i16> @splat_v4i16(i16 %a) #0 { +; NO_SVE-LABEL: splat_v4i16: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: dup v0.4h, w0 +; NO_SVE-NEXT: ret +; ; CHECK-LABEL: splat_v4i16: -; CHECK: dup v0.4h, w0 -; CHECK-NEXT: ret +; CHECK: // %bb.0: +; CHECK-NEXT: dup v0.4h, w0 +; CHECK-NEXT: ret %insert = insertelement <4 x i16> undef, i16 %a, i64 0 %splat = shufflevector <4 x i16> %insert, <4 x i16> undef, <4 x i32> zeroinitializer ret <4 x i16> %splat @@ -112,20 +250,33 @@ ; Don't use SVE for 128-bit vectors. define <8 x i16> @splat_v8i16(i16 %a) #0 { +; NO_SVE-LABEL: splat_v8i16: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: dup v0.8h, w0 +; NO_SVE-NEXT: ret +; ; CHECK-LABEL: splat_v8i16: -; CHECK: dup v0.8h, w0 -; CHECK-NEXT: ret +; CHECK: // %bb.0: +; CHECK-NEXT: dup v0.8h, w0 +; CHECK-NEXT: ret %insert = insertelement <8 x i16> undef, i16 %a, i64 0 %splat = shufflevector <8 x i16> %insert, <8 x i16> undef, <8 x i32> zeroinitializer ret <8 x i16> %splat } define void @splat_v16i16(i16 %a, <16 x i16>* %b) #0 { +; NO_SVE-LABEL: splat_v16i16: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: dup v0.8h, w0 +; NO_SVE-NEXT: stp q0, q0, [x1] +; NO_SVE-NEXT: ret +; ; CHECK-LABEL: splat_v16i16: -; CHECK-DAG: mov [[RES:z[0-9]+]].h, w0 -; CHECK-DAG: ptrue [[PG:p[0-9]+]].h, vl16 -; CHECK-NEXT: st1h { [[RES]].h }, [[PG]], [x1] -; CHECK-NEXT: ret +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.h, vl16 +; CHECK-NEXT: mov z0.h, w0 +; CHECK-NEXT: st1h { z0.h }, p0, [x1] +; CHECK-NEXT: ret %insert = insertelement <16 x i16> undef, i16 %a, i64 0 %splat = shufflevector <16 x i16> %insert, <16 x i16> undef, <16 x i32> zeroinitializer store <16 x i16> %splat, <16 x i16>* %b @@ -133,19 +284,39 @@ } define void @splat_v32i16(i16 %a, <32 x i16>* %b) #0 { -; CHECK-LABEL: splat_v32i16: -; VBITS_GE_512-DAG: mov [[RES:z[0-9]+]].h, w0 -; VBITS_GE_512-DAG: ptrue [[PG:p[0-9]+]].h, vl32 -; VBITS_GE_512-NEXT: st1h { [[RES]].h }, [[PG]], [x1] -; VBITS_GE_512-NEXT: ret +; NO_SVE-LABEL: splat_v32i16: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: dup v0.8h, w0 +; NO_SVE-NEXT: stp q0, q0, [x1] +; NO_SVE-NEXT: stp q0, q0, [x1, #32] +; NO_SVE-NEXT: ret +; +; VBITS_EQ_256-LABEL: splat_v32i16: +; VBITS_EQ_256: // %bb.0: +; VBITS_EQ_256-NEXT: mov x8, #16 +; VBITS_EQ_256-NEXT: mov z0.h, w0 +; VBITS_EQ_256-NEXT: ptrue p0.h, vl16 +; VBITS_EQ_256-NEXT: st1h { z0.h }, p0, [x1] +; VBITS_EQ_256-NEXT: st1h { z0.h }, p0, [x1, x8, lsl #1] +; VBITS_EQ_256-NEXT: ret +; +; VBITS_GE_384-LABEL: splat_v32i16: +; VBITS_GE_384: // %bb.0: +; VBITS_GE_384-NEXT: mov x8, #16 +; VBITS_GE_384-NEXT: mov z0.h, w0 +; VBITS_GE_384-NEXT: ptrue p0.h, vl16 +; VBITS_GE_384-NEXT: st1h { z0.h }, p0, [x1] +; VBITS_GE_384-NEXT: st1h { z0.h }, p0, [x1, x8, lsl #1] +; VBITS_GE_384-NEXT: ret +; +; VBITS_GE_512-LABEL: splat_v32i16: +; VBITS_GE_512: // %bb.0: +; VBITS_GE_512-NEXT: ptrue p0.h, vl32 +; VBITS_GE_512-NEXT: mov z0.h, w0 +; VBITS_GE_512-NEXT: st1h { z0.h }, p0, [x1] +; VBITS_GE_512-NEXT: ret ; Ensure sensible type legalisation. -; VBITS_EQ_256-DAG: mov [[RES:z[0-9]+]].h, w0 -; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].h, vl16 -; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #16 -; VBITS_EQ_256-DAG: st1h { [[RES]].h }, [[PG]], [x1] -; VBITS_EQ_256-DAG: st1h { [[RES]].h }, [[PG]], [x1, x[[NUMELTS]], lsl #1] -; VBITS_EQ_256-NEXT: ret %insert = insertelement <32 x i16> undef, i16 %a, i64 0 %splat = shufflevector <32 x i16> %insert, <32 x i16> undef, <32 x i32> zeroinitializer store <32 x i16> %splat, <32 x i16>* %b @@ -153,11 +324,47 @@ } define void @splat_v64i16(i16 %a, <64 x i16>* %b) #0 { -; CHECK-LABEL: splat_v64i16: -; VBITS_GE_1024-DAG: mov [[RES:z[0-9]+]].h, w0 -; VBITS_GE_1024-DAG: ptrue [[PG:p[0-9]+]].h, vl64 -; VBITS_GE_1024-NEXT: st1h { [[RES]].h }, [[PG]], [x1] -; VBITS_GE_1024-NEXT: ret +; NO_SVE-LABEL: splat_v64i16: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: dup v0.8h, w0 +; NO_SVE-NEXT: stp q0, q0, [x1] +; NO_SVE-NEXT: stp q0, q0, [x1, #32] +; NO_SVE-NEXT: stp q0, q0, [x1, #64] +; NO_SVE-NEXT: stp q0, q0, [x1, #96] +; NO_SVE-NEXT: ret +; +; VBITS_EQ_256-LABEL: splat_v64i16: +; VBITS_EQ_256: // %bb.0: +; VBITS_EQ_256-NEXT: mov x8, #48 +; VBITS_EQ_256-NEXT: mov x9, #32 +; VBITS_EQ_256-NEXT: mov x10, #16 +; VBITS_EQ_256-NEXT: ptrue p0.h, vl16 +; VBITS_EQ_256-NEXT: mov z0.h, w0 +; VBITS_EQ_256-NEXT: st1h { z0.h }, p0, [x1, x8, lsl #1] +; VBITS_EQ_256-NEXT: st1h { z0.h }, p0, [x1, x9, lsl #1] +; VBITS_EQ_256-NEXT: st1h { z0.h }, p0, [x1, x10, lsl #1] +; VBITS_EQ_256-NEXT: st1h { z0.h }, p0, [x1] +; VBITS_EQ_256-NEXT: ret +; +; VBITS_GE_384-LABEL: splat_v64i16: +; VBITS_GE_384: // %bb.0: +; VBITS_GE_384-NEXT: mov x8, #48 +; VBITS_GE_384-NEXT: mov x9, #32 +; VBITS_GE_384-NEXT: mov x10, #16 +; VBITS_GE_384-NEXT: ptrue p0.h, vl16 +; VBITS_GE_384-NEXT: mov z0.h, w0 +; VBITS_GE_384-NEXT: st1h { z0.h }, p0, [x1, x8, lsl #1] +; VBITS_GE_384-NEXT: st1h { z0.h }, p0, [x1, x9, lsl #1] +; VBITS_GE_384-NEXT: st1h { z0.h }, p0, [x1, x10, lsl #1] +; VBITS_GE_384-NEXT: st1h { z0.h }, p0, [x1] +; VBITS_GE_384-NEXT: ret +; +; VBITS_GE_1024-LABEL: splat_v64i16: +; VBITS_GE_1024: // %bb.0: +; VBITS_GE_1024-NEXT: ptrue p0.h, vl64 +; VBITS_GE_1024-NEXT: mov z0.h, w0 +; VBITS_GE_1024-NEXT: st1h { z0.h }, p0, [x1] +; VBITS_GE_1024-NEXT: ret %insert = insertelement <64 x i16> undef, i16 %a, i64 0 %splat = shufflevector <64 x i16> %insert, <64 x i16> undef, <64 x i32> zeroinitializer store <64 x i16> %splat, <64 x i16>* %b @@ -165,11 +372,67 @@ } define void @splat_v128i16(i16 %a, <128 x i16>* %b) #0 { -; CHECK-LABEL: splat_v128i16: -; VBITS_GE_2048-DAG: mov [[RES:z[0-9]+]].h, w0 -; VBITS_GE_2048-DAG: ptrue [[PG:p[0-9]+]].h, vl128 -; VBITS_GE_2048-NEXT: st1h { [[RES]].h }, [[PG]], [x1] -; VBITS_GE_2048-NEXT: ret +; NO_SVE-LABEL: splat_v128i16: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: dup v0.8h, w0 +; NO_SVE-NEXT: stp q0, q0, [x1] +; NO_SVE-NEXT: stp q0, q0, [x1, #32] +; NO_SVE-NEXT: stp q0, q0, [x1, #64] +; NO_SVE-NEXT: stp q0, q0, [x1, #96] +; NO_SVE-NEXT: stp q0, q0, [x1, #128] +; NO_SVE-NEXT: stp q0, q0, [x1, #160] +; NO_SVE-NEXT: stp q0, q0, [x1, #192] +; NO_SVE-NEXT: stp q0, q0, [x1, #224] +; NO_SVE-NEXT: ret +; +; VBITS_EQ_256-LABEL: splat_v128i16: +; VBITS_EQ_256: // %bb.0: +; VBITS_EQ_256-NEXT: mov x8, #112 +; VBITS_EQ_256-NEXT: mov x9, #96 +; VBITS_EQ_256-NEXT: ptrue p0.h, vl16 +; VBITS_EQ_256-NEXT: mov z0.h, w0 +; VBITS_EQ_256-NEXT: mov x10, #80 +; VBITS_EQ_256-NEXT: mov x11, #64 +; VBITS_EQ_256-NEXT: mov x12, #48 +; VBITS_EQ_256-NEXT: st1h { z0.h }, p0, [x1, x8, lsl #1] +; VBITS_EQ_256-NEXT: mov x8, #32 +; VBITS_EQ_256-NEXT: st1h { z0.h }, p0, [x1, x9, lsl #1] +; VBITS_EQ_256-NEXT: mov x9, #16 +; VBITS_EQ_256-NEXT: st1h { z0.h }, p0, [x1, x10, lsl #1] +; VBITS_EQ_256-NEXT: st1h { z0.h }, p0, [x1, x11, lsl #1] +; VBITS_EQ_256-NEXT: st1h { z0.h }, p0, [x1, x12, lsl #1] +; VBITS_EQ_256-NEXT: st1h { z0.h }, p0, [x1, x8, lsl #1] +; VBITS_EQ_256-NEXT: st1h { z0.h }, p0, [x1] +; VBITS_EQ_256-NEXT: st1h { z0.h }, p0, [x1, x9, lsl #1] +; VBITS_EQ_256-NEXT: ret +; +; VBITS_GE_384-LABEL: splat_v128i16: +; VBITS_GE_384: // %bb.0: +; VBITS_GE_384-NEXT: mov x8, #112 +; VBITS_GE_384-NEXT: mov x9, #96 +; VBITS_GE_384-NEXT: ptrue p0.h, vl16 +; VBITS_GE_384-NEXT: mov z0.h, w0 +; VBITS_GE_384-NEXT: mov x10, #80 +; VBITS_GE_384-NEXT: mov x11, #64 +; VBITS_GE_384-NEXT: mov x12, #48 +; VBITS_GE_384-NEXT: st1h { z0.h }, p0, [x1, x8, lsl #1] +; VBITS_GE_384-NEXT: mov x8, #32 +; VBITS_GE_384-NEXT: st1h { z0.h }, p0, [x1, x9, lsl #1] +; VBITS_GE_384-NEXT: mov x9, #16 +; VBITS_GE_384-NEXT: st1h { z0.h }, p0, [x1, x10, lsl #1] +; VBITS_GE_384-NEXT: st1h { z0.h }, p0, [x1, x11, lsl #1] +; VBITS_GE_384-NEXT: st1h { z0.h }, p0, [x1, x12, lsl #1] +; VBITS_GE_384-NEXT: st1h { z0.h }, p0, [x1, x8, lsl #1] +; VBITS_GE_384-NEXT: st1h { z0.h }, p0, [x1] +; VBITS_GE_384-NEXT: st1h { z0.h }, p0, [x1, x9, lsl #1] +; VBITS_GE_384-NEXT: ret +; +; VBITS_GE_2048-LABEL: splat_v128i16: +; VBITS_GE_2048: // %bb.0: +; VBITS_GE_2048-NEXT: ptrue p0.h, vl128 +; VBITS_GE_2048-NEXT: mov z0.h, w0 +; VBITS_GE_2048-NEXT: st1h { z0.h }, p0, [x1] +; VBITS_GE_2048-NEXT: ret %insert = insertelement <128 x i16> undef, i16 %a, i64 0 %splat = shufflevector <128 x i16> %insert, <128 x i16> undef, <128 x i32> zeroinitializer store <128 x i16> %splat, <128 x i16>* %b @@ -178,9 +441,15 @@ ; Don't use SVE for 64-bit vectors. define <2 x i32> @splat_v2i32(i32 %a) #0 { +; NO_SVE-LABEL: splat_v2i32: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: dup v0.2s, w0 +; NO_SVE-NEXT: ret +; ; CHECK-LABEL: splat_v2i32: -; CHECK: dup v0.2s, w0 -; CHECK-NEXT: ret +; CHECK: // %bb.0: +; CHECK-NEXT: dup v0.2s, w0 +; CHECK-NEXT: ret %insert = insertelement <2 x i32> undef, i32 %a, i64 0 %splat = shufflevector <2 x i32> %insert, <2 x i32> undef, <2 x i32> zeroinitializer ret <2 x i32> %splat @@ -188,20 +457,33 @@ ; Don't use SVE for 128-bit vectors. define <4 x i32> @splat_v4i32(i32 %a) #0 { +; NO_SVE-LABEL: splat_v4i32: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: dup v0.4s, w0 +; NO_SVE-NEXT: ret +; ; CHECK-LABEL: splat_v4i32: -; CHECK: dup v0.4s, w0 -; CHECK-NEXT: ret +; CHECK: // %bb.0: +; CHECK-NEXT: dup v0.4s, w0 +; CHECK-NEXT: ret %insert = insertelement <4 x i32> undef, i32 %a, i64 0 %splat = shufflevector <4 x i32> %insert, <4 x i32> undef, <4 x i32> zeroinitializer ret <4 x i32> %splat } define void @splat_v8i32(i32 %a, <8 x i32>* %b) #0 { +; NO_SVE-LABEL: splat_v8i32: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: dup v0.4s, w0 +; NO_SVE-NEXT: stp q0, q0, [x1] +; NO_SVE-NEXT: ret +; ; CHECK-LABEL: splat_v8i32: -; CHECK-DAG: mov [[RES:z[0-9]+]].s, w0 -; CHECK-DAG: ptrue [[PG:p[0-9]+]].s, vl8 -; CHECK-NEXT: st1w { [[RES]].s }, [[PG]], [x1] -; CHECK-NEXT: ret +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.s, vl8 +; CHECK-NEXT: mov z0.s, w0 +; CHECK-NEXT: st1w { z0.s }, p0, [x1] +; CHECK-NEXT: ret %insert = insertelement <8 x i32> undef, i32 %a, i64 0 %splat = shufflevector <8 x i32> %insert, <8 x i32> undef, <8 x i32> zeroinitializer store <8 x i32> %splat, <8 x i32>* %b @@ -209,19 +491,39 @@ } define void @splat_v16i32(i32 %a, <16 x i32>* %b) #0 { -; CHECK-LABEL: splat_v16i32: -; VBITS_GE_512-DAG: mov [[RES:z[0-9]+]].s, w0 -; VBITS_GE_512-DAG: ptrue [[PG:p[0-9]+]].s, vl16 -; VBITS_GE_512-NEXT: st1w { [[RES]].s }, [[PG]], [x1] -; VBITS_GE_512-NEXT: ret +; NO_SVE-LABEL: splat_v16i32: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: dup v0.4s, w0 +; NO_SVE-NEXT: stp q0, q0, [x1] +; NO_SVE-NEXT: stp q0, q0, [x1, #32] +; NO_SVE-NEXT: ret +; +; VBITS_EQ_256-LABEL: splat_v16i32: +; VBITS_EQ_256: // %bb.0: +; VBITS_EQ_256-NEXT: mov x8, #8 +; VBITS_EQ_256-NEXT: mov z0.s, w0 +; VBITS_EQ_256-NEXT: ptrue p0.s, vl8 +; VBITS_EQ_256-NEXT: st1w { z0.s }, p0, [x1] +; VBITS_EQ_256-NEXT: st1w { z0.s }, p0, [x1, x8, lsl #2] +; VBITS_EQ_256-NEXT: ret +; +; VBITS_GE_384-LABEL: splat_v16i32: +; VBITS_GE_384: // %bb.0: +; VBITS_GE_384-NEXT: mov x8, #8 +; VBITS_GE_384-NEXT: mov z0.s, w0 +; VBITS_GE_384-NEXT: ptrue p0.s, vl8 +; VBITS_GE_384-NEXT: st1w { z0.s }, p0, [x1] +; VBITS_GE_384-NEXT: st1w { z0.s }, p0, [x1, x8, lsl #2] +; VBITS_GE_384-NEXT: ret +; +; VBITS_GE_512-LABEL: splat_v16i32: +; VBITS_GE_512: // %bb.0: +; VBITS_GE_512-NEXT: ptrue p0.s, vl16 +; VBITS_GE_512-NEXT: mov z0.s, w0 +; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x1] +; VBITS_GE_512-NEXT: ret ; Ensure sensible type legalisation. -; VBITS_EQ_256-DAG: mov [[RES:z[0-9]+]].s, w0 -; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].s, vl8 -; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #8 -; VBITS_EQ_256-DAG: st1w { [[RES]].s }, [[PG]], [x1] -; VBITS_EQ_256-DAG: st1w { [[RES]].s }, [[PG]], [x1, x[[NUMELTS]], lsl #2] -; VBITS_EQ_256-NEXT: ret %insert = insertelement <16 x i32> undef, i32 %a, i64 0 %splat = shufflevector <16 x i32> %insert, <16 x i32> undef, <16 x i32> zeroinitializer store <16 x i32> %splat, <16 x i32>* %b @@ -229,11 +531,47 @@ } define void @splat_v32i32(i32 %a, <32 x i32>* %b) #0 { -; CHECK-LABEL: splat_v32i32: -; VBITS_GE_1024-DAG: mov [[RES:z[0-9]+]].s, w0 -; VBITS_GE_1024-DAG: ptrue [[PG:p[0-9]+]].s, vl32 -; VBITS_GE_1024-NEXT: st1w { [[RES]].s }, [[PG]], [x1] -; VBITS_GE_1024-NEXT: ret +; NO_SVE-LABEL: splat_v32i32: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: dup v0.4s, w0 +; NO_SVE-NEXT: stp q0, q0, [x1] +; NO_SVE-NEXT: stp q0, q0, [x1, #32] +; NO_SVE-NEXT: stp q0, q0, [x1, #64] +; NO_SVE-NEXT: stp q0, q0, [x1, #96] +; NO_SVE-NEXT: ret +; +; VBITS_EQ_256-LABEL: splat_v32i32: +; VBITS_EQ_256: // %bb.0: +; VBITS_EQ_256-NEXT: mov x8, #24 +; VBITS_EQ_256-NEXT: mov x9, #16 +; VBITS_EQ_256-NEXT: mov x10, #8 +; VBITS_EQ_256-NEXT: ptrue p0.s, vl8 +; VBITS_EQ_256-NEXT: mov z0.s, w0 +; VBITS_EQ_256-NEXT: st1w { z0.s }, p0, [x1, x8, lsl #2] +; VBITS_EQ_256-NEXT: st1w { z0.s }, p0, [x1, x9, lsl #2] +; VBITS_EQ_256-NEXT: st1w { z0.s }, p0, [x1, x10, lsl #2] +; VBITS_EQ_256-NEXT: st1w { z0.s }, p0, [x1] +; VBITS_EQ_256-NEXT: ret +; +; VBITS_GE_384-LABEL: splat_v32i32: +; VBITS_GE_384: // %bb.0: +; VBITS_GE_384-NEXT: mov x8, #24 +; VBITS_GE_384-NEXT: mov x9, #16 +; VBITS_GE_384-NEXT: mov x10, #8 +; VBITS_GE_384-NEXT: ptrue p0.s, vl8 +; VBITS_GE_384-NEXT: mov z0.s, w0 +; VBITS_GE_384-NEXT: st1w { z0.s }, p0, [x1, x8, lsl #2] +; VBITS_GE_384-NEXT: st1w { z0.s }, p0, [x1, x9, lsl #2] +; VBITS_GE_384-NEXT: st1w { z0.s }, p0, [x1, x10, lsl #2] +; VBITS_GE_384-NEXT: st1w { z0.s }, p0, [x1] +; VBITS_GE_384-NEXT: ret +; +; VBITS_GE_1024-LABEL: splat_v32i32: +; VBITS_GE_1024: // %bb.0: +; VBITS_GE_1024-NEXT: ptrue p0.s, vl32 +; VBITS_GE_1024-NEXT: mov z0.s, w0 +; VBITS_GE_1024-NEXT: st1w { z0.s }, p0, [x1] +; VBITS_GE_1024-NEXT: ret %insert = insertelement <32 x i32> undef, i32 %a, i64 0 %splat = shufflevector <32 x i32> %insert, <32 x i32> undef, <32 x i32> zeroinitializer store <32 x i32> %splat, <32 x i32>* %b @@ -241,11 +579,67 @@ } define void @splat_v64i32(i32 %a, <64 x i32>* %b) #0 { -; CHECK-LABEL: splat_v64i32: -; VBITS_GE_2048-DAG: mov [[RES:z[0-9]+]].s, w0 -; VBITS_GE_2048-DAG: ptrue [[PG:p[0-9]+]].s, vl64 -; VBITS_GE_2048-NEXT: st1w { [[RES]].s }, [[PG]], [x1] -; VBITS_GE_2048-NEXT: ret +; NO_SVE-LABEL: splat_v64i32: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: dup v0.4s, w0 +; NO_SVE-NEXT: stp q0, q0, [x1] +; NO_SVE-NEXT: stp q0, q0, [x1, #32] +; NO_SVE-NEXT: stp q0, q0, [x1, #64] +; NO_SVE-NEXT: stp q0, q0, [x1, #96] +; NO_SVE-NEXT: stp q0, q0, [x1, #128] +; NO_SVE-NEXT: stp q0, q0, [x1, #160] +; NO_SVE-NEXT: stp q0, q0, [x1, #192] +; NO_SVE-NEXT: stp q0, q0, [x1, #224] +; NO_SVE-NEXT: ret +; +; VBITS_EQ_256-LABEL: splat_v64i32: +; VBITS_EQ_256: // %bb.0: +; VBITS_EQ_256-NEXT: mov x8, #56 +; VBITS_EQ_256-NEXT: mov x9, #48 +; VBITS_EQ_256-NEXT: ptrue p0.s, vl8 +; VBITS_EQ_256-NEXT: mov z0.s, w0 +; VBITS_EQ_256-NEXT: mov x10, #40 +; VBITS_EQ_256-NEXT: mov x11, #32 +; VBITS_EQ_256-NEXT: mov x12, #24 +; VBITS_EQ_256-NEXT: st1w { z0.s }, p0, [x1, x8, lsl #2] +; VBITS_EQ_256-NEXT: mov x8, #16 +; VBITS_EQ_256-NEXT: st1w { z0.s }, p0, [x1, x9, lsl #2] +; VBITS_EQ_256-NEXT: mov x9, #8 +; VBITS_EQ_256-NEXT: st1w { z0.s }, p0, [x1, x10, lsl #2] +; VBITS_EQ_256-NEXT: st1w { z0.s }, p0, [x1, x11, lsl #2] +; VBITS_EQ_256-NEXT: st1w { z0.s }, p0, [x1, x12, lsl #2] +; VBITS_EQ_256-NEXT: st1w { z0.s }, p0, [x1, x8, lsl #2] +; VBITS_EQ_256-NEXT: st1w { z0.s }, p0, [x1] +; VBITS_EQ_256-NEXT: st1w { z0.s }, p0, [x1, x9, lsl #2] +; VBITS_EQ_256-NEXT: ret +; +; VBITS_GE_384-LABEL: splat_v64i32: +; VBITS_GE_384: // %bb.0: +; VBITS_GE_384-NEXT: mov x8, #56 +; VBITS_GE_384-NEXT: mov x9, #48 +; VBITS_GE_384-NEXT: ptrue p0.s, vl8 +; VBITS_GE_384-NEXT: mov z0.s, w0 +; VBITS_GE_384-NEXT: mov x10, #40 +; VBITS_GE_384-NEXT: mov x11, #32 +; VBITS_GE_384-NEXT: mov x12, #24 +; VBITS_GE_384-NEXT: st1w { z0.s }, p0, [x1, x8, lsl #2] +; VBITS_GE_384-NEXT: mov x8, #16 +; VBITS_GE_384-NEXT: st1w { z0.s }, p0, [x1, x9, lsl #2] +; VBITS_GE_384-NEXT: mov x9, #8 +; VBITS_GE_384-NEXT: st1w { z0.s }, p0, [x1, x10, lsl #2] +; VBITS_GE_384-NEXT: st1w { z0.s }, p0, [x1, x11, lsl #2] +; VBITS_GE_384-NEXT: st1w { z0.s }, p0, [x1, x12, lsl #2] +; VBITS_GE_384-NEXT: st1w { z0.s }, p0, [x1, x8, lsl #2] +; VBITS_GE_384-NEXT: st1w { z0.s }, p0, [x1] +; VBITS_GE_384-NEXT: st1w { z0.s }, p0, [x1, x9, lsl #2] +; VBITS_GE_384-NEXT: ret +; +; VBITS_GE_2048-LABEL: splat_v64i32: +; VBITS_GE_2048: // %bb.0: +; VBITS_GE_2048-NEXT: ptrue p0.s, vl64 +; VBITS_GE_2048-NEXT: mov z0.s, w0 +; VBITS_GE_2048-NEXT: st1w { z0.s }, p0, [x1] +; VBITS_GE_2048-NEXT: ret %insert = insertelement <64 x i32> undef, i32 %a, i64 0 %splat = shufflevector <64 x i32> %insert, <64 x i32> undef, <64 x i32> zeroinitializer store <64 x i32> %splat, <64 x i32>* %b @@ -254,9 +648,15 @@ ; Don't use SVE for 64-bit vectors. define <1 x i64> @splat_v1i64(i64 %a) #0 { +; NO_SVE-LABEL: splat_v1i64: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: fmov d0, x0 +; NO_SVE-NEXT: ret +; ; CHECK-LABEL: splat_v1i64: -; CHECK: fmov d0, x0 -; CHECK-NEXT: ret +; CHECK: // %bb.0: +; CHECK-NEXT: fmov d0, x0 +; CHECK-NEXT: ret %insert = insertelement <1 x i64> undef, i64 %a, i64 0 %splat = shufflevector <1 x i64> %insert, <1 x i64> undef, <1 x i32> zeroinitializer ret <1 x i64> %splat @@ -264,20 +664,33 @@ ; Don't use SVE for 128-bit vectors. define <2 x i64> @splat_v2i64(i64 %a) #0 { +; NO_SVE-LABEL: splat_v2i64: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: dup v0.2d, x0 +; NO_SVE-NEXT: ret +; ; CHECK-LABEL: splat_v2i64: -; CHECK: dup v0.2d, x0 -; CHECK-NEXT: ret +; CHECK: // %bb.0: +; CHECK-NEXT: dup v0.2d, x0 +; CHECK-NEXT: ret %insert = insertelement <2 x i64> undef, i64 %a, i64 0 %splat = shufflevector <2 x i64> %insert, <2 x i64> undef, <2 x i32> zeroinitializer ret <2 x i64> %splat } define void @splat_v4i64(i64 %a, <4 x i64>* %b) #0 { +; NO_SVE-LABEL: splat_v4i64: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: dup v0.2d, x0 +; NO_SVE-NEXT: stp q0, q0, [x1] +; NO_SVE-NEXT: ret +; ; CHECK-LABEL: splat_v4i64: -; CHECK-DAG: mov [[RES:z[0-9]+]].d, x0 -; CHECK-DAG: ptrue [[PG:p[0-9]+]].d, vl4 -; CHECK-NEXT: st1d { [[RES]].d }, [[PG]], [x1] -; CHECK-NEXT: ret +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.d, vl4 +; CHECK-NEXT: mov z0.d, x0 +; CHECK-NEXT: st1d { z0.d }, p0, [x1] +; CHECK-NEXT: ret %insert = insertelement <4 x i64> undef, i64 %a, i64 0 %splat = shufflevector <4 x i64> %insert, <4 x i64> undef, <4 x i32> zeroinitializer store <4 x i64> %splat, <4 x i64>* %b @@ -285,19 +698,39 @@ } define void @splat_v8i64(i64 %a, <8 x i64>* %b) #0 { -; CHECK-LABEL: splat_v8i64: -; VBITS_GE_512-DAG: mov [[RES:z[0-9]+]].d, x0 -; VBITS_GE_512-DAG: ptrue [[PG:p[0-9]+]].d, vl8 -; VBITS_GE_512-NEXT: st1d { [[RES]].d }, [[PG]], [x1] -; VBITS_GE_512-NEXT: ret +; NO_SVE-LABEL: splat_v8i64: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: dup v0.2d, x0 +; NO_SVE-NEXT: stp q0, q0, [x1] +; NO_SVE-NEXT: stp q0, q0, [x1, #32] +; NO_SVE-NEXT: ret +; +; VBITS_EQ_256-LABEL: splat_v8i64: +; VBITS_EQ_256: // %bb.0: +; VBITS_EQ_256-NEXT: mov x8, #4 +; VBITS_EQ_256-NEXT: mov z0.d, x0 +; VBITS_EQ_256-NEXT: ptrue p0.d, vl4 +; VBITS_EQ_256-NEXT: st1d { z0.d }, p0, [x1] +; VBITS_EQ_256-NEXT: st1d { z0.d }, p0, [x1, x8, lsl #3] +; VBITS_EQ_256-NEXT: ret +; +; VBITS_GE_384-LABEL: splat_v8i64: +; VBITS_GE_384: // %bb.0: +; VBITS_GE_384-NEXT: mov x8, #4 +; VBITS_GE_384-NEXT: mov z0.d, x0 +; VBITS_GE_384-NEXT: ptrue p0.d, vl4 +; VBITS_GE_384-NEXT: st1d { z0.d }, p0, [x1] +; VBITS_GE_384-NEXT: st1d { z0.d }, p0, [x1, x8, lsl #3] +; VBITS_GE_384-NEXT: ret +; +; VBITS_GE_512-LABEL: splat_v8i64: +; VBITS_GE_512: // %bb.0: +; VBITS_GE_512-NEXT: ptrue p0.d, vl8 +; VBITS_GE_512-NEXT: mov z0.d, x0 +; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x1] +; VBITS_GE_512-NEXT: ret ; Ensure sensible type legalisation. -; VBITS_EQ_256-DAG: mov [[RES:z[0-9]+]].d, x0 -; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].d, vl4 -; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #4 -; VBITS_EQ_256-DAG: st1d { [[RES]].d }, [[PG]], [x1] -; VBITS_EQ_256-DAG: st1d { [[RES]].d }, [[PG]], [x1, x[[NUMELTS]], lsl #3] -; VBITS_EQ_256-NEXT: ret %insert = insertelement <8 x i64> undef, i64 %a, i64 0 %splat = shufflevector <8 x i64> %insert, <8 x i64> undef, <8 x i32> zeroinitializer store <8 x i64> %splat, <8 x i64>* %b @@ -305,11 +738,47 @@ } define void @splat_v16i64(i64 %a, <16 x i64>* %b) #0 { -; CHECK-LABEL: splat_v16i64: -; VBITS_GE_1024-DAG: mov [[RES:z[0-9]+]].d, x0 -; VBITS_GE_1024-DAG: ptrue [[PG:p[0-9]+]].d, vl16 -; VBITS_GE_1024-NEXT: st1d { [[RES]].d }, [[PG]], [x1] -; VBITS_GE_1024-NEXT: ret +; NO_SVE-LABEL: splat_v16i64: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: dup v0.2d, x0 +; NO_SVE-NEXT: stp q0, q0, [x1] +; NO_SVE-NEXT: stp q0, q0, [x1, #32] +; NO_SVE-NEXT: stp q0, q0, [x1, #64] +; NO_SVE-NEXT: stp q0, q0, [x1, #96] +; NO_SVE-NEXT: ret +; +; VBITS_EQ_256-LABEL: splat_v16i64: +; VBITS_EQ_256: // %bb.0: +; VBITS_EQ_256-NEXT: mov x8, #12 +; VBITS_EQ_256-NEXT: mov x9, #8 +; VBITS_EQ_256-NEXT: mov x10, #4 +; VBITS_EQ_256-NEXT: ptrue p0.d, vl4 +; VBITS_EQ_256-NEXT: mov z0.d, x0 +; VBITS_EQ_256-NEXT: st1d { z0.d }, p0, [x1, x8, lsl #3] +; VBITS_EQ_256-NEXT: st1d { z0.d }, p0, [x1, x9, lsl #3] +; VBITS_EQ_256-NEXT: st1d { z0.d }, p0, [x1, x10, lsl #3] +; VBITS_EQ_256-NEXT: st1d { z0.d }, p0, [x1] +; VBITS_EQ_256-NEXT: ret +; +; VBITS_GE_384-LABEL: splat_v16i64: +; VBITS_GE_384: // %bb.0: +; VBITS_GE_384-NEXT: mov x8, #12 +; VBITS_GE_384-NEXT: mov x9, #8 +; VBITS_GE_384-NEXT: mov x10, #4 +; VBITS_GE_384-NEXT: ptrue p0.d, vl4 +; VBITS_GE_384-NEXT: mov z0.d, x0 +; VBITS_GE_384-NEXT: st1d { z0.d }, p0, [x1, x8, lsl #3] +; VBITS_GE_384-NEXT: st1d { z0.d }, p0, [x1, x9, lsl #3] +; VBITS_GE_384-NEXT: st1d { z0.d }, p0, [x1, x10, lsl #3] +; VBITS_GE_384-NEXT: st1d { z0.d }, p0, [x1] +; VBITS_GE_384-NEXT: ret +; +; VBITS_GE_1024-LABEL: splat_v16i64: +; VBITS_GE_1024: // %bb.0: +; VBITS_GE_1024-NEXT: ptrue p0.d, vl16 +; VBITS_GE_1024-NEXT: mov z0.d, x0 +; VBITS_GE_1024-NEXT: st1d { z0.d }, p0, [x1] +; VBITS_GE_1024-NEXT: ret %insert = insertelement <16 x i64> undef, i64 %a, i64 0 %splat = shufflevector <16 x i64> %insert, <16 x i64> undef, <16 x i32> zeroinitializer store <16 x i64> %splat, <16 x i64>* %b @@ -317,11 +786,67 @@ } define void @splat_v32i64(i64 %a, <32 x i64>* %b) #0 { -; CHECK-LABEL: splat_v32i64: -; VBITS_GE_2048-DAG: mov [[RES:z[0-9]+]].d, x0 -; VBITS_GE_2048-DAG: ptrue [[PG:p[0-9]+]].d, vl32 -; VBITS_GE_2048-NEXT: st1d { [[RES]].d }, [[PG]], [x1] -; VBITS_GE_2048-NEXT: ret +; NO_SVE-LABEL: splat_v32i64: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: dup v0.2d, x0 +; NO_SVE-NEXT: stp q0, q0, [x1] +; NO_SVE-NEXT: stp q0, q0, [x1, #32] +; NO_SVE-NEXT: stp q0, q0, [x1, #64] +; NO_SVE-NEXT: stp q0, q0, [x1, #96] +; NO_SVE-NEXT: stp q0, q0, [x1, #128] +; NO_SVE-NEXT: stp q0, q0, [x1, #160] +; NO_SVE-NEXT: stp q0, q0, [x1, #192] +; NO_SVE-NEXT: stp q0, q0, [x1, #224] +; NO_SVE-NEXT: ret +; +; VBITS_EQ_256-LABEL: splat_v32i64: +; VBITS_EQ_256: // %bb.0: +; VBITS_EQ_256-NEXT: mov x8, #28 +; VBITS_EQ_256-NEXT: mov x9, #24 +; VBITS_EQ_256-NEXT: ptrue p0.d, vl4 +; VBITS_EQ_256-NEXT: mov z0.d, x0 +; VBITS_EQ_256-NEXT: mov x10, #20 +; VBITS_EQ_256-NEXT: mov x11, #16 +; VBITS_EQ_256-NEXT: mov x12, #12 +; VBITS_EQ_256-NEXT: st1d { z0.d }, p0, [x1, x8, lsl #3] +; VBITS_EQ_256-NEXT: mov x8, #8 +; VBITS_EQ_256-NEXT: st1d { z0.d }, p0, [x1, x9, lsl #3] +; VBITS_EQ_256-NEXT: mov x9, #4 +; VBITS_EQ_256-NEXT: st1d { z0.d }, p0, [x1, x10, lsl #3] +; VBITS_EQ_256-NEXT: st1d { z0.d }, p0, [x1, x11, lsl #3] +; VBITS_EQ_256-NEXT: st1d { z0.d }, p0, [x1, x12, lsl #3] +; VBITS_EQ_256-NEXT: st1d { z0.d }, p0, [x1, x8, lsl #3] +; VBITS_EQ_256-NEXT: st1d { z0.d }, p0, [x1] +; VBITS_EQ_256-NEXT: st1d { z0.d }, p0, [x1, x9, lsl #3] +; VBITS_EQ_256-NEXT: ret +; +; VBITS_GE_384-LABEL: splat_v32i64: +; VBITS_GE_384: // %bb.0: +; VBITS_GE_384-NEXT: mov x8, #28 +; VBITS_GE_384-NEXT: mov x9, #24 +; VBITS_GE_384-NEXT: ptrue p0.d, vl4 +; VBITS_GE_384-NEXT: mov z0.d, x0 +; VBITS_GE_384-NEXT: mov x10, #20 +; VBITS_GE_384-NEXT: mov x11, #16 +; VBITS_GE_384-NEXT: mov x12, #12 +; VBITS_GE_384-NEXT: st1d { z0.d }, p0, [x1, x8, lsl #3] +; VBITS_GE_384-NEXT: mov x8, #8 +; VBITS_GE_384-NEXT: st1d { z0.d }, p0, [x1, x9, lsl #3] +; VBITS_GE_384-NEXT: mov x9, #4 +; VBITS_GE_384-NEXT: st1d { z0.d }, p0, [x1, x10, lsl #3] +; VBITS_GE_384-NEXT: st1d { z0.d }, p0, [x1, x11, lsl #3] +; VBITS_GE_384-NEXT: st1d { z0.d }, p0, [x1, x12, lsl #3] +; VBITS_GE_384-NEXT: st1d { z0.d }, p0, [x1, x8, lsl #3] +; VBITS_GE_384-NEXT: st1d { z0.d }, p0, [x1] +; VBITS_GE_384-NEXT: st1d { z0.d }, p0, [x1, x9, lsl #3] +; VBITS_GE_384-NEXT: ret +; +; VBITS_GE_2048-LABEL: splat_v32i64: +; VBITS_GE_2048: // %bb.0: +; VBITS_GE_2048-NEXT: ptrue p0.d, vl32 +; VBITS_GE_2048-NEXT: mov z0.d, x0 +; VBITS_GE_2048-NEXT: st1d { z0.d }, p0, [x1] +; VBITS_GE_2048-NEXT: ret %insert = insertelement <32 x i64> undef, i64 %a, i64 0 %splat = shufflevector <32 x i64> %insert, <32 x i64> undef, <32 x i32> zeroinitializer store <32 x i64> %splat, <32 x i64>* %b @@ -334,9 +859,17 @@ ; Don't use SVE for 64-bit vectors. define <4 x half> @splat_v4f16(half %a) #0 { +; NO_SVE-LABEL: splat_v4f16: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: // kill: def $h0 killed $h0 def $q0 +; NO_SVE-NEXT: dup v0.4h, v0.h[0] +; NO_SVE-NEXT: ret +; ; CHECK-LABEL: splat_v4f16: -; CHECK: dup v0.4h, v0.h[0] -; CHECK-NEXT: ret +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $h0 killed $h0 def $q0 +; CHECK-NEXT: dup v0.4h, v0.h[0] +; CHECK-NEXT: ret %insert = insertelement <4 x half> undef, half %a, i64 0 %splat = shufflevector <4 x half> %insert, <4 x half> undef, <4 x i32> zeroinitializer ret <4 x half> %splat @@ -344,20 +877,37 @@ ; Don't use SVE for 128-bit vectors. define <8 x half> @splat_v8f16(half %a) #0 { +; NO_SVE-LABEL: splat_v8f16: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: // kill: def $h0 killed $h0 def $q0 +; NO_SVE-NEXT: dup v0.8h, v0.h[0] +; NO_SVE-NEXT: ret +; ; CHECK-LABEL: splat_v8f16: -; CHECK: dup v0.8h, v0.h[0] -; CHECK-NEXT: ret +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $h0 killed $h0 def $q0 +; CHECK-NEXT: dup v0.8h, v0.h[0] +; CHECK-NEXT: ret %insert = insertelement <8 x half> undef, half %a, i64 0 %splat = shufflevector <8 x half> %insert, <8 x half> undef, <8 x i32> zeroinitializer ret <8 x half> %splat } define void @splat_v16f16(half %a, <16 x half>* %b) #0 { +; NO_SVE-LABEL: splat_v16f16: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: // kill: def $h0 killed $h0 def $q0 +; NO_SVE-NEXT: dup v0.8h, v0.h[0] +; NO_SVE-NEXT: stp q0, q0, [x0] +; NO_SVE-NEXT: ret +; ; CHECK-LABEL: splat_v16f16: -; CHECK-DAG: mov [[RES:z[0-9]+]].h, h0 -; CHECK-DAG: ptrue [[PG:p[0-9]+]].h, vl16 -; CHECK-NEXT: st1h { [[RES]].h }, [[PG]], [x0] -; CHECK-NEXT: ret +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $h0 killed $h0 def $z0 +; CHECK-NEXT: ptrue p0.h, vl16 +; CHECK-NEXT: mov z0.h, h0 +; CHECK-NEXT: st1h { z0.h }, p0, [x0] +; CHECK-NEXT: ret %insert = insertelement <16 x half> undef, half %a, i64 0 %splat = shufflevector <16 x half> %insert, <16 x half> undef, <16 x i32> zeroinitializer store <16 x half> %splat, <16 x half>* %b @@ -365,19 +915,43 @@ } define void @splat_v32f16(half %a, <32 x half>* %b) #0 { -; CHECK-LABEL: splat_v32f16: -; VBITS_GE_512-DAG: mov [[RES:z[0-9]+]].h, h0 -; VBITS_GE_512-DAG: ptrue [[PG:p[0-9]+]].h, vl32 -; VBITS_GE_512-NEXT: st1h { [[RES]].h }, [[PG]], [x0] -; VBITS_GE_512-NEXT: ret +; NO_SVE-LABEL: splat_v32f16: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: // kill: def $h0 killed $h0 def $q0 +; NO_SVE-NEXT: dup v0.8h, v0.h[0] +; NO_SVE-NEXT: stp q0, q0, [x0] +; NO_SVE-NEXT: stp q0, q0, [x0, #32] +; NO_SVE-NEXT: ret +; +; VBITS_EQ_256-LABEL: splat_v32f16: +; VBITS_EQ_256: // %bb.0: +; VBITS_EQ_256-NEXT: mov x8, #16 +; VBITS_EQ_256-NEXT: // kill: def $h0 killed $h0 def $z0 +; VBITS_EQ_256-NEXT: ptrue p0.h, vl16 +; VBITS_EQ_256-NEXT: mov z0.h, h0 +; VBITS_EQ_256-NEXT: st1h { z0.h }, p0, [x0] +; VBITS_EQ_256-NEXT: st1h { z0.h }, p0, [x0, x8, lsl #1] +; VBITS_EQ_256-NEXT: ret +; +; VBITS_GE_384-LABEL: splat_v32f16: +; VBITS_GE_384: // %bb.0: +; VBITS_GE_384-NEXT: mov x8, #16 +; VBITS_GE_384-NEXT: // kill: def $h0 killed $h0 def $z0 +; VBITS_GE_384-NEXT: ptrue p0.h, vl16 +; VBITS_GE_384-NEXT: mov z0.h, h0 +; VBITS_GE_384-NEXT: st1h { z0.h }, p0, [x0] +; VBITS_GE_384-NEXT: st1h { z0.h }, p0, [x0, x8, lsl #1] +; VBITS_GE_384-NEXT: ret +; +; VBITS_GE_512-LABEL: splat_v32f16: +; VBITS_GE_512: // %bb.0: +; VBITS_GE_512-NEXT: // kill: def $h0 killed $h0 def $z0 +; VBITS_GE_512-NEXT: ptrue p0.h, vl32 +; VBITS_GE_512-NEXT: mov z0.h, h0 +; VBITS_GE_512-NEXT: st1h { z0.h }, p0, [x0] +; VBITS_GE_512-NEXT: ret ; Ensure sensible type legalisation. -; VBITS_EQ_256-DAG: mov [[RES:z[0-9]+]].h, h0 -; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].h, vl16 -; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #16 -; VBITS_EQ_256-DAG: st1h { [[RES]].h }, [[PG]], [x0] -; VBITS_EQ_256-DAG: st1h { [[RES]].h }, [[PG]], [x0, x[[NUMELTS]], lsl #1] -; VBITS_EQ_256-NEXT: ret %insert = insertelement <32 x half> undef, half %a, i64 0 %splat = shufflevector <32 x half> %insert, <32 x half> undef, <32 x i32> zeroinitializer store <32 x half> %splat, <32 x half>* %b @@ -385,11 +959,51 @@ } define void @splat_v64f16(half %a, <64 x half>* %b) #0 { -; CHECK-LABEL: splat_v64f16: -; VBITS_GE_1024-DAG: mov [[RES:z[0-9]+]].h, h0 -; VBITS_GE_1024-DAG: ptrue [[PG:p[0-9]+]].h, vl64 -; VBITS_GE_1024-NEXT: st1h { [[RES]].h }, [[PG]], [x0] -; VBITS_GE_1024-NEXT: ret +; NO_SVE-LABEL: splat_v64f16: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: // kill: def $h0 killed $h0 def $q0 +; NO_SVE-NEXT: dup v0.8h, v0.h[0] +; NO_SVE-NEXT: stp q0, q0, [x0] +; NO_SVE-NEXT: stp q0, q0, [x0, #32] +; NO_SVE-NEXT: stp q0, q0, [x0, #64] +; NO_SVE-NEXT: stp q0, q0, [x0, #96] +; NO_SVE-NEXT: ret +; +; VBITS_EQ_256-LABEL: splat_v64f16: +; VBITS_EQ_256: // %bb.0: +; VBITS_EQ_256-NEXT: mov x8, #48 +; VBITS_EQ_256-NEXT: mov x9, #32 +; VBITS_EQ_256-NEXT: mov x10, #16 +; VBITS_EQ_256-NEXT: // kill: def $h0 killed $h0 def $z0 +; VBITS_EQ_256-NEXT: ptrue p0.h, vl16 +; VBITS_EQ_256-NEXT: mov z0.h, h0 +; VBITS_EQ_256-NEXT: st1h { z0.h }, p0, [x0, x8, lsl #1] +; VBITS_EQ_256-NEXT: st1h { z0.h }, p0, [x0, x9, lsl #1] +; VBITS_EQ_256-NEXT: st1h { z0.h }, p0, [x0, x10, lsl #1] +; VBITS_EQ_256-NEXT: st1h { z0.h }, p0, [x0] +; VBITS_EQ_256-NEXT: ret +; +; VBITS_GE_384-LABEL: splat_v64f16: +; VBITS_GE_384: // %bb.0: +; VBITS_GE_384-NEXT: mov x8, #48 +; VBITS_GE_384-NEXT: mov x9, #32 +; VBITS_GE_384-NEXT: mov x10, #16 +; VBITS_GE_384-NEXT: // kill: def $h0 killed $h0 def $z0 +; VBITS_GE_384-NEXT: ptrue p0.h, vl16 +; VBITS_GE_384-NEXT: mov z0.h, h0 +; VBITS_GE_384-NEXT: st1h { z0.h }, p0, [x0, x8, lsl #1] +; VBITS_GE_384-NEXT: st1h { z0.h }, p0, [x0, x9, lsl #1] +; VBITS_GE_384-NEXT: st1h { z0.h }, p0, [x0, x10, lsl #1] +; VBITS_GE_384-NEXT: st1h { z0.h }, p0, [x0] +; VBITS_GE_384-NEXT: ret +; +; VBITS_GE_1024-LABEL: splat_v64f16: +; VBITS_GE_1024: // %bb.0: +; VBITS_GE_1024-NEXT: // kill: def $h0 killed $h0 def $z0 +; VBITS_GE_1024-NEXT: ptrue p0.h, vl64 +; VBITS_GE_1024-NEXT: mov z0.h, h0 +; VBITS_GE_1024-NEXT: st1h { z0.h }, p0, [x0] +; VBITS_GE_1024-NEXT: ret %insert = insertelement <64 x half> undef, half %a, i64 0 %splat = shufflevector <64 x half> %insert, <64 x half> undef, <64 x i32> zeroinitializer store <64 x half> %splat, <64 x half>* %b @@ -397,11 +1011,71 @@ } define void @splat_v128f16(half %a, <128 x half>* %b) #0 { -; CHECK-LABEL: splat_v128f16: -; VBITS_GE_2048-DAG: mov [[RES:z[0-9]+]].h, h0 -; VBITS_GE_2048-DAG: ptrue [[PG:p[0-9]+]].h, vl128 -; VBITS_GE_2048-NEXT: st1h { [[RES]].h }, [[PG]], [x0] -; VBITS_GE_2048-NEXT: ret +; NO_SVE-LABEL: splat_v128f16: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: // kill: def $h0 killed $h0 def $q0 +; NO_SVE-NEXT: dup v0.8h, v0.h[0] +; NO_SVE-NEXT: stp q0, q0, [x0] +; NO_SVE-NEXT: stp q0, q0, [x0, #32] +; NO_SVE-NEXT: stp q0, q0, [x0, #64] +; NO_SVE-NEXT: stp q0, q0, [x0, #96] +; NO_SVE-NEXT: stp q0, q0, [x0, #128] +; NO_SVE-NEXT: stp q0, q0, [x0, #160] +; NO_SVE-NEXT: stp q0, q0, [x0, #192] +; NO_SVE-NEXT: stp q0, q0, [x0, #224] +; NO_SVE-NEXT: ret +; +; VBITS_EQ_256-LABEL: splat_v128f16: +; VBITS_EQ_256: // %bb.0: +; VBITS_EQ_256-NEXT: mov x8, #112 +; VBITS_EQ_256-NEXT: mov x9, #96 +; VBITS_EQ_256-NEXT: // kill: def $h0 killed $h0 def $z0 +; VBITS_EQ_256-NEXT: ptrue p0.h, vl16 +; VBITS_EQ_256-NEXT: mov x10, #80 +; VBITS_EQ_256-NEXT: mov z0.h, h0 +; VBITS_EQ_256-NEXT: mov x11, #64 +; VBITS_EQ_256-NEXT: mov x12, #48 +; VBITS_EQ_256-NEXT: st1h { z0.h }, p0, [x0, x8, lsl #1] +; VBITS_EQ_256-NEXT: mov x8, #32 +; VBITS_EQ_256-NEXT: st1h { z0.h }, p0, [x0, x9, lsl #1] +; VBITS_EQ_256-NEXT: mov x9, #16 +; VBITS_EQ_256-NEXT: st1h { z0.h }, p0, [x0, x10, lsl #1] +; VBITS_EQ_256-NEXT: st1h { z0.h }, p0, [x0, x11, lsl #1] +; VBITS_EQ_256-NEXT: st1h { z0.h }, p0, [x0, x12, lsl #1] +; VBITS_EQ_256-NEXT: st1h { z0.h }, p0, [x0, x8, lsl #1] +; VBITS_EQ_256-NEXT: st1h { z0.h }, p0, [x0] +; VBITS_EQ_256-NEXT: st1h { z0.h }, p0, [x0, x9, lsl #1] +; VBITS_EQ_256-NEXT: ret +; +; VBITS_GE_384-LABEL: splat_v128f16: +; VBITS_GE_384: // %bb.0: +; VBITS_GE_384-NEXT: mov x8, #112 +; VBITS_GE_384-NEXT: mov x9, #96 +; VBITS_GE_384-NEXT: // kill: def $h0 killed $h0 def $z0 +; VBITS_GE_384-NEXT: ptrue p0.h, vl16 +; VBITS_GE_384-NEXT: mov x10, #80 +; VBITS_GE_384-NEXT: mov z0.h, h0 +; VBITS_GE_384-NEXT: mov x11, #64 +; VBITS_GE_384-NEXT: mov x12, #48 +; VBITS_GE_384-NEXT: st1h { z0.h }, p0, [x0, x8, lsl #1] +; VBITS_GE_384-NEXT: mov x8, #32 +; VBITS_GE_384-NEXT: st1h { z0.h }, p0, [x0, x9, lsl #1] +; VBITS_GE_384-NEXT: mov x9, #16 +; VBITS_GE_384-NEXT: st1h { z0.h }, p0, [x0, x10, lsl #1] +; VBITS_GE_384-NEXT: st1h { z0.h }, p0, [x0, x11, lsl #1] +; VBITS_GE_384-NEXT: st1h { z0.h }, p0, [x0, x12, lsl #1] +; VBITS_GE_384-NEXT: st1h { z0.h }, p0, [x0, x8, lsl #1] +; VBITS_GE_384-NEXT: st1h { z0.h }, p0, [x0] +; VBITS_GE_384-NEXT: st1h { z0.h }, p0, [x0, x9, lsl #1] +; VBITS_GE_384-NEXT: ret +; +; VBITS_GE_2048-LABEL: splat_v128f16: +; VBITS_GE_2048: // %bb.0: +; VBITS_GE_2048-NEXT: // kill: def $h0 killed $h0 def $z0 +; VBITS_GE_2048-NEXT: ptrue p0.h, vl128 +; VBITS_GE_2048-NEXT: mov z0.h, h0 +; VBITS_GE_2048-NEXT: st1h { z0.h }, p0, [x0] +; VBITS_GE_2048-NEXT: ret %insert = insertelement <128 x half> undef, half %a, i64 0 %splat = shufflevector <128 x half> %insert, <128 x half> undef, <128 x i32> zeroinitializer store <128 x half> %splat, <128 x half>* %b @@ -410,9 +1084,17 @@ ; Don't use SVE for 64-bit vectors. define <2 x float> @splat_v2f32(float %a, <2 x float> %op2) #0 { +; NO_SVE-LABEL: splat_v2f32: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: // kill: def $s0 killed $s0 def $q0 +; NO_SVE-NEXT: dup v0.2s, v0.s[0] +; NO_SVE-NEXT: ret +; ; CHECK-LABEL: splat_v2f32: -; CHECK: dup v0.2s, v0.s[0] -; CHECK-NEXT: ret +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $s0 killed $s0 def $q0 +; CHECK-NEXT: dup v0.2s, v0.s[0] +; CHECK-NEXT: ret %insert = insertelement <2 x float> undef, float %a, i64 0 %splat = shufflevector <2 x float> %insert, <2 x float> undef, <2 x i32> zeroinitializer ret <2 x float> %splat @@ -420,20 +1102,37 @@ ; Don't use SVE for 128-bit vectors. define <4 x float> @splat_v4f32(float %a, <4 x float> %op2) #0 { +; NO_SVE-LABEL: splat_v4f32: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: // kill: def $s0 killed $s0 def $q0 +; NO_SVE-NEXT: dup v0.4s, v0.s[0] +; NO_SVE-NEXT: ret +; ; CHECK-LABEL: splat_v4f32: -; CHECK: dup v0.4s, v0.s[0] -; CHECK-NEXT: ret +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $s0 killed $s0 def $q0 +; CHECK-NEXT: dup v0.4s, v0.s[0] +; CHECK-NEXT: ret %insert = insertelement <4 x float> undef, float %a, i64 0 %splat = shufflevector <4 x float> %insert, <4 x float> undef, <4 x i32> zeroinitializer ret <4 x float> %splat } define void @splat_v8f32(float %a, <8 x float>* %b) #0 { +; NO_SVE-LABEL: splat_v8f32: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: // kill: def $s0 killed $s0 def $q0 +; NO_SVE-NEXT: dup v0.4s, v0.s[0] +; NO_SVE-NEXT: stp q0, q0, [x0] +; NO_SVE-NEXT: ret +; ; CHECK-LABEL: splat_v8f32: -; CHECK-DAG: mov [[RES:z[0-9]+]].s, s0 -; CHECK-DAG: ptrue [[PG:p[0-9]+]].s, vl8 -; CHECK-NEXT: st1w { [[RES]].s }, [[PG]], [x0] -; CHECK-NEXT: ret +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $s0 killed $s0 def $z0 +; CHECK-NEXT: ptrue p0.s, vl8 +; CHECK-NEXT: mov z0.s, s0 +; CHECK-NEXT: st1w { z0.s }, p0, [x0] +; CHECK-NEXT: ret %insert = insertelement <8 x float> undef, float %a, i64 0 %splat = shufflevector <8 x float> %insert, <8 x float> undef, <8 x i32> zeroinitializer store <8 x float> %splat, <8 x float>* %b @@ -441,19 +1140,43 @@ } define void @splat_v16f32(float %a, <16 x float>* %b) #0 { -; CHECK-LABEL: splat_v16f32: -; VBITS_GE_512-DAG: mov [[RES:z[0-9]+]].s, s0 -; VBITS_GE_512-DAG: ptrue [[PG:p[0-9]+]].s, vl16 -; VBITS_GE_512-NEXT: st1w { [[RES]].s }, [[PG]], [x0] -; VBITS_GE_512-NEXT: ret +; NO_SVE-LABEL: splat_v16f32: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: // kill: def $s0 killed $s0 def $q0 +; NO_SVE-NEXT: dup v0.4s, v0.s[0] +; NO_SVE-NEXT: stp q0, q0, [x0] +; NO_SVE-NEXT: stp q0, q0, [x0, #32] +; NO_SVE-NEXT: ret +; +; VBITS_EQ_256-LABEL: splat_v16f32: +; VBITS_EQ_256: // %bb.0: +; VBITS_EQ_256-NEXT: mov x8, #8 +; VBITS_EQ_256-NEXT: // kill: def $s0 killed $s0 def $z0 +; VBITS_EQ_256-NEXT: ptrue p0.s, vl8 +; VBITS_EQ_256-NEXT: mov z0.s, s0 +; VBITS_EQ_256-NEXT: st1w { z0.s }, p0, [x0] +; VBITS_EQ_256-NEXT: st1w { z0.s }, p0, [x0, x8, lsl #2] +; VBITS_EQ_256-NEXT: ret +; +; VBITS_GE_384-LABEL: splat_v16f32: +; VBITS_GE_384: // %bb.0: +; VBITS_GE_384-NEXT: mov x8, #8 +; VBITS_GE_384-NEXT: // kill: def $s0 killed $s0 def $z0 +; VBITS_GE_384-NEXT: ptrue p0.s, vl8 +; VBITS_GE_384-NEXT: mov z0.s, s0 +; VBITS_GE_384-NEXT: st1w { z0.s }, p0, [x0] +; VBITS_GE_384-NEXT: st1w { z0.s }, p0, [x0, x8, lsl #2] +; VBITS_GE_384-NEXT: ret +; +; VBITS_GE_512-LABEL: splat_v16f32: +; VBITS_GE_512: // %bb.0: +; VBITS_GE_512-NEXT: // kill: def $s0 killed $s0 def $z0 +; VBITS_GE_512-NEXT: ptrue p0.s, vl16 +; VBITS_GE_512-NEXT: mov z0.s, s0 +; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x0] +; VBITS_GE_512-NEXT: ret ; Ensure sensible type legalisation. -; VBITS_EQ_256-DAG: mov [[RES:z[0-9]+]].s, s0 -; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].s, vl8 -; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #8 -; VBITS_EQ_256-DAG: st1w { [[RES]].s }, [[PG]], [x0] -; VBITS_EQ_256-DAG: st1w { [[RES]].s }, [[PG]], [x0, x[[NUMELTS]], lsl #2] -; VBITS_EQ_256-NEXT: ret %insert = insertelement <16 x float> undef, float %a, i64 0 %splat = shufflevector <16 x float> %insert, <16 x float> undef, <16 x i32> zeroinitializer store <16 x float> %splat, <16 x float>* %b @@ -461,11 +1184,51 @@ } define void @splat_v32f32(float %a, <32 x float>* %b) #0 { -; CHECK-LABEL: splat_v32f32: -; VBITS_GE_1024-DAG: mov [[RES:z[0-9]+]].s, s0 -; VBITS_GE_1024-DAG: ptrue [[PG:p[0-9]+]].s, vl32 -; VBITS_GE_1024-NEXT: st1w { [[RES]].s }, [[PG]], [x0] -; VBITS_GE_1024-NEXT: ret +; NO_SVE-LABEL: splat_v32f32: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: // kill: def $s0 killed $s0 def $q0 +; NO_SVE-NEXT: dup v0.4s, v0.s[0] +; NO_SVE-NEXT: stp q0, q0, [x0] +; NO_SVE-NEXT: stp q0, q0, [x0, #32] +; NO_SVE-NEXT: stp q0, q0, [x0, #64] +; NO_SVE-NEXT: stp q0, q0, [x0, #96] +; NO_SVE-NEXT: ret +; +; VBITS_EQ_256-LABEL: splat_v32f32: +; VBITS_EQ_256: // %bb.0: +; VBITS_EQ_256-NEXT: mov x8, #24 +; VBITS_EQ_256-NEXT: mov x9, #16 +; VBITS_EQ_256-NEXT: mov x10, #8 +; VBITS_EQ_256-NEXT: // kill: def $s0 killed $s0 def $z0 +; VBITS_EQ_256-NEXT: ptrue p0.s, vl8 +; VBITS_EQ_256-NEXT: mov z0.s, s0 +; VBITS_EQ_256-NEXT: st1w { z0.s }, p0, [x0, x8, lsl #2] +; VBITS_EQ_256-NEXT: st1w { z0.s }, p0, [x0, x9, lsl #2] +; VBITS_EQ_256-NEXT: st1w { z0.s }, p0, [x0, x10, lsl #2] +; VBITS_EQ_256-NEXT: st1w { z0.s }, p0, [x0] +; VBITS_EQ_256-NEXT: ret +; +; VBITS_GE_384-LABEL: splat_v32f32: +; VBITS_GE_384: // %bb.0: +; VBITS_GE_384-NEXT: mov x8, #24 +; VBITS_GE_384-NEXT: mov x9, #16 +; VBITS_GE_384-NEXT: mov x10, #8 +; VBITS_GE_384-NEXT: // kill: def $s0 killed $s0 def $z0 +; VBITS_GE_384-NEXT: ptrue p0.s, vl8 +; VBITS_GE_384-NEXT: mov z0.s, s0 +; VBITS_GE_384-NEXT: st1w { z0.s }, p0, [x0, x8, lsl #2] +; VBITS_GE_384-NEXT: st1w { z0.s }, p0, [x0, x9, lsl #2] +; VBITS_GE_384-NEXT: st1w { z0.s }, p0, [x0, x10, lsl #2] +; VBITS_GE_384-NEXT: st1w { z0.s }, p0, [x0] +; VBITS_GE_384-NEXT: ret +; +; VBITS_GE_1024-LABEL: splat_v32f32: +; VBITS_GE_1024: // %bb.0: +; VBITS_GE_1024-NEXT: // kill: def $s0 killed $s0 def $z0 +; VBITS_GE_1024-NEXT: ptrue p0.s, vl32 +; VBITS_GE_1024-NEXT: mov z0.s, s0 +; VBITS_GE_1024-NEXT: st1w { z0.s }, p0, [x0] +; VBITS_GE_1024-NEXT: ret %insert = insertelement <32 x float> undef, float %a, i64 0 %splat = shufflevector <32 x float> %insert, <32 x float> undef, <32 x i32> zeroinitializer store <32 x float> %splat, <32 x float>* %b @@ -473,11 +1236,71 @@ } define void @splat_v64f32(float %a, <64 x float>* %b) #0 { -; CHECK-LABEL: splat_v64f32: -; VBITS_GE_2048-DAG: mov [[RES:z[0-9]+]].s, s0 -; VBITS_GE_2048-DAG: ptrue [[PG:p[0-9]+]].s, vl64 -; VBITS_GE_2048-NEXT: st1w { [[RES]].s }, [[PG]], [x0] -; VBITS_GE_2048-NEXT: ret +; NO_SVE-LABEL: splat_v64f32: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: // kill: def $s0 killed $s0 def $q0 +; NO_SVE-NEXT: dup v0.4s, v0.s[0] +; NO_SVE-NEXT: stp q0, q0, [x0] +; NO_SVE-NEXT: stp q0, q0, [x0, #32] +; NO_SVE-NEXT: stp q0, q0, [x0, #64] +; NO_SVE-NEXT: stp q0, q0, [x0, #96] +; NO_SVE-NEXT: stp q0, q0, [x0, #128] +; NO_SVE-NEXT: stp q0, q0, [x0, #160] +; NO_SVE-NEXT: stp q0, q0, [x0, #192] +; NO_SVE-NEXT: stp q0, q0, [x0, #224] +; NO_SVE-NEXT: ret +; +; VBITS_EQ_256-LABEL: splat_v64f32: +; VBITS_EQ_256: // %bb.0: +; VBITS_EQ_256-NEXT: mov x8, #56 +; VBITS_EQ_256-NEXT: mov x9, #48 +; VBITS_EQ_256-NEXT: // kill: def $s0 killed $s0 def $z0 +; VBITS_EQ_256-NEXT: ptrue p0.s, vl8 +; VBITS_EQ_256-NEXT: mov x10, #40 +; VBITS_EQ_256-NEXT: mov z0.s, s0 +; VBITS_EQ_256-NEXT: mov x11, #32 +; VBITS_EQ_256-NEXT: mov x12, #24 +; VBITS_EQ_256-NEXT: st1w { z0.s }, p0, [x0, x8, lsl #2] +; VBITS_EQ_256-NEXT: mov x8, #16 +; VBITS_EQ_256-NEXT: st1w { z0.s }, p0, [x0, x9, lsl #2] +; VBITS_EQ_256-NEXT: mov x9, #8 +; VBITS_EQ_256-NEXT: st1w { z0.s }, p0, [x0, x10, lsl #2] +; VBITS_EQ_256-NEXT: st1w { z0.s }, p0, [x0, x11, lsl #2] +; VBITS_EQ_256-NEXT: st1w { z0.s }, p0, [x0, x12, lsl #2] +; VBITS_EQ_256-NEXT: st1w { z0.s }, p0, [x0, x8, lsl #2] +; VBITS_EQ_256-NEXT: st1w { z0.s }, p0, [x0] +; VBITS_EQ_256-NEXT: st1w { z0.s }, p0, [x0, x9, lsl #2] +; VBITS_EQ_256-NEXT: ret +; +; VBITS_GE_384-LABEL: splat_v64f32: +; VBITS_GE_384: // %bb.0: +; VBITS_GE_384-NEXT: mov x8, #56 +; VBITS_GE_384-NEXT: mov x9, #48 +; VBITS_GE_384-NEXT: // kill: def $s0 killed $s0 def $z0 +; VBITS_GE_384-NEXT: ptrue p0.s, vl8 +; VBITS_GE_384-NEXT: mov x10, #40 +; VBITS_GE_384-NEXT: mov z0.s, s0 +; VBITS_GE_384-NEXT: mov x11, #32 +; VBITS_GE_384-NEXT: mov x12, #24 +; VBITS_GE_384-NEXT: st1w { z0.s }, p0, [x0, x8, lsl #2] +; VBITS_GE_384-NEXT: mov x8, #16 +; VBITS_GE_384-NEXT: st1w { z0.s }, p0, [x0, x9, lsl #2] +; VBITS_GE_384-NEXT: mov x9, #8 +; VBITS_GE_384-NEXT: st1w { z0.s }, p0, [x0, x10, lsl #2] +; VBITS_GE_384-NEXT: st1w { z0.s }, p0, [x0, x11, lsl #2] +; VBITS_GE_384-NEXT: st1w { z0.s }, p0, [x0, x12, lsl #2] +; VBITS_GE_384-NEXT: st1w { z0.s }, p0, [x0, x8, lsl #2] +; VBITS_GE_384-NEXT: st1w { z0.s }, p0, [x0] +; VBITS_GE_384-NEXT: st1w { z0.s }, p0, [x0, x9, lsl #2] +; VBITS_GE_384-NEXT: ret +; +; VBITS_GE_2048-LABEL: splat_v64f32: +; VBITS_GE_2048: // %bb.0: +; VBITS_GE_2048-NEXT: // kill: def $s0 killed $s0 def $z0 +; VBITS_GE_2048-NEXT: ptrue p0.s, vl64 +; VBITS_GE_2048-NEXT: mov z0.s, s0 +; VBITS_GE_2048-NEXT: st1w { z0.s }, p0, [x0] +; VBITS_GE_2048-NEXT: ret %insert = insertelement <64 x float> undef, float %a, i64 0 %splat = shufflevector <64 x float> %insert, <64 x float> undef, <64 x i32> zeroinitializer store <64 x float> %splat, <64 x float>* %b @@ -486,9 +1309,13 @@ ; Don't use SVE for 64-bit vectors. define <1 x double> @splat_v1f64(double %a, <1 x double> %op2) #0 { +; NO_SVE-LABEL: splat_v1f64: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: ret +; ; CHECK-LABEL: splat_v1f64: -; CHECK: // %bb.0: -; CHECK-NEXT: ret +; CHECK: // %bb.0: +; CHECK-NEXT: ret %insert = insertelement <1 x double> undef, double %a, i64 0 %splat = shufflevector <1 x double> %insert, <1 x double> undef, <1 x i32> zeroinitializer ret <1 x double> %splat @@ -496,20 +1323,37 @@ ; Don't use SVE for 128-bit vectors. define <2 x double> @splat_v2f64(double %a, <2 x double> %op2) #0 { +; NO_SVE-LABEL: splat_v2f64: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: // kill: def $d0 killed $d0 def $q0 +; NO_SVE-NEXT: dup v0.2d, v0.d[0] +; NO_SVE-NEXT: ret +; ; CHECK-LABEL: splat_v2f64: -; CHECK: dup v0.2d, v0.d[0] -; CHECK-NEXT: ret +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-NEXT: dup v0.2d, v0.d[0] +; CHECK-NEXT: ret %insert = insertelement <2 x double> undef, double %a, i64 0 %splat = shufflevector <2 x double> %insert, <2 x double> undef, <2 x i32> zeroinitializer ret <2 x double> %splat } define void @splat_v4f64(double %a, <4 x double>* %b) #0 { +; NO_SVE-LABEL: splat_v4f64: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: // kill: def $d0 killed $d0 def $q0 +; NO_SVE-NEXT: dup v0.2d, v0.d[0] +; NO_SVE-NEXT: stp q0, q0, [x0] +; NO_SVE-NEXT: ret +; ; CHECK-LABEL: splat_v4f64: -; CHECK-DAG: mov [[RES:z[0-9]+]].d, d0 -; CHECK-DAG: ptrue [[PG:p[0-9]+]].d, vl4 -; CHECK-NEXT: st1d { [[RES]].d }, [[PG]], [x0] -; CHECK-NEXT: ret +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: ptrue p0.d, vl4 +; CHECK-NEXT: mov z0.d, d0 +; CHECK-NEXT: st1d { z0.d }, p0, [x0] +; CHECK-NEXT: ret %insert = insertelement <4 x double> undef, double %a, i64 0 %splat = shufflevector <4 x double> %insert, <4 x double> undef, <4 x i32> zeroinitializer store <4 x double> %splat, <4 x double>* %b @@ -517,19 +1361,43 @@ } define void @splat_v8f64(double %a, <8 x double>* %b) #0 { -; CHECK-LABEL: splat_v8f64: -; VBITS_GE_512-DAG: mov [[RES:z[0-9]+]].d, d0 -; VBITS_GE_512-DAG: ptrue [[PG:p[0-9]+]].d, vl8 -; VBITS_GE_512-NEXT: st1d { [[RES]].d }, [[PG]], [x0] -; VBITS_GE_512-NEXT: ret +; NO_SVE-LABEL: splat_v8f64: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: // kill: def $d0 killed $d0 def $q0 +; NO_SVE-NEXT: dup v0.2d, v0.d[0] +; NO_SVE-NEXT: stp q0, q0, [x0] +; NO_SVE-NEXT: stp q0, q0, [x0, #32] +; NO_SVE-NEXT: ret +; +; VBITS_EQ_256-LABEL: splat_v8f64: +; VBITS_EQ_256: // %bb.0: +; VBITS_EQ_256-NEXT: mov x8, #4 +; VBITS_EQ_256-NEXT: // kill: def $d0 killed $d0 def $z0 +; VBITS_EQ_256-NEXT: ptrue p0.d, vl4 +; VBITS_EQ_256-NEXT: mov z0.d, d0 +; VBITS_EQ_256-NEXT: st1d { z0.d }, p0, [x0] +; VBITS_EQ_256-NEXT: st1d { z0.d }, p0, [x0, x8, lsl #3] +; VBITS_EQ_256-NEXT: ret +; +; VBITS_GE_384-LABEL: splat_v8f64: +; VBITS_GE_384: // %bb.0: +; VBITS_GE_384-NEXT: mov x8, #4 +; VBITS_GE_384-NEXT: // kill: def $d0 killed $d0 def $z0 +; VBITS_GE_384-NEXT: ptrue p0.d, vl4 +; VBITS_GE_384-NEXT: mov z0.d, d0 +; VBITS_GE_384-NEXT: st1d { z0.d }, p0, [x0] +; VBITS_GE_384-NEXT: st1d { z0.d }, p0, [x0, x8, lsl #3] +; VBITS_GE_384-NEXT: ret +; +; VBITS_GE_512-LABEL: splat_v8f64: +; VBITS_GE_512: // %bb.0: +; VBITS_GE_512-NEXT: // kill: def $d0 killed $d0 def $z0 +; VBITS_GE_512-NEXT: ptrue p0.d, vl8 +; VBITS_GE_512-NEXT: mov z0.d, d0 +; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x0] +; VBITS_GE_512-NEXT: ret ; Ensure sensible type legalisation. -; VBITS_EQ_256-DAG: mov [[RES:z[0-9]+]].d, d0 -; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].d, vl4 -; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #4 -; VBITS_EQ_256-DAG: st1d { [[RES]].d }, [[PG]], [x0] -; VBITS_EQ_256-DAG: st1d { [[RES]].d }, [[PG]], [x0, x[[NUMELTS]], lsl #3] -; VBITS_EQ_256-NEXT: ret %insert = insertelement <8 x double> undef, double %a, i64 0 %splat = shufflevector <8 x double> %insert, <8 x double> undef, <8 x i32> zeroinitializer store <8 x double> %splat, <8 x double>* %b @@ -537,11 +1405,51 @@ } define void @splat_v16f64(double %a, <16 x double>* %b) #0 { -; CHECK-LABEL: splat_v16f64: -; VBITS_GE_1024-DAG: mov [[RES:z[0-9]+]].d, d0 -; VBITS_GE_1024-DAG: ptrue [[PG:p[0-9]+]].d, vl16 -; VBITS_GE_1024-NEXT: st1d { [[RES]].d }, [[PG]], [x0] -; VBITS_GE_1024-NEXT: ret +; NO_SVE-LABEL: splat_v16f64: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: // kill: def $d0 killed $d0 def $q0 +; NO_SVE-NEXT: dup v0.2d, v0.d[0] +; NO_SVE-NEXT: stp q0, q0, [x0] +; NO_SVE-NEXT: stp q0, q0, [x0, #32] +; NO_SVE-NEXT: stp q0, q0, [x0, #64] +; NO_SVE-NEXT: stp q0, q0, [x0, #96] +; NO_SVE-NEXT: ret +; +; VBITS_EQ_256-LABEL: splat_v16f64: +; VBITS_EQ_256: // %bb.0: +; VBITS_EQ_256-NEXT: mov x8, #12 +; VBITS_EQ_256-NEXT: mov x9, #8 +; VBITS_EQ_256-NEXT: mov x10, #4 +; VBITS_EQ_256-NEXT: // kill: def $d0 killed $d0 def $z0 +; VBITS_EQ_256-NEXT: ptrue p0.d, vl4 +; VBITS_EQ_256-NEXT: mov z0.d, d0 +; VBITS_EQ_256-NEXT: st1d { z0.d }, p0, [x0, x8, lsl #3] +; VBITS_EQ_256-NEXT: st1d { z0.d }, p0, [x0, x9, lsl #3] +; VBITS_EQ_256-NEXT: st1d { z0.d }, p0, [x0, x10, lsl #3] +; VBITS_EQ_256-NEXT: st1d { z0.d }, p0, [x0] +; VBITS_EQ_256-NEXT: ret +; +; VBITS_GE_384-LABEL: splat_v16f64: +; VBITS_GE_384: // %bb.0: +; VBITS_GE_384-NEXT: mov x8, #12 +; VBITS_GE_384-NEXT: mov x9, #8 +; VBITS_GE_384-NEXT: mov x10, #4 +; VBITS_GE_384-NEXT: // kill: def $d0 killed $d0 def $z0 +; VBITS_GE_384-NEXT: ptrue p0.d, vl4 +; VBITS_GE_384-NEXT: mov z0.d, d0 +; VBITS_GE_384-NEXT: st1d { z0.d }, p0, [x0, x8, lsl #3] +; VBITS_GE_384-NEXT: st1d { z0.d }, p0, [x0, x9, lsl #3] +; VBITS_GE_384-NEXT: st1d { z0.d }, p0, [x0, x10, lsl #3] +; VBITS_GE_384-NEXT: st1d { z0.d }, p0, [x0] +; VBITS_GE_384-NEXT: ret +; +; VBITS_GE_1024-LABEL: splat_v16f64: +; VBITS_GE_1024: // %bb.0: +; VBITS_GE_1024-NEXT: // kill: def $d0 killed $d0 def $z0 +; VBITS_GE_1024-NEXT: ptrue p0.d, vl16 +; VBITS_GE_1024-NEXT: mov z0.d, d0 +; VBITS_GE_1024-NEXT: st1d { z0.d }, p0, [x0] +; VBITS_GE_1024-NEXT: ret %insert = insertelement <16 x double> undef, double %a, i64 0 %splat = shufflevector <16 x double> %insert, <16 x double> undef, <16 x i32> zeroinitializer store <16 x double> %splat, <16 x double>* %b @@ -549,11 +1457,71 @@ } define void @splat_v32f64(double %a, <32 x double>* %b) #0 { -; CHECK-LABEL: splat_v32f64: -; VBITS_GE_2048-DAG: mov [[RES:z[0-9]+]].d, d0 -; VBITS_GE_2048-DAG: ptrue [[PG:p[0-9]+]].d, vl32 -; VBITS_GE_2048-NEXT: st1d { [[RES]].d }, [[PG]], [x0] -; VBITS_GE_2048-NEXT: ret +; NO_SVE-LABEL: splat_v32f64: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: // kill: def $d0 killed $d0 def $q0 +; NO_SVE-NEXT: dup v0.2d, v0.d[0] +; NO_SVE-NEXT: stp q0, q0, [x0] +; NO_SVE-NEXT: stp q0, q0, [x0, #32] +; NO_SVE-NEXT: stp q0, q0, [x0, #64] +; NO_SVE-NEXT: stp q0, q0, [x0, #96] +; NO_SVE-NEXT: stp q0, q0, [x0, #128] +; NO_SVE-NEXT: stp q0, q0, [x0, #160] +; NO_SVE-NEXT: stp q0, q0, [x0, #192] +; NO_SVE-NEXT: stp q0, q0, [x0, #224] +; NO_SVE-NEXT: ret +; +; VBITS_EQ_256-LABEL: splat_v32f64: +; VBITS_EQ_256: // %bb.0: +; VBITS_EQ_256-NEXT: mov x8, #28 +; VBITS_EQ_256-NEXT: mov x9, #24 +; VBITS_EQ_256-NEXT: // kill: def $d0 killed $d0 def $z0 +; VBITS_EQ_256-NEXT: ptrue p0.d, vl4 +; VBITS_EQ_256-NEXT: mov x10, #20 +; VBITS_EQ_256-NEXT: mov z0.d, d0 +; VBITS_EQ_256-NEXT: mov x11, #16 +; VBITS_EQ_256-NEXT: mov x12, #12 +; VBITS_EQ_256-NEXT: st1d { z0.d }, p0, [x0, x8, lsl #3] +; VBITS_EQ_256-NEXT: mov x8, #8 +; VBITS_EQ_256-NEXT: st1d { z0.d }, p0, [x0, x9, lsl #3] +; VBITS_EQ_256-NEXT: mov x9, #4 +; VBITS_EQ_256-NEXT: st1d { z0.d }, p0, [x0, x10, lsl #3] +; VBITS_EQ_256-NEXT: st1d { z0.d }, p0, [x0, x11, lsl #3] +; VBITS_EQ_256-NEXT: st1d { z0.d }, p0, [x0, x12, lsl #3] +; VBITS_EQ_256-NEXT: st1d { z0.d }, p0, [x0, x8, lsl #3] +; VBITS_EQ_256-NEXT: st1d { z0.d }, p0, [x0] +; VBITS_EQ_256-NEXT: st1d { z0.d }, p0, [x0, x9, lsl #3] +; VBITS_EQ_256-NEXT: ret +; +; VBITS_GE_384-LABEL: splat_v32f64: +; VBITS_GE_384: // %bb.0: +; VBITS_GE_384-NEXT: mov x8, #28 +; VBITS_GE_384-NEXT: mov x9, #24 +; VBITS_GE_384-NEXT: // kill: def $d0 killed $d0 def $z0 +; VBITS_GE_384-NEXT: ptrue p0.d, vl4 +; VBITS_GE_384-NEXT: mov x10, #20 +; VBITS_GE_384-NEXT: mov z0.d, d0 +; VBITS_GE_384-NEXT: mov x11, #16 +; VBITS_GE_384-NEXT: mov x12, #12 +; VBITS_GE_384-NEXT: st1d { z0.d }, p0, [x0, x8, lsl #3] +; VBITS_GE_384-NEXT: mov x8, #8 +; VBITS_GE_384-NEXT: st1d { z0.d }, p0, [x0, x9, lsl #3] +; VBITS_GE_384-NEXT: mov x9, #4 +; VBITS_GE_384-NEXT: st1d { z0.d }, p0, [x0, x10, lsl #3] +; VBITS_GE_384-NEXT: st1d { z0.d }, p0, [x0, x11, lsl #3] +; VBITS_GE_384-NEXT: st1d { z0.d }, p0, [x0, x12, lsl #3] +; VBITS_GE_384-NEXT: st1d { z0.d }, p0, [x0, x8, lsl #3] +; VBITS_GE_384-NEXT: st1d { z0.d }, p0, [x0] +; VBITS_GE_384-NEXT: st1d { z0.d }, p0, [x0, x9, lsl #3] +; VBITS_GE_384-NEXT: ret +; +; VBITS_GE_2048-LABEL: splat_v32f64: +; VBITS_GE_2048: // %bb.0: +; VBITS_GE_2048-NEXT: // kill: def $d0 killed $d0 def $z0 +; VBITS_GE_2048-NEXT: ptrue p0.d, vl32 +; VBITS_GE_2048-NEXT: mov z0.d, d0 +; VBITS_GE_2048-NEXT: st1d { z0.d }, p0, [x0] +; VBITS_GE_2048-NEXT: ret %insert = insertelement <32 x double> undef, double %a, i64 0 %splat = shufflevector <32 x double> %insert, <32 x double> undef, <32 x i32> zeroinitializer store <32 x double> %splat, <32 x double>* %b @@ -565,11 +1533,37 @@ ; define void @splat_imm_v64i8(<64 x i8>* %a) #0 { -; CHECK-LABEL: splat_imm_v64i8: -; VBITS_GE_512-DAG: mov [[RES:z[0-9]+]].b, #1 -; VBITS_GE_512-DAG: ptrue [[PG:p[0-9]+]].b, vl64 -; VBITS_GE_512-NEXT: st1b { [[RES]].b }, [[PG]], [x0] -; VBITS_GE_512-NEXT: ret +; NO_SVE-LABEL: splat_imm_v64i8: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: movi v0.16b, #1 +; NO_SVE-NEXT: stp q0, q0, [x0] +; NO_SVE-NEXT: stp q0, q0, [x0, #32] +; NO_SVE-NEXT: ret +; +; VBITS_EQ_256-LABEL: splat_imm_v64i8: +; VBITS_EQ_256: // %bb.0: +; VBITS_EQ_256-NEXT: mov w8, #32 +; VBITS_EQ_256-NEXT: mov z0.b, #1 // =0x1 +; VBITS_EQ_256-NEXT: ptrue p0.b, vl32 +; VBITS_EQ_256-NEXT: st1b { z0.b }, p0, [x0] +; VBITS_EQ_256-NEXT: st1b { z0.b }, p0, [x0, x8] +; VBITS_EQ_256-NEXT: ret +; +; VBITS_GE_384-LABEL: splat_imm_v64i8: +; VBITS_GE_384: // %bb.0: +; VBITS_GE_384-NEXT: mov w8, #32 +; VBITS_GE_384-NEXT: mov z0.b, #1 // =0x1 +; VBITS_GE_384-NEXT: ptrue p0.b, vl32 +; VBITS_GE_384-NEXT: st1b { z0.b }, p0, [x0] +; VBITS_GE_384-NEXT: st1b { z0.b }, p0, [x0, x8] +; VBITS_GE_384-NEXT: ret +; +; VBITS_GE_512-LABEL: splat_imm_v64i8: +; VBITS_GE_512: // %bb.0: +; VBITS_GE_512-NEXT: mov z0.b, #1 // =0x1 +; VBITS_GE_512-NEXT: ptrue p0.b, vl64 +; VBITS_GE_512-NEXT: st1b { z0.b }, p0, [x0] +; VBITS_GE_512-NEXT: ret %insert = insertelement <64 x i8> undef, i8 1, i64 0 %splat = shufflevector <64 x i8> %insert, <64 x i8> undef, <64 x i32> zeroinitializer store <64 x i8> %splat, <64 x i8>* %a @@ -577,11 +1571,37 @@ } define void @splat_imm_v32i16(<32 x i16>* %a) #0 { -; CHECK-LABEL: splat_imm_v32i16: -; VBITS_GE_512-DAG: mov [[RES:z[0-9]+]].h, #2 -; VBITS_GE_512-DAG: ptrue [[PG:p[0-9]+]].h, vl32 -; VBITS_GE_512-NEXT: st1h { [[RES]].h }, [[PG]], [x0] -; VBITS_GE_512-NEXT: ret +; NO_SVE-LABEL: splat_imm_v32i16: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: movi v0.8h, #2 +; NO_SVE-NEXT: stp q0, q0, [x0] +; NO_SVE-NEXT: stp q0, q0, [x0, #32] +; NO_SVE-NEXT: ret +; +; VBITS_EQ_256-LABEL: splat_imm_v32i16: +; VBITS_EQ_256: // %bb.0: +; VBITS_EQ_256-NEXT: mov x8, #16 +; VBITS_EQ_256-NEXT: mov z0.h, #2 // =0x2 +; VBITS_EQ_256-NEXT: ptrue p0.h, vl16 +; VBITS_EQ_256-NEXT: st1h { z0.h }, p0, [x0] +; VBITS_EQ_256-NEXT: st1h { z0.h }, p0, [x0, x8, lsl #1] +; VBITS_EQ_256-NEXT: ret +; +; VBITS_GE_384-LABEL: splat_imm_v32i16: +; VBITS_GE_384: // %bb.0: +; VBITS_GE_384-NEXT: mov x8, #16 +; VBITS_GE_384-NEXT: mov z0.h, #2 // =0x2 +; VBITS_GE_384-NEXT: ptrue p0.h, vl16 +; VBITS_GE_384-NEXT: st1h { z0.h }, p0, [x0] +; VBITS_GE_384-NEXT: st1h { z0.h }, p0, [x0, x8, lsl #1] +; VBITS_GE_384-NEXT: ret +; +; VBITS_GE_512-LABEL: splat_imm_v32i16: +; VBITS_GE_512: // %bb.0: +; VBITS_GE_512-NEXT: mov z0.h, #2 // =0x2 +; VBITS_GE_512-NEXT: ptrue p0.h, vl32 +; VBITS_GE_512-NEXT: st1h { z0.h }, p0, [x0] +; VBITS_GE_512-NEXT: ret %insert = insertelement <32 x i16> undef, i16 2, i64 0 %splat = shufflevector <32 x i16> %insert, <32 x i16> undef, <32 x i32> zeroinitializer store <32 x i16> %splat, <32 x i16>* %a @@ -589,11 +1609,37 @@ } define void @splat_imm_v16i32(<16 x i32>* %a) #0 { -; CHECK-LABEL: splat_imm_v16i32: -; VBITS_GE_512-DAG: mov [[RES:z[0-9]+]].s, #3 -; VBITS_GE_512-DAG: ptrue [[PG:p[0-9]+]].s, vl16 -; VBITS_GE_512-NEXT: st1w { [[RES]].s }, [[PG]], [x0] -; VBITS_GE_512-NEXT: ret +; NO_SVE-LABEL: splat_imm_v16i32: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: movi v0.4s, #3 +; NO_SVE-NEXT: stp q0, q0, [x0] +; NO_SVE-NEXT: stp q0, q0, [x0, #32] +; NO_SVE-NEXT: ret +; +; VBITS_EQ_256-LABEL: splat_imm_v16i32: +; VBITS_EQ_256: // %bb.0: +; VBITS_EQ_256-NEXT: mov x8, #8 +; VBITS_EQ_256-NEXT: mov z0.s, #3 // =0x3 +; VBITS_EQ_256-NEXT: ptrue p0.s, vl8 +; VBITS_EQ_256-NEXT: st1w { z0.s }, p0, [x0] +; VBITS_EQ_256-NEXT: st1w { z0.s }, p0, [x0, x8, lsl #2] +; VBITS_EQ_256-NEXT: ret +; +; VBITS_GE_384-LABEL: splat_imm_v16i32: +; VBITS_GE_384: // %bb.0: +; VBITS_GE_384-NEXT: mov x8, #8 +; VBITS_GE_384-NEXT: mov z0.s, #3 // =0x3 +; VBITS_GE_384-NEXT: ptrue p0.s, vl8 +; VBITS_GE_384-NEXT: st1w { z0.s }, p0, [x0] +; VBITS_GE_384-NEXT: st1w { z0.s }, p0, [x0, x8, lsl #2] +; VBITS_GE_384-NEXT: ret +; +; VBITS_GE_512-LABEL: splat_imm_v16i32: +; VBITS_GE_512: // %bb.0: +; VBITS_GE_512-NEXT: mov z0.s, #3 // =0x3 +; VBITS_GE_512-NEXT: ptrue p0.s, vl16 +; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x0] +; VBITS_GE_512-NEXT: ret %insert = insertelement <16 x i32> undef, i32 3, i64 0 %splat = shufflevector <16 x i32> %insert, <16 x i32> undef, <16 x i32> zeroinitializer store <16 x i32> %splat, <16 x i32>* %a @@ -601,11 +1647,38 @@ } define void @splat_imm_v8i64(<8 x i64>* %a) #0 { -; CHECK-LABEL: splat_imm_v8i64: -; VBITS_GE_512-DAG: mov [[RES:z[0-9]+]].d, #4 -; VBITS_GE_512-DAG: ptrue [[PG:p[0-9]+]].d, vl8 -; VBITS_GE_512-NEXT: st1d { [[RES]].d }, [[PG]], [x0] -; VBITS_GE_512-NEXT: ret +; NO_SVE-LABEL: splat_imm_v8i64: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: mov w8, #4 +; NO_SVE-NEXT: dup v0.2d, x8 +; NO_SVE-NEXT: stp q0, q0, [x0] +; NO_SVE-NEXT: stp q0, q0, [x0, #32] +; NO_SVE-NEXT: ret +; +; VBITS_EQ_256-LABEL: splat_imm_v8i64: +; VBITS_EQ_256: // %bb.0: +; VBITS_EQ_256-NEXT: mov x8, #4 +; VBITS_EQ_256-NEXT: mov z0.d, #4 // =0x4 +; VBITS_EQ_256-NEXT: ptrue p0.d, vl4 +; VBITS_EQ_256-NEXT: st1d { z0.d }, p0, [x0] +; VBITS_EQ_256-NEXT: st1d { z0.d }, p0, [x0, x8, lsl #3] +; VBITS_EQ_256-NEXT: ret +; +; VBITS_GE_384-LABEL: splat_imm_v8i64: +; VBITS_GE_384: // %bb.0: +; VBITS_GE_384-NEXT: mov x8, #4 +; VBITS_GE_384-NEXT: mov z0.d, #4 // =0x4 +; VBITS_GE_384-NEXT: ptrue p0.d, vl4 +; VBITS_GE_384-NEXT: st1d { z0.d }, p0, [x0] +; VBITS_GE_384-NEXT: st1d { z0.d }, p0, [x0, x8, lsl #3] +; VBITS_GE_384-NEXT: ret +; +; VBITS_GE_512-LABEL: splat_imm_v8i64: +; VBITS_GE_512: // %bb.0: +; VBITS_GE_512-NEXT: mov z0.d, #4 // =0x4 +; VBITS_GE_512-NEXT: ptrue p0.d, vl8 +; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x0] +; VBITS_GE_512-NEXT: ret %insert = insertelement <8 x i64> undef, i64 4, i64 0 %splat = shufflevector <8 x i64> %insert, <8 x i64> undef, <8 x i32> zeroinitializer store <8 x i64> %splat, <8 x i64>* %a @@ -617,11 +1690,37 @@ ; define void @splat_imm_v32f16(<32 x half>* %a) #0 { -; CHECK-LABEL: splat_imm_v32f16: -; VBITS_GE_512-DAG: fmov [[RES:z[0-9]+]].h, #5.00000000 -; VBITS_GE_512-DAG: ptrue [[PG:p[0-9]+]].h, vl32 -; VBITS_GE_512-NEXT: st1h { [[RES]].h }, [[PG]], [x0] -; VBITS_GE_512-NEXT: ret +; NO_SVE-LABEL: splat_imm_v32f16: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: movi v0.8h, #69, lsl #8 +; NO_SVE-NEXT: stp q0, q0, [x0] +; NO_SVE-NEXT: stp q0, q0, [x0, #32] +; NO_SVE-NEXT: ret +; +; VBITS_EQ_256-LABEL: splat_imm_v32f16: +; VBITS_EQ_256: // %bb.0: +; VBITS_EQ_256-NEXT: mov x8, #16 +; VBITS_EQ_256-NEXT: fmov z0.h, #5.00000000 +; VBITS_EQ_256-NEXT: ptrue p0.h, vl16 +; VBITS_EQ_256-NEXT: st1h { z0.h }, p0, [x0] +; VBITS_EQ_256-NEXT: st1h { z0.h }, p0, [x0, x8, lsl #1] +; VBITS_EQ_256-NEXT: ret +; +; VBITS_GE_384-LABEL: splat_imm_v32f16: +; VBITS_GE_384: // %bb.0: +; VBITS_GE_384-NEXT: mov x8, #16 +; VBITS_GE_384-NEXT: fmov z0.h, #5.00000000 +; VBITS_GE_384-NEXT: ptrue p0.h, vl16 +; VBITS_GE_384-NEXT: st1h { z0.h }, p0, [x0] +; VBITS_GE_384-NEXT: st1h { z0.h }, p0, [x0, x8, lsl #1] +; VBITS_GE_384-NEXT: ret +; +; VBITS_GE_512-LABEL: splat_imm_v32f16: +; VBITS_GE_512: // %bb.0: +; VBITS_GE_512-NEXT: fmov z0.h, #5.00000000 +; VBITS_GE_512-NEXT: ptrue p0.h, vl32 +; VBITS_GE_512-NEXT: st1h { z0.h }, p0, [x0] +; VBITS_GE_512-NEXT: ret %insert = insertelement <32 x half> undef, half 5.0, i64 0 %splat = shufflevector <32 x half> %insert, <32 x half> undef, <32 x i32> zeroinitializer store <32 x half> %splat, <32 x half>* %a @@ -629,11 +1728,37 @@ } define void @splat_imm_v16f32(<16 x float>* %a) #0 { -; CHECK-LABEL: splat_imm_v16f32: -; VBITS_GE_512-DAG: fmov [[RES:z[0-9]+]].s, #6.00000000 -; VBITS_GE_512-DAG: ptrue [[PG:p[0-9]+]].s, vl16 -; VBITS_GE_512-NEXT: st1w { [[RES]].s }, [[PG]], [x0] -; VBITS_GE_512-NEXT: ret +; NO_SVE-LABEL: splat_imm_v16f32: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: fmov v0.4s, #6.00000000 +; NO_SVE-NEXT: stp q0, q0, [x0] +; NO_SVE-NEXT: stp q0, q0, [x0, #32] +; NO_SVE-NEXT: ret +; +; VBITS_EQ_256-LABEL: splat_imm_v16f32: +; VBITS_EQ_256: // %bb.0: +; VBITS_EQ_256-NEXT: mov x8, #8 +; VBITS_EQ_256-NEXT: fmov z0.s, #6.00000000 +; VBITS_EQ_256-NEXT: ptrue p0.s, vl8 +; VBITS_EQ_256-NEXT: st1w { z0.s }, p0, [x0] +; VBITS_EQ_256-NEXT: st1w { z0.s }, p0, [x0, x8, lsl #2] +; VBITS_EQ_256-NEXT: ret +; +; VBITS_GE_384-LABEL: splat_imm_v16f32: +; VBITS_GE_384: // %bb.0: +; VBITS_GE_384-NEXT: mov x8, #8 +; VBITS_GE_384-NEXT: fmov z0.s, #6.00000000 +; VBITS_GE_384-NEXT: ptrue p0.s, vl8 +; VBITS_GE_384-NEXT: st1w { z0.s }, p0, [x0] +; VBITS_GE_384-NEXT: st1w { z0.s }, p0, [x0, x8, lsl #2] +; VBITS_GE_384-NEXT: ret +; +; VBITS_GE_512-LABEL: splat_imm_v16f32: +; VBITS_GE_512: // %bb.0: +; VBITS_GE_512-NEXT: fmov z0.s, #6.00000000 +; VBITS_GE_512-NEXT: ptrue p0.s, vl16 +; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x0] +; VBITS_GE_512-NEXT: ret %insert = insertelement <16 x float> undef, float 6.0, i64 0 %splat = shufflevector <16 x float> %insert, <16 x float> undef, <16 x i32> zeroinitializer store <16 x float> %splat, <16 x float>* %a @@ -641,11 +1766,37 @@ } define void @splat_imm_v8f64(<8 x double>* %a) #0 { -; CHECK-LABEL: splat_imm_v8f64: -; VBITS_GE_512-DAG: fmov [[RES:z[0-9]+]].d, #7.00000000 -; VBITS_GE_512-DAG: ptrue [[PG:p[0-9]+]].d, vl8 -; VBITS_GE_512-NEXT: st1d { [[RES]].d }, [[PG]], [x0] -; VBITS_GE_512-NEXT: ret +; NO_SVE-LABEL: splat_imm_v8f64: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: fmov v0.2d, #7.00000000 +; NO_SVE-NEXT: stp q0, q0, [x0] +; NO_SVE-NEXT: stp q0, q0, [x0, #32] +; NO_SVE-NEXT: ret +; +; VBITS_EQ_256-LABEL: splat_imm_v8f64: +; VBITS_EQ_256: // %bb.0: +; VBITS_EQ_256-NEXT: mov x8, #4 +; VBITS_EQ_256-NEXT: fmov z0.d, #7.00000000 +; VBITS_EQ_256-NEXT: ptrue p0.d, vl4 +; VBITS_EQ_256-NEXT: st1d { z0.d }, p0, [x0] +; VBITS_EQ_256-NEXT: st1d { z0.d }, p0, [x0, x8, lsl #3] +; VBITS_EQ_256-NEXT: ret +; +; VBITS_GE_384-LABEL: splat_imm_v8f64: +; VBITS_GE_384: // %bb.0: +; VBITS_GE_384-NEXT: mov x8, #4 +; VBITS_GE_384-NEXT: fmov z0.d, #7.00000000 +; VBITS_GE_384-NEXT: ptrue p0.d, vl4 +; VBITS_GE_384-NEXT: st1d { z0.d }, p0, [x0] +; VBITS_GE_384-NEXT: st1d { z0.d }, p0, [x0, x8, lsl #3] +; VBITS_GE_384-NEXT: ret +; +; VBITS_GE_512-LABEL: splat_imm_v8f64: +; VBITS_GE_512: // %bb.0: +; VBITS_GE_512-NEXT: fmov z0.d, #7.00000000 +; VBITS_GE_512-NEXT: ptrue p0.d, vl8 +; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x0] +; VBITS_GE_512-NEXT: ret %insert = insertelement <8 x double> undef, double 7.0, i64 0 %splat = shufflevector <8 x double> %insert, <8 x double> undef, <8 x i32> zeroinitializer store <8 x double> %splat, <8 x double>* %a diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-subvector.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-subvector.ll --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-subvector.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-subvector.ll @@ -1,6 +1,7 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -aarch64-sve-vector-bits-min=128 -aarch64-enable-atomic-cfg-tidy=false < %s | FileCheck %s -check-prefix=NO_SVE -; RUN: llc -aarch64-sve-vector-bits-min=256 -aarch64-enable-atomic-cfg-tidy=false < %s | FileCheck %s -check-prefixes=CHECK -; RUN: llc -aarch64-sve-vector-bits-min=384 -aarch64-enable-atomic-cfg-tidy=false < %s | FileCheck %s -check-prefixes=CHECK +; RUN: llc -aarch64-sve-vector-bits-min=256 -aarch64-enable-atomic-cfg-tidy=false < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_256 +; RUN: llc -aarch64-sve-vector-bits-min=384 -aarch64-enable-atomic-cfg-tidy=false < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_384 ; RUN: llc -aarch64-sve-vector-bits-min=512 -aarch64-enable-atomic-cfg-tidy=false < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512 ; RUN: llc -aarch64-sve-vector-bits-min=640 -aarch64-enable-atomic-cfg-tidy=false < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512 ; RUN: llc -aarch64-sve-vector-bits-min=768 -aarch64-enable-atomic-cfg-tidy=false < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512 @@ -30,10 +31,17 @@ ; NO_SVE-NOT: ptrue define void @subvector_v8i16(<8 x i16> *%in, <8 x i16>* %out) #0 { +; NO_SVE-LABEL: subvector_v8i16: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: ldr q0, [x0] +; NO_SVE-NEXT: str q0, [x1] +; NO_SVE-NEXT: ret +; ; CHECK-LABEL: subvector_v8i16: -; CHECK: ldr [[DATA:q[0-9]+]], [x0] -; CHECK: str [[DATA]], [x1] -; CHECK: ret +; CHECK: // %bb.0: +; CHECK-NEXT: ldr q0, [x0] +; CHECK-NEXT: str q0, [x1] +; CHECK-NEXT: ret %a = load <8 x i16>, <8 x i16>* %in br label %bb1 @@ -43,11 +51,18 @@ } define void @subvector_v16i16(<16 x i16> *%in, <16 x i16>* %out) #0 { +; NO_SVE-LABEL: subvector_v16i16: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: ldp q1, q0, [x0] +; NO_SVE-NEXT: stp q1, q0, [x1] +; NO_SVE-NEXT: ret +; ; CHECK-LABEL: subvector_v16i16: -; CHECK: ptrue [[PG:p[0-9]+]].h, vl16 -; CHECK: ld1h { [[DATA:z[0-9]+.h]] }, [[PG]]/z, [x0] -; CHECK: st1h { [[DATA]] }, [[PG]], [x1] -; CHECK: ret +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.h, vl16 +; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] +; CHECK-NEXT: st1h { z0.h }, p0, [x1] +; CHECK-NEXT: ret %a = load <16 x i16>, <16 x i16>* %in br label %bb1 @@ -57,11 +72,40 @@ } define void @subvector_v32i16(<32 x i16> *%in, <32 x i16>* %out) #0 { -; CHECK-LABEL: subvector_v32i16: -; VBITS_GE_512: ptrue [[PG:p[0-9]+]].h, vl32 -; VBITS_GE_512: ld1h { [[DATA:z[0-9]+.h]] }, [[PG]]/z, [x0] -; VBITS_GE_512: st1h { [[DATA]] }, [[PG]], [x1] -; CHECK: ret +; NO_SVE-LABEL: subvector_v32i16: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: ldp q1, q0, [x0, #32] +; NO_SVE-NEXT: ldp q3, q2, [x0] +; NO_SVE-NEXT: stp q1, q0, [x1, #32] +; NO_SVE-NEXT: stp q3, q2, [x1] +; NO_SVE-NEXT: ret +; +; VBITS_GE_256-LABEL: subvector_v32i16: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov x8, #16 +; VBITS_GE_256-NEXT: ptrue p0.h, vl16 +; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] +; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0] +; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x1, x8, lsl #1] +; VBITS_GE_256-NEXT: st1h { z1.h }, p0, [x1] +; VBITS_GE_256-NEXT: ret +; +; VBITS_GE_384-LABEL: subvector_v32i16: +; VBITS_GE_384: // %bb.0: +; VBITS_GE_384-NEXT: mov x8, #16 +; VBITS_GE_384-NEXT: ptrue p0.h, vl16 +; VBITS_GE_384-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] +; VBITS_GE_384-NEXT: ld1h { z1.h }, p0/z, [x0] +; VBITS_GE_384-NEXT: st1h { z0.h }, p0, [x1, x8, lsl #1] +; VBITS_GE_384-NEXT: st1h { z1.h }, p0, [x1] +; VBITS_GE_384-NEXT: ret +; +; VBITS_GE_512-LABEL: subvector_v32i16: +; VBITS_GE_512: // %bb.0: +; VBITS_GE_512-NEXT: ptrue p0.h, vl32 +; VBITS_GE_512-NEXT: ld1h { z0.h }, p0/z, [x0] +; VBITS_GE_512-NEXT: st1h { z0.h }, p0, [x1] +; VBITS_GE_512-NEXT: ret %a = load <32 x i16>, <32 x i16>* %in br label %bb1 @@ -71,11 +115,56 @@ } define void @subvector_v64i16(<64 x i16> *%in, <64 x i16>* %out) #0 { -; CHECK-LABEL: subvector_v64i16: -; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].h, vl64 -; VBITS_GE_1024: ld1h { [[DATA:z[0-9]+.h]] }, [[PG]]/z, [x0] -; VBITS_GE_1024: st1h { [[DATA]] }, [[PG]], [x1] -; CHECK: ret +; NO_SVE-LABEL: subvector_v64i16: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: ldp q1, q0, [x0, #96] +; NO_SVE-NEXT: ldp q3, q2, [x0, #64] +; NO_SVE-NEXT: ldp q5, q4, [x0, #32] +; NO_SVE-NEXT: ldp q7, q6, [x0] +; NO_SVE-NEXT: stp q5, q4, [x1, #32] +; NO_SVE-NEXT: stp q3, q2, [x1, #64] +; NO_SVE-NEXT: stp q1, q0, [x1, #96] +; NO_SVE-NEXT: stp q7, q6, [x1] +; NO_SVE-NEXT: ret +; +; VBITS_GE_256-LABEL: subvector_v64i16: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov x8, #48 +; VBITS_GE_256-NEXT: mov x9, #32 +; VBITS_GE_256-NEXT: mov x10, #16 +; VBITS_GE_256-NEXT: ptrue p0.h, vl16 +; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] +; VBITS_GE_256-NEXT: ld1h { z2.h }, p0/z, [x0, x9, lsl #1] +; VBITS_GE_256-NEXT: ld1h { z3.h }, p0/z, [x0, x10, lsl #1] +; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0] +; VBITS_GE_256-NEXT: st1h { z2.h }, p0, [x1, x9, lsl #1] +; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x1, x8, lsl #1] +; VBITS_GE_256-NEXT: st1h { z3.h }, p0, [x1, x10, lsl #1] +; VBITS_GE_256-NEXT: st1h { z1.h }, p0, [x1] +; VBITS_GE_256-NEXT: ret +; +; VBITS_GE_384-LABEL: subvector_v64i16: +; VBITS_GE_384: // %bb.0: +; VBITS_GE_384-NEXT: mov x8, #48 +; VBITS_GE_384-NEXT: mov x9, #32 +; VBITS_GE_384-NEXT: mov x10, #16 +; VBITS_GE_384-NEXT: ptrue p0.h, vl16 +; VBITS_GE_384-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] +; VBITS_GE_384-NEXT: ld1h { z2.h }, p0/z, [x0, x9, lsl #1] +; VBITS_GE_384-NEXT: ld1h { z3.h }, p0/z, [x0, x10, lsl #1] +; VBITS_GE_384-NEXT: ld1h { z1.h }, p0/z, [x0] +; VBITS_GE_384-NEXT: st1h { z2.h }, p0, [x1, x9, lsl #1] +; VBITS_GE_384-NEXT: st1h { z0.h }, p0, [x1, x8, lsl #1] +; VBITS_GE_384-NEXT: st1h { z3.h }, p0, [x1, x10, lsl #1] +; VBITS_GE_384-NEXT: st1h { z1.h }, p0, [x1] +; VBITS_GE_384-NEXT: ret +; +; VBITS_GE_1024-LABEL: subvector_v64i16: +; VBITS_GE_1024: // %bb.0: +; VBITS_GE_1024-NEXT: ptrue p0.h, vl64 +; VBITS_GE_1024-NEXT: ld1h { z0.h }, p0/z, [x0] +; VBITS_GE_1024-NEXT: st1h { z0.h }, p0, [x1] +; VBITS_GE_1024-NEXT: ret %a = load <64 x i16>, <64 x i16>* %in br label %bb1 @@ -85,11 +174,18 @@ } define void @subvector_v8i32(<8 x i32> *%in, <8 x i32>* %out) #0 { +; NO_SVE-LABEL: subvector_v8i32: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: ldp q1, q0, [x0] +; NO_SVE-NEXT: stp q1, q0, [x1] +; NO_SVE-NEXT: ret +; ; CHECK-LABEL: subvector_v8i32: -; CHECK: ptrue [[PG:p[0-9]+]].s, vl8 -; CHECK: ld1w { [[DATA:z[0-9]+.s]] }, [[PG]]/z, [x0] -; CHECK: st1w { [[DATA]] }, [[PG]], [x1] -; CHECK: ret +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.s, vl8 +; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] +; CHECK-NEXT: st1w { z0.s }, p0, [x1] +; CHECK-NEXT: ret %a = load <8 x i32>, <8 x i32>* %in br label %bb1 @@ -99,11 +195,40 @@ } define void @subvector_v16i32(<16 x i32> *%in, <16 x i32>* %out) #0 { -; CHECK-LABEL: subvector_v16i32: -; VBITS_GE_512: ptrue [[PG:p[0-9]+]].s, vl16 -; VBITS_GE_512: ld1w { [[DATA:z[0-9]+.s]] }, [[PG]]/z, [x0] -; VBITS_GE_512: st1w { [[DATA]] }, [[PG]], [x1] -; CHECK: ret +; NO_SVE-LABEL: subvector_v16i32: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: ldp q1, q0, [x0, #32] +; NO_SVE-NEXT: ldp q3, q2, [x0] +; NO_SVE-NEXT: stp q1, q0, [x1, #32] +; NO_SVE-NEXT: stp q3, q2, [x1] +; NO_SVE-NEXT: ret +; +; VBITS_GE_256-LABEL: subvector_v16i32: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov x8, #8 +; VBITS_GE_256-NEXT: ptrue p0.s, vl8 +; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0] +; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x1, x8, lsl #2] +; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x1] +; VBITS_GE_256-NEXT: ret +; +; VBITS_GE_384-LABEL: subvector_v16i32: +; VBITS_GE_384: // %bb.0: +; VBITS_GE_384-NEXT: mov x8, #8 +; VBITS_GE_384-NEXT: ptrue p0.s, vl8 +; VBITS_GE_384-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] +; VBITS_GE_384-NEXT: ld1w { z1.s }, p0/z, [x0] +; VBITS_GE_384-NEXT: st1w { z0.s }, p0, [x1, x8, lsl #2] +; VBITS_GE_384-NEXT: st1w { z1.s }, p0, [x1] +; VBITS_GE_384-NEXT: ret +; +; VBITS_GE_512-LABEL: subvector_v16i32: +; VBITS_GE_512: // %bb.0: +; VBITS_GE_512-NEXT: ptrue p0.s, vl16 +; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x0] +; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x1] +; VBITS_GE_512-NEXT: ret %a = load <16 x i32>, <16 x i32>* %in br label %bb1 @@ -113,11 +238,56 @@ } define void @subvector_v32i32(<32 x i32> *%in, <32 x i32>* %out) #0 { -; CHECK-LABEL: subvector_v32i32: -; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].s, vl32 -; VBITS_GE_1024: ld1w { [[DATA:z[0-9]+.s]] }, [[PG]]/z, [x0] -; VBITS_GE_1024: st1w { [[DATA]] }, [[PG]], [x1] -; CHECK: ret +; NO_SVE-LABEL: subvector_v32i32: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: ldp q1, q0, [x0, #96] +; NO_SVE-NEXT: ldp q3, q2, [x0, #64] +; NO_SVE-NEXT: ldp q5, q4, [x0, #32] +; NO_SVE-NEXT: ldp q7, q6, [x0] +; NO_SVE-NEXT: stp q5, q4, [x1, #32] +; NO_SVE-NEXT: stp q3, q2, [x1, #64] +; NO_SVE-NEXT: stp q1, q0, [x1, #96] +; NO_SVE-NEXT: stp q7, q6, [x1] +; NO_SVE-NEXT: ret +; +; VBITS_GE_256-LABEL: subvector_v32i32: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov x8, #24 +; VBITS_GE_256-NEXT: mov x9, #16 +; VBITS_GE_256-NEXT: mov x10, #8 +; VBITS_GE_256-NEXT: ptrue p0.s, vl8 +; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z2.s }, p0/z, [x0, x9, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z3.s }, p0/z, [x0, x10, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0] +; VBITS_GE_256-NEXT: st1w { z2.s }, p0, [x1, x9, lsl #2] +; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x1, x8, lsl #2] +; VBITS_GE_256-NEXT: st1w { z3.s }, p0, [x1, x10, lsl #2] +; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x1] +; VBITS_GE_256-NEXT: ret +; +; VBITS_GE_384-LABEL: subvector_v32i32: +; VBITS_GE_384: // %bb.0: +; VBITS_GE_384-NEXT: mov x8, #24 +; VBITS_GE_384-NEXT: mov x9, #16 +; VBITS_GE_384-NEXT: mov x10, #8 +; VBITS_GE_384-NEXT: ptrue p0.s, vl8 +; VBITS_GE_384-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] +; VBITS_GE_384-NEXT: ld1w { z2.s }, p0/z, [x0, x9, lsl #2] +; VBITS_GE_384-NEXT: ld1w { z3.s }, p0/z, [x0, x10, lsl #2] +; VBITS_GE_384-NEXT: ld1w { z1.s }, p0/z, [x0] +; VBITS_GE_384-NEXT: st1w { z2.s }, p0, [x1, x9, lsl #2] +; VBITS_GE_384-NEXT: st1w { z0.s }, p0, [x1, x8, lsl #2] +; VBITS_GE_384-NEXT: st1w { z3.s }, p0, [x1, x10, lsl #2] +; VBITS_GE_384-NEXT: st1w { z1.s }, p0, [x1] +; VBITS_GE_384-NEXT: ret +; +; VBITS_GE_1024-LABEL: subvector_v32i32: +; VBITS_GE_1024: // %bb.0: +; VBITS_GE_1024-NEXT: ptrue p0.s, vl32 +; VBITS_GE_1024-NEXT: ld1w { z0.s }, p0/z, [x0] +; VBITS_GE_1024-NEXT: st1w { z0.s }, p0, [x1] +; VBITS_GE_1024-NEXT: ret %a = load <32 x i32>, <32 x i32>* %in br label %bb1 @@ -127,11 +297,88 @@ } define void @subvector_v64i32(<64 x i32> *%in, <64 x i32>* %out) #0 { -; CHECK-LABEL: subvector_v64i32: -; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].s, vl64 -; VBITS_GE_2048: ld1w { [[DATA:z[0-9]+.s]] }, [[PG]]/z, [x0] -; VBITS_GE_2048: st1w { [[DATA]] }, [[PG]], [x1] -; CHECK: ret +; NO_SVE-LABEL: subvector_v64i32: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: ldp q1, q0, [x0, #224] +; NO_SVE-NEXT: ldp q3, q2, [x0, #192] +; NO_SVE-NEXT: ldp q5, q4, [x0, #160] +; NO_SVE-NEXT: ldp q7, q6, [x0, #128] +; NO_SVE-NEXT: ldp q17, q16, [x0, #96] +; NO_SVE-NEXT: ldp q19, q18, [x0, #64] +; NO_SVE-NEXT: ldp q21, q20, [x0, #32] +; NO_SVE-NEXT: ldp q23, q22, [x0] +; NO_SVE-NEXT: stp q21, q20, [x1, #32] +; NO_SVE-NEXT: stp q19, q18, [x1, #64] +; NO_SVE-NEXT: stp q17, q16, [x1, #96] +; NO_SVE-NEXT: stp q7, q6, [x1, #128] +; NO_SVE-NEXT: stp q5, q4, [x1, #160] +; NO_SVE-NEXT: stp q23, q22, [x1] +; NO_SVE-NEXT: stp q3, q2, [x1, #192] +; NO_SVE-NEXT: stp q1, q0, [x1, #224] +; NO_SVE-NEXT: ret +; +; VBITS_GE_256-LABEL: subvector_v64i32: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov x8, #56 +; VBITS_GE_256-NEXT: mov x9, #48 +; VBITS_GE_256-NEXT: mov x10, #40 +; VBITS_GE_256-NEXT: mov x11, #32 +; VBITS_GE_256-NEXT: mov x12, #24 +; VBITS_GE_256-NEXT: mov x13, #16 +; VBITS_GE_256-NEXT: mov x14, #8 +; VBITS_GE_256-NEXT: ptrue p0.s, vl8 +; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0, x9, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z3.s }, p0/z, [x0, x10, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z4.s }, p0/z, [x0, x11, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z5.s }, p0/z, [x0, x12, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z6.s }, p0/z, [x0, x13, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z7.s }, p0/z, [x0, x14, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z2.s }, p0/z, [x0] +; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x1, x9, lsl #2] +; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x1, x8, lsl #2] +; VBITS_GE_256-NEXT: st1w { z4.s }, p0, [x1, x11, lsl #2] +; VBITS_GE_256-NEXT: st1w { z3.s }, p0, [x1, x10, lsl #2] +; VBITS_GE_256-NEXT: st1w { z6.s }, p0, [x1, x13, lsl #2] +; VBITS_GE_256-NEXT: st1w { z5.s }, p0, [x1, x12, lsl #2] +; VBITS_GE_256-NEXT: st1w { z7.s }, p0, [x1, x14, lsl #2] +; VBITS_GE_256-NEXT: st1w { z2.s }, p0, [x1] +; VBITS_GE_256-NEXT: ret +; +; VBITS_GE_384-LABEL: subvector_v64i32: +; VBITS_GE_384: // %bb.0: +; VBITS_GE_384-NEXT: mov x8, #56 +; VBITS_GE_384-NEXT: mov x9, #48 +; VBITS_GE_384-NEXT: mov x10, #40 +; VBITS_GE_384-NEXT: mov x11, #32 +; VBITS_GE_384-NEXT: mov x12, #24 +; VBITS_GE_384-NEXT: mov x13, #16 +; VBITS_GE_384-NEXT: mov x14, #8 +; VBITS_GE_384-NEXT: ptrue p0.s, vl8 +; VBITS_GE_384-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] +; VBITS_GE_384-NEXT: ld1w { z1.s }, p0/z, [x0, x9, lsl #2] +; VBITS_GE_384-NEXT: ld1w { z3.s }, p0/z, [x0, x10, lsl #2] +; VBITS_GE_384-NEXT: ld1w { z4.s }, p0/z, [x0, x11, lsl #2] +; VBITS_GE_384-NEXT: ld1w { z5.s }, p0/z, [x0, x12, lsl #2] +; VBITS_GE_384-NEXT: ld1w { z6.s }, p0/z, [x0, x13, lsl #2] +; VBITS_GE_384-NEXT: ld1w { z7.s }, p0/z, [x0, x14, lsl #2] +; VBITS_GE_384-NEXT: ld1w { z2.s }, p0/z, [x0] +; VBITS_GE_384-NEXT: st1w { z1.s }, p0, [x1, x9, lsl #2] +; VBITS_GE_384-NEXT: st1w { z0.s }, p0, [x1, x8, lsl #2] +; VBITS_GE_384-NEXT: st1w { z4.s }, p0, [x1, x11, lsl #2] +; VBITS_GE_384-NEXT: st1w { z3.s }, p0, [x1, x10, lsl #2] +; VBITS_GE_384-NEXT: st1w { z6.s }, p0, [x1, x13, lsl #2] +; VBITS_GE_384-NEXT: st1w { z5.s }, p0, [x1, x12, lsl #2] +; VBITS_GE_384-NEXT: st1w { z7.s }, p0, [x1, x14, lsl #2] +; VBITS_GE_384-NEXT: st1w { z2.s }, p0, [x1] +; VBITS_GE_384-NEXT: ret +; +; VBITS_GE_2048-LABEL: subvector_v64i32: +; VBITS_GE_2048: // %bb.0: +; VBITS_GE_2048-NEXT: ptrue p0.s, vl64 +; VBITS_GE_2048-NEXT: ld1w { z0.s }, p0/z, [x0] +; VBITS_GE_2048-NEXT: st1w { z0.s }, p0, [x1] +; VBITS_GE_2048-NEXT: ret %a = load <64 x i32>, <64 x i32>* %in br label %bb1 @@ -142,11 +389,40 @@ define void @subvector_v8i64(<8 x i64> *%in, <8 x i64>* %out) #0 { -; CHECK-LABEL: subvector_v8i64: -; VBITS_GE_512: ptrue [[PG:p[0-9]+]].d, vl8 -; VBITS_GE_512: ld1d { [[DATA:z[0-9]+.d]] }, [[PG]]/z, [x0] -; VBITS_GE_512: st1d { [[DATA]] }, [[PG]], [x1] -; CHECK: ret +; NO_SVE-LABEL: subvector_v8i64: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: ldp q1, q0, [x0, #32] +; NO_SVE-NEXT: ldp q3, q2, [x0] +; NO_SVE-NEXT: stp q1, q0, [x1, #32] +; NO_SVE-NEXT: stp q3, q2, [x1] +; NO_SVE-NEXT: ret +; +; VBITS_GE_256-LABEL: subvector_v8i64: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov x8, #4 +; VBITS_GE_256-NEXT: ptrue p0.d, vl4 +; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0] +; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x1, x8, lsl #3] +; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x1] +; VBITS_GE_256-NEXT: ret +; +; VBITS_GE_384-LABEL: subvector_v8i64: +; VBITS_GE_384: // %bb.0: +; VBITS_GE_384-NEXT: mov x8, #4 +; VBITS_GE_384-NEXT: ptrue p0.d, vl4 +; VBITS_GE_384-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] +; VBITS_GE_384-NEXT: ld1d { z1.d }, p0/z, [x0] +; VBITS_GE_384-NEXT: st1d { z0.d }, p0, [x1, x8, lsl #3] +; VBITS_GE_384-NEXT: st1d { z1.d }, p0, [x1] +; VBITS_GE_384-NEXT: ret +; +; VBITS_GE_512-LABEL: subvector_v8i64: +; VBITS_GE_512: // %bb.0: +; VBITS_GE_512-NEXT: ptrue p0.d, vl8 +; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x0] +; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x1] +; VBITS_GE_512-NEXT: ret %a = load <8 x i64>, <8 x i64>* %in br label %bb1 @@ -156,11 +432,56 @@ } define void @subvector_v16i64(<16 x i64> *%in, <16 x i64>* %out) #0 { -; CHECK-LABEL: subvector_v16i64: -; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].d, vl16 -; VBITS_GE_1024: ld1d { [[DATA:z[0-9]+.d]] }, [[PG]]/z, [x0] -; VBITS_GE_1024: st1d { [[DATA]] }, [[PG]], [x1] -; CHECK: ret +; NO_SVE-LABEL: subvector_v16i64: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: ldp q1, q0, [x0, #96] +; NO_SVE-NEXT: ldp q3, q2, [x0, #64] +; NO_SVE-NEXT: ldp q5, q4, [x0, #32] +; NO_SVE-NEXT: ldp q7, q6, [x0] +; NO_SVE-NEXT: stp q5, q4, [x1, #32] +; NO_SVE-NEXT: stp q3, q2, [x1, #64] +; NO_SVE-NEXT: stp q1, q0, [x1, #96] +; NO_SVE-NEXT: stp q7, q6, [x1] +; NO_SVE-NEXT: ret +; +; VBITS_GE_256-LABEL: subvector_v16i64: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov x8, #12 +; VBITS_GE_256-NEXT: mov x9, #8 +; VBITS_GE_256-NEXT: mov x10, #4 +; VBITS_GE_256-NEXT: ptrue p0.d, vl4 +; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z2.d }, p0/z, [x0, x9, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z3.d }, p0/z, [x0, x10, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0] +; VBITS_GE_256-NEXT: st1d { z2.d }, p0, [x1, x9, lsl #3] +; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x1, x8, lsl #3] +; VBITS_GE_256-NEXT: st1d { z3.d }, p0, [x1, x10, lsl #3] +; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x1] +; VBITS_GE_256-NEXT: ret +; +; VBITS_GE_384-LABEL: subvector_v16i64: +; VBITS_GE_384: // %bb.0: +; VBITS_GE_384-NEXT: mov x8, #12 +; VBITS_GE_384-NEXT: mov x9, #8 +; VBITS_GE_384-NEXT: mov x10, #4 +; VBITS_GE_384-NEXT: ptrue p0.d, vl4 +; VBITS_GE_384-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] +; VBITS_GE_384-NEXT: ld1d { z2.d }, p0/z, [x0, x9, lsl #3] +; VBITS_GE_384-NEXT: ld1d { z3.d }, p0/z, [x0, x10, lsl #3] +; VBITS_GE_384-NEXT: ld1d { z1.d }, p0/z, [x0] +; VBITS_GE_384-NEXT: st1d { z2.d }, p0, [x1, x9, lsl #3] +; VBITS_GE_384-NEXT: st1d { z0.d }, p0, [x1, x8, lsl #3] +; VBITS_GE_384-NEXT: st1d { z3.d }, p0, [x1, x10, lsl #3] +; VBITS_GE_384-NEXT: st1d { z1.d }, p0, [x1] +; VBITS_GE_384-NEXT: ret +; +; VBITS_GE_1024-LABEL: subvector_v16i64: +; VBITS_GE_1024: // %bb.0: +; VBITS_GE_1024-NEXT: ptrue p0.d, vl16 +; VBITS_GE_1024-NEXT: ld1d { z0.d }, p0/z, [x0] +; VBITS_GE_1024-NEXT: st1d { z0.d }, p0, [x1] +; VBITS_GE_1024-NEXT: ret %a = load <16 x i64>, <16 x i64>* %in br label %bb1 @@ -170,11 +491,88 @@ } define void @subvector_v32i64(<32 x i64> *%in, <32 x i64>* %out) #0 { -; CHECK-LABEL: subvector_v32i64: -; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].d, vl32 -; VBITS_GE_2048: ld1d { [[DATA:z[0-9]+.d]] }, [[PG]]/z, [x0] -; VBITS_GE_2048: st1d { [[DATA]] }, [[PG]], [x1] -; CHECK: ret +; NO_SVE-LABEL: subvector_v32i64: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: ldp q1, q0, [x0, #224] +; NO_SVE-NEXT: ldp q3, q2, [x0, #192] +; NO_SVE-NEXT: ldp q5, q4, [x0, #160] +; NO_SVE-NEXT: ldp q7, q6, [x0, #128] +; NO_SVE-NEXT: ldp q17, q16, [x0, #96] +; NO_SVE-NEXT: ldp q19, q18, [x0, #64] +; NO_SVE-NEXT: ldp q21, q20, [x0, #32] +; NO_SVE-NEXT: ldp q23, q22, [x0] +; NO_SVE-NEXT: stp q21, q20, [x1, #32] +; NO_SVE-NEXT: stp q19, q18, [x1, #64] +; NO_SVE-NEXT: stp q17, q16, [x1, #96] +; NO_SVE-NEXT: stp q7, q6, [x1, #128] +; NO_SVE-NEXT: stp q5, q4, [x1, #160] +; NO_SVE-NEXT: stp q23, q22, [x1] +; NO_SVE-NEXT: stp q3, q2, [x1, #192] +; NO_SVE-NEXT: stp q1, q0, [x1, #224] +; NO_SVE-NEXT: ret +; +; VBITS_GE_256-LABEL: subvector_v32i64: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov x8, #28 +; VBITS_GE_256-NEXT: mov x9, #24 +; VBITS_GE_256-NEXT: mov x10, #20 +; VBITS_GE_256-NEXT: mov x11, #16 +; VBITS_GE_256-NEXT: mov x12, #12 +; VBITS_GE_256-NEXT: mov x13, #8 +; VBITS_GE_256-NEXT: mov x14, #4 +; VBITS_GE_256-NEXT: ptrue p0.d, vl4 +; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0, x9, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z3.d }, p0/z, [x0, x10, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z4.d }, p0/z, [x0, x11, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z5.d }, p0/z, [x0, x12, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z6.d }, p0/z, [x0, x13, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z7.d }, p0/z, [x0, x14, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z2.d }, p0/z, [x0] +; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x1, x9, lsl #3] +; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x1, x8, lsl #3] +; VBITS_GE_256-NEXT: st1d { z4.d }, p0, [x1, x11, lsl #3] +; VBITS_GE_256-NEXT: st1d { z3.d }, p0, [x1, x10, lsl #3] +; VBITS_GE_256-NEXT: st1d { z6.d }, p0, [x1, x13, lsl #3] +; VBITS_GE_256-NEXT: st1d { z5.d }, p0, [x1, x12, lsl #3] +; VBITS_GE_256-NEXT: st1d { z7.d }, p0, [x1, x14, lsl #3] +; VBITS_GE_256-NEXT: st1d { z2.d }, p0, [x1] +; VBITS_GE_256-NEXT: ret +; +; VBITS_GE_384-LABEL: subvector_v32i64: +; VBITS_GE_384: // %bb.0: +; VBITS_GE_384-NEXT: mov x8, #28 +; VBITS_GE_384-NEXT: mov x9, #24 +; VBITS_GE_384-NEXT: mov x10, #20 +; VBITS_GE_384-NEXT: mov x11, #16 +; VBITS_GE_384-NEXT: mov x12, #12 +; VBITS_GE_384-NEXT: mov x13, #8 +; VBITS_GE_384-NEXT: mov x14, #4 +; VBITS_GE_384-NEXT: ptrue p0.d, vl4 +; VBITS_GE_384-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] +; VBITS_GE_384-NEXT: ld1d { z1.d }, p0/z, [x0, x9, lsl #3] +; VBITS_GE_384-NEXT: ld1d { z3.d }, p0/z, [x0, x10, lsl #3] +; VBITS_GE_384-NEXT: ld1d { z4.d }, p0/z, [x0, x11, lsl #3] +; VBITS_GE_384-NEXT: ld1d { z5.d }, p0/z, [x0, x12, lsl #3] +; VBITS_GE_384-NEXT: ld1d { z6.d }, p0/z, [x0, x13, lsl #3] +; VBITS_GE_384-NEXT: ld1d { z7.d }, p0/z, [x0, x14, lsl #3] +; VBITS_GE_384-NEXT: ld1d { z2.d }, p0/z, [x0] +; VBITS_GE_384-NEXT: st1d { z1.d }, p0, [x1, x9, lsl #3] +; VBITS_GE_384-NEXT: st1d { z0.d }, p0, [x1, x8, lsl #3] +; VBITS_GE_384-NEXT: st1d { z4.d }, p0, [x1, x11, lsl #3] +; VBITS_GE_384-NEXT: st1d { z3.d }, p0, [x1, x10, lsl #3] +; VBITS_GE_384-NEXT: st1d { z6.d }, p0, [x1, x13, lsl #3] +; VBITS_GE_384-NEXT: st1d { z5.d }, p0, [x1, x12, lsl #3] +; VBITS_GE_384-NEXT: st1d { z7.d }, p0, [x1, x14, lsl #3] +; VBITS_GE_384-NEXT: st1d { z2.d }, p0, [x1] +; VBITS_GE_384-NEXT: ret +; +; VBITS_GE_2048-LABEL: subvector_v32i64: +; VBITS_GE_2048: // %bb.0: +; VBITS_GE_2048-NEXT: ptrue p0.d, vl32 +; VBITS_GE_2048-NEXT: ld1d { z0.d }, p0/z, [x0] +; VBITS_GE_2048-NEXT: st1d { z0.d }, p0, [x1] +; VBITS_GE_2048-NEXT: ret %a = load <32 x i64>, <32 x i64>* %in br label %bb1 @@ -184,10 +582,17 @@ } define void @subvector_v8f16(<8 x half> *%in, <8 x half>* %out) #0 { +; NO_SVE-LABEL: subvector_v8f16: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: ldr q0, [x0] +; NO_SVE-NEXT: str q0, [x1] +; NO_SVE-NEXT: ret +; ; CHECK-LABEL: subvector_v8f16: -; CHECK: ldr [[DATA:q[0-9]+]], [x0] -; CHECK: str [[DATA]], [x1] -; CHECK: ret +; CHECK: // %bb.0: +; CHECK-NEXT: ldr q0, [x0] +; CHECK-NEXT: str q0, [x1] +; CHECK-NEXT: ret %a = load <8 x half>, <8 x half>* %in br label %bb1 @@ -197,11 +602,18 @@ } define void @subvector_v16f16(<16 x half> *%in, <16 x half>* %out) #0 { +; NO_SVE-LABEL: subvector_v16f16: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: ldp q1, q0, [x0] +; NO_SVE-NEXT: stp q1, q0, [x1] +; NO_SVE-NEXT: ret +; ; CHECK-LABEL: subvector_v16f16: -; CHECK: ptrue [[PG:p[0-9]+]].h, vl16 -; CHECK: ld1h { [[DATA:z[0-9]+.h]] }, [[PG]]/z, [x0] -; CHECK: st1h { [[DATA]] }, [[PG]], [x1] -; CHECK: ret +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.h, vl16 +; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] +; CHECK-NEXT: st1h { z0.h }, p0, [x1] +; CHECK-NEXT: ret %a = load <16 x half>, <16 x half>* %in br label %bb1 @@ -211,11 +623,40 @@ } define void @subvector_v32f16(<32 x half> *%in, <32 x half>* %out) #0 { -; CHECK-LABEL: subvector_v32f16: -; VBITS_GE_512: ptrue [[PG:p[0-9]+]].h, vl32 -; VBITS_GE_512: ld1h { [[DATA:z[0-9]+.h]] }, [[PG]]/z, [x0] -; VBITS_GE_512: st1h { [[DATA]] }, [[PG]], [x1] -; CHECK: ret +; NO_SVE-LABEL: subvector_v32f16: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: ldp q1, q0, [x0, #32] +; NO_SVE-NEXT: ldp q3, q2, [x0] +; NO_SVE-NEXT: stp q1, q0, [x1, #32] +; NO_SVE-NEXT: stp q3, q2, [x1] +; NO_SVE-NEXT: ret +; +; VBITS_GE_256-LABEL: subvector_v32f16: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov x8, #16 +; VBITS_GE_256-NEXT: ptrue p0.h, vl16 +; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] +; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0] +; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x1, x8, lsl #1] +; VBITS_GE_256-NEXT: st1h { z1.h }, p0, [x1] +; VBITS_GE_256-NEXT: ret +; +; VBITS_GE_384-LABEL: subvector_v32f16: +; VBITS_GE_384: // %bb.0: +; VBITS_GE_384-NEXT: mov x8, #16 +; VBITS_GE_384-NEXT: ptrue p0.h, vl16 +; VBITS_GE_384-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] +; VBITS_GE_384-NEXT: ld1h { z1.h }, p0/z, [x0] +; VBITS_GE_384-NEXT: st1h { z0.h }, p0, [x1, x8, lsl #1] +; VBITS_GE_384-NEXT: st1h { z1.h }, p0, [x1] +; VBITS_GE_384-NEXT: ret +; +; VBITS_GE_512-LABEL: subvector_v32f16: +; VBITS_GE_512: // %bb.0: +; VBITS_GE_512-NEXT: ptrue p0.h, vl32 +; VBITS_GE_512-NEXT: ld1h { z0.h }, p0/z, [x0] +; VBITS_GE_512-NEXT: st1h { z0.h }, p0, [x1] +; VBITS_GE_512-NEXT: ret %a = load <32 x half>, <32 x half>* %in br label %bb1 @@ -225,11 +666,56 @@ } define void @subvector_v64f16(<64 x half> *%in, <64 x half>* %out) #0 { -; CHECK-LABEL: subvector_v64f16: -; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].h, vl64 -; VBITS_GE_1024: ld1h { [[DATA:z[0-9]+.h]] }, [[PG]]/z, [x0] -; VBITS_GE_1024: st1h { [[DATA]] }, [[PG]], [x1] -; CHECK: ret +; NO_SVE-LABEL: subvector_v64f16: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: ldp q1, q0, [x0, #96] +; NO_SVE-NEXT: ldp q3, q2, [x0, #64] +; NO_SVE-NEXT: ldp q5, q4, [x0, #32] +; NO_SVE-NEXT: ldp q7, q6, [x0] +; NO_SVE-NEXT: stp q5, q4, [x1, #32] +; NO_SVE-NEXT: stp q3, q2, [x1, #64] +; NO_SVE-NEXT: stp q1, q0, [x1, #96] +; NO_SVE-NEXT: stp q7, q6, [x1] +; NO_SVE-NEXT: ret +; +; VBITS_GE_256-LABEL: subvector_v64f16: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov x8, #48 +; VBITS_GE_256-NEXT: mov x9, #32 +; VBITS_GE_256-NEXT: mov x10, #16 +; VBITS_GE_256-NEXT: ptrue p0.h, vl16 +; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] +; VBITS_GE_256-NEXT: ld1h { z2.h }, p0/z, [x0, x9, lsl #1] +; VBITS_GE_256-NEXT: ld1h { z3.h }, p0/z, [x0, x10, lsl #1] +; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0] +; VBITS_GE_256-NEXT: st1h { z2.h }, p0, [x1, x9, lsl #1] +; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x1, x8, lsl #1] +; VBITS_GE_256-NEXT: st1h { z3.h }, p0, [x1, x10, lsl #1] +; VBITS_GE_256-NEXT: st1h { z1.h }, p0, [x1] +; VBITS_GE_256-NEXT: ret +; +; VBITS_GE_384-LABEL: subvector_v64f16: +; VBITS_GE_384: // %bb.0: +; VBITS_GE_384-NEXT: mov x8, #48 +; VBITS_GE_384-NEXT: mov x9, #32 +; VBITS_GE_384-NEXT: mov x10, #16 +; VBITS_GE_384-NEXT: ptrue p0.h, vl16 +; VBITS_GE_384-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] +; VBITS_GE_384-NEXT: ld1h { z2.h }, p0/z, [x0, x9, lsl #1] +; VBITS_GE_384-NEXT: ld1h { z3.h }, p0/z, [x0, x10, lsl #1] +; VBITS_GE_384-NEXT: ld1h { z1.h }, p0/z, [x0] +; VBITS_GE_384-NEXT: st1h { z2.h }, p0, [x1, x9, lsl #1] +; VBITS_GE_384-NEXT: st1h { z0.h }, p0, [x1, x8, lsl #1] +; VBITS_GE_384-NEXT: st1h { z3.h }, p0, [x1, x10, lsl #1] +; VBITS_GE_384-NEXT: st1h { z1.h }, p0, [x1] +; VBITS_GE_384-NEXT: ret +; +; VBITS_GE_1024-LABEL: subvector_v64f16: +; VBITS_GE_1024: // %bb.0: +; VBITS_GE_1024-NEXT: ptrue p0.h, vl64 +; VBITS_GE_1024-NEXT: ld1h { z0.h }, p0/z, [x0] +; VBITS_GE_1024-NEXT: st1h { z0.h }, p0, [x1] +; VBITS_GE_1024-NEXT: ret %a = load <64 x half>, <64 x half>* %in br label %bb1 @@ -239,11 +725,18 @@ } define void @subvector_v8f32(<8 x float> *%in, <8 x float>* %out) #0 { +; NO_SVE-LABEL: subvector_v8f32: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: ldp q1, q0, [x0] +; NO_SVE-NEXT: stp q1, q0, [x1] +; NO_SVE-NEXT: ret +; ; CHECK-LABEL: subvector_v8f32: -; CHECK: ptrue [[PG:p[0-9]+]].s, vl8 -; CHECK: ld1w { [[DATA:z[0-9]+.s]] }, [[PG]]/z, [x0] -; CHECK: st1w { [[DATA]] }, [[PG]], [x1] -; CHECK: ret +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.s, vl8 +; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] +; CHECK-NEXT: st1w { z0.s }, p0, [x1] +; CHECK-NEXT: ret %a = load <8 x float>, <8 x float>* %in br label %bb1 @@ -253,11 +746,40 @@ } define void @subvector_v16f32(<16 x float> *%in, <16 x float>* %out) #0 { -; CHECK-LABEL: subvector_v16f32: -; VBITS_GE_512: ptrue [[PG:p[0-9]+]].s, vl16 -; VBITS_GE_512: ld1w { [[DATA:z[0-9]+.s]] }, [[PG]]/z, [x0] -; VBITS_GE_512: st1w { [[DATA]] }, [[PG]], [x1] -; CHECK: ret +; NO_SVE-LABEL: subvector_v16f32: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: ldp q1, q0, [x0, #32] +; NO_SVE-NEXT: ldp q3, q2, [x0] +; NO_SVE-NEXT: stp q1, q0, [x1, #32] +; NO_SVE-NEXT: stp q3, q2, [x1] +; NO_SVE-NEXT: ret +; +; VBITS_GE_256-LABEL: subvector_v16f32: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov x8, #8 +; VBITS_GE_256-NEXT: ptrue p0.s, vl8 +; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0] +; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x1, x8, lsl #2] +; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x1] +; VBITS_GE_256-NEXT: ret +; +; VBITS_GE_384-LABEL: subvector_v16f32: +; VBITS_GE_384: // %bb.0: +; VBITS_GE_384-NEXT: mov x8, #8 +; VBITS_GE_384-NEXT: ptrue p0.s, vl8 +; VBITS_GE_384-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] +; VBITS_GE_384-NEXT: ld1w { z1.s }, p0/z, [x0] +; VBITS_GE_384-NEXT: st1w { z0.s }, p0, [x1, x8, lsl #2] +; VBITS_GE_384-NEXT: st1w { z1.s }, p0, [x1] +; VBITS_GE_384-NEXT: ret +; +; VBITS_GE_512-LABEL: subvector_v16f32: +; VBITS_GE_512: // %bb.0: +; VBITS_GE_512-NEXT: ptrue p0.s, vl16 +; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x0] +; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x1] +; VBITS_GE_512-NEXT: ret %a = load <16 x float>, <16 x float>* %in br label %bb1 @@ -267,11 +789,56 @@ } define void @subvector_v32f32(<32 x float> *%in, <32 x float>* %out) #0 { -; CHECK-LABEL: subvector_v32f32: -; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].s, vl32 -; VBITS_GE_1024: ld1w { [[DATA:z[0-9]+.s]] }, [[PG]]/z, [x0] -; VBITS_GE_1024: st1w { [[DATA]] }, [[PG]], [x1] -; CHECK: ret +; NO_SVE-LABEL: subvector_v32f32: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: ldp q1, q0, [x0, #96] +; NO_SVE-NEXT: ldp q3, q2, [x0, #64] +; NO_SVE-NEXT: ldp q5, q4, [x0, #32] +; NO_SVE-NEXT: ldp q7, q6, [x0] +; NO_SVE-NEXT: stp q5, q4, [x1, #32] +; NO_SVE-NEXT: stp q3, q2, [x1, #64] +; NO_SVE-NEXT: stp q1, q0, [x1, #96] +; NO_SVE-NEXT: stp q7, q6, [x1] +; NO_SVE-NEXT: ret +; +; VBITS_GE_256-LABEL: subvector_v32f32: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov x8, #24 +; VBITS_GE_256-NEXT: mov x9, #16 +; VBITS_GE_256-NEXT: mov x10, #8 +; VBITS_GE_256-NEXT: ptrue p0.s, vl8 +; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z2.s }, p0/z, [x0, x9, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z3.s }, p0/z, [x0, x10, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0] +; VBITS_GE_256-NEXT: st1w { z2.s }, p0, [x1, x9, lsl #2] +; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x1, x8, lsl #2] +; VBITS_GE_256-NEXT: st1w { z3.s }, p0, [x1, x10, lsl #2] +; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x1] +; VBITS_GE_256-NEXT: ret +; +; VBITS_GE_384-LABEL: subvector_v32f32: +; VBITS_GE_384: // %bb.0: +; VBITS_GE_384-NEXT: mov x8, #24 +; VBITS_GE_384-NEXT: mov x9, #16 +; VBITS_GE_384-NEXT: mov x10, #8 +; VBITS_GE_384-NEXT: ptrue p0.s, vl8 +; VBITS_GE_384-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] +; VBITS_GE_384-NEXT: ld1w { z2.s }, p0/z, [x0, x9, lsl #2] +; VBITS_GE_384-NEXT: ld1w { z3.s }, p0/z, [x0, x10, lsl #2] +; VBITS_GE_384-NEXT: ld1w { z1.s }, p0/z, [x0] +; VBITS_GE_384-NEXT: st1w { z2.s }, p0, [x1, x9, lsl #2] +; VBITS_GE_384-NEXT: st1w { z0.s }, p0, [x1, x8, lsl #2] +; VBITS_GE_384-NEXT: st1w { z3.s }, p0, [x1, x10, lsl #2] +; VBITS_GE_384-NEXT: st1w { z1.s }, p0, [x1] +; VBITS_GE_384-NEXT: ret +; +; VBITS_GE_1024-LABEL: subvector_v32f32: +; VBITS_GE_1024: // %bb.0: +; VBITS_GE_1024-NEXT: ptrue p0.s, vl32 +; VBITS_GE_1024-NEXT: ld1w { z0.s }, p0/z, [x0] +; VBITS_GE_1024-NEXT: st1w { z0.s }, p0, [x1] +; VBITS_GE_1024-NEXT: ret %a = load <32 x float>, <32 x float>* %in br label %bb1 @@ -281,11 +848,88 @@ } define void @subvector_v64f32(<64 x float> *%in, <64 x float>* %out) #0 { -; CHECK-LABEL: subvector_v64f32: -; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].s, vl64 -; VBITS_GE_2048: ld1w { [[DATA:z[0-9]+.s]] }, [[PG]]/z, [x0] -; VBITS_GE_2048: st1w { [[DATA]] }, [[PG]], [x1] -; CHECK: ret +; NO_SVE-LABEL: subvector_v64f32: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: ldp q1, q0, [x0, #224] +; NO_SVE-NEXT: ldp q3, q2, [x0, #192] +; NO_SVE-NEXT: ldp q5, q4, [x0, #160] +; NO_SVE-NEXT: ldp q7, q6, [x0, #128] +; NO_SVE-NEXT: ldp q17, q16, [x0, #96] +; NO_SVE-NEXT: ldp q19, q18, [x0, #64] +; NO_SVE-NEXT: ldp q21, q20, [x0, #32] +; NO_SVE-NEXT: ldp q23, q22, [x0] +; NO_SVE-NEXT: stp q21, q20, [x1, #32] +; NO_SVE-NEXT: stp q19, q18, [x1, #64] +; NO_SVE-NEXT: stp q17, q16, [x1, #96] +; NO_SVE-NEXT: stp q7, q6, [x1, #128] +; NO_SVE-NEXT: stp q5, q4, [x1, #160] +; NO_SVE-NEXT: stp q23, q22, [x1] +; NO_SVE-NEXT: stp q3, q2, [x1, #192] +; NO_SVE-NEXT: stp q1, q0, [x1, #224] +; NO_SVE-NEXT: ret +; +; VBITS_GE_256-LABEL: subvector_v64f32: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov x8, #56 +; VBITS_GE_256-NEXT: mov x9, #48 +; VBITS_GE_256-NEXT: mov x10, #40 +; VBITS_GE_256-NEXT: mov x11, #32 +; VBITS_GE_256-NEXT: mov x12, #24 +; VBITS_GE_256-NEXT: mov x13, #16 +; VBITS_GE_256-NEXT: mov x14, #8 +; VBITS_GE_256-NEXT: ptrue p0.s, vl8 +; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0, x9, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z3.s }, p0/z, [x0, x10, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z4.s }, p0/z, [x0, x11, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z5.s }, p0/z, [x0, x12, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z6.s }, p0/z, [x0, x13, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z7.s }, p0/z, [x0, x14, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z2.s }, p0/z, [x0] +; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x1, x9, lsl #2] +; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x1, x8, lsl #2] +; VBITS_GE_256-NEXT: st1w { z4.s }, p0, [x1, x11, lsl #2] +; VBITS_GE_256-NEXT: st1w { z3.s }, p0, [x1, x10, lsl #2] +; VBITS_GE_256-NEXT: st1w { z6.s }, p0, [x1, x13, lsl #2] +; VBITS_GE_256-NEXT: st1w { z5.s }, p0, [x1, x12, lsl #2] +; VBITS_GE_256-NEXT: st1w { z7.s }, p0, [x1, x14, lsl #2] +; VBITS_GE_256-NEXT: st1w { z2.s }, p0, [x1] +; VBITS_GE_256-NEXT: ret +; +; VBITS_GE_384-LABEL: subvector_v64f32: +; VBITS_GE_384: // %bb.0: +; VBITS_GE_384-NEXT: mov x8, #56 +; VBITS_GE_384-NEXT: mov x9, #48 +; VBITS_GE_384-NEXT: mov x10, #40 +; VBITS_GE_384-NEXT: mov x11, #32 +; VBITS_GE_384-NEXT: mov x12, #24 +; VBITS_GE_384-NEXT: mov x13, #16 +; VBITS_GE_384-NEXT: mov x14, #8 +; VBITS_GE_384-NEXT: ptrue p0.s, vl8 +; VBITS_GE_384-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] +; VBITS_GE_384-NEXT: ld1w { z1.s }, p0/z, [x0, x9, lsl #2] +; VBITS_GE_384-NEXT: ld1w { z3.s }, p0/z, [x0, x10, lsl #2] +; VBITS_GE_384-NEXT: ld1w { z4.s }, p0/z, [x0, x11, lsl #2] +; VBITS_GE_384-NEXT: ld1w { z5.s }, p0/z, [x0, x12, lsl #2] +; VBITS_GE_384-NEXT: ld1w { z6.s }, p0/z, [x0, x13, lsl #2] +; VBITS_GE_384-NEXT: ld1w { z7.s }, p0/z, [x0, x14, lsl #2] +; VBITS_GE_384-NEXT: ld1w { z2.s }, p0/z, [x0] +; VBITS_GE_384-NEXT: st1w { z1.s }, p0, [x1, x9, lsl #2] +; VBITS_GE_384-NEXT: st1w { z0.s }, p0, [x1, x8, lsl #2] +; VBITS_GE_384-NEXT: st1w { z4.s }, p0, [x1, x11, lsl #2] +; VBITS_GE_384-NEXT: st1w { z3.s }, p0, [x1, x10, lsl #2] +; VBITS_GE_384-NEXT: st1w { z6.s }, p0, [x1, x13, lsl #2] +; VBITS_GE_384-NEXT: st1w { z5.s }, p0, [x1, x12, lsl #2] +; VBITS_GE_384-NEXT: st1w { z7.s }, p0, [x1, x14, lsl #2] +; VBITS_GE_384-NEXT: st1w { z2.s }, p0, [x1] +; VBITS_GE_384-NEXT: ret +; +; VBITS_GE_2048-LABEL: subvector_v64f32: +; VBITS_GE_2048: // %bb.0: +; VBITS_GE_2048-NEXT: ptrue p0.s, vl64 +; VBITS_GE_2048-NEXT: ld1w { z0.s }, p0/z, [x0] +; VBITS_GE_2048-NEXT: st1w { z0.s }, p0, [x1] +; VBITS_GE_2048-NEXT: ret %a = load <64 x float>, <64 x float>* %in br label %bb1 @@ -294,11 +938,40 @@ ret void } define void @subvector_v8f64(<8 x double> *%in, <8 x double>* %out) #0 { -; CHECK-LABEL: subvector_v8f64: -; VBITS_GE_512: ptrue [[PG:p[0-9]+]].d, vl8 -; VBITS_GE_512: ld1d { [[DATA:z[0-9]+.d]] }, [[PG]]/z, [x0] -; VBITS_GE_512: st1d { [[DATA]] }, [[PG]], [x1] -; CHECK: ret +; NO_SVE-LABEL: subvector_v8f64: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: ldp q1, q0, [x0, #32] +; NO_SVE-NEXT: ldp q3, q2, [x0] +; NO_SVE-NEXT: stp q1, q0, [x1, #32] +; NO_SVE-NEXT: stp q3, q2, [x1] +; NO_SVE-NEXT: ret +; +; VBITS_GE_256-LABEL: subvector_v8f64: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov x8, #4 +; VBITS_GE_256-NEXT: ptrue p0.d, vl4 +; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0] +; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x1, x8, lsl #3] +; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x1] +; VBITS_GE_256-NEXT: ret +; +; VBITS_GE_384-LABEL: subvector_v8f64: +; VBITS_GE_384: // %bb.0: +; VBITS_GE_384-NEXT: mov x8, #4 +; VBITS_GE_384-NEXT: ptrue p0.d, vl4 +; VBITS_GE_384-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] +; VBITS_GE_384-NEXT: ld1d { z1.d }, p0/z, [x0] +; VBITS_GE_384-NEXT: st1d { z0.d }, p0, [x1, x8, lsl #3] +; VBITS_GE_384-NEXT: st1d { z1.d }, p0, [x1] +; VBITS_GE_384-NEXT: ret +; +; VBITS_GE_512-LABEL: subvector_v8f64: +; VBITS_GE_512: // %bb.0: +; VBITS_GE_512-NEXT: ptrue p0.d, vl8 +; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x0] +; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x1] +; VBITS_GE_512-NEXT: ret %a = load <8 x double>, <8 x double>* %in br label %bb1 @@ -308,11 +981,56 @@ } define void @subvector_v16f64(<16 x double> *%in, <16 x double>* %out) #0 { -; CHECK-LABEL: subvector_v16f64: -; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].d, vl16 -; VBITS_GE_1024: ld1d { [[DATA:z[0-9]+.d]] }, [[PG]]/z, [x0] -; VBITS_GE_1024: st1d { [[DATA]] }, [[PG]], [x1] -; CHECK: ret +; NO_SVE-LABEL: subvector_v16f64: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: ldp q1, q0, [x0, #96] +; NO_SVE-NEXT: ldp q3, q2, [x0, #64] +; NO_SVE-NEXT: ldp q5, q4, [x0, #32] +; NO_SVE-NEXT: ldp q7, q6, [x0] +; NO_SVE-NEXT: stp q5, q4, [x1, #32] +; NO_SVE-NEXT: stp q3, q2, [x1, #64] +; NO_SVE-NEXT: stp q1, q0, [x1, #96] +; NO_SVE-NEXT: stp q7, q6, [x1] +; NO_SVE-NEXT: ret +; +; VBITS_GE_256-LABEL: subvector_v16f64: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov x8, #12 +; VBITS_GE_256-NEXT: mov x9, #8 +; VBITS_GE_256-NEXT: mov x10, #4 +; VBITS_GE_256-NEXT: ptrue p0.d, vl4 +; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z2.d }, p0/z, [x0, x9, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z3.d }, p0/z, [x0, x10, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0] +; VBITS_GE_256-NEXT: st1d { z2.d }, p0, [x1, x9, lsl #3] +; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x1, x8, lsl #3] +; VBITS_GE_256-NEXT: st1d { z3.d }, p0, [x1, x10, lsl #3] +; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x1] +; VBITS_GE_256-NEXT: ret +; +; VBITS_GE_384-LABEL: subvector_v16f64: +; VBITS_GE_384: // %bb.0: +; VBITS_GE_384-NEXT: mov x8, #12 +; VBITS_GE_384-NEXT: mov x9, #8 +; VBITS_GE_384-NEXT: mov x10, #4 +; VBITS_GE_384-NEXT: ptrue p0.d, vl4 +; VBITS_GE_384-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] +; VBITS_GE_384-NEXT: ld1d { z2.d }, p0/z, [x0, x9, lsl #3] +; VBITS_GE_384-NEXT: ld1d { z3.d }, p0/z, [x0, x10, lsl #3] +; VBITS_GE_384-NEXT: ld1d { z1.d }, p0/z, [x0] +; VBITS_GE_384-NEXT: st1d { z2.d }, p0, [x1, x9, lsl #3] +; VBITS_GE_384-NEXT: st1d { z0.d }, p0, [x1, x8, lsl #3] +; VBITS_GE_384-NEXT: st1d { z3.d }, p0, [x1, x10, lsl #3] +; VBITS_GE_384-NEXT: st1d { z1.d }, p0, [x1] +; VBITS_GE_384-NEXT: ret +; +; VBITS_GE_1024-LABEL: subvector_v16f64: +; VBITS_GE_1024: // %bb.0: +; VBITS_GE_1024-NEXT: ptrue p0.d, vl16 +; VBITS_GE_1024-NEXT: ld1d { z0.d }, p0/z, [x0] +; VBITS_GE_1024-NEXT: st1d { z0.d }, p0, [x1] +; VBITS_GE_1024-NEXT: ret %a = load <16 x double>, <16 x double>* %in br label %bb1 @@ -322,11 +1040,88 @@ } define void @subvector_v32f64(<32 x double> *%in, <32 x double>* %out) #0 { -; CHECK-LABEL: subvector_v32f64: -; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].d, vl32 -; VBITS_GE_2048: ld1d { [[DATA:z[0-9]+.d]] }, [[PG]]/z, [x0] -; VBITS_GE_2048: st1d { [[DATA]] }, [[PG]], [x1] -; CHECK: ret +; NO_SVE-LABEL: subvector_v32f64: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: ldp q1, q0, [x0, #224] +; NO_SVE-NEXT: ldp q3, q2, [x0, #192] +; NO_SVE-NEXT: ldp q5, q4, [x0, #160] +; NO_SVE-NEXT: ldp q7, q6, [x0, #128] +; NO_SVE-NEXT: ldp q17, q16, [x0, #96] +; NO_SVE-NEXT: ldp q19, q18, [x0, #64] +; NO_SVE-NEXT: ldp q21, q20, [x0, #32] +; NO_SVE-NEXT: ldp q23, q22, [x0] +; NO_SVE-NEXT: stp q21, q20, [x1, #32] +; NO_SVE-NEXT: stp q19, q18, [x1, #64] +; NO_SVE-NEXT: stp q17, q16, [x1, #96] +; NO_SVE-NEXT: stp q7, q6, [x1, #128] +; NO_SVE-NEXT: stp q5, q4, [x1, #160] +; NO_SVE-NEXT: stp q23, q22, [x1] +; NO_SVE-NEXT: stp q3, q2, [x1, #192] +; NO_SVE-NEXT: stp q1, q0, [x1, #224] +; NO_SVE-NEXT: ret +; +; VBITS_GE_256-LABEL: subvector_v32f64: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov x8, #28 +; VBITS_GE_256-NEXT: mov x9, #24 +; VBITS_GE_256-NEXT: mov x10, #20 +; VBITS_GE_256-NEXT: mov x11, #16 +; VBITS_GE_256-NEXT: mov x12, #12 +; VBITS_GE_256-NEXT: mov x13, #8 +; VBITS_GE_256-NEXT: mov x14, #4 +; VBITS_GE_256-NEXT: ptrue p0.d, vl4 +; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0, x9, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z3.d }, p0/z, [x0, x10, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z4.d }, p0/z, [x0, x11, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z5.d }, p0/z, [x0, x12, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z6.d }, p0/z, [x0, x13, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z7.d }, p0/z, [x0, x14, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z2.d }, p0/z, [x0] +; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x1, x9, lsl #3] +; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x1, x8, lsl #3] +; VBITS_GE_256-NEXT: st1d { z4.d }, p0, [x1, x11, lsl #3] +; VBITS_GE_256-NEXT: st1d { z3.d }, p0, [x1, x10, lsl #3] +; VBITS_GE_256-NEXT: st1d { z6.d }, p0, [x1, x13, lsl #3] +; VBITS_GE_256-NEXT: st1d { z5.d }, p0, [x1, x12, lsl #3] +; VBITS_GE_256-NEXT: st1d { z7.d }, p0, [x1, x14, lsl #3] +; VBITS_GE_256-NEXT: st1d { z2.d }, p0, [x1] +; VBITS_GE_256-NEXT: ret +; +; VBITS_GE_384-LABEL: subvector_v32f64: +; VBITS_GE_384: // %bb.0: +; VBITS_GE_384-NEXT: mov x8, #28 +; VBITS_GE_384-NEXT: mov x9, #24 +; VBITS_GE_384-NEXT: mov x10, #20 +; VBITS_GE_384-NEXT: mov x11, #16 +; VBITS_GE_384-NEXT: mov x12, #12 +; VBITS_GE_384-NEXT: mov x13, #8 +; VBITS_GE_384-NEXT: mov x14, #4 +; VBITS_GE_384-NEXT: ptrue p0.d, vl4 +; VBITS_GE_384-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] +; VBITS_GE_384-NEXT: ld1d { z1.d }, p0/z, [x0, x9, lsl #3] +; VBITS_GE_384-NEXT: ld1d { z3.d }, p0/z, [x0, x10, lsl #3] +; VBITS_GE_384-NEXT: ld1d { z4.d }, p0/z, [x0, x11, lsl #3] +; VBITS_GE_384-NEXT: ld1d { z5.d }, p0/z, [x0, x12, lsl #3] +; VBITS_GE_384-NEXT: ld1d { z6.d }, p0/z, [x0, x13, lsl #3] +; VBITS_GE_384-NEXT: ld1d { z7.d }, p0/z, [x0, x14, lsl #3] +; VBITS_GE_384-NEXT: ld1d { z2.d }, p0/z, [x0] +; VBITS_GE_384-NEXT: st1d { z1.d }, p0, [x1, x9, lsl #3] +; VBITS_GE_384-NEXT: st1d { z0.d }, p0, [x1, x8, lsl #3] +; VBITS_GE_384-NEXT: st1d { z4.d }, p0, [x1, x11, lsl #3] +; VBITS_GE_384-NEXT: st1d { z3.d }, p0, [x1, x10, lsl #3] +; VBITS_GE_384-NEXT: st1d { z6.d }, p0, [x1, x13, lsl #3] +; VBITS_GE_384-NEXT: st1d { z5.d }, p0, [x1, x12, lsl #3] +; VBITS_GE_384-NEXT: st1d { z7.d }, p0, [x1, x14, lsl #3] +; VBITS_GE_384-NEXT: st1d { z2.d }, p0, [x1] +; VBITS_GE_384-NEXT: ret +; +; VBITS_GE_2048-LABEL: subvector_v32f64: +; VBITS_GE_2048: // %bb.0: +; VBITS_GE_2048-NEXT: ptrue p0.d, vl32 +; VBITS_GE_2048-NEXT: ld1d { z0.d }, p0/z, [x0] +; VBITS_GE_2048-NEXT: st1d { z0.d }, p0, [x1] +; VBITS_GE_2048-NEXT: ret %a = load <32 x double>, <32 x double>* %in br label %bb1 @@ -336,11 +1131,25 @@ } define <8 x i1> @no_warn_dropped_scalable(<8 x i32>* %in) #0 { +; NO_SVE-LABEL: no_warn_dropped_scalable: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: ldp q1, q0, [x0] +; NO_SVE-NEXT: cmgt v1.4s, v1.4s, #0 +; NO_SVE-NEXT: cmgt v0.4s, v0.4s, #0 +; NO_SVE-NEXT: uzp1 v0.8h, v1.8h, v0.8h +; NO_SVE-NEXT: xtn v0.8b, v0.8h +; NO_SVE-NEXT: ret +; ; CHECK-LABEL: no_warn_dropped_scalable: -; CHECK: ptrue [[PG:p[0-9]+]].s, vl8 -; CHECK: ld1w { [[A:z[0-9]+]].s }, [[PG]]/z, [x0] -; CHECK: cmpgt p{{[0-9]}}.s, [[PG]]/z, [[A]].s, #0 -; CHECK: ret +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.s, vl8 +; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] +; CHECK-NEXT: cmpgt p0.s, p0/z, z0.s, #0 +; CHECK-NEXT: mov z0.s, p0/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h +; CHECK-NEXT: uzp1 z0.b, z0.b, z0.b +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 +; CHECK-NEXT: ret %a = load <8 x i32>, <8 x i32>* %in br label %bb1 @@ -354,16 +1163,28 @@ ; performed when the input idiom is the result of operation legalisation. When ; not prevented the test triggers infinite combine->legalise->combine->... define void @no_subvector_binop_hang(<8 x i32>* %in, <8 x i32>* %out, i1 %cond) #0 { +; NO_SVE-LABEL: no_subvector_binop_hang: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: tbz w2, #0, .LBB23_2 +; NO_SVE-NEXT: // %bb.1: // %bb.1 +; NO_SVE-NEXT: ldp q1, q0, [x0] +; NO_SVE-NEXT: ldp q2, q3, [x1] +; NO_SVE-NEXT: orr v1.16b, v1.16b, v2.16b +; NO_SVE-NEXT: orr v0.16b, v0.16b, v3.16b +; NO_SVE-NEXT: stp q1, q0, [x1] +; NO_SVE-NEXT: .LBB23_2: // %bb.2 +; NO_SVE-NEXT: ret +; ; CHECK-LABEL: no_subvector_binop_hang: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue [[PG:p[0-9]+]].s, vl8 -; CHECK-NEXT: ld1w { [[A:z[0-9]+]].s }, [[PG]]/z, [x0] -; CHECK-NEXT: ld1w { [[B:z[0-9]+]].s }, [[PG]]/z, [x1] -; CHECK-NEXT: tbz w2, #0, [[LABEL:\.[A-z0-9_]+]] +; CHECK-NEXT: ptrue p0.s, vl8 +; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] +; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1] +; CHECK-NEXT: tbz w2, #0, .LBB23_2 ; CHECK-NEXT: // %bb.1: // %bb.1 -; CHECK-NEXT: orr [[OR:z[0-9]+]].d, [[A]].d, [[B]].d -; CHECK-NEXT: st1w { [[OR]].s }, [[PG]], [x1] -; CHECK-NEXT: [[LABEL]]: // %bb.2 +; CHECK-NEXT: orr z0.d, z0.d, z1.d +; CHECK-NEXT: st1w { z0.s }, p0, [x1] +; CHECK-NEXT: .LBB23_2: // %bb.2 ; CHECK-NEXT: ret %a = load <8 x i32>, <8 x i32>* %in %b = load <8 x i32>, <8 x i32>* %out diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-vector-shuffle.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-vector-shuffle.ll --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-vector-shuffle.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-vector-shuffle.ll @@ -1,6 +1,7 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -aarch64-sve-vector-bits-min=128 < %s | FileCheck %s -check-prefix=NO_SVE ; RUN: llc -aarch64-sve-vector-bits-min=256 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_EQ_256 -; RUN: llc -aarch64-sve-vector-bits-min=384 < %s | FileCheck %s -check-prefixes=CHECK +; RUN: llc -aarch64-sve-vector-bits-min=384 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_384 ; RUN: llc -aarch64-sve-vector-bits-min=512 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512 ; RUN: llc -aarch64-sve-vector-bits-min=640 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512 ; RUN: llc -aarch64-sve-vector-bits-min=768 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512 @@ -22,6 +23,11 @@ ; Don't use SVE for 64-bit vectors define <8 x i8> @shuffle_ext_byone_v8i8(<8 x i8> %op1, <8 x i8> %op2) #0 { +; NO_SVE-LABEL: shuffle_ext_byone_v8i8: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: ext v0.8b, v0.8b, v1.8b, #7 +; NO_SVE-NEXT: ret +; ; CHECK-LABEL: shuffle_ext_byone_v8i8: ; CHECK: // %bb.0: ; CHECK-NEXT: ext v0.8b, v0.8b, v1.8b, #7 @@ -32,6 +38,11 @@ ; Don't use SVE for 128-bit vectors define <16 x i8> @shuffle_ext_byone_v16i8(<16 x i8> %op1, <16 x i8> %op2) { +; NO_SVE-LABEL: shuffle_ext_byone_v16i8: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: ext v0.16b, v0.16b, v1.16b, #15 +; NO_SVE-NEXT: ret +; ; CHECK-LABEL: shuffle_ext_byone_v16i8: ; CHECK: // %bb.0: ; CHECK-NEXT: ext v0.16b, v0.16b, v1.16b, #15 @@ -42,6 +53,15 @@ } define void @shuffle_ext_byone_v32i8(<32 x i8>* %a, <32 x i8>* %b) #0 { +; NO_SVE-LABEL: shuffle_ext_byone_v32i8: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: ldp q1, q2, [x1] +; NO_SVE-NEXT: ldr q0, [x0, #16] +; NO_SVE-NEXT: ext v0.16b, v0.16b, v1.16b, #15 +; NO_SVE-NEXT: ext v1.16b, v1.16b, v2.16b, #15 +; NO_SVE-NEXT: stp q0, q1, [x0] +; NO_SVE-NEXT: ret +; ; CHECK-LABEL: shuffle_ext_byone_v32i8: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.b, vl32 @@ -64,6 +84,19 @@ define void @shuffle_ext_byone_v64i8(<64 x i8>* %a, <64 x i8>* %b) #0 { ; Ensure sensible type legalisation. +; NO_SVE-LABEL: shuffle_ext_byone_v64i8: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: ldp q1, q0, [x1, #32] +; NO_SVE-NEXT: ldp q4, q2, [x1] +; NO_SVE-NEXT: ext v0.16b, v1.16b, v0.16b, #15 +; NO_SVE-NEXT: ldr q3, [x0, #48] +; NO_SVE-NEXT: ext v5.16b, v2.16b, v1.16b, #15 +; NO_SVE-NEXT: ext v2.16b, v4.16b, v2.16b, #15 +; NO_SVE-NEXT: ext v1.16b, v3.16b, v4.16b, #15 +; NO_SVE-NEXT: stp q5, q0, [x0, #32] +; NO_SVE-NEXT: stp q1, q2, [x0] +; NO_SVE-NEXT: ret +; ; VBITS_EQ_256-LABEL: shuffle_ext_byone_v64i8: ; VBITS_EQ_256: // %bb.0: ; VBITS_EQ_256-NEXT: mov w8, #32 @@ -81,6 +114,23 @@ ; VBITS_EQ_256-NEXT: st1b { z1.b }, p0, [x0, x8] ; VBITS_EQ_256-NEXT: ret ; +; VBITS_GE_384-LABEL: shuffle_ext_byone_v64i8: +; VBITS_GE_384: // %bb.0: +; VBITS_GE_384-NEXT: mov w8, #32 +; VBITS_GE_384-NEXT: ptrue p0.b, vl32 +; VBITS_GE_384-NEXT: ld1b { z0.b }, p0/z, [x0, x8] +; VBITS_GE_384-NEXT: ld1b { z1.b }, p0/z, [x1, x8] +; VBITS_GE_384-NEXT: ld1b { z2.b }, p0/z, [x1] +; VBITS_GE_384-NEXT: mov z0.b, z0.b[31] +; VBITS_GE_384-NEXT: mov z3.b, z2.b[31] +; VBITS_GE_384-NEXT: fmov w9, s0 +; VBITS_GE_384-NEXT: fmov w10, s3 +; VBITS_GE_384-NEXT: insr z2.b, w9 +; VBITS_GE_384-NEXT: insr z1.b, w10 +; VBITS_GE_384-NEXT: st1b { z2.b }, p0, [x0] +; VBITS_GE_384-NEXT: st1b { z1.b }, p0, [x0, x8] +; VBITS_GE_384-NEXT: ret +; ; VBITS_GE_512-LABEL: shuffle_ext_byone_v64i8: ; VBITS_GE_512: // %bb.0: ; VBITS_GE_512-NEXT: ptrue p0.b, vl64 @@ -106,6 +156,85 @@ } define void @shuffle_ext_byone_v128i8(<128 x i8>* %a, <128 x i8>* %b) #0 { +; NO_SVE-LABEL: shuffle_ext_byone_v128i8: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: ldp q2, q1, [x1, #96] +; NO_SVE-NEXT: ldp q6, q5, [x1, #64] +; NO_SVE-NEXT: ext v1.16b, v2.16b, v1.16b, #15 +; NO_SVE-NEXT: ldp q4, q3, [x1, #32] +; NO_SVE-NEXT: ext v2.16b, v5.16b, v2.16b, #15 +; NO_SVE-NEXT: ldp q16, q7, [x1] +; NO_SVE-NEXT: ext v17.16b, v4.16b, v3.16b, #15 +; NO_SVE-NEXT: ldr q0, [x0, #112] +; NO_SVE-NEXT: stp q2, q1, [x0, #96] +; NO_SVE-NEXT: ext v4.16b, v7.16b, v4.16b, #15 +; NO_SVE-NEXT: ext v7.16b, v16.16b, v7.16b, #15 +; NO_SVE-NEXT: ext v0.16b, v0.16b, v16.16b, #15 +; NO_SVE-NEXT: ext v1.16b, v3.16b, v6.16b, #15 +; NO_SVE-NEXT: ext v2.16b, v6.16b, v5.16b, #15 +; NO_SVE-NEXT: stp q4, q17, [x0, #32] +; NO_SVE-NEXT: stp q0, q7, [x0] +; NO_SVE-NEXT: stp q1, q2, [x0, #64] +; NO_SVE-NEXT: ret +; +; VBITS_EQ_256-LABEL: shuffle_ext_byone_v128i8: +; VBITS_EQ_256: // %bb.0: +; VBITS_EQ_256-NEXT: mov w8, #64 +; VBITS_EQ_256-NEXT: mov w10, #32 +; VBITS_EQ_256-NEXT: ptrue p0.b, vl32 +; VBITS_EQ_256-NEXT: mov w9, #96 +; VBITS_EQ_256-NEXT: ld1b { z0.b }, p0/z, [x1, x8] +; VBITS_EQ_256-NEXT: ld1b { z2.b }, p0/z, [x1, x10] +; VBITS_EQ_256-NEXT: ld1b { z1.b }, p0/z, [x0, x9] +; VBITS_EQ_256-NEXT: ld1b { z3.b }, p0/z, [x1, x9] +; VBITS_EQ_256-NEXT: ld1b { z4.b }, p0/z, [x1] +; VBITS_EQ_256-NEXT: mov z5.b, z0.b[31] +; VBITS_EQ_256-NEXT: fmov w11, s5 +; VBITS_EQ_256-NEXT: mov z5.b, z2.b[31] +; VBITS_EQ_256-NEXT: mov z1.b, z1.b[31] +; VBITS_EQ_256-NEXT: fmov w12, s5 +; VBITS_EQ_256-NEXT: mov z5.b, z4.b[31] +; VBITS_EQ_256-NEXT: fmov w13, s1 +; VBITS_EQ_256-NEXT: fmov w14, s5 +; VBITS_EQ_256-NEXT: insr z3.b, w11 +; VBITS_EQ_256-NEXT: insr z0.b, w12 +; VBITS_EQ_256-NEXT: st1b { z3.b }, p0, [x0, x9] +; VBITS_EQ_256-NEXT: insr z4.b, w13 +; VBITS_EQ_256-NEXT: st1b { z0.b }, p0, [x0, x8] +; VBITS_EQ_256-NEXT: insr z2.b, w14 +; VBITS_EQ_256-NEXT: st1b { z4.b }, p0, [x0] +; VBITS_EQ_256-NEXT: st1b { z2.b }, p0, [x0, x10] +; VBITS_EQ_256-NEXT: ret +; +; VBITS_GE_384-LABEL: shuffle_ext_byone_v128i8: +; VBITS_GE_384: // %bb.0: +; VBITS_GE_384-NEXT: mov w8, #64 +; VBITS_GE_384-NEXT: mov w10, #32 +; VBITS_GE_384-NEXT: ptrue p0.b, vl32 +; VBITS_GE_384-NEXT: mov w9, #96 +; VBITS_GE_384-NEXT: ld1b { z0.b }, p0/z, [x1, x8] +; VBITS_GE_384-NEXT: ld1b { z2.b }, p0/z, [x1, x10] +; VBITS_GE_384-NEXT: ld1b { z1.b }, p0/z, [x0, x9] +; VBITS_GE_384-NEXT: ld1b { z3.b }, p0/z, [x1, x9] +; VBITS_GE_384-NEXT: ld1b { z4.b }, p0/z, [x1] +; VBITS_GE_384-NEXT: mov z5.b, z0.b[31] +; VBITS_GE_384-NEXT: fmov w11, s5 +; VBITS_GE_384-NEXT: mov z5.b, z2.b[31] +; VBITS_GE_384-NEXT: mov z1.b, z1.b[31] +; VBITS_GE_384-NEXT: fmov w12, s5 +; VBITS_GE_384-NEXT: mov z5.b, z4.b[31] +; VBITS_GE_384-NEXT: fmov w13, s1 +; VBITS_GE_384-NEXT: fmov w14, s5 +; VBITS_GE_384-NEXT: insr z3.b, w11 +; VBITS_GE_384-NEXT: insr z0.b, w12 +; VBITS_GE_384-NEXT: st1b { z3.b }, p0, [x0, x9] +; VBITS_GE_384-NEXT: insr z4.b, w13 +; VBITS_GE_384-NEXT: st1b { z0.b }, p0, [x0, x8] +; VBITS_GE_384-NEXT: insr z2.b, w14 +; VBITS_GE_384-NEXT: st1b { z4.b }, p0, [x0] +; VBITS_GE_384-NEXT: st1b { z2.b }, p0, [x0, x10] +; VBITS_GE_384-NEXT: ret +; ; VBITS_GE_1024-LABEL: shuffle_ext_byone_v128i8: ; VBITS_GE_1024: // %bb.0: ; VBITS_GE_1024-NEXT: ptrue p0.b, vl128 @@ -140,6 +269,149 @@ } define void @shuffle_ext_byone_v256i8(<256 x i8>* %a, <256 x i8>* %b) #0 { +; NO_SVE-LABEL: shuffle_ext_byone_v256i8: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: ldp q1, q0, [x1, #160] +; NO_SVE-NEXT: ldp q3, q2, [x1, #128] +; NO_SVE-NEXT: ldp q6, q7, [x1, #192] +; NO_SVE-NEXT: ext v19.16b, v3.16b, v2.16b, #15 +; NO_SVE-NEXT: ldp q18, q16, [x1, #96] +; NO_SVE-NEXT: ext v21.16b, v6.16b, v7.16b, #15 +; NO_SVE-NEXT: ext v6.16b, v0.16b, v6.16b, #15 +; NO_SVE-NEXT: ext v0.16b, v1.16b, v0.16b, #15 +; NO_SVE-NEXT: ext v1.16b, v2.16b, v1.16b, #15 +; NO_SVE-NEXT: ldp q5, q4, [x1, #64] +; NO_SVE-NEXT: ext v3.16b, v16.16b, v3.16b, #15 +; NO_SVE-NEXT: ldp q22, q20, [x1, #32] +; NO_SVE-NEXT: ext v17.16b, v5.16b, v4.16b, #15 +; NO_SVE-NEXT: ldp q24, q23, [x1] +; NO_SVE-NEXT: ldp q25, q2, [x1, #224] +; NO_SVE-NEXT: stp q3, q19, [x0, #128] +; NO_SVE-NEXT: stp q1, q0, [x0, #160] +; NO_SVE-NEXT: stp q6, q21, [x0, #192] +; NO_SVE-NEXT: ext v6.16b, v18.16b, v16.16b, #15 +; NO_SVE-NEXT: ext v0.16b, v4.16b, v18.16b, #15 +; NO_SVE-NEXT: ext v4.16b, v20.16b, v5.16b, #15 +; NO_SVE-NEXT: ldr q1, [x0, #240] +; NO_SVE-NEXT: ext v3.16b, v22.16b, v20.16b, #15 +; NO_SVE-NEXT: ext v5.16b, v23.16b, v22.16b, #15 +; NO_SVE-NEXT: stp q0, q6, [x0, #96] +; NO_SVE-NEXT: ext v0.16b, v24.16b, v23.16b, #15 +; NO_SVE-NEXT: stp q4, q17, [x0, #64] +; NO_SVE-NEXT: ext v1.16b, v1.16b, v24.16b, #15 +; NO_SVE-NEXT: ext v2.16b, v25.16b, v2.16b, #15 +; NO_SVE-NEXT: stp q5, q3, [x0, #32] +; NO_SVE-NEXT: ext v3.16b, v7.16b, v25.16b, #15 +; NO_SVE-NEXT: stp q1, q0, [x0] +; NO_SVE-NEXT: stp q3, q2, [x0, #224] +; NO_SVE-NEXT: ret +; +; VBITS_EQ_256-LABEL: shuffle_ext_byone_v256i8: +; VBITS_EQ_256: // %bb.0: +; VBITS_EQ_256-NEXT: mov w8, #32 +; VBITS_EQ_256-NEXT: ptrue p0.b, vl32 +; VBITS_EQ_256-NEXT: mov w11, #128 +; VBITS_EQ_256-NEXT: mov w13, #64 +; VBITS_EQ_256-NEXT: mov w12, #96 +; VBITS_EQ_256-NEXT: mov w14, #160 +; VBITS_EQ_256-NEXT: ld1b { z0.b }, p0/z, [x1, x8] +; VBITS_EQ_256-NEXT: mov w10, #192 +; VBITS_EQ_256-NEXT: ld1b { z3.b }, p0/z, [x1, x11] +; VBITS_EQ_256-NEXT: ld1b { z5.b }, p0/z, [x1, x13] +; VBITS_EQ_256-NEXT: mov w9, #224 +; VBITS_EQ_256-NEXT: ld1b { z7.b }, p0/z, [x1, x12] +; VBITS_EQ_256-NEXT: ld1b { z4.b }, p0/z, [x1, x10] +; VBITS_EQ_256-NEXT: mov z6.b, z0.b[31] +; VBITS_EQ_256-NEXT: fmov w15, s6 +; VBITS_EQ_256-NEXT: ld1b { z6.b }, p0/z, [x1, x14] +; VBITS_EQ_256-NEXT: mov z16.b, z3.b[31] +; VBITS_EQ_256-NEXT: ld1b { z1.b }, p0/z, [x0, x9] +; VBITS_EQ_256-NEXT: ld1b { z2.b }, p0/z, [x1, x9] +; VBITS_EQ_256-NEXT: ld1b { z17.b }, p0/z, [x1] +; VBITS_EQ_256-NEXT: fmov w16, s16 +; VBITS_EQ_256-NEXT: mov z16.b, z5.b[31] +; VBITS_EQ_256-NEXT: insr z5.b, w15 +; VBITS_EQ_256-NEXT: fmov w15, s16 +; VBITS_EQ_256-NEXT: mov z16.b, z7.b[31] +; VBITS_EQ_256-NEXT: mov z1.b, z1.b[31] +; VBITS_EQ_256-NEXT: fmov w17, s16 +; VBITS_EQ_256-NEXT: mov z16.b, z6.b[31] +; VBITS_EQ_256-NEXT: fmov w18, s16 +; VBITS_EQ_256-NEXT: mov z16.b, z4.b[31] +; VBITS_EQ_256-NEXT: insr z7.b, w15 +; VBITS_EQ_256-NEXT: fmov w15, s16 +; VBITS_EQ_256-NEXT: mov z16.b, z17.b[31] +; VBITS_EQ_256-NEXT: fmov w1, s1 +; VBITS_EQ_256-NEXT: fmov w2, s16 +; VBITS_EQ_256-NEXT: insr z3.b, w17 +; VBITS_EQ_256-NEXT: insr z6.b, w16 +; VBITS_EQ_256-NEXT: insr z4.b, w18 +; VBITS_EQ_256-NEXT: insr z2.b, w15 +; VBITS_EQ_256-NEXT: insr z17.b, w1 +; VBITS_EQ_256-NEXT: insr z0.b, w2 +; VBITS_EQ_256-NEXT: st1b { z2.b }, p0, [x0, x9] +; VBITS_EQ_256-NEXT: st1b { z0.b }, p0, [x0, x8] +; VBITS_EQ_256-NEXT: st1b { z4.b }, p0, [x0, x10] +; VBITS_EQ_256-NEXT: st1b { z3.b }, p0, [x0, x11] +; VBITS_EQ_256-NEXT: st1b { z7.b }, p0, [x0, x12] +; VBITS_EQ_256-NEXT: st1b { z5.b }, p0, [x0, x13] +; VBITS_EQ_256-NEXT: st1b { z6.b }, p0, [x0, x14] +; VBITS_EQ_256-NEXT: st1b { z17.b }, p0, [x0] +; VBITS_EQ_256-NEXT: ret +; +; VBITS_GE_384-LABEL: shuffle_ext_byone_v256i8: +; VBITS_GE_384: // %bb.0: +; VBITS_GE_384-NEXT: mov w8, #32 +; VBITS_GE_384-NEXT: ptrue p0.b, vl32 +; VBITS_GE_384-NEXT: mov w11, #128 +; VBITS_GE_384-NEXT: mov w13, #64 +; VBITS_GE_384-NEXT: mov w12, #96 +; VBITS_GE_384-NEXT: mov w14, #160 +; VBITS_GE_384-NEXT: ld1b { z0.b }, p0/z, [x1, x8] +; VBITS_GE_384-NEXT: mov w10, #192 +; VBITS_GE_384-NEXT: ld1b { z3.b }, p0/z, [x1, x11] +; VBITS_GE_384-NEXT: ld1b { z5.b }, p0/z, [x1, x13] +; VBITS_GE_384-NEXT: mov w9, #224 +; VBITS_GE_384-NEXT: ld1b { z7.b }, p0/z, [x1, x12] +; VBITS_GE_384-NEXT: ld1b { z4.b }, p0/z, [x1, x10] +; VBITS_GE_384-NEXT: mov z6.b, z0.b[31] +; VBITS_GE_384-NEXT: fmov w15, s6 +; VBITS_GE_384-NEXT: ld1b { z6.b }, p0/z, [x1, x14] +; VBITS_GE_384-NEXT: mov z16.b, z3.b[31] +; VBITS_GE_384-NEXT: ld1b { z1.b }, p0/z, [x0, x9] +; VBITS_GE_384-NEXT: ld1b { z2.b }, p0/z, [x1, x9] +; VBITS_GE_384-NEXT: ld1b { z17.b }, p0/z, [x1] +; VBITS_GE_384-NEXT: fmov w16, s16 +; VBITS_GE_384-NEXT: mov z16.b, z5.b[31] +; VBITS_GE_384-NEXT: insr z5.b, w15 +; VBITS_GE_384-NEXT: fmov w15, s16 +; VBITS_GE_384-NEXT: mov z16.b, z7.b[31] +; VBITS_GE_384-NEXT: mov z1.b, z1.b[31] +; VBITS_GE_384-NEXT: fmov w17, s16 +; VBITS_GE_384-NEXT: mov z16.b, z6.b[31] +; VBITS_GE_384-NEXT: fmov w18, s16 +; VBITS_GE_384-NEXT: mov z16.b, z4.b[31] +; VBITS_GE_384-NEXT: insr z7.b, w15 +; VBITS_GE_384-NEXT: fmov w15, s16 +; VBITS_GE_384-NEXT: mov z16.b, z17.b[31] +; VBITS_GE_384-NEXT: fmov w1, s1 +; VBITS_GE_384-NEXT: fmov w2, s16 +; VBITS_GE_384-NEXT: insr z3.b, w17 +; VBITS_GE_384-NEXT: insr z6.b, w16 +; VBITS_GE_384-NEXT: insr z4.b, w18 +; VBITS_GE_384-NEXT: insr z2.b, w15 +; VBITS_GE_384-NEXT: insr z17.b, w1 +; VBITS_GE_384-NEXT: insr z0.b, w2 +; VBITS_GE_384-NEXT: st1b { z2.b }, p0, [x0, x9] +; VBITS_GE_384-NEXT: st1b { z0.b }, p0, [x0, x8] +; VBITS_GE_384-NEXT: st1b { z4.b }, p0, [x0, x10] +; VBITS_GE_384-NEXT: st1b { z3.b }, p0, [x0, x11] +; VBITS_GE_384-NEXT: st1b { z7.b }, p0, [x0, x12] +; VBITS_GE_384-NEXT: st1b { z5.b }, p0, [x0, x13] +; VBITS_GE_384-NEXT: st1b { z6.b }, p0, [x0, x14] +; VBITS_GE_384-NEXT: st1b { z17.b }, p0, [x0] +; VBITS_GE_384-NEXT: ret +; ; VBITS_GE_2048-LABEL: shuffle_ext_byone_v256i8: ; VBITS_GE_2048: // %bb.0: ; VBITS_GE_2048-NEXT: ptrue p0.b, vl256 @@ -191,6 +463,11 @@ ; Don't use SVE for 64-bit vectors define <4 x i16> @shuffle_ext_byone_v4i16(<4 x i16> %op1, <4 x i16> %op2) #0 { +; NO_SVE-LABEL: shuffle_ext_byone_v4i16: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: ext v0.8b, v0.8b, v1.8b, #6 +; NO_SVE-NEXT: ret +; ; CHECK-LABEL: shuffle_ext_byone_v4i16: ; CHECK: // %bb.0: ; CHECK-NEXT: ext v0.8b, v0.8b, v1.8b, #6 @@ -201,6 +478,11 @@ ; Don't use SVE for 128-bit vectors define <8 x i16> @shuffle_ext_byone_v8i16(<8 x i16> %op1, <8 x i16> %op2) #0 { +; NO_SVE-LABEL: shuffle_ext_byone_v8i16: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: ext v0.16b, v0.16b, v1.16b, #14 +; NO_SVE-NEXT: ret +; ; CHECK-LABEL: shuffle_ext_byone_v8i16: ; CHECK: // %bb.0: ; CHECK-NEXT: ext v0.16b, v0.16b, v1.16b, #14 @@ -210,6 +492,15 @@ } define void @shuffle_ext_byone_v16i16(<16 x i16>* %a, <16 x i16>* %b) #0 { +; NO_SVE-LABEL: shuffle_ext_byone_v16i16: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: ldp q1, q2, [x1] +; NO_SVE-NEXT: ldr q0, [x0, #16] +; NO_SVE-NEXT: ext v0.16b, v0.16b, v1.16b, #14 +; NO_SVE-NEXT: ext v1.16b, v1.16b, v2.16b, #14 +; NO_SVE-NEXT: stp q0, q1, [x0] +; NO_SVE-NEXT: ret +; ; CHECK-LABEL: shuffle_ext_byone_v16i16: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.h, vl16 @@ -230,6 +521,19 @@ define void @shuffle_ext_byone_v32i16(<32 x i16>* %a, <32 x i16>* %b) #0 { ; Ensure sensible type legalisation. +; NO_SVE-LABEL: shuffle_ext_byone_v32i16: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: ldp q1, q0, [x1, #32] +; NO_SVE-NEXT: ldp q4, q2, [x1] +; NO_SVE-NEXT: ext v0.16b, v1.16b, v0.16b, #14 +; NO_SVE-NEXT: ldr q3, [x0, #48] +; NO_SVE-NEXT: ext v5.16b, v2.16b, v1.16b, #14 +; NO_SVE-NEXT: ext v2.16b, v4.16b, v2.16b, #14 +; NO_SVE-NEXT: ext v1.16b, v3.16b, v4.16b, #14 +; NO_SVE-NEXT: stp q5, q0, [x0, #32] +; NO_SVE-NEXT: stp q1, q2, [x0] +; NO_SVE-NEXT: ret +; ; VBITS_EQ_256-LABEL: shuffle_ext_byone_v32i16: ; VBITS_EQ_256: // %bb.0: ; VBITS_EQ_256-NEXT: mov x8, #16 @@ -247,6 +551,23 @@ ; VBITS_EQ_256-NEXT: st1h { z1.h }, p0, [x0, x8, lsl #1] ; VBITS_EQ_256-NEXT: ret ; +; VBITS_GE_384-LABEL: shuffle_ext_byone_v32i16: +; VBITS_GE_384: // %bb.0: +; VBITS_GE_384-NEXT: mov x8, #16 +; VBITS_GE_384-NEXT: ptrue p0.h, vl16 +; VBITS_GE_384-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] +; VBITS_GE_384-NEXT: ld1h { z1.h }, p0/z, [x1, x8, lsl #1] +; VBITS_GE_384-NEXT: ld1h { z2.h }, p0/z, [x1] +; VBITS_GE_384-NEXT: mov z0.h, z0.h[15] +; VBITS_GE_384-NEXT: mov z3.h, z2.h[15] +; VBITS_GE_384-NEXT: fmov w9, s0 +; VBITS_GE_384-NEXT: fmov w10, s3 +; VBITS_GE_384-NEXT: insr z2.h, w9 +; VBITS_GE_384-NEXT: insr z1.h, w10 +; VBITS_GE_384-NEXT: st1h { z2.h }, p0, [x0] +; VBITS_GE_384-NEXT: st1h { z1.h }, p0, [x0, x8, lsl #1] +; VBITS_GE_384-NEXT: ret +; ; VBITS_GE_512-LABEL: shuffle_ext_byone_v32i16: ; VBITS_GE_512: // %bb.0: ; VBITS_GE_512-NEXT: ptrue p0.h, vl32 @@ -268,6 +589,85 @@ } define void @shuffle_ext_byone_v64i16(<64 x i16>* %a, <64 x i16>* %b) #0 { +; NO_SVE-LABEL: shuffle_ext_byone_v64i16: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: ldp q2, q1, [x1, #96] +; NO_SVE-NEXT: ldp q6, q5, [x1, #64] +; NO_SVE-NEXT: ext v1.16b, v2.16b, v1.16b, #14 +; NO_SVE-NEXT: ldp q4, q3, [x1, #32] +; NO_SVE-NEXT: ext v2.16b, v5.16b, v2.16b, #14 +; NO_SVE-NEXT: ldp q16, q7, [x1] +; NO_SVE-NEXT: ext v17.16b, v4.16b, v3.16b, #14 +; NO_SVE-NEXT: ldr q0, [x0, #112] +; NO_SVE-NEXT: stp q2, q1, [x0, #96] +; NO_SVE-NEXT: ext v4.16b, v7.16b, v4.16b, #14 +; NO_SVE-NEXT: ext v7.16b, v16.16b, v7.16b, #14 +; NO_SVE-NEXT: ext v0.16b, v0.16b, v16.16b, #14 +; NO_SVE-NEXT: ext v1.16b, v3.16b, v6.16b, #14 +; NO_SVE-NEXT: ext v2.16b, v6.16b, v5.16b, #14 +; NO_SVE-NEXT: stp q4, q17, [x0, #32] +; NO_SVE-NEXT: stp q0, q7, [x0] +; NO_SVE-NEXT: stp q1, q2, [x0, #64] +; NO_SVE-NEXT: ret +; +; VBITS_EQ_256-LABEL: shuffle_ext_byone_v64i16: +; VBITS_EQ_256: // %bb.0: +; VBITS_EQ_256-NEXT: mov x8, #32 +; VBITS_EQ_256-NEXT: mov x10, #16 +; VBITS_EQ_256-NEXT: mov x9, #48 +; VBITS_EQ_256-NEXT: ptrue p0.h, vl16 +; VBITS_EQ_256-NEXT: ld1h { z0.h }, p0/z, [x1, x8, lsl #1] +; VBITS_EQ_256-NEXT: ld1h { z2.h }, p0/z, [x1, x10, lsl #1] +; VBITS_EQ_256-NEXT: ld1h { z1.h }, p0/z, [x0, x9, lsl #1] +; VBITS_EQ_256-NEXT: ld1h { z3.h }, p0/z, [x1, x9, lsl #1] +; VBITS_EQ_256-NEXT: ld1h { z4.h }, p0/z, [x1] +; VBITS_EQ_256-NEXT: mov z5.h, z0.h[15] +; VBITS_EQ_256-NEXT: fmov w11, s5 +; VBITS_EQ_256-NEXT: mov z5.h, z2.h[15] +; VBITS_EQ_256-NEXT: mov z1.h, z1.h[15] +; VBITS_EQ_256-NEXT: fmov w12, s5 +; VBITS_EQ_256-NEXT: mov z5.h, z4.h[15] +; VBITS_EQ_256-NEXT: fmov w13, s1 +; VBITS_EQ_256-NEXT: fmov w14, s5 +; VBITS_EQ_256-NEXT: insr z3.h, w11 +; VBITS_EQ_256-NEXT: insr z0.h, w12 +; VBITS_EQ_256-NEXT: st1h { z3.h }, p0, [x0, x9, lsl #1] +; VBITS_EQ_256-NEXT: insr z4.h, w13 +; VBITS_EQ_256-NEXT: st1h { z0.h }, p0, [x0, x8, lsl #1] +; VBITS_EQ_256-NEXT: insr z2.h, w14 +; VBITS_EQ_256-NEXT: st1h { z4.h }, p0, [x0] +; VBITS_EQ_256-NEXT: st1h { z2.h }, p0, [x0, x10, lsl #1] +; VBITS_EQ_256-NEXT: ret +; +; VBITS_GE_384-LABEL: shuffle_ext_byone_v64i16: +; VBITS_GE_384: // %bb.0: +; VBITS_GE_384-NEXT: mov x8, #32 +; VBITS_GE_384-NEXT: mov x10, #16 +; VBITS_GE_384-NEXT: mov x9, #48 +; VBITS_GE_384-NEXT: ptrue p0.h, vl16 +; VBITS_GE_384-NEXT: ld1h { z0.h }, p0/z, [x1, x8, lsl #1] +; VBITS_GE_384-NEXT: ld1h { z2.h }, p0/z, [x1, x10, lsl #1] +; VBITS_GE_384-NEXT: ld1h { z1.h }, p0/z, [x0, x9, lsl #1] +; VBITS_GE_384-NEXT: ld1h { z3.h }, p0/z, [x1, x9, lsl #1] +; VBITS_GE_384-NEXT: ld1h { z4.h }, p0/z, [x1] +; VBITS_GE_384-NEXT: mov z5.h, z0.h[15] +; VBITS_GE_384-NEXT: fmov w11, s5 +; VBITS_GE_384-NEXT: mov z5.h, z2.h[15] +; VBITS_GE_384-NEXT: mov z1.h, z1.h[15] +; VBITS_GE_384-NEXT: fmov w12, s5 +; VBITS_GE_384-NEXT: mov z5.h, z4.h[15] +; VBITS_GE_384-NEXT: fmov w13, s1 +; VBITS_GE_384-NEXT: fmov w14, s5 +; VBITS_GE_384-NEXT: insr z3.h, w11 +; VBITS_GE_384-NEXT: insr z0.h, w12 +; VBITS_GE_384-NEXT: st1h { z3.h }, p0, [x0, x9, lsl #1] +; VBITS_GE_384-NEXT: insr z4.h, w13 +; VBITS_GE_384-NEXT: st1h { z0.h }, p0, [x0, x8, lsl #1] +; VBITS_GE_384-NEXT: insr z2.h, w14 +; VBITS_GE_384-NEXT: st1h { z4.h }, p0, [x0] +; VBITS_GE_384-NEXT: st1h { z2.h }, p0, [x0, x10, lsl #1] +; VBITS_GE_384-NEXT: ret +; ; VBITS_GE_1024-LABEL: shuffle_ext_byone_v64i16: ; VBITS_GE_1024: // %bb.0: ; VBITS_GE_1024-NEXT: ptrue p0.h, vl64 @@ -294,6 +694,149 @@ } define void @shuffle_ext_byone_v128i16(<128 x i16>* %a, <128 x i16>* %b) #0 { +; NO_SVE-LABEL: shuffle_ext_byone_v128i16: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: ldp q1, q0, [x1, #160] +; NO_SVE-NEXT: ldp q3, q2, [x1, #128] +; NO_SVE-NEXT: ldp q6, q7, [x1, #192] +; NO_SVE-NEXT: ext v19.16b, v3.16b, v2.16b, #14 +; NO_SVE-NEXT: ldp q18, q16, [x1, #96] +; NO_SVE-NEXT: ext v21.16b, v6.16b, v7.16b, #14 +; NO_SVE-NEXT: ext v6.16b, v0.16b, v6.16b, #14 +; NO_SVE-NEXT: ext v0.16b, v1.16b, v0.16b, #14 +; NO_SVE-NEXT: ext v1.16b, v2.16b, v1.16b, #14 +; NO_SVE-NEXT: ldp q5, q4, [x1, #64] +; NO_SVE-NEXT: ext v3.16b, v16.16b, v3.16b, #14 +; NO_SVE-NEXT: ldp q22, q20, [x1, #32] +; NO_SVE-NEXT: ext v17.16b, v5.16b, v4.16b, #14 +; NO_SVE-NEXT: ldp q24, q23, [x1] +; NO_SVE-NEXT: ldp q25, q2, [x1, #224] +; NO_SVE-NEXT: stp q3, q19, [x0, #128] +; NO_SVE-NEXT: stp q1, q0, [x0, #160] +; NO_SVE-NEXT: stp q6, q21, [x0, #192] +; NO_SVE-NEXT: ext v6.16b, v18.16b, v16.16b, #14 +; NO_SVE-NEXT: ext v0.16b, v4.16b, v18.16b, #14 +; NO_SVE-NEXT: ext v4.16b, v20.16b, v5.16b, #14 +; NO_SVE-NEXT: ldr q1, [x0, #240] +; NO_SVE-NEXT: ext v3.16b, v22.16b, v20.16b, #14 +; NO_SVE-NEXT: ext v5.16b, v23.16b, v22.16b, #14 +; NO_SVE-NEXT: stp q0, q6, [x0, #96] +; NO_SVE-NEXT: ext v0.16b, v24.16b, v23.16b, #14 +; NO_SVE-NEXT: stp q4, q17, [x0, #64] +; NO_SVE-NEXT: ext v1.16b, v1.16b, v24.16b, #14 +; NO_SVE-NEXT: ext v2.16b, v25.16b, v2.16b, #14 +; NO_SVE-NEXT: stp q5, q3, [x0, #32] +; NO_SVE-NEXT: ext v3.16b, v7.16b, v25.16b, #14 +; NO_SVE-NEXT: stp q1, q0, [x0] +; NO_SVE-NEXT: stp q3, q2, [x0, #224] +; NO_SVE-NEXT: ret +; +; VBITS_EQ_256-LABEL: shuffle_ext_byone_v128i16: +; VBITS_EQ_256: // %bb.0: +; VBITS_EQ_256-NEXT: mov x8, #16 +; VBITS_EQ_256-NEXT: ptrue p0.h, vl16 +; VBITS_EQ_256-NEXT: mov x10, #64 +; VBITS_EQ_256-NEXT: mov x13, #32 +; VBITS_EQ_256-NEXT: mov x14, #48 +; VBITS_EQ_256-NEXT: mov x11, #80 +; VBITS_EQ_256-NEXT: ld1h { z0.h }, p0/z, [x1, x8, lsl #1] +; VBITS_EQ_256-NEXT: mov x12, #96 +; VBITS_EQ_256-NEXT: ld1h { z2.h }, p0/z, [x1, x10, lsl #1] +; VBITS_EQ_256-NEXT: ld1h { z5.h }, p0/z, [x1, x13, lsl #1] +; VBITS_EQ_256-NEXT: mov x9, #112 +; VBITS_EQ_256-NEXT: ld1h { z7.h }, p0/z, [x1, x14, lsl #1] +; VBITS_EQ_256-NEXT: ld1h { z4.h }, p0/z, [x1, x12, lsl #1] +; VBITS_EQ_256-NEXT: mov z6.h, z0.h[15] +; VBITS_EQ_256-NEXT: fmov w15, s6 +; VBITS_EQ_256-NEXT: ld1h { z6.h }, p0/z, [x1, x11, lsl #1] +; VBITS_EQ_256-NEXT: mov z16.h, z2.h[15] +; VBITS_EQ_256-NEXT: ld1h { z1.h }, p0/z, [x0, x9, lsl #1] +; VBITS_EQ_256-NEXT: ld1h { z3.h }, p0/z, [x1, x9, lsl #1] +; VBITS_EQ_256-NEXT: ld1h { z17.h }, p0/z, [x1] +; VBITS_EQ_256-NEXT: fmov w16, s16 +; VBITS_EQ_256-NEXT: mov z16.h, z5.h[15] +; VBITS_EQ_256-NEXT: insr z5.h, w15 +; VBITS_EQ_256-NEXT: fmov w15, s16 +; VBITS_EQ_256-NEXT: mov z16.h, z7.h[15] +; VBITS_EQ_256-NEXT: mov z1.h, z1.h[15] +; VBITS_EQ_256-NEXT: fmov w17, s16 +; VBITS_EQ_256-NEXT: mov z16.h, z6.h[15] +; VBITS_EQ_256-NEXT: fmov w18, s16 +; VBITS_EQ_256-NEXT: mov z16.h, z4.h[15] +; VBITS_EQ_256-NEXT: insr z7.h, w15 +; VBITS_EQ_256-NEXT: fmov w15, s16 +; VBITS_EQ_256-NEXT: mov z16.h, z17.h[15] +; VBITS_EQ_256-NEXT: fmov w1, s1 +; VBITS_EQ_256-NEXT: fmov w2, s16 +; VBITS_EQ_256-NEXT: insr z2.h, w17 +; VBITS_EQ_256-NEXT: insr z6.h, w16 +; VBITS_EQ_256-NEXT: insr z4.h, w18 +; VBITS_EQ_256-NEXT: insr z3.h, w15 +; VBITS_EQ_256-NEXT: insr z17.h, w1 +; VBITS_EQ_256-NEXT: insr z0.h, w2 +; VBITS_EQ_256-NEXT: st1h { z3.h }, p0, [x0, x9, lsl #1] +; VBITS_EQ_256-NEXT: st1h { z4.h }, p0, [x0, x12, lsl #1] +; VBITS_EQ_256-NEXT: st1h { z2.h }, p0, [x0, x10, lsl #1] +; VBITS_EQ_256-NEXT: st1h { z7.h }, p0, [x0, x14, lsl #1] +; VBITS_EQ_256-NEXT: st1h { z5.h }, p0, [x0, x13, lsl #1] +; VBITS_EQ_256-NEXT: st1h { z0.h }, p0, [x0, x8, lsl #1] +; VBITS_EQ_256-NEXT: st1h { z6.h }, p0, [x0, x11, lsl #1] +; VBITS_EQ_256-NEXT: st1h { z17.h }, p0, [x0] +; VBITS_EQ_256-NEXT: ret +; +; VBITS_GE_384-LABEL: shuffle_ext_byone_v128i16: +; VBITS_GE_384: // %bb.0: +; VBITS_GE_384-NEXT: mov x8, #16 +; VBITS_GE_384-NEXT: ptrue p0.h, vl16 +; VBITS_GE_384-NEXT: mov x10, #64 +; VBITS_GE_384-NEXT: mov x13, #32 +; VBITS_GE_384-NEXT: mov x14, #48 +; VBITS_GE_384-NEXT: mov x11, #80 +; VBITS_GE_384-NEXT: ld1h { z0.h }, p0/z, [x1, x8, lsl #1] +; VBITS_GE_384-NEXT: mov x12, #96 +; VBITS_GE_384-NEXT: ld1h { z2.h }, p0/z, [x1, x10, lsl #1] +; VBITS_GE_384-NEXT: ld1h { z5.h }, p0/z, [x1, x13, lsl #1] +; VBITS_GE_384-NEXT: mov x9, #112 +; VBITS_GE_384-NEXT: ld1h { z7.h }, p0/z, [x1, x14, lsl #1] +; VBITS_GE_384-NEXT: ld1h { z4.h }, p0/z, [x1, x12, lsl #1] +; VBITS_GE_384-NEXT: mov z6.h, z0.h[15] +; VBITS_GE_384-NEXT: fmov w15, s6 +; VBITS_GE_384-NEXT: ld1h { z6.h }, p0/z, [x1, x11, lsl #1] +; VBITS_GE_384-NEXT: mov z16.h, z2.h[15] +; VBITS_GE_384-NEXT: ld1h { z1.h }, p0/z, [x0, x9, lsl #1] +; VBITS_GE_384-NEXT: ld1h { z3.h }, p0/z, [x1, x9, lsl #1] +; VBITS_GE_384-NEXT: ld1h { z17.h }, p0/z, [x1] +; VBITS_GE_384-NEXT: fmov w16, s16 +; VBITS_GE_384-NEXT: mov z16.h, z5.h[15] +; VBITS_GE_384-NEXT: insr z5.h, w15 +; VBITS_GE_384-NEXT: fmov w15, s16 +; VBITS_GE_384-NEXT: mov z16.h, z7.h[15] +; VBITS_GE_384-NEXT: mov z1.h, z1.h[15] +; VBITS_GE_384-NEXT: fmov w17, s16 +; VBITS_GE_384-NEXT: mov z16.h, z6.h[15] +; VBITS_GE_384-NEXT: fmov w18, s16 +; VBITS_GE_384-NEXT: mov z16.h, z4.h[15] +; VBITS_GE_384-NEXT: insr z7.h, w15 +; VBITS_GE_384-NEXT: fmov w15, s16 +; VBITS_GE_384-NEXT: mov z16.h, z17.h[15] +; VBITS_GE_384-NEXT: fmov w1, s1 +; VBITS_GE_384-NEXT: fmov w2, s16 +; VBITS_GE_384-NEXT: insr z2.h, w17 +; VBITS_GE_384-NEXT: insr z6.h, w16 +; VBITS_GE_384-NEXT: insr z4.h, w18 +; VBITS_GE_384-NEXT: insr z3.h, w15 +; VBITS_GE_384-NEXT: insr z17.h, w1 +; VBITS_GE_384-NEXT: insr z0.h, w2 +; VBITS_GE_384-NEXT: st1h { z3.h }, p0, [x0, x9, lsl #1] +; VBITS_GE_384-NEXT: st1h { z4.h }, p0, [x0, x12, lsl #1] +; VBITS_GE_384-NEXT: st1h { z2.h }, p0, [x0, x10, lsl #1] +; VBITS_GE_384-NEXT: st1h { z7.h }, p0, [x0, x14, lsl #1] +; VBITS_GE_384-NEXT: st1h { z5.h }, p0, [x0, x13, lsl #1] +; VBITS_GE_384-NEXT: st1h { z0.h }, p0, [x0, x8, lsl #1] +; VBITS_GE_384-NEXT: st1h { z6.h }, p0, [x0, x11, lsl #1] +; VBITS_GE_384-NEXT: st1h { z17.h }, p0, [x0] +; VBITS_GE_384-NEXT: ret +; ; VBITS_GE_2048-LABEL: shuffle_ext_byone_v128i16: ; VBITS_GE_2048: // %bb.0: ; VBITS_GE_2048-NEXT: ptrue p0.h, vl128 @@ -329,6 +872,11 @@ ; Don't use SVE for 64-bit vectors define <2 x i32> @shuffle_ext_byone_v2i32(<2 x i32> %op1, <2 x i32> %op2) #0 { +; NO_SVE-LABEL: shuffle_ext_byone_v2i32: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: ext v0.8b, v0.8b, v1.8b, #4 +; NO_SVE-NEXT: ret +; ; CHECK-LABEL: shuffle_ext_byone_v2i32: ; CHECK: // %bb.0: ; CHECK-NEXT: ext v0.8b, v0.8b, v1.8b, #4 @@ -339,6 +887,11 @@ ; Don't use SVE for 128-bit vectors define <4 x i32> @shuffle_ext_byone_v4i32(<4 x i32> %op1, <4 x i32> %op2) #0 { +; NO_SVE-LABEL: shuffle_ext_byone_v4i32: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: ext v0.16b, v0.16b, v1.16b, #12 +; NO_SVE-NEXT: ret +; ; CHECK-LABEL: shuffle_ext_byone_v4i32: ; CHECK: // %bb.0: ; CHECK-NEXT: ext v0.16b, v0.16b, v1.16b, #12 @@ -348,6 +901,15 @@ } define void @shuffle_ext_byone_v8i32(<8 x i32>* %a, <8 x i32>* %b) #0 { +; NO_SVE-LABEL: shuffle_ext_byone_v8i32: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: ldp q1, q2, [x1] +; NO_SVE-NEXT: ldr q0, [x0, #16] +; NO_SVE-NEXT: ext v0.16b, v0.16b, v1.16b, #12 +; NO_SVE-NEXT: ext v1.16b, v1.16b, v2.16b, #12 +; NO_SVE-NEXT: stp q0, q1, [x0] +; NO_SVE-NEXT: ret +; ; CHECK-LABEL: shuffle_ext_byone_v8i32: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.s, vl8 @@ -367,6 +929,19 @@ define void @shuffle_ext_byone_v16i32(<16 x i32>* %a, <16 x i32>* %b) #0 { ; Ensure sensible type legalisation. +; NO_SVE-LABEL: shuffle_ext_byone_v16i32: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: ldp q1, q0, [x1, #32] +; NO_SVE-NEXT: ldp q4, q2, [x1] +; NO_SVE-NEXT: ext v0.16b, v1.16b, v0.16b, #12 +; NO_SVE-NEXT: ldr q3, [x0, #48] +; NO_SVE-NEXT: ext v5.16b, v2.16b, v1.16b, #12 +; NO_SVE-NEXT: ext v2.16b, v4.16b, v2.16b, #12 +; NO_SVE-NEXT: ext v1.16b, v3.16b, v4.16b, #12 +; NO_SVE-NEXT: stp q5, q0, [x0, #32] +; NO_SVE-NEXT: stp q1, q2, [x0] +; NO_SVE-NEXT: ret +; ; VBITS_EQ_256-LABEL: shuffle_ext_byone_v16i32: ; VBITS_EQ_256: // %bb.0: ; VBITS_EQ_256-NEXT: mov x8, #8 @@ -384,6 +959,23 @@ ; VBITS_EQ_256-NEXT: st1w { z1.s }, p0, [x0, x8, lsl #2] ; VBITS_EQ_256-NEXT: ret ; +; VBITS_GE_384-LABEL: shuffle_ext_byone_v16i32: +; VBITS_GE_384: // %bb.0: +; VBITS_GE_384-NEXT: mov x8, #8 +; VBITS_GE_384-NEXT: ptrue p0.s, vl8 +; VBITS_GE_384-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] +; VBITS_GE_384-NEXT: ld1w { z1.s }, p0/z, [x1, x8, lsl #2] +; VBITS_GE_384-NEXT: ld1w { z2.s }, p0/z, [x1] +; VBITS_GE_384-NEXT: mov z0.s, z0.s[7] +; VBITS_GE_384-NEXT: mov z3.s, z2.s[7] +; VBITS_GE_384-NEXT: fmov w9, s0 +; VBITS_GE_384-NEXT: fmov w10, s3 +; VBITS_GE_384-NEXT: insr z2.s, w9 +; VBITS_GE_384-NEXT: insr z1.s, w10 +; VBITS_GE_384-NEXT: st1w { z2.s }, p0, [x0] +; VBITS_GE_384-NEXT: st1w { z1.s }, p0, [x0, x8, lsl #2] +; VBITS_GE_384-NEXT: ret +; ; VBITS_GE_512-LABEL: shuffle_ext_byone_v16i32: ; VBITS_GE_512: // %bb.0: ; VBITS_GE_512-NEXT: ptrue p0.s, vl16 @@ -403,6 +995,85 @@ } define void @shuffle_ext_byone_v32i32(<32 x i32>* %a, <32 x i32>* %b) #0 { +; NO_SVE-LABEL: shuffle_ext_byone_v32i32: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: ldp q2, q1, [x1, #96] +; NO_SVE-NEXT: ldp q6, q5, [x1, #64] +; NO_SVE-NEXT: ext v1.16b, v2.16b, v1.16b, #12 +; NO_SVE-NEXT: ldp q4, q3, [x1, #32] +; NO_SVE-NEXT: ext v2.16b, v5.16b, v2.16b, #12 +; NO_SVE-NEXT: ldp q16, q7, [x1] +; NO_SVE-NEXT: ext v17.16b, v4.16b, v3.16b, #12 +; NO_SVE-NEXT: ldr q0, [x0, #112] +; NO_SVE-NEXT: stp q2, q1, [x0, #96] +; NO_SVE-NEXT: ext v4.16b, v7.16b, v4.16b, #12 +; NO_SVE-NEXT: ext v7.16b, v16.16b, v7.16b, #12 +; NO_SVE-NEXT: ext v0.16b, v0.16b, v16.16b, #12 +; NO_SVE-NEXT: ext v1.16b, v3.16b, v6.16b, #12 +; NO_SVE-NEXT: ext v2.16b, v6.16b, v5.16b, #12 +; NO_SVE-NEXT: stp q4, q17, [x0, #32] +; NO_SVE-NEXT: stp q0, q7, [x0] +; NO_SVE-NEXT: stp q1, q2, [x0, #64] +; NO_SVE-NEXT: ret +; +; VBITS_EQ_256-LABEL: shuffle_ext_byone_v32i32: +; VBITS_EQ_256: // %bb.0: +; VBITS_EQ_256-NEXT: mov x8, #16 +; VBITS_EQ_256-NEXT: mov x10, #8 +; VBITS_EQ_256-NEXT: mov x9, #24 +; VBITS_EQ_256-NEXT: ptrue p0.s, vl8 +; VBITS_EQ_256-NEXT: ld1w { z0.s }, p0/z, [x1, x8, lsl #2] +; VBITS_EQ_256-NEXT: ld1w { z2.s }, p0/z, [x1, x10, lsl #2] +; VBITS_EQ_256-NEXT: ld1w { z1.s }, p0/z, [x0, x9, lsl #2] +; VBITS_EQ_256-NEXT: ld1w { z3.s }, p0/z, [x1, x9, lsl #2] +; VBITS_EQ_256-NEXT: ld1w { z4.s }, p0/z, [x1] +; VBITS_EQ_256-NEXT: mov z5.s, z0.s[7] +; VBITS_EQ_256-NEXT: fmov w11, s5 +; VBITS_EQ_256-NEXT: mov z5.s, z2.s[7] +; VBITS_EQ_256-NEXT: mov z1.s, z1.s[7] +; VBITS_EQ_256-NEXT: fmov w12, s5 +; VBITS_EQ_256-NEXT: mov z5.s, z4.s[7] +; VBITS_EQ_256-NEXT: fmov w13, s1 +; VBITS_EQ_256-NEXT: fmov w14, s5 +; VBITS_EQ_256-NEXT: insr z3.s, w11 +; VBITS_EQ_256-NEXT: insr z0.s, w12 +; VBITS_EQ_256-NEXT: st1w { z3.s }, p0, [x0, x9, lsl #2] +; VBITS_EQ_256-NEXT: insr z4.s, w13 +; VBITS_EQ_256-NEXT: st1w { z0.s }, p0, [x0, x8, lsl #2] +; VBITS_EQ_256-NEXT: insr z2.s, w14 +; VBITS_EQ_256-NEXT: st1w { z4.s }, p0, [x0] +; VBITS_EQ_256-NEXT: st1w { z2.s }, p0, [x0, x10, lsl #2] +; VBITS_EQ_256-NEXT: ret +; +; VBITS_GE_384-LABEL: shuffle_ext_byone_v32i32: +; VBITS_GE_384: // %bb.0: +; VBITS_GE_384-NEXT: mov x8, #16 +; VBITS_GE_384-NEXT: mov x10, #8 +; VBITS_GE_384-NEXT: mov x9, #24 +; VBITS_GE_384-NEXT: ptrue p0.s, vl8 +; VBITS_GE_384-NEXT: ld1w { z0.s }, p0/z, [x1, x8, lsl #2] +; VBITS_GE_384-NEXT: ld1w { z2.s }, p0/z, [x1, x10, lsl #2] +; VBITS_GE_384-NEXT: ld1w { z1.s }, p0/z, [x0, x9, lsl #2] +; VBITS_GE_384-NEXT: ld1w { z3.s }, p0/z, [x1, x9, lsl #2] +; VBITS_GE_384-NEXT: ld1w { z4.s }, p0/z, [x1] +; VBITS_GE_384-NEXT: mov z5.s, z0.s[7] +; VBITS_GE_384-NEXT: fmov w11, s5 +; VBITS_GE_384-NEXT: mov z5.s, z2.s[7] +; VBITS_GE_384-NEXT: mov z1.s, z1.s[7] +; VBITS_GE_384-NEXT: fmov w12, s5 +; VBITS_GE_384-NEXT: mov z5.s, z4.s[7] +; VBITS_GE_384-NEXT: fmov w13, s1 +; VBITS_GE_384-NEXT: fmov w14, s5 +; VBITS_GE_384-NEXT: insr z3.s, w11 +; VBITS_GE_384-NEXT: insr z0.s, w12 +; VBITS_GE_384-NEXT: st1w { z3.s }, p0, [x0, x9, lsl #2] +; VBITS_GE_384-NEXT: insr z4.s, w13 +; VBITS_GE_384-NEXT: st1w { z0.s }, p0, [x0, x8, lsl #2] +; VBITS_GE_384-NEXT: insr z2.s, w14 +; VBITS_GE_384-NEXT: st1w { z4.s }, p0, [x0] +; VBITS_GE_384-NEXT: st1w { z2.s }, p0, [x0, x10, lsl #2] +; VBITS_GE_384-NEXT: ret +; ; VBITS_GE_1024-LABEL: shuffle_ext_byone_v32i32: ; VBITS_GE_1024: // %bb.0: ; VBITS_GE_1024-NEXT: ptrue p0.s, vl32 @@ -425,6 +1096,149 @@ } define void @shuffle_ext_byone_v64i32(<64 x i32>* %a, <64 x i32>* %b) #0 { +; NO_SVE-LABEL: shuffle_ext_byone_v64i32: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: ldp q1, q0, [x1, #160] +; NO_SVE-NEXT: ldp q3, q2, [x1, #128] +; NO_SVE-NEXT: ldp q6, q7, [x1, #192] +; NO_SVE-NEXT: ext v19.16b, v3.16b, v2.16b, #12 +; NO_SVE-NEXT: ldp q18, q16, [x1, #96] +; NO_SVE-NEXT: ext v21.16b, v6.16b, v7.16b, #12 +; NO_SVE-NEXT: ext v6.16b, v0.16b, v6.16b, #12 +; NO_SVE-NEXT: ext v0.16b, v1.16b, v0.16b, #12 +; NO_SVE-NEXT: ext v1.16b, v2.16b, v1.16b, #12 +; NO_SVE-NEXT: ldp q5, q4, [x1, #64] +; NO_SVE-NEXT: ext v3.16b, v16.16b, v3.16b, #12 +; NO_SVE-NEXT: ldp q22, q20, [x1, #32] +; NO_SVE-NEXT: ext v17.16b, v5.16b, v4.16b, #12 +; NO_SVE-NEXT: ldp q24, q23, [x1] +; NO_SVE-NEXT: ldp q25, q2, [x1, #224] +; NO_SVE-NEXT: stp q3, q19, [x0, #128] +; NO_SVE-NEXT: stp q1, q0, [x0, #160] +; NO_SVE-NEXT: stp q6, q21, [x0, #192] +; NO_SVE-NEXT: ext v6.16b, v18.16b, v16.16b, #12 +; NO_SVE-NEXT: ext v0.16b, v4.16b, v18.16b, #12 +; NO_SVE-NEXT: ext v4.16b, v20.16b, v5.16b, #12 +; NO_SVE-NEXT: ldr q1, [x0, #240] +; NO_SVE-NEXT: ext v3.16b, v22.16b, v20.16b, #12 +; NO_SVE-NEXT: ext v5.16b, v23.16b, v22.16b, #12 +; NO_SVE-NEXT: stp q0, q6, [x0, #96] +; NO_SVE-NEXT: ext v0.16b, v24.16b, v23.16b, #12 +; NO_SVE-NEXT: stp q4, q17, [x0, #64] +; NO_SVE-NEXT: ext v1.16b, v1.16b, v24.16b, #12 +; NO_SVE-NEXT: ext v2.16b, v25.16b, v2.16b, #12 +; NO_SVE-NEXT: stp q5, q3, [x0, #32] +; NO_SVE-NEXT: ext v3.16b, v7.16b, v25.16b, #12 +; NO_SVE-NEXT: stp q1, q0, [x0] +; NO_SVE-NEXT: stp q3, q2, [x0, #224] +; NO_SVE-NEXT: ret +; +; VBITS_EQ_256-LABEL: shuffle_ext_byone_v64i32: +; VBITS_EQ_256: // %bb.0: +; VBITS_EQ_256-NEXT: mov x8, #8 +; VBITS_EQ_256-NEXT: ptrue p0.s, vl8 +; VBITS_EQ_256-NEXT: mov x10, #32 +; VBITS_EQ_256-NEXT: mov x13, #16 +; VBITS_EQ_256-NEXT: mov x14, #24 +; VBITS_EQ_256-NEXT: mov x11, #40 +; VBITS_EQ_256-NEXT: ld1w { z0.s }, p0/z, [x1, x8, lsl #2] +; VBITS_EQ_256-NEXT: mov x12, #48 +; VBITS_EQ_256-NEXT: ld1w { z2.s }, p0/z, [x1, x10, lsl #2] +; VBITS_EQ_256-NEXT: ld1w { z5.s }, p0/z, [x1, x13, lsl #2] +; VBITS_EQ_256-NEXT: mov x9, #56 +; VBITS_EQ_256-NEXT: ld1w { z7.s }, p0/z, [x1, x14, lsl #2] +; VBITS_EQ_256-NEXT: ld1w { z4.s }, p0/z, [x1, x12, lsl #2] +; VBITS_EQ_256-NEXT: mov z6.s, z0.s[7] +; VBITS_EQ_256-NEXT: fmov w15, s6 +; VBITS_EQ_256-NEXT: ld1w { z6.s }, p0/z, [x1, x11, lsl #2] +; VBITS_EQ_256-NEXT: mov z16.s, z2.s[7] +; VBITS_EQ_256-NEXT: ld1w { z1.s }, p0/z, [x0, x9, lsl #2] +; VBITS_EQ_256-NEXT: ld1w { z3.s }, p0/z, [x1, x9, lsl #2] +; VBITS_EQ_256-NEXT: ld1w { z17.s }, p0/z, [x1] +; VBITS_EQ_256-NEXT: fmov w16, s16 +; VBITS_EQ_256-NEXT: mov z16.s, z5.s[7] +; VBITS_EQ_256-NEXT: insr z5.s, w15 +; VBITS_EQ_256-NEXT: fmov w15, s16 +; VBITS_EQ_256-NEXT: mov z16.s, z7.s[7] +; VBITS_EQ_256-NEXT: mov z1.s, z1.s[7] +; VBITS_EQ_256-NEXT: fmov w17, s16 +; VBITS_EQ_256-NEXT: mov z16.s, z6.s[7] +; VBITS_EQ_256-NEXT: fmov w18, s16 +; VBITS_EQ_256-NEXT: mov z16.s, z4.s[7] +; VBITS_EQ_256-NEXT: insr z7.s, w15 +; VBITS_EQ_256-NEXT: fmov w15, s16 +; VBITS_EQ_256-NEXT: mov z16.s, z17.s[7] +; VBITS_EQ_256-NEXT: fmov w1, s1 +; VBITS_EQ_256-NEXT: fmov w2, s16 +; VBITS_EQ_256-NEXT: insr z2.s, w17 +; VBITS_EQ_256-NEXT: insr z6.s, w16 +; VBITS_EQ_256-NEXT: insr z4.s, w18 +; VBITS_EQ_256-NEXT: insr z3.s, w15 +; VBITS_EQ_256-NEXT: insr z17.s, w1 +; VBITS_EQ_256-NEXT: insr z0.s, w2 +; VBITS_EQ_256-NEXT: st1w { z3.s }, p0, [x0, x9, lsl #2] +; VBITS_EQ_256-NEXT: st1w { z4.s }, p0, [x0, x12, lsl #2] +; VBITS_EQ_256-NEXT: st1w { z2.s }, p0, [x0, x10, lsl #2] +; VBITS_EQ_256-NEXT: st1w { z7.s }, p0, [x0, x14, lsl #2] +; VBITS_EQ_256-NEXT: st1w { z5.s }, p0, [x0, x13, lsl #2] +; VBITS_EQ_256-NEXT: st1w { z0.s }, p0, [x0, x8, lsl #2] +; VBITS_EQ_256-NEXT: st1w { z6.s }, p0, [x0, x11, lsl #2] +; VBITS_EQ_256-NEXT: st1w { z17.s }, p0, [x0] +; VBITS_EQ_256-NEXT: ret +; +; VBITS_GE_384-LABEL: shuffle_ext_byone_v64i32: +; VBITS_GE_384: // %bb.0: +; VBITS_GE_384-NEXT: mov x8, #8 +; VBITS_GE_384-NEXT: ptrue p0.s, vl8 +; VBITS_GE_384-NEXT: mov x10, #32 +; VBITS_GE_384-NEXT: mov x13, #16 +; VBITS_GE_384-NEXT: mov x14, #24 +; VBITS_GE_384-NEXT: mov x11, #40 +; VBITS_GE_384-NEXT: ld1w { z0.s }, p0/z, [x1, x8, lsl #2] +; VBITS_GE_384-NEXT: mov x12, #48 +; VBITS_GE_384-NEXT: ld1w { z2.s }, p0/z, [x1, x10, lsl #2] +; VBITS_GE_384-NEXT: ld1w { z5.s }, p0/z, [x1, x13, lsl #2] +; VBITS_GE_384-NEXT: mov x9, #56 +; VBITS_GE_384-NEXT: ld1w { z7.s }, p0/z, [x1, x14, lsl #2] +; VBITS_GE_384-NEXT: ld1w { z4.s }, p0/z, [x1, x12, lsl #2] +; VBITS_GE_384-NEXT: mov z6.s, z0.s[7] +; VBITS_GE_384-NEXT: fmov w15, s6 +; VBITS_GE_384-NEXT: ld1w { z6.s }, p0/z, [x1, x11, lsl #2] +; VBITS_GE_384-NEXT: mov z16.s, z2.s[7] +; VBITS_GE_384-NEXT: ld1w { z1.s }, p0/z, [x0, x9, lsl #2] +; VBITS_GE_384-NEXT: ld1w { z3.s }, p0/z, [x1, x9, lsl #2] +; VBITS_GE_384-NEXT: ld1w { z17.s }, p0/z, [x1] +; VBITS_GE_384-NEXT: fmov w16, s16 +; VBITS_GE_384-NEXT: mov z16.s, z5.s[7] +; VBITS_GE_384-NEXT: insr z5.s, w15 +; VBITS_GE_384-NEXT: fmov w15, s16 +; VBITS_GE_384-NEXT: mov z16.s, z7.s[7] +; VBITS_GE_384-NEXT: mov z1.s, z1.s[7] +; VBITS_GE_384-NEXT: fmov w17, s16 +; VBITS_GE_384-NEXT: mov z16.s, z6.s[7] +; VBITS_GE_384-NEXT: fmov w18, s16 +; VBITS_GE_384-NEXT: mov z16.s, z4.s[7] +; VBITS_GE_384-NEXT: insr z7.s, w15 +; VBITS_GE_384-NEXT: fmov w15, s16 +; VBITS_GE_384-NEXT: mov z16.s, z17.s[7] +; VBITS_GE_384-NEXT: fmov w1, s1 +; VBITS_GE_384-NEXT: fmov w2, s16 +; VBITS_GE_384-NEXT: insr z2.s, w17 +; VBITS_GE_384-NEXT: insr z6.s, w16 +; VBITS_GE_384-NEXT: insr z4.s, w18 +; VBITS_GE_384-NEXT: insr z3.s, w15 +; VBITS_GE_384-NEXT: insr z17.s, w1 +; VBITS_GE_384-NEXT: insr z0.s, w2 +; VBITS_GE_384-NEXT: st1w { z3.s }, p0, [x0, x9, lsl #2] +; VBITS_GE_384-NEXT: st1w { z4.s }, p0, [x0, x12, lsl #2] +; VBITS_GE_384-NEXT: st1w { z2.s }, p0, [x0, x10, lsl #2] +; VBITS_GE_384-NEXT: st1w { z7.s }, p0, [x0, x14, lsl #2] +; VBITS_GE_384-NEXT: st1w { z5.s }, p0, [x0, x13, lsl #2] +; VBITS_GE_384-NEXT: st1w { z0.s }, p0, [x0, x8, lsl #2] +; VBITS_GE_384-NEXT: st1w { z6.s }, p0, [x0, x11, lsl #2] +; VBITS_GE_384-NEXT: st1w { z17.s }, p0, [x0] +; VBITS_GE_384-NEXT: ret +; ; VBITS_GE_2048-LABEL: shuffle_ext_byone_v64i32: ; VBITS_GE_2048: // %bb.0: ; VBITS_GE_2048-NEXT: ptrue p0.s, vl64 @@ -452,6 +1266,11 @@ ; Don't use SVE for 128-bit vectors define <2 x i64> @shuffle_ext_byone_v2i64(<2 x i64> %op1, <2 x i64> %op2) #0 { +; NO_SVE-LABEL: shuffle_ext_byone_v2i64: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: ext v0.16b, v0.16b, v1.16b, #8 +; NO_SVE-NEXT: ret +; ; CHECK-LABEL: shuffle_ext_byone_v2i64: ; CHECK: // %bb.0: ; CHECK-NEXT: ext v0.16b, v0.16b, v1.16b, #8 @@ -461,6 +1280,15 @@ } define void @shuffle_ext_byone_v4i64(<4 x i64>* %a, <4 x i64>* %b) #0 { +; NO_SVE-LABEL: shuffle_ext_byone_v4i64: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: ldp q1, q2, [x1] +; NO_SVE-NEXT: ldr q0, [x0, #16] +; NO_SVE-NEXT: ext v0.16b, v0.16b, v1.16b, #8 +; NO_SVE-NEXT: ext v1.16b, v1.16b, v2.16b, #8 +; NO_SVE-NEXT: stp q0, q1, [x0] +; NO_SVE-NEXT: ret +; ; CHECK-LABEL: shuffle_ext_byone_v4i64: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.d, vl4 @@ -480,6 +1308,19 @@ define void @shuffle_ext_byone_v8i64(<8 x i64>* %a, <8 x i64>* %b) #0 { ; Ensure sensible type legalisation. +; NO_SVE-LABEL: shuffle_ext_byone_v8i64: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: ldp q1, q0, [x1, #32] +; NO_SVE-NEXT: ldp q4, q2, [x1] +; NO_SVE-NEXT: ext v0.16b, v1.16b, v0.16b, #8 +; NO_SVE-NEXT: ldr q3, [x0, #48] +; NO_SVE-NEXT: ext v5.16b, v2.16b, v1.16b, #8 +; NO_SVE-NEXT: ext v2.16b, v4.16b, v2.16b, #8 +; NO_SVE-NEXT: ext v1.16b, v3.16b, v4.16b, #8 +; NO_SVE-NEXT: stp q5, q0, [x0, #32] +; NO_SVE-NEXT: stp q1, q2, [x0] +; NO_SVE-NEXT: ret +; ; VBITS_EQ_256-LABEL: shuffle_ext_byone_v8i64: ; VBITS_EQ_256: // %bb.0: ; VBITS_EQ_256-NEXT: mov x8, #4 @@ -497,6 +1338,23 @@ ; VBITS_EQ_256-NEXT: st1d { z1.d }, p0, [x0, x8, lsl #3] ; VBITS_EQ_256-NEXT: ret ; +; VBITS_GE_384-LABEL: shuffle_ext_byone_v8i64: +; VBITS_GE_384: // %bb.0: +; VBITS_GE_384-NEXT: mov x8, #4 +; VBITS_GE_384-NEXT: ptrue p0.d, vl4 +; VBITS_GE_384-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] +; VBITS_GE_384-NEXT: ld1d { z1.d }, p0/z, [x1, x8, lsl #3] +; VBITS_GE_384-NEXT: ld1d { z2.d }, p0/z, [x1] +; VBITS_GE_384-NEXT: mov z0.d, z0.d[3] +; VBITS_GE_384-NEXT: mov z3.d, z2.d[3] +; VBITS_GE_384-NEXT: fmov x9, d0 +; VBITS_GE_384-NEXT: fmov x10, d3 +; VBITS_GE_384-NEXT: insr z2.d, x9 +; VBITS_GE_384-NEXT: insr z1.d, x10 +; VBITS_GE_384-NEXT: st1d { z2.d }, p0, [x0] +; VBITS_GE_384-NEXT: st1d { z1.d }, p0, [x0, x8, lsl #3] +; VBITS_GE_384-NEXT: ret +; ; VBITS_GE_512-LABEL: shuffle_ext_byone_v8i64: ; VBITS_GE_512: // %bb.0: ; VBITS_GE_512-NEXT: ptrue p0.d, vl8 @@ -515,6 +1373,85 @@ } define void @shuffle_ext_byone_v16i64(<16 x i64>* %a, <16 x i64>* %b) #0 { +; NO_SVE-LABEL: shuffle_ext_byone_v16i64: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: ldp q2, q1, [x1, #96] +; NO_SVE-NEXT: ldp q6, q5, [x1, #64] +; NO_SVE-NEXT: ext v1.16b, v2.16b, v1.16b, #8 +; NO_SVE-NEXT: ldp q4, q3, [x1, #32] +; NO_SVE-NEXT: ext v2.16b, v5.16b, v2.16b, #8 +; NO_SVE-NEXT: ldp q16, q7, [x1] +; NO_SVE-NEXT: ext v17.16b, v4.16b, v3.16b, #8 +; NO_SVE-NEXT: ldr q0, [x0, #112] +; NO_SVE-NEXT: stp q2, q1, [x0, #96] +; NO_SVE-NEXT: ext v4.16b, v7.16b, v4.16b, #8 +; NO_SVE-NEXT: ext v7.16b, v16.16b, v7.16b, #8 +; NO_SVE-NEXT: ext v0.16b, v0.16b, v16.16b, #8 +; NO_SVE-NEXT: ext v1.16b, v3.16b, v6.16b, #8 +; NO_SVE-NEXT: ext v2.16b, v6.16b, v5.16b, #8 +; NO_SVE-NEXT: stp q4, q17, [x0, #32] +; NO_SVE-NEXT: stp q0, q7, [x0] +; NO_SVE-NEXT: stp q1, q2, [x0, #64] +; NO_SVE-NEXT: ret +; +; VBITS_EQ_256-LABEL: shuffle_ext_byone_v16i64: +; VBITS_EQ_256: // %bb.0: +; VBITS_EQ_256-NEXT: mov x8, #8 +; VBITS_EQ_256-NEXT: mov x10, #4 +; VBITS_EQ_256-NEXT: mov x9, #12 +; VBITS_EQ_256-NEXT: ptrue p0.d, vl4 +; VBITS_EQ_256-NEXT: ld1d { z0.d }, p0/z, [x1, x8, lsl #3] +; VBITS_EQ_256-NEXT: ld1d { z2.d }, p0/z, [x1, x10, lsl #3] +; VBITS_EQ_256-NEXT: ld1d { z1.d }, p0/z, [x0, x9, lsl #3] +; VBITS_EQ_256-NEXT: ld1d { z3.d }, p0/z, [x1, x9, lsl #3] +; VBITS_EQ_256-NEXT: ld1d { z4.d }, p0/z, [x1] +; VBITS_EQ_256-NEXT: mov z5.d, z0.d[3] +; VBITS_EQ_256-NEXT: fmov x11, d5 +; VBITS_EQ_256-NEXT: mov z5.d, z2.d[3] +; VBITS_EQ_256-NEXT: mov z1.d, z1.d[3] +; VBITS_EQ_256-NEXT: fmov x12, d5 +; VBITS_EQ_256-NEXT: mov z5.d, z4.d[3] +; VBITS_EQ_256-NEXT: fmov x13, d1 +; VBITS_EQ_256-NEXT: fmov x14, d5 +; VBITS_EQ_256-NEXT: insr z3.d, x11 +; VBITS_EQ_256-NEXT: insr z0.d, x12 +; VBITS_EQ_256-NEXT: st1d { z3.d }, p0, [x0, x9, lsl #3] +; VBITS_EQ_256-NEXT: insr z4.d, x13 +; VBITS_EQ_256-NEXT: st1d { z0.d }, p0, [x0, x8, lsl #3] +; VBITS_EQ_256-NEXT: insr z2.d, x14 +; VBITS_EQ_256-NEXT: st1d { z4.d }, p0, [x0] +; VBITS_EQ_256-NEXT: st1d { z2.d }, p0, [x0, x10, lsl #3] +; VBITS_EQ_256-NEXT: ret +; +; VBITS_GE_384-LABEL: shuffle_ext_byone_v16i64: +; VBITS_GE_384: // %bb.0: +; VBITS_GE_384-NEXT: mov x8, #8 +; VBITS_GE_384-NEXT: mov x10, #4 +; VBITS_GE_384-NEXT: mov x9, #12 +; VBITS_GE_384-NEXT: ptrue p0.d, vl4 +; VBITS_GE_384-NEXT: ld1d { z0.d }, p0/z, [x1, x8, lsl #3] +; VBITS_GE_384-NEXT: ld1d { z2.d }, p0/z, [x1, x10, lsl #3] +; VBITS_GE_384-NEXT: ld1d { z1.d }, p0/z, [x0, x9, lsl #3] +; VBITS_GE_384-NEXT: ld1d { z3.d }, p0/z, [x1, x9, lsl #3] +; VBITS_GE_384-NEXT: ld1d { z4.d }, p0/z, [x1] +; VBITS_GE_384-NEXT: mov z5.d, z0.d[3] +; VBITS_GE_384-NEXT: fmov x11, d5 +; VBITS_GE_384-NEXT: mov z5.d, z2.d[3] +; VBITS_GE_384-NEXT: mov z1.d, z1.d[3] +; VBITS_GE_384-NEXT: fmov x12, d5 +; VBITS_GE_384-NEXT: mov z5.d, z4.d[3] +; VBITS_GE_384-NEXT: fmov x13, d1 +; VBITS_GE_384-NEXT: fmov x14, d5 +; VBITS_GE_384-NEXT: insr z3.d, x11 +; VBITS_GE_384-NEXT: insr z0.d, x12 +; VBITS_GE_384-NEXT: st1d { z3.d }, p0, [x0, x9, lsl #3] +; VBITS_GE_384-NEXT: insr z4.d, x13 +; VBITS_GE_384-NEXT: st1d { z0.d }, p0, [x0, x8, lsl #3] +; VBITS_GE_384-NEXT: insr z2.d, x14 +; VBITS_GE_384-NEXT: st1d { z4.d }, p0, [x0] +; VBITS_GE_384-NEXT: st1d { z2.d }, p0, [x0, x10, lsl #3] +; VBITS_GE_384-NEXT: ret +; ; VBITS_GE_1024-LABEL: shuffle_ext_byone_v16i64: ; VBITS_GE_1024: // %bb.0: ; VBITS_GE_1024-NEXT: ptrue p0.d, vl16 @@ -535,6 +1472,149 @@ } define void @shuffle_ext_byone_v32i64(<32 x i64>* %a, <32 x i64>* %b) #0 { +; NO_SVE-LABEL: shuffle_ext_byone_v32i64: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: ldp q1, q0, [x1, #160] +; NO_SVE-NEXT: ldp q3, q2, [x1, #128] +; NO_SVE-NEXT: ldp q6, q7, [x1, #192] +; NO_SVE-NEXT: ext v19.16b, v3.16b, v2.16b, #8 +; NO_SVE-NEXT: ldp q18, q16, [x1, #96] +; NO_SVE-NEXT: ext v21.16b, v6.16b, v7.16b, #8 +; NO_SVE-NEXT: ext v6.16b, v0.16b, v6.16b, #8 +; NO_SVE-NEXT: ext v0.16b, v1.16b, v0.16b, #8 +; NO_SVE-NEXT: ext v1.16b, v2.16b, v1.16b, #8 +; NO_SVE-NEXT: ldp q5, q4, [x1, #64] +; NO_SVE-NEXT: ext v3.16b, v16.16b, v3.16b, #8 +; NO_SVE-NEXT: ldp q22, q20, [x1, #32] +; NO_SVE-NEXT: ext v17.16b, v5.16b, v4.16b, #8 +; NO_SVE-NEXT: ldp q24, q23, [x1] +; NO_SVE-NEXT: ldp q25, q2, [x1, #224] +; NO_SVE-NEXT: stp q3, q19, [x0, #128] +; NO_SVE-NEXT: stp q1, q0, [x0, #160] +; NO_SVE-NEXT: stp q6, q21, [x0, #192] +; NO_SVE-NEXT: ext v6.16b, v18.16b, v16.16b, #8 +; NO_SVE-NEXT: ext v0.16b, v4.16b, v18.16b, #8 +; NO_SVE-NEXT: ext v4.16b, v20.16b, v5.16b, #8 +; NO_SVE-NEXT: ldr q1, [x0, #240] +; NO_SVE-NEXT: ext v3.16b, v22.16b, v20.16b, #8 +; NO_SVE-NEXT: ext v5.16b, v23.16b, v22.16b, #8 +; NO_SVE-NEXT: stp q0, q6, [x0, #96] +; NO_SVE-NEXT: ext v0.16b, v24.16b, v23.16b, #8 +; NO_SVE-NEXT: stp q4, q17, [x0, #64] +; NO_SVE-NEXT: ext v1.16b, v1.16b, v24.16b, #8 +; NO_SVE-NEXT: ext v2.16b, v25.16b, v2.16b, #8 +; NO_SVE-NEXT: stp q5, q3, [x0, #32] +; NO_SVE-NEXT: ext v3.16b, v7.16b, v25.16b, #8 +; NO_SVE-NEXT: stp q1, q0, [x0] +; NO_SVE-NEXT: stp q3, q2, [x0, #224] +; NO_SVE-NEXT: ret +; +; VBITS_EQ_256-LABEL: shuffle_ext_byone_v32i64: +; VBITS_EQ_256: // %bb.0: +; VBITS_EQ_256-NEXT: mov x8, #4 +; VBITS_EQ_256-NEXT: ptrue p0.d, vl4 +; VBITS_EQ_256-NEXT: mov x10, #16 +; VBITS_EQ_256-NEXT: mov x13, #8 +; VBITS_EQ_256-NEXT: mov x14, #12 +; VBITS_EQ_256-NEXT: mov x11, #20 +; VBITS_EQ_256-NEXT: ld1d { z0.d }, p0/z, [x1, x8, lsl #3] +; VBITS_EQ_256-NEXT: mov x12, #24 +; VBITS_EQ_256-NEXT: ld1d { z2.d }, p0/z, [x1, x10, lsl #3] +; VBITS_EQ_256-NEXT: ld1d { z5.d }, p0/z, [x1, x13, lsl #3] +; VBITS_EQ_256-NEXT: mov x9, #28 +; VBITS_EQ_256-NEXT: ld1d { z7.d }, p0/z, [x1, x14, lsl #3] +; VBITS_EQ_256-NEXT: ld1d { z4.d }, p0/z, [x1, x12, lsl #3] +; VBITS_EQ_256-NEXT: mov z6.d, z0.d[3] +; VBITS_EQ_256-NEXT: fmov x15, d6 +; VBITS_EQ_256-NEXT: ld1d { z6.d }, p0/z, [x1, x11, lsl #3] +; VBITS_EQ_256-NEXT: mov z16.d, z2.d[3] +; VBITS_EQ_256-NEXT: ld1d { z1.d }, p0/z, [x0, x9, lsl #3] +; VBITS_EQ_256-NEXT: ld1d { z3.d }, p0/z, [x1, x9, lsl #3] +; VBITS_EQ_256-NEXT: ld1d { z17.d }, p0/z, [x1] +; VBITS_EQ_256-NEXT: fmov x16, d16 +; VBITS_EQ_256-NEXT: mov z16.d, z5.d[3] +; VBITS_EQ_256-NEXT: insr z5.d, x15 +; VBITS_EQ_256-NEXT: fmov x15, d16 +; VBITS_EQ_256-NEXT: mov z16.d, z7.d[3] +; VBITS_EQ_256-NEXT: mov z1.d, z1.d[3] +; VBITS_EQ_256-NEXT: fmov x17, d16 +; VBITS_EQ_256-NEXT: mov z16.d, z6.d[3] +; VBITS_EQ_256-NEXT: fmov x18, d16 +; VBITS_EQ_256-NEXT: mov z16.d, z4.d[3] +; VBITS_EQ_256-NEXT: insr z7.d, x15 +; VBITS_EQ_256-NEXT: fmov x15, d16 +; VBITS_EQ_256-NEXT: mov z16.d, z17.d[3] +; VBITS_EQ_256-NEXT: fmov x1, d1 +; VBITS_EQ_256-NEXT: fmov x2, d16 +; VBITS_EQ_256-NEXT: insr z2.d, x17 +; VBITS_EQ_256-NEXT: insr z6.d, x16 +; VBITS_EQ_256-NEXT: insr z4.d, x18 +; VBITS_EQ_256-NEXT: insr z3.d, x15 +; VBITS_EQ_256-NEXT: insr z17.d, x1 +; VBITS_EQ_256-NEXT: insr z0.d, x2 +; VBITS_EQ_256-NEXT: st1d { z3.d }, p0, [x0, x9, lsl #3] +; VBITS_EQ_256-NEXT: st1d { z4.d }, p0, [x0, x12, lsl #3] +; VBITS_EQ_256-NEXT: st1d { z2.d }, p0, [x0, x10, lsl #3] +; VBITS_EQ_256-NEXT: st1d { z7.d }, p0, [x0, x14, lsl #3] +; VBITS_EQ_256-NEXT: st1d { z5.d }, p0, [x0, x13, lsl #3] +; VBITS_EQ_256-NEXT: st1d { z0.d }, p0, [x0, x8, lsl #3] +; VBITS_EQ_256-NEXT: st1d { z6.d }, p0, [x0, x11, lsl #3] +; VBITS_EQ_256-NEXT: st1d { z17.d }, p0, [x0] +; VBITS_EQ_256-NEXT: ret +; +; VBITS_GE_384-LABEL: shuffle_ext_byone_v32i64: +; VBITS_GE_384: // %bb.0: +; VBITS_GE_384-NEXT: mov x8, #4 +; VBITS_GE_384-NEXT: ptrue p0.d, vl4 +; VBITS_GE_384-NEXT: mov x10, #16 +; VBITS_GE_384-NEXT: mov x13, #8 +; VBITS_GE_384-NEXT: mov x14, #12 +; VBITS_GE_384-NEXT: mov x11, #20 +; VBITS_GE_384-NEXT: ld1d { z0.d }, p0/z, [x1, x8, lsl #3] +; VBITS_GE_384-NEXT: mov x12, #24 +; VBITS_GE_384-NEXT: ld1d { z2.d }, p0/z, [x1, x10, lsl #3] +; VBITS_GE_384-NEXT: ld1d { z5.d }, p0/z, [x1, x13, lsl #3] +; VBITS_GE_384-NEXT: mov x9, #28 +; VBITS_GE_384-NEXT: ld1d { z7.d }, p0/z, [x1, x14, lsl #3] +; VBITS_GE_384-NEXT: ld1d { z4.d }, p0/z, [x1, x12, lsl #3] +; VBITS_GE_384-NEXT: mov z6.d, z0.d[3] +; VBITS_GE_384-NEXT: fmov x15, d6 +; VBITS_GE_384-NEXT: ld1d { z6.d }, p0/z, [x1, x11, lsl #3] +; VBITS_GE_384-NEXT: mov z16.d, z2.d[3] +; VBITS_GE_384-NEXT: ld1d { z1.d }, p0/z, [x0, x9, lsl #3] +; VBITS_GE_384-NEXT: ld1d { z3.d }, p0/z, [x1, x9, lsl #3] +; VBITS_GE_384-NEXT: ld1d { z17.d }, p0/z, [x1] +; VBITS_GE_384-NEXT: fmov x16, d16 +; VBITS_GE_384-NEXT: mov z16.d, z5.d[3] +; VBITS_GE_384-NEXT: insr z5.d, x15 +; VBITS_GE_384-NEXT: fmov x15, d16 +; VBITS_GE_384-NEXT: mov z16.d, z7.d[3] +; VBITS_GE_384-NEXT: mov z1.d, z1.d[3] +; VBITS_GE_384-NEXT: fmov x17, d16 +; VBITS_GE_384-NEXT: mov z16.d, z6.d[3] +; VBITS_GE_384-NEXT: fmov x18, d16 +; VBITS_GE_384-NEXT: mov z16.d, z4.d[3] +; VBITS_GE_384-NEXT: insr z7.d, x15 +; VBITS_GE_384-NEXT: fmov x15, d16 +; VBITS_GE_384-NEXT: mov z16.d, z17.d[3] +; VBITS_GE_384-NEXT: fmov x1, d1 +; VBITS_GE_384-NEXT: fmov x2, d16 +; VBITS_GE_384-NEXT: insr z2.d, x17 +; VBITS_GE_384-NEXT: insr z6.d, x16 +; VBITS_GE_384-NEXT: insr z4.d, x18 +; VBITS_GE_384-NEXT: insr z3.d, x15 +; VBITS_GE_384-NEXT: insr z17.d, x1 +; VBITS_GE_384-NEXT: insr z0.d, x2 +; VBITS_GE_384-NEXT: st1d { z3.d }, p0, [x0, x9, lsl #3] +; VBITS_GE_384-NEXT: st1d { z4.d }, p0, [x0, x12, lsl #3] +; VBITS_GE_384-NEXT: st1d { z2.d }, p0, [x0, x10, lsl #3] +; VBITS_GE_384-NEXT: st1d { z7.d }, p0, [x0, x14, lsl #3] +; VBITS_GE_384-NEXT: st1d { z5.d }, p0, [x0, x13, lsl #3] +; VBITS_GE_384-NEXT: st1d { z0.d }, p0, [x0, x8, lsl #3] +; VBITS_GE_384-NEXT: st1d { z6.d }, p0, [x0, x11, lsl #3] +; VBITS_GE_384-NEXT: st1d { z17.d }, p0, [x0] +; VBITS_GE_384-NEXT: ret +; ; VBITS_GE_2048-LABEL: shuffle_ext_byone_v32i64: ; VBITS_GE_2048: // %bb.0: ; VBITS_GE_2048-NEXT: ptrue p0.d, vl32 @@ -558,6 +1638,11 @@ ; Don't use SVE for 64-bit vectors define <4 x half> @shuffle_ext_byone_v4f16(<4 x half> %op1, <4 x half> %op2) #0 { +; NO_SVE-LABEL: shuffle_ext_byone_v4f16: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: ext v0.8b, v0.8b, v1.8b, #6 +; NO_SVE-NEXT: ret +; ; CHECK-LABEL: shuffle_ext_byone_v4f16: ; CHECK: // %bb.0: ; CHECK-NEXT: ext v0.8b, v0.8b, v1.8b, #6 @@ -568,6 +1653,11 @@ ; Don't use SVE for 128-bit vectors define <8 x half> @shuffle_ext_byone_v8f16(<8 x half> %op1, <8 x half> %op2) #0 { +; NO_SVE-LABEL: shuffle_ext_byone_v8f16: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: ext v0.16b, v0.16b, v1.16b, #14 +; NO_SVE-NEXT: ret +; ; CHECK-LABEL: shuffle_ext_byone_v8f16: ; CHECK: // %bb.0: ; CHECK-NEXT: ext v0.16b, v0.16b, v1.16b, #14 @@ -577,6 +1667,15 @@ } define void @shuffle_ext_byone_v16f16(<16 x half>* %a, <16 x half>* %b) #0 { +; NO_SVE-LABEL: shuffle_ext_byone_v16f16: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: ldp q1, q2, [x1] +; NO_SVE-NEXT: ldr q0, [x0, #16] +; NO_SVE-NEXT: ext v0.16b, v0.16b, v1.16b, #14 +; NO_SVE-NEXT: ext v1.16b, v1.16b, v2.16b, #14 +; NO_SVE-NEXT: stp q0, q1, [x0] +; NO_SVE-NEXT: ret +; ; CHECK-LABEL: shuffle_ext_byone_v16f16: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.h, vl16 @@ -596,6 +1695,19 @@ define void @shuffle_ext_byone_v32f16(<32 x half>* %a, <32 x half>* %b) #0 { ; Ensure sensible type legalisation. +; NO_SVE-LABEL: shuffle_ext_byone_v32f16: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: ldp q1, q0, [x1, #32] +; NO_SVE-NEXT: ldp q4, q2, [x1] +; NO_SVE-NEXT: ext v0.16b, v1.16b, v0.16b, #14 +; NO_SVE-NEXT: ldr q3, [x0, #48] +; NO_SVE-NEXT: ext v5.16b, v2.16b, v1.16b, #14 +; NO_SVE-NEXT: ext v2.16b, v4.16b, v2.16b, #14 +; NO_SVE-NEXT: ext v1.16b, v3.16b, v4.16b, #14 +; NO_SVE-NEXT: stp q5, q0, [x0, #32] +; NO_SVE-NEXT: stp q1, q2, [x0] +; NO_SVE-NEXT: ret +; ; VBITS_EQ_256-LABEL: shuffle_ext_byone_v32f16: ; VBITS_EQ_256: // %bb.0: ; VBITS_EQ_256-NEXT: mov x8, #16 @@ -611,6 +1723,21 @@ ; VBITS_EQ_256-NEXT: st1h { z1.h }, p0, [x0, x8, lsl #1] ; VBITS_EQ_256-NEXT: ret ; +; VBITS_GE_384-LABEL: shuffle_ext_byone_v32f16: +; VBITS_GE_384: // %bb.0: +; VBITS_GE_384-NEXT: mov x8, #16 +; VBITS_GE_384-NEXT: ptrue p0.h, vl16 +; VBITS_GE_384-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] +; VBITS_GE_384-NEXT: ld1h { z1.h }, p0/z, [x1, x8, lsl #1] +; VBITS_GE_384-NEXT: ld1h { z2.h }, p0/z, [x1] +; VBITS_GE_384-NEXT: mov z0.h, z0.h[15] +; VBITS_GE_384-NEXT: mov z3.h, z2.h[15] +; VBITS_GE_384-NEXT: insr z2.h, h0 +; VBITS_GE_384-NEXT: insr z1.h, h3 +; VBITS_GE_384-NEXT: st1h { z2.h }, p0, [x0] +; VBITS_GE_384-NEXT: st1h { z1.h }, p0, [x0, x8, lsl #1] +; VBITS_GE_384-NEXT: ret +; ; VBITS_GE_512-LABEL: shuffle_ext_byone_v32f16: ; VBITS_GE_512: // %bb.0: ; VBITS_GE_512-NEXT: ptrue p0.h, vl32 @@ -631,6 +1758,77 @@ } define void @shuffle_ext_byone_v64f16(<64 x half>* %a, <64 x half>* %b) #0 { +; NO_SVE-LABEL: shuffle_ext_byone_v64f16: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: ldp q2, q1, [x1, #96] +; NO_SVE-NEXT: ldp q6, q5, [x1, #64] +; NO_SVE-NEXT: ext v1.16b, v2.16b, v1.16b, #14 +; NO_SVE-NEXT: ldp q4, q3, [x1, #32] +; NO_SVE-NEXT: ext v2.16b, v5.16b, v2.16b, #14 +; NO_SVE-NEXT: ldp q16, q7, [x1] +; NO_SVE-NEXT: ext v17.16b, v4.16b, v3.16b, #14 +; NO_SVE-NEXT: ldr q0, [x0, #112] +; NO_SVE-NEXT: stp q2, q1, [x0, #96] +; NO_SVE-NEXT: ext v4.16b, v7.16b, v4.16b, #14 +; NO_SVE-NEXT: ext v7.16b, v16.16b, v7.16b, #14 +; NO_SVE-NEXT: ext v0.16b, v0.16b, v16.16b, #14 +; NO_SVE-NEXT: ext v1.16b, v3.16b, v6.16b, #14 +; NO_SVE-NEXT: ext v2.16b, v6.16b, v5.16b, #14 +; NO_SVE-NEXT: stp q4, q17, [x0, #32] +; NO_SVE-NEXT: stp q0, q7, [x0] +; NO_SVE-NEXT: stp q1, q2, [x0, #64] +; NO_SVE-NEXT: ret +; +; VBITS_EQ_256-LABEL: shuffle_ext_byone_v64f16: +; VBITS_EQ_256: // %bb.0: +; VBITS_EQ_256-NEXT: mov x8, #32 +; VBITS_EQ_256-NEXT: mov x9, #48 +; VBITS_EQ_256-NEXT: mov x10, #16 +; VBITS_EQ_256-NEXT: ptrue p0.h, vl16 +; VBITS_EQ_256-NEXT: ld1h { z0.h }, p0/z, [x1, x8, lsl #1] +; VBITS_EQ_256-NEXT: ld1h { z1.h }, p0/z, [x1, x9, lsl #1] +; VBITS_EQ_256-NEXT: ld1h { z3.h }, p0/z, [x1, x10, lsl #1] +; VBITS_EQ_256-NEXT: ld1h { z2.h }, p0/z, [x0, x9, lsl #1] +; VBITS_EQ_256-NEXT: ld1h { z4.h }, p0/z, [x1] +; VBITS_EQ_256-NEXT: mov z5.h, z0.h[15] +; VBITS_EQ_256-NEXT: insr z1.h, h5 +; VBITS_EQ_256-NEXT: mov z5.h, z3.h[15] +; VBITS_EQ_256-NEXT: mov z2.h, z2.h[15] +; VBITS_EQ_256-NEXT: insr z0.h, h5 +; VBITS_EQ_256-NEXT: mov z5.h, z4.h[15] +; VBITS_EQ_256-NEXT: insr z4.h, h2 +; VBITS_EQ_256-NEXT: insr z3.h, h5 +; VBITS_EQ_256-NEXT: st1h { z0.h }, p0, [x0, x8, lsl #1] +; VBITS_EQ_256-NEXT: st1h { z1.h }, p0, [x0, x9, lsl #1] +; VBITS_EQ_256-NEXT: st1h { z3.h }, p0, [x0, x10, lsl #1] +; VBITS_EQ_256-NEXT: st1h { z4.h }, p0, [x0] +; VBITS_EQ_256-NEXT: ret +; +; VBITS_GE_384-LABEL: shuffle_ext_byone_v64f16: +; VBITS_GE_384: // %bb.0: +; VBITS_GE_384-NEXT: mov x8, #32 +; VBITS_GE_384-NEXT: mov x9, #48 +; VBITS_GE_384-NEXT: mov x10, #16 +; VBITS_GE_384-NEXT: ptrue p0.h, vl16 +; VBITS_GE_384-NEXT: ld1h { z0.h }, p0/z, [x1, x8, lsl #1] +; VBITS_GE_384-NEXT: ld1h { z1.h }, p0/z, [x1, x9, lsl #1] +; VBITS_GE_384-NEXT: ld1h { z3.h }, p0/z, [x1, x10, lsl #1] +; VBITS_GE_384-NEXT: ld1h { z2.h }, p0/z, [x0, x9, lsl #1] +; VBITS_GE_384-NEXT: ld1h { z4.h }, p0/z, [x1] +; VBITS_GE_384-NEXT: mov z5.h, z0.h[15] +; VBITS_GE_384-NEXT: insr z1.h, h5 +; VBITS_GE_384-NEXT: mov z5.h, z3.h[15] +; VBITS_GE_384-NEXT: mov z2.h, z2.h[15] +; VBITS_GE_384-NEXT: insr z0.h, h5 +; VBITS_GE_384-NEXT: mov z5.h, z4.h[15] +; VBITS_GE_384-NEXT: insr z4.h, h2 +; VBITS_GE_384-NEXT: insr z3.h, h5 +; VBITS_GE_384-NEXT: st1h { z0.h }, p0, [x0, x8, lsl #1] +; VBITS_GE_384-NEXT: st1h { z1.h }, p0, [x0, x9, lsl #1] +; VBITS_GE_384-NEXT: st1h { z3.h }, p0, [x0, x10, lsl #1] +; VBITS_GE_384-NEXT: st1h { z4.h }, p0, [x0] +; VBITS_GE_384-NEXT: ret +; ; VBITS_GE_1024-LABEL: shuffle_ext_byone_v64f16: ; VBITS_GE_1024: // %bb.0: ; VBITS_GE_1024-NEXT: ptrue p0.h, vl64 @@ -657,6 +1855,133 @@ } define void @shuffle_ext_byone_v128f16(<128 x half>* %a, <128 x half>* %b) #0 { +; NO_SVE-LABEL: shuffle_ext_byone_v128f16: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: ldp q1, q0, [x1, #160] +; NO_SVE-NEXT: ldp q3, q2, [x1, #128] +; NO_SVE-NEXT: ldp q6, q7, [x1, #192] +; NO_SVE-NEXT: ext v19.16b, v3.16b, v2.16b, #14 +; NO_SVE-NEXT: ldp q18, q16, [x1, #96] +; NO_SVE-NEXT: ext v21.16b, v6.16b, v7.16b, #14 +; NO_SVE-NEXT: ext v6.16b, v0.16b, v6.16b, #14 +; NO_SVE-NEXT: ext v0.16b, v1.16b, v0.16b, #14 +; NO_SVE-NEXT: ext v1.16b, v2.16b, v1.16b, #14 +; NO_SVE-NEXT: ldp q5, q4, [x1, #64] +; NO_SVE-NEXT: ext v3.16b, v16.16b, v3.16b, #14 +; NO_SVE-NEXT: ldp q22, q20, [x1, #32] +; NO_SVE-NEXT: ext v17.16b, v5.16b, v4.16b, #14 +; NO_SVE-NEXT: ldp q24, q23, [x1] +; NO_SVE-NEXT: ldp q25, q2, [x1, #224] +; NO_SVE-NEXT: stp q3, q19, [x0, #128] +; NO_SVE-NEXT: stp q1, q0, [x0, #160] +; NO_SVE-NEXT: stp q6, q21, [x0, #192] +; NO_SVE-NEXT: ext v6.16b, v18.16b, v16.16b, #14 +; NO_SVE-NEXT: ext v0.16b, v4.16b, v18.16b, #14 +; NO_SVE-NEXT: ext v4.16b, v20.16b, v5.16b, #14 +; NO_SVE-NEXT: ldr q1, [x0, #240] +; NO_SVE-NEXT: ext v3.16b, v22.16b, v20.16b, #14 +; NO_SVE-NEXT: ext v5.16b, v23.16b, v22.16b, #14 +; NO_SVE-NEXT: stp q0, q6, [x0, #96] +; NO_SVE-NEXT: ext v0.16b, v24.16b, v23.16b, #14 +; NO_SVE-NEXT: stp q4, q17, [x0, #64] +; NO_SVE-NEXT: ext v1.16b, v1.16b, v24.16b, #14 +; NO_SVE-NEXT: ext v2.16b, v25.16b, v2.16b, #14 +; NO_SVE-NEXT: stp q5, q3, [x0, #32] +; NO_SVE-NEXT: ext v3.16b, v7.16b, v25.16b, #14 +; NO_SVE-NEXT: stp q1, q0, [x0] +; NO_SVE-NEXT: stp q3, q2, [x0, #224] +; NO_SVE-NEXT: ret +; +; VBITS_EQ_256-LABEL: shuffle_ext_byone_v128f16: +; VBITS_EQ_256: // %bb.0: +; VBITS_EQ_256-NEXT: mov x10, #64 +; VBITS_EQ_256-NEXT: mov x9, #80 +; VBITS_EQ_256-NEXT: mov x11, #16 +; VBITS_EQ_256-NEXT: ptrue p0.h, vl16 +; VBITS_EQ_256-NEXT: mov x12, #32 +; VBITS_EQ_256-NEXT: mov x13, #48 +; VBITS_EQ_256-NEXT: mov x8, #112 +; VBITS_EQ_256-NEXT: mov x14, #96 +; VBITS_EQ_256-NEXT: ld1h { z3.h }, p0/z, [x1, x10, lsl #1] +; VBITS_EQ_256-NEXT: ld1h { z1.h }, p0/z, [x1, x9, lsl #1] +; VBITS_EQ_256-NEXT: ld1h { z5.h }, p0/z, [x1, x11, lsl #1] +; VBITS_EQ_256-NEXT: ld1h { z4.h }, p0/z, [x1, x12, lsl #1] +; VBITS_EQ_256-NEXT: ld1h { z16.h }, p0/z, [x1, x13, lsl #1] +; VBITS_EQ_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] +; VBITS_EQ_256-NEXT: ld1h { z2.h }, p0/z, [x1, x8, lsl #1] +; VBITS_EQ_256-NEXT: ld1h { z7.h }, p0/z, [x1, x14, lsl #1] +; VBITS_EQ_256-NEXT: ld1h { z17.h }, p0/z, [x1] +; VBITS_EQ_256-NEXT: mov z18.h, z3.h[15] +; VBITS_EQ_256-NEXT: mov z6.h, z1.h[15] +; VBITS_EQ_256-NEXT: insr z1.h, h18 +; VBITS_EQ_256-NEXT: mov z18.h, z5.h[15] +; VBITS_EQ_256-NEXT: mov z19.h, z4.h[15] +; VBITS_EQ_256-NEXT: insr z4.h, h18 +; VBITS_EQ_256-NEXT: mov z18.h, z16.h[15] +; VBITS_EQ_256-NEXT: insr z3.h, h18 +; VBITS_EQ_256-NEXT: mov z18.h, z7.h[15] +; VBITS_EQ_256-NEXT: insr z7.h, h6 +; VBITS_EQ_256-NEXT: mov z0.h, z0.h[15] +; VBITS_EQ_256-NEXT: mov z6.h, z17.h[15] +; VBITS_EQ_256-NEXT: insr z16.h, h19 +; VBITS_EQ_256-NEXT: insr z2.h, h18 +; VBITS_EQ_256-NEXT: insr z17.h, h0 +; VBITS_EQ_256-NEXT: insr z5.h, h6 +; VBITS_EQ_256-NEXT: st1h { z2.h }, p0, [x0, x8, lsl #1] +; VBITS_EQ_256-NEXT: st1h { z7.h }, p0, [x0, x14, lsl #1] +; VBITS_EQ_256-NEXT: st1h { z3.h }, p0, [x0, x10, lsl #1] +; VBITS_EQ_256-NEXT: st1h { z16.h }, p0, [x0, x13, lsl #1] +; VBITS_EQ_256-NEXT: st1h { z4.h }, p0, [x0, x12, lsl #1] +; VBITS_EQ_256-NEXT: st1h { z5.h }, p0, [x0, x11, lsl #1] +; VBITS_EQ_256-NEXT: st1h { z1.h }, p0, [x0, x9, lsl #1] +; VBITS_EQ_256-NEXT: st1h { z17.h }, p0, [x0] +; VBITS_EQ_256-NEXT: ret +; +; VBITS_GE_384-LABEL: shuffle_ext_byone_v128f16: +; VBITS_GE_384: // %bb.0: +; VBITS_GE_384-NEXT: mov x10, #64 +; VBITS_GE_384-NEXT: mov x9, #80 +; VBITS_GE_384-NEXT: mov x11, #16 +; VBITS_GE_384-NEXT: ptrue p0.h, vl16 +; VBITS_GE_384-NEXT: mov x12, #32 +; VBITS_GE_384-NEXT: mov x13, #48 +; VBITS_GE_384-NEXT: mov x8, #112 +; VBITS_GE_384-NEXT: mov x14, #96 +; VBITS_GE_384-NEXT: ld1h { z3.h }, p0/z, [x1, x10, lsl #1] +; VBITS_GE_384-NEXT: ld1h { z1.h }, p0/z, [x1, x9, lsl #1] +; VBITS_GE_384-NEXT: ld1h { z5.h }, p0/z, [x1, x11, lsl #1] +; VBITS_GE_384-NEXT: ld1h { z4.h }, p0/z, [x1, x12, lsl #1] +; VBITS_GE_384-NEXT: ld1h { z16.h }, p0/z, [x1, x13, lsl #1] +; VBITS_GE_384-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] +; VBITS_GE_384-NEXT: ld1h { z2.h }, p0/z, [x1, x8, lsl #1] +; VBITS_GE_384-NEXT: ld1h { z7.h }, p0/z, [x1, x14, lsl #1] +; VBITS_GE_384-NEXT: ld1h { z17.h }, p0/z, [x1] +; VBITS_GE_384-NEXT: mov z18.h, z3.h[15] +; VBITS_GE_384-NEXT: mov z6.h, z1.h[15] +; VBITS_GE_384-NEXT: insr z1.h, h18 +; VBITS_GE_384-NEXT: mov z18.h, z5.h[15] +; VBITS_GE_384-NEXT: mov z19.h, z4.h[15] +; VBITS_GE_384-NEXT: insr z4.h, h18 +; VBITS_GE_384-NEXT: mov z18.h, z16.h[15] +; VBITS_GE_384-NEXT: insr z3.h, h18 +; VBITS_GE_384-NEXT: mov z18.h, z7.h[15] +; VBITS_GE_384-NEXT: insr z7.h, h6 +; VBITS_GE_384-NEXT: mov z0.h, z0.h[15] +; VBITS_GE_384-NEXT: mov z6.h, z17.h[15] +; VBITS_GE_384-NEXT: insr z16.h, h19 +; VBITS_GE_384-NEXT: insr z2.h, h18 +; VBITS_GE_384-NEXT: insr z17.h, h0 +; VBITS_GE_384-NEXT: insr z5.h, h6 +; VBITS_GE_384-NEXT: st1h { z2.h }, p0, [x0, x8, lsl #1] +; VBITS_GE_384-NEXT: st1h { z7.h }, p0, [x0, x14, lsl #1] +; VBITS_GE_384-NEXT: st1h { z3.h }, p0, [x0, x10, lsl #1] +; VBITS_GE_384-NEXT: st1h { z16.h }, p0, [x0, x13, lsl #1] +; VBITS_GE_384-NEXT: st1h { z4.h }, p0, [x0, x12, lsl #1] +; VBITS_GE_384-NEXT: st1h { z5.h }, p0, [x0, x11, lsl #1] +; VBITS_GE_384-NEXT: st1h { z1.h }, p0, [x0, x9, lsl #1] +; VBITS_GE_384-NEXT: st1h { z17.h }, p0, [x0] +; VBITS_GE_384-NEXT: ret +; ; VBITS_GE_2048-LABEL: shuffle_ext_byone_v128f16: ; VBITS_GE_2048: // %bb.0: ; VBITS_GE_2048-NEXT: ptrue p0.h, vl128 @@ -692,6 +2017,11 @@ ; Don't use SVE for 64-bit vectors define <2 x float> @shuffle_ext_byone_v2f32(<2 x float> %op1, <2 x float> %op2) #0 { +; NO_SVE-LABEL: shuffle_ext_byone_v2f32: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: ext v0.8b, v0.8b, v1.8b, #4 +; NO_SVE-NEXT: ret +; ; CHECK-LABEL: shuffle_ext_byone_v2f32: ; CHECK: // %bb.0: ; CHECK-NEXT: ext v0.8b, v0.8b, v1.8b, #4 @@ -702,6 +2032,11 @@ ; Don't use SVE for 128-bit vectors define <4 x float> @shuffle_ext_byone_v4f32(<4 x float> %op1, <4 x float> %op2) #0 { +; NO_SVE-LABEL: shuffle_ext_byone_v4f32: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: ext v0.16b, v0.16b, v1.16b, #12 +; NO_SVE-NEXT: ret +; ; CHECK-LABEL: shuffle_ext_byone_v4f32: ; CHECK: // %bb.0: ; CHECK-NEXT: ext v0.16b, v0.16b, v1.16b, #12 @@ -711,6 +2046,15 @@ } define void @shuffle_ext_byone_v8f32(<8 x float>* %a, <8 x float>* %b) #0 { +; NO_SVE-LABEL: shuffle_ext_byone_v8f32: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: ldp q1, q2, [x1] +; NO_SVE-NEXT: ldr q0, [x0, #16] +; NO_SVE-NEXT: ext v0.16b, v0.16b, v1.16b, #12 +; NO_SVE-NEXT: ext v1.16b, v1.16b, v2.16b, #12 +; NO_SVE-NEXT: stp q0, q1, [x0] +; NO_SVE-NEXT: ret +; ; CHECK-LABEL: shuffle_ext_byone_v8f32: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.s, vl8 @@ -729,6 +2073,19 @@ define void @shuffle_ext_byone_v16f32(<16 x float>* %a, <16 x float>* %b) #0 { ; Ensure sensible type legalisation. +; NO_SVE-LABEL: shuffle_ext_byone_v16f32: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: ldp q1, q0, [x1, #32] +; NO_SVE-NEXT: ldp q4, q2, [x1] +; NO_SVE-NEXT: ext v0.16b, v1.16b, v0.16b, #12 +; NO_SVE-NEXT: ldr q3, [x0, #48] +; NO_SVE-NEXT: ext v5.16b, v2.16b, v1.16b, #12 +; NO_SVE-NEXT: ext v2.16b, v4.16b, v2.16b, #12 +; NO_SVE-NEXT: ext v1.16b, v3.16b, v4.16b, #12 +; NO_SVE-NEXT: stp q5, q0, [x0, #32] +; NO_SVE-NEXT: stp q1, q2, [x0] +; NO_SVE-NEXT: ret +; ; VBITS_EQ_256-LABEL: shuffle_ext_byone_v16f32: ; VBITS_EQ_256: // %bb.0: ; VBITS_EQ_256-NEXT: mov x8, #8 @@ -744,6 +2101,21 @@ ; VBITS_EQ_256-NEXT: st1w { z1.s }, p0, [x0, x8, lsl #2] ; VBITS_EQ_256-NEXT: ret ; +; VBITS_GE_384-LABEL: shuffle_ext_byone_v16f32: +; VBITS_GE_384: // %bb.0: +; VBITS_GE_384-NEXT: mov x8, #8 +; VBITS_GE_384-NEXT: ptrue p0.s, vl8 +; VBITS_GE_384-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] +; VBITS_GE_384-NEXT: ld1w { z1.s }, p0/z, [x1, x8, lsl #2] +; VBITS_GE_384-NEXT: ld1w { z2.s }, p0/z, [x1] +; VBITS_GE_384-NEXT: mov z0.s, z0.s[7] +; VBITS_GE_384-NEXT: mov z3.s, z2.s[7] +; VBITS_GE_384-NEXT: insr z2.s, s0 +; VBITS_GE_384-NEXT: insr z1.s, s3 +; VBITS_GE_384-NEXT: st1w { z2.s }, p0, [x0] +; VBITS_GE_384-NEXT: st1w { z1.s }, p0, [x0, x8, lsl #2] +; VBITS_GE_384-NEXT: ret +; ; VBITS_GE_512-LABEL: shuffle_ext_byone_v16f32: ; VBITS_GE_512: // %bb.0: ; VBITS_GE_512-NEXT: ptrue p0.s, vl16 @@ -762,6 +2134,77 @@ } define void @shuffle_ext_byone_v32f32(<32 x float>* %a, <32 x float>* %b) #0 { +; NO_SVE-LABEL: shuffle_ext_byone_v32f32: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: ldp q2, q1, [x1, #96] +; NO_SVE-NEXT: ldp q6, q5, [x1, #64] +; NO_SVE-NEXT: ext v1.16b, v2.16b, v1.16b, #12 +; NO_SVE-NEXT: ldp q4, q3, [x1, #32] +; NO_SVE-NEXT: ext v2.16b, v5.16b, v2.16b, #12 +; NO_SVE-NEXT: ldp q16, q7, [x1] +; NO_SVE-NEXT: ext v17.16b, v4.16b, v3.16b, #12 +; NO_SVE-NEXT: ldr q0, [x0, #112] +; NO_SVE-NEXT: stp q2, q1, [x0, #96] +; NO_SVE-NEXT: ext v4.16b, v7.16b, v4.16b, #12 +; NO_SVE-NEXT: ext v7.16b, v16.16b, v7.16b, #12 +; NO_SVE-NEXT: ext v0.16b, v0.16b, v16.16b, #12 +; NO_SVE-NEXT: ext v1.16b, v3.16b, v6.16b, #12 +; NO_SVE-NEXT: ext v2.16b, v6.16b, v5.16b, #12 +; NO_SVE-NEXT: stp q4, q17, [x0, #32] +; NO_SVE-NEXT: stp q0, q7, [x0] +; NO_SVE-NEXT: stp q1, q2, [x0, #64] +; NO_SVE-NEXT: ret +; +; VBITS_EQ_256-LABEL: shuffle_ext_byone_v32f32: +; VBITS_EQ_256: // %bb.0: +; VBITS_EQ_256-NEXT: mov x8, #16 +; VBITS_EQ_256-NEXT: mov x9, #24 +; VBITS_EQ_256-NEXT: mov x10, #8 +; VBITS_EQ_256-NEXT: ptrue p0.s, vl8 +; VBITS_EQ_256-NEXT: ld1w { z0.s }, p0/z, [x1, x8, lsl #2] +; VBITS_EQ_256-NEXT: ld1w { z1.s }, p0/z, [x1, x9, lsl #2] +; VBITS_EQ_256-NEXT: ld1w { z3.s }, p0/z, [x1, x10, lsl #2] +; VBITS_EQ_256-NEXT: ld1w { z2.s }, p0/z, [x0, x9, lsl #2] +; VBITS_EQ_256-NEXT: ld1w { z4.s }, p0/z, [x1] +; VBITS_EQ_256-NEXT: mov z5.s, z0.s[7] +; VBITS_EQ_256-NEXT: insr z1.s, s5 +; VBITS_EQ_256-NEXT: mov z5.s, z3.s[7] +; VBITS_EQ_256-NEXT: mov z2.s, z2.s[7] +; VBITS_EQ_256-NEXT: insr z0.s, s5 +; VBITS_EQ_256-NEXT: mov z5.s, z4.s[7] +; VBITS_EQ_256-NEXT: insr z4.s, s2 +; VBITS_EQ_256-NEXT: insr z3.s, s5 +; VBITS_EQ_256-NEXT: st1w { z0.s }, p0, [x0, x8, lsl #2] +; VBITS_EQ_256-NEXT: st1w { z1.s }, p0, [x0, x9, lsl #2] +; VBITS_EQ_256-NEXT: st1w { z3.s }, p0, [x0, x10, lsl #2] +; VBITS_EQ_256-NEXT: st1w { z4.s }, p0, [x0] +; VBITS_EQ_256-NEXT: ret +; +; VBITS_GE_384-LABEL: shuffle_ext_byone_v32f32: +; VBITS_GE_384: // %bb.0: +; VBITS_GE_384-NEXT: mov x8, #16 +; VBITS_GE_384-NEXT: mov x9, #24 +; VBITS_GE_384-NEXT: mov x10, #8 +; VBITS_GE_384-NEXT: ptrue p0.s, vl8 +; VBITS_GE_384-NEXT: ld1w { z0.s }, p0/z, [x1, x8, lsl #2] +; VBITS_GE_384-NEXT: ld1w { z1.s }, p0/z, [x1, x9, lsl #2] +; VBITS_GE_384-NEXT: ld1w { z3.s }, p0/z, [x1, x10, lsl #2] +; VBITS_GE_384-NEXT: ld1w { z2.s }, p0/z, [x0, x9, lsl #2] +; VBITS_GE_384-NEXT: ld1w { z4.s }, p0/z, [x1] +; VBITS_GE_384-NEXT: mov z5.s, z0.s[7] +; VBITS_GE_384-NEXT: insr z1.s, s5 +; VBITS_GE_384-NEXT: mov z5.s, z3.s[7] +; VBITS_GE_384-NEXT: mov z2.s, z2.s[7] +; VBITS_GE_384-NEXT: insr z0.s, s5 +; VBITS_GE_384-NEXT: mov z5.s, z4.s[7] +; VBITS_GE_384-NEXT: insr z4.s, s2 +; VBITS_GE_384-NEXT: insr z3.s, s5 +; VBITS_GE_384-NEXT: st1w { z0.s }, p0, [x0, x8, lsl #2] +; VBITS_GE_384-NEXT: st1w { z1.s }, p0, [x0, x9, lsl #2] +; VBITS_GE_384-NEXT: st1w { z3.s }, p0, [x0, x10, lsl #2] +; VBITS_GE_384-NEXT: st1w { z4.s }, p0, [x0] +; VBITS_GE_384-NEXT: ret +; ; VBITS_GE_1024-LABEL: shuffle_ext_byone_v32f32: ; VBITS_GE_1024: // %bb.0: ; VBITS_GE_1024-NEXT: ptrue p0.s, vl32 @@ -784,6 +2227,133 @@ } define void @shuffle_ext_byone_v64f32(<64 x float>* %a, <64 x float>* %b) #0 { +; NO_SVE-LABEL: shuffle_ext_byone_v64f32: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: ldp q1, q0, [x1, #160] +; NO_SVE-NEXT: ldp q3, q2, [x1, #128] +; NO_SVE-NEXT: ldp q6, q7, [x1, #192] +; NO_SVE-NEXT: ext v19.16b, v3.16b, v2.16b, #12 +; NO_SVE-NEXT: ldp q18, q16, [x1, #96] +; NO_SVE-NEXT: ext v21.16b, v6.16b, v7.16b, #12 +; NO_SVE-NEXT: ext v6.16b, v0.16b, v6.16b, #12 +; NO_SVE-NEXT: ext v0.16b, v1.16b, v0.16b, #12 +; NO_SVE-NEXT: ext v1.16b, v2.16b, v1.16b, #12 +; NO_SVE-NEXT: ldp q5, q4, [x1, #64] +; NO_SVE-NEXT: ext v3.16b, v16.16b, v3.16b, #12 +; NO_SVE-NEXT: ldp q22, q20, [x1, #32] +; NO_SVE-NEXT: ext v17.16b, v5.16b, v4.16b, #12 +; NO_SVE-NEXT: ldp q24, q23, [x1] +; NO_SVE-NEXT: ldp q25, q2, [x1, #224] +; NO_SVE-NEXT: stp q3, q19, [x0, #128] +; NO_SVE-NEXT: stp q1, q0, [x0, #160] +; NO_SVE-NEXT: stp q6, q21, [x0, #192] +; NO_SVE-NEXT: ext v6.16b, v18.16b, v16.16b, #12 +; NO_SVE-NEXT: ext v0.16b, v4.16b, v18.16b, #12 +; NO_SVE-NEXT: ext v4.16b, v20.16b, v5.16b, #12 +; NO_SVE-NEXT: ldr q1, [x0, #240] +; NO_SVE-NEXT: ext v3.16b, v22.16b, v20.16b, #12 +; NO_SVE-NEXT: ext v5.16b, v23.16b, v22.16b, #12 +; NO_SVE-NEXT: stp q0, q6, [x0, #96] +; NO_SVE-NEXT: ext v0.16b, v24.16b, v23.16b, #12 +; NO_SVE-NEXT: stp q4, q17, [x0, #64] +; NO_SVE-NEXT: ext v1.16b, v1.16b, v24.16b, #12 +; NO_SVE-NEXT: ext v2.16b, v25.16b, v2.16b, #12 +; NO_SVE-NEXT: stp q5, q3, [x0, #32] +; NO_SVE-NEXT: ext v3.16b, v7.16b, v25.16b, #12 +; NO_SVE-NEXT: stp q1, q0, [x0] +; NO_SVE-NEXT: stp q3, q2, [x0, #224] +; NO_SVE-NEXT: ret +; +; VBITS_EQ_256-LABEL: shuffle_ext_byone_v64f32: +; VBITS_EQ_256: // %bb.0: +; VBITS_EQ_256-NEXT: mov x10, #32 +; VBITS_EQ_256-NEXT: mov x9, #40 +; VBITS_EQ_256-NEXT: mov x11, #8 +; VBITS_EQ_256-NEXT: ptrue p0.s, vl8 +; VBITS_EQ_256-NEXT: mov x12, #16 +; VBITS_EQ_256-NEXT: mov x13, #24 +; VBITS_EQ_256-NEXT: mov x8, #56 +; VBITS_EQ_256-NEXT: mov x14, #48 +; VBITS_EQ_256-NEXT: ld1w { z3.s }, p0/z, [x1, x10, lsl #2] +; VBITS_EQ_256-NEXT: ld1w { z1.s }, p0/z, [x1, x9, lsl #2] +; VBITS_EQ_256-NEXT: ld1w { z5.s }, p0/z, [x1, x11, lsl #2] +; VBITS_EQ_256-NEXT: ld1w { z4.s }, p0/z, [x1, x12, lsl #2] +; VBITS_EQ_256-NEXT: ld1w { z16.s }, p0/z, [x1, x13, lsl #2] +; VBITS_EQ_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] +; VBITS_EQ_256-NEXT: ld1w { z2.s }, p0/z, [x1, x8, lsl #2] +; VBITS_EQ_256-NEXT: ld1w { z7.s }, p0/z, [x1, x14, lsl #2] +; VBITS_EQ_256-NEXT: ld1w { z17.s }, p0/z, [x1] +; VBITS_EQ_256-NEXT: mov z18.s, z3.s[7] +; VBITS_EQ_256-NEXT: mov z6.s, z1.s[7] +; VBITS_EQ_256-NEXT: insr z1.s, s18 +; VBITS_EQ_256-NEXT: mov z18.s, z5.s[7] +; VBITS_EQ_256-NEXT: mov z19.s, z4.s[7] +; VBITS_EQ_256-NEXT: insr z4.s, s18 +; VBITS_EQ_256-NEXT: mov z18.s, z16.s[7] +; VBITS_EQ_256-NEXT: insr z3.s, s18 +; VBITS_EQ_256-NEXT: mov z18.s, z7.s[7] +; VBITS_EQ_256-NEXT: insr z7.s, s6 +; VBITS_EQ_256-NEXT: mov z0.s, z0.s[7] +; VBITS_EQ_256-NEXT: mov z6.s, z17.s[7] +; VBITS_EQ_256-NEXT: insr z16.s, s19 +; VBITS_EQ_256-NEXT: insr z2.s, s18 +; VBITS_EQ_256-NEXT: insr z17.s, s0 +; VBITS_EQ_256-NEXT: insr z5.s, s6 +; VBITS_EQ_256-NEXT: st1w { z2.s }, p0, [x0, x8, lsl #2] +; VBITS_EQ_256-NEXT: st1w { z7.s }, p0, [x0, x14, lsl #2] +; VBITS_EQ_256-NEXT: st1w { z3.s }, p0, [x0, x10, lsl #2] +; VBITS_EQ_256-NEXT: st1w { z16.s }, p0, [x0, x13, lsl #2] +; VBITS_EQ_256-NEXT: st1w { z4.s }, p0, [x0, x12, lsl #2] +; VBITS_EQ_256-NEXT: st1w { z5.s }, p0, [x0, x11, lsl #2] +; VBITS_EQ_256-NEXT: st1w { z1.s }, p0, [x0, x9, lsl #2] +; VBITS_EQ_256-NEXT: st1w { z17.s }, p0, [x0] +; VBITS_EQ_256-NEXT: ret +; +; VBITS_GE_384-LABEL: shuffle_ext_byone_v64f32: +; VBITS_GE_384: // %bb.0: +; VBITS_GE_384-NEXT: mov x10, #32 +; VBITS_GE_384-NEXT: mov x9, #40 +; VBITS_GE_384-NEXT: mov x11, #8 +; VBITS_GE_384-NEXT: ptrue p0.s, vl8 +; VBITS_GE_384-NEXT: mov x12, #16 +; VBITS_GE_384-NEXT: mov x13, #24 +; VBITS_GE_384-NEXT: mov x8, #56 +; VBITS_GE_384-NEXT: mov x14, #48 +; VBITS_GE_384-NEXT: ld1w { z3.s }, p0/z, [x1, x10, lsl #2] +; VBITS_GE_384-NEXT: ld1w { z1.s }, p0/z, [x1, x9, lsl #2] +; VBITS_GE_384-NEXT: ld1w { z5.s }, p0/z, [x1, x11, lsl #2] +; VBITS_GE_384-NEXT: ld1w { z4.s }, p0/z, [x1, x12, lsl #2] +; VBITS_GE_384-NEXT: ld1w { z16.s }, p0/z, [x1, x13, lsl #2] +; VBITS_GE_384-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] +; VBITS_GE_384-NEXT: ld1w { z2.s }, p0/z, [x1, x8, lsl #2] +; VBITS_GE_384-NEXT: ld1w { z7.s }, p0/z, [x1, x14, lsl #2] +; VBITS_GE_384-NEXT: ld1w { z17.s }, p0/z, [x1] +; VBITS_GE_384-NEXT: mov z18.s, z3.s[7] +; VBITS_GE_384-NEXT: mov z6.s, z1.s[7] +; VBITS_GE_384-NEXT: insr z1.s, s18 +; VBITS_GE_384-NEXT: mov z18.s, z5.s[7] +; VBITS_GE_384-NEXT: mov z19.s, z4.s[7] +; VBITS_GE_384-NEXT: insr z4.s, s18 +; VBITS_GE_384-NEXT: mov z18.s, z16.s[7] +; VBITS_GE_384-NEXT: insr z3.s, s18 +; VBITS_GE_384-NEXT: mov z18.s, z7.s[7] +; VBITS_GE_384-NEXT: insr z7.s, s6 +; VBITS_GE_384-NEXT: mov z0.s, z0.s[7] +; VBITS_GE_384-NEXT: mov z6.s, z17.s[7] +; VBITS_GE_384-NEXT: insr z16.s, s19 +; VBITS_GE_384-NEXT: insr z2.s, s18 +; VBITS_GE_384-NEXT: insr z17.s, s0 +; VBITS_GE_384-NEXT: insr z5.s, s6 +; VBITS_GE_384-NEXT: st1w { z2.s }, p0, [x0, x8, lsl #2] +; VBITS_GE_384-NEXT: st1w { z7.s }, p0, [x0, x14, lsl #2] +; VBITS_GE_384-NEXT: st1w { z3.s }, p0, [x0, x10, lsl #2] +; VBITS_GE_384-NEXT: st1w { z16.s }, p0, [x0, x13, lsl #2] +; VBITS_GE_384-NEXT: st1w { z4.s }, p0, [x0, x12, lsl #2] +; VBITS_GE_384-NEXT: st1w { z5.s }, p0, [x0, x11, lsl #2] +; VBITS_GE_384-NEXT: st1w { z1.s }, p0, [x0, x9, lsl #2] +; VBITS_GE_384-NEXT: st1w { z17.s }, p0, [x0] +; VBITS_GE_384-NEXT: ret +; ; VBITS_GE_2048-LABEL: shuffle_ext_byone_v64f32: ; VBITS_GE_2048: // %bb.0: ; VBITS_GE_2048-NEXT: ptrue p0.s, vl64 @@ -811,6 +2381,11 @@ ; Don't use SVE for 128-bit vectors define <2 x double> @shuffle_ext_byone_v2f64(<2 x double> %op1, <2 x double> %op2) #0 { +; NO_SVE-LABEL: shuffle_ext_byone_v2f64: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: ext v0.16b, v0.16b, v1.16b, #8 +; NO_SVE-NEXT: ret +; ; CHECK-LABEL: shuffle_ext_byone_v2f64: ; CHECK: // %bb.0: ; CHECK-NEXT: ext v0.16b, v0.16b, v1.16b, #8 @@ -820,6 +2395,15 @@ } define void @shuffle_ext_byone_v4f64(<4 x double>* %a, <4 x double>* %b) #0 { +; NO_SVE-LABEL: shuffle_ext_byone_v4f64: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: ldp q1, q2, [x1] +; NO_SVE-NEXT: ldr q0, [x0, #16] +; NO_SVE-NEXT: ext v0.16b, v0.16b, v1.16b, #8 +; NO_SVE-NEXT: ext v1.16b, v1.16b, v2.16b, #8 +; NO_SVE-NEXT: stp q0, q1, [x0] +; NO_SVE-NEXT: ret +; ; CHECK-LABEL: shuffle_ext_byone_v4f64: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.d, vl4 @@ -838,6 +2422,19 @@ define void @shuffle_ext_byone_v8f64(<8 x double>* %a, <8 x double>* %b) #0 { ; Ensure sensible type legalisation. +; NO_SVE-LABEL: shuffle_ext_byone_v8f64: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: ldp q1, q0, [x1, #32] +; NO_SVE-NEXT: ldp q4, q2, [x1] +; NO_SVE-NEXT: ext v0.16b, v1.16b, v0.16b, #8 +; NO_SVE-NEXT: ldr q3, [x0, #48] +; NO_SVE-NEXT: ext v5.16b, v2.16b, v1.16b, #8 +; NO_SVE-NEXT: ext v2.16b, v4.16b, v2.16b, #8 +; NO_SVE-NEXT: ext v1.16b, v3.16b, v4.16b, #8 +; NO_SVE-NEXT: stp q5, q0, [x0, #32] +; NO_SVE-NEXT: stp q1, q2, [x0] +; NO_SVE-NEXT: ret +; ; VBITS_EQ_256-LABEL: shuffle_ext_byone_v8f64: ; VBITS_EQ_256: // %bb.0: ; VBITS_EQ_256-NEXT: mov x8, #4 @@ -853,6 +2450,21 @@ ; VBITS_EQ_256-NEXT: st1d { z1.d }, p0, [x0, x8, lsl #3] ; VBITS_EQ_256-NEXT: ret ; +; VBITS_GE_384-LABEL: shuffle_ext_byone_v8f64: +; VBITS_GE_384: // %bb.0: +; VBITS_GE_384-NEXT: mov x8, #4 +; VBITS_GE_384-NEXT: ptrue p0.d, vl4 +; VBITS_GE_384-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] +; VBITS_GE_384-NEXT: ld1d { z1.d }, p0/z, [x1, x8, lsl #3] +; VBITS_GE_384-NEXT: ld1d { z2.d }, p0/z, [x1] +; VBITS_GE_384-NEXT: mov z0.d, z0.d[3] +; VBITS_GE_384-NEXT: mov z3.d, z2.d[3] +; VBITS_GE_384-NEXT: insr z2.d, d0 +; VBITS_GE_384-NEXT: insr z1.d, d3 +; VBITS_GE_384-NEXT: st1d { z2.d }, p0, [x0] +; VBITS_GE_384-NEXT: st1d { z1.d }, p0, [x0, x8, lsl #3] +; VBITS_GE_384-NEXT: ret +; ; VBITS_GE_512-LABEL: shuffle_ext_byone_v8f64: ; VBITS_GE_512: // %bb.0: ; VBITS_GE_512-NEXT: ptrue p0.d, vl8 @@ -870,6 +2482,77 @@ } define void @shuffle_ext_byone_v16f64(<16 x double>* %a, <16 x double>* %b) #0 { +; NO_SVE-LABEL: shuffle_ext_byone_v16f64: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: ldp q2, q1, [x1, #96] +; NO_SVE-NEXT: ldp q6, q5, [x1, #64] +; NO_SVE-NEXT: ext v1.16b, v2.16b, v1.16b, #8 +; NO_SVE-NEXT: ldp q4, q3, [x1, #32] +; NO_SVE-NEXT: ext v2.16b, v5.16b, v2.16b, #8 +; NO_SVE-NEXT: ldp q16, q7, [x1] +; NO_SVE-NEXT: ext v17.16b, v4.16b, v3.16b, #8 +; NO_SVE-NEXT: ldr q0, [x0, #112] +; NO_SVE-NEXT: stp q2, q1, [x0, #96] +; NO_SVE-NEXT: ext v4.16b, v7.16b, v4.16b, #8 +; NO_SVE-NEXT: ext v7.16b, v16.16b, v7.16b, #8 +; NO_SVE-NEXT: ext v0.16b, v0.16b, v16.16b, #8 +; NO_SVE-NEXT: ext v1.16b, v3.16b, v6.16b, #8 +; NO_SVE-NEXT: ext v2.16b, v6.16b, v5.16b, #8 +; NO_SVE-NEXT: stp q4, q17, [x0, #32] +; NO_SVE-NEXT: stp q0, q7, [x0] +; NO_SVE-NEXT: stp q1, q2, [x0, #64] +; NO_SVE-NEXT: ret +; +; VBITS_EQ_256-LABEL: shuffle_ext_byone_v16f64: +; VBITS_EQ_256: // %bb.0: +; VBITS_EQ_256-NEXT: mov x8, #8 +; VBITS_EQ_256-NEXT: mov x9, #12 +; VBITS_EQ_256-NEXT: mov x10, #4 +; VBITS_EQ_256-NEXT: ptrue p0.d, vl4 +; VBITS_EQ_256-NEXT: ld1d { z0.d }, p0/z, [x1, x8, lsl #3] +; VBITS_EQ_256-NEXT: ld1d { z1.d }, p0/z, [x1, x9, lsl #3] +; VBITS_EQ_256-NEXT: ld1d { z3.d }, p0/z, [x1, x10, lsl #3] +; VBITS_EQ_256-NEXT: ld1d { z2.d }, p0/z, [x0, x9, lsl #3] +; VBITS_EQ_256-NEXT: ld1d { z4.d }, p0/z, [x1] +; VBITS_EQ_256-NEXT: mov z5.d, z0.d[3] +; VBITS_EQ_256-NEXT: insr z1.d, d5 +; VBITS_EQ_256-NEXT: mov z5.d, z3.d[3] +; VBITS_EQ_256-NEXT: mov z2.d, z2.d[3] +; VBITS_EQ_256-NEXT: insr z0.d, d5 +; VBITS_EQ_256-NEXT: mov z5.d, z4.d[3] +; VBITS_EQ_256-NEXT: insr z4.d, d2 +; VBITS_EQ_256-NEXT: insr z3.d, d5 +; VBITS_EQ_256-NEXT: st1d { z0.d }, p0, [x0, x8, lsl #3] +; VBITS_EQ_256-NEXT: st1d { z1.d }, p0, [x0, x9, lsl #3] +; VBITS_EQ_256-NEXT: st1d { z3.d }, p0, [x0, x10, lsl #3] +; VBITS_EQ_256-NEXT: st1d { z4.d }, p0, [x0] +; VBITS_EQ_256-NEXT: ret +; +; VBITS_GE_384-LABEL: shuffle_ext_byone_v16f64: +; VBITS_GE_384: // %bb.0: +; VBITS_GE_384-NEXT: mov x8, #8 +; VBITS_GE_384-NEXT: mov x9, #12 +; VBITS_GE_384-NEXT: mov x10, #4 +; VBITS_GE_384-NEXT: ptrue p0.d, vl4 +; VBITS_GE_384-NEXT: ld1d { z0.d }, p0/z, [x1, x8, lsl #3] +; VBITS_GE_384-NEXT: ld1d { z1.d }, p0/z, [x1, x9, lsl #3] +; VBITS_GE_384-NEXT: ld1d { z3.d }, p0/z, [x1, x10, lsl #3] +; VBITS_GE_384-NEXT: ld1d { z2.d }, p0/z, [x0, x9, lsl #3] +; VBITS_GE_384-NEXT: ld1d { z4.d }, p0/z, [x1] +; VBITS_GE_384-NEXT: mov z5.d, z0.d[3] +; VBITS_GE_384-NEXT: insr z1.d, d5 +; VBITS_GE_384-NEXT: mov z5.d, z3.d[3] +; VBITS_GE_384-NEXT: mov z2.d, z2.d[3] +; VBITS_GE_384-NEXT: insr z0.d, d5 +; VBITS_GE_384-NEXT: mov z5.d, z4.d[3] +; VBITS_GE_384-NEXT: insr z4.d, d2 +; VBITS_GE_384-NEXT: insr z3.d, d5 +; VBITS_GE_384-NEXT: st1d { z0.d }, p0, [x0, x8, lsl #3] +; VBITS_GE_384-NEXT: st1d { z1.d }, p0, [x0, x9, lsl #3] +; VBITS_GE_384-NEXT: st1d { z3.d }, p0, [x0, x10, lsl #3] +; VBITS_GE_384-NEXT: st1d { z4.d }, p0, [x0] +; VBITS_GE_384-NEXT: ret +; ; VBITS_GE_1024-LABEL: shuffle_ext_byone_v16f64: ; VBITS_GE_1024: // %bb.0: ; VBITS_GE_1024-NEXT: ptrue p0.d, vl16 @@ -890,6 +2573,133 @@ } define void @shuffle_ext_byone_v32f64(<32 x double>* %a, <32 x double>* %b) #0 { +; NO_SVE-LABEL: shuffle_ext_byone_v32f64: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: ldp q1, q0, [x1, #160] +; NO_SVE-NEXT: ldp q3, q2, [x1, #128] +; NO_SVE-NEXT: ldp q6, q7, [x1, #192] +; NO_SVE-NEXT: ext v19.16b, v3.16b, v2.16b, #8 +; NO_SVE-NEXT: ldp q18, q16, [x1, #96] +; NO_SVE-NEXT: ext v21.16b, v6.16b, v7.16b, #8 +; NO_SVE-NEXT: ext v6.16b, v0.16b, v6.16b, #8 +; NO_SVE-NEXT: ext v0.16b, v1.16b, v0.16b, #8 +; NO_SVE-NEXT: ext v1.16b, v2.16b, v1.16b, #8 +; NO_SVE-NEXT: ldp q5, q4, [x1, #64] +; NO_SVE-NEXT: ext v3.16b, v16.16b, v3.16b, #8 +; NO_SVE-NEXT: ldp q22, q20, [x1, #32] +; NO_SVE-NEXT: ext v17.16b, v5.16b, v4.16b, #8 +; NO_SVE-NEXT: ldp q24, q23, [x1] +; NO_SVE-NEXT: ldp q25, q2, [x1, #224] +; NO_SVE-NEXT: stp q3, q19, [x0, #128] +; NO_SVE-NEXT: stp q1, q0, [x0, #160] +; NO_SVE-NEXT: stp q6, q21, [x0, #192] +; NO_SVE-NEXT: ext v6.16b, v18.16b, v16.16b, #8 +; NO_SVE-NEXT: ext v0.16b, v4.16b, v18.16b, #8 +; NO_SVE-NEXT: ext v4.16b, v20.16b, v5.16b, #8 +; NO_SVE-NEXT: ldr q1, [x0, #240] +; NO_SVE-NEXT: ext v3.16b, v22.16b, v20.16b, #8 +; NO_SVE-NEXT: ext v5.16b, v23.16b, v22.16b, #8 +; NO_SVE-NEXT: stp q0, q6, [x0, #96] +; NO_SVE-NEXT: ext v0.16b, v24.16b, v23.16b, #8 +; NO_SVE-NEXT: stp q4, q17, [x0, #64] +; NO_SVE-NEXT: ext v1.16b, v1.16b, v24.16b, #8 +; NO_SVE-NEXT: ext v2.16b, v25.16b, v2.16b, #8 +; NO_SVE-NEXT: stp q5, q3, [x0, #32] +; NO_SVE-NEXT: ext v3.16b, v7.16b, v25.16b, #8 +; NO_SVE-NEXT: stp q1, q0, [x0] +; NO_SVE-NEXT: stp q3, q2, [x0, #224] +; NO_SVE-NEXT: ret +; +; VBITS_EQ_256-LABEL: shuffle_ext_byone_v32f64: +; VBITS_EQ_256: // %bb.0: +; VBITS_EQ_256-NEXT: mov x10, #16 +; VBITS_EQ_256-NEXT: mov x9, #20 +; VBITS_EQ_256-NEXT: mov x11, #4 +; VBITS_EQ_256-NEXT: ptrue p0.d, vl4 +; VBITS_EQ_256-NEXT: mov x12, #8 +; VBITS_EQ_256-NEXT: mov x13, #12 +; VBITS_EQ_256-NEXT: mov x8, #28 +; VBITS_EQ_256-NEXT: mov x14, #24 +; VBITS_EQ_256-NEXT: ld1d { z3.d }, p0/z, [x1, x10, lsl #3] +; VBITS_EQ_256-NEXT: ld1d { z1.d }, p0/z, [x1, x9, lsl #3] +; VBITS_EQ_256-NEXT: ld1d { z5.d }, p0/z, [x1, x11, lsl #3] +; VBITS_EQ_256-NEXT: ld1d { z4.d }, p0/z, [x1, x12, lsl #3] +; VBITS_EQ_256-NEXT: ld1d { z16.d }, p0/z, [x1, x13, lsl #3] +; VBITS_EQ_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] +; VBITS_EQ_256-NEXT: ld1d { z2.d }, p0/z, [x1, x8, lsl #3] +; VBITS_EQ_256-NEXT: ld1d { z7.d }, p0/z, [x1, x14, lsl #3] +; VBITS_EQ_256-NEXT: ld1d { z17.d }, p0/z, [x1] +; VBITS_EQ_256-NEXT: mov z18.d, z3.d[3] +; VBITS_EQ_256-NEXT: mov z6.d, z1.d[3] +; VBITS_EQ_256-NEXT: insr z1.d, d18 +; VBITS_EQ_256-NEXT: mov z18.d, z5.d[3] +; VBITS_EQ_256-NEXT: mov z19.d, z4.d[3] +; VBITS_EQ_256-NEXT: insr z4.d, d18 +; VBITS_EQ_256-NEXT: mov z18.d, z16.d[3] +; VBITS_EQ_256-NEXT: insr z3.d, d18 +; VBITS_EQ_256-NEXT: mov z18.d, z7.d[3] +; VBITS_EQ_256-NEXT: insr z7.d, d6 +; VBITS_EQ_256-NEXT: mov z0.d, z0.d[3] +; VBITS_EQ_256-NEXT: mov z6.d, z17.d[3] +; VBITS_EQ_256-NEXT: insr z16.d, d19 +; VBITS_EQ_256-NEXT: insr z2.d, d18 +; VBITS_EQ_256-NEXT: insr z17.d, d0 +; VBITS_EQ_256-NEXT: insr z5.d, d6 +; VBITS_EQ_256-NEXT: st1d { z2.d }, p0, [x0, x8, lsl #3] +; VBITS_EQ_256-NEXT: st1d { z7.d }, p0, [x0, x14, lsl #3] +; VBITS_EQ_256-NEXT: st1d { z3.d }, p0, [x0, x10, lsl #3] +; VBITS_EQ_256-NEXT: st1d { z16.d }, p0, [x0, x13, lsl #3] +; VBITS_EQ_256-NEXT: st1d { z4.d }, p0, [x0, x12, lsl #3] +; VBITS_EQ_256-NEXT: st1d { z5.d }, p0, [x0, x11, lsl #3] +; VBITS_EQ_256-NEXT: st1d { z1.d }, p0, [x0, x9, lsl #3] +; VBITS_EQ_256-NEXT: st1d { z17.d }, p0, [x0] +; VBITS_EQ_256-NEXT: ret +; +; VBITS_GE_384-LABEL: shuffle_ext_byone_v32f64: +; VBITS_GE_384: // %bb.0: +; VBITS_GE_384-NEXT: mov x10, #16 +; VBITS_GE_384-NEXT: mov x9, #20 +; VBITS_GE_384-NEXT: mov x11, #4 +; VBITS_GE_384-NEXT: ptrue p0.d, vl4 +; VBITS_GE_384-NEXT: mov x12, #8 +; VBITS_GE_384-NEXT: mov x13, #12 +; VBITS_GE_384-NEXT: mov x8, #28 +; VBITS_GE_384-NEXT: mov x14, #24 +; VBITS_GE_384-NEXT: ld1d { z3.d }, p0/z, [x1, x10, lsl #3] +; VBITS_GE_384-NEXT: ld1d { z1.d }, p0/z, [x1, x9, lsl #3] +; VBITS_GE_384-NEXT: ld1d { z5.d }, p0/z, [x1, x11, lsl #3] +; VBITS_GE_384-NEXT: ld1d { z4.d }, p0/z, [x1, x12, lsl #3] +; VBITS_GE_384-NEXT: ld1d { z16.d }, p0/z, [x1, x13, lsl #3] +; VBITS_GE_384-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] +; VBITS_GE_384-NEXT: ld1d { z2.d }, p0/z, [x1, x8, lsl #3] +; VBITS_GE_384-NEXT: ld1d { z7.d }, p0/z, [x1, x14, lsl #3] +; VBITS_GE_384-NEXT: ld1d { z17.d }, p0/z, [x1] +; VBITS_GE_384-NEXT: mov z18.d, z3.d[3] +; VBITS_GE_384-NEXT: mov z6.d, z1.d[3] +; VBITS_GE_384-NEXT: insr z1.d, d18 +; VBITS_GE_384-NEXT: mov z18.d, z5.d[3] +; VBITS_GE_384-NEXT: mov z19.d, z4.d[3] +; VBITS_GE_384-NEXT: insr z4.d, d18 +; VBITS_GE_384-NEXT: mov z18.d, z16.d[3] +; VBITS_GE_384-NEXT: insr z3.d, d18 +; VBITS_GE_384-NEXT: mov z18.d, z7.d[3] +; VBITS_GE_384-NEXT: insr z7.d, d6 +; VBITS_GE_384-NEXT: mov z0.d, z0.d[3] +; VBITS_GE_384-NEXT: mov z6.d, z17.d[3] +; VBITS_GE_384-NEXT: insr z16.d, d19 +; VBITS_GE_384-NEXT: insr z2.d, d18 +; VBITS_GE_384-NEXT: insr z17.d, d0 +; VBITS_GE_384-NEXT: insr z5.d, d6 +; VBITS_GE_384-NEXT: st1d { z2.d }, p0, [x0, x8, lsl #3] +; VBITS_GE_384-NEXT: st1d { z7.d }, p0, [x0, x14, lsl #3] +; VBITS_GE_384-NEXT: st1d { z3.d }, p0, [x0, x10, lsl #3] +; VBITS_GE_384-NEXT: st1d { z16.d }, p0, [x0, x13, lsl #3] +; VBITS_GE_384-NEXT: st1d { z4.d }, p0, [x0, x12, lsl #3] +; VBITS_GE_384-NEXT: st1d { z5.d }, p0, [x0, x11, lsl #3] +; VBITS_GE_384-NEXT: st1d { z1.d }, p0, [x0, x9, lsl #3] +; VBITS_GE_384-NEXT: st1d { z17.d }, p0, [x0] +; VBITS_GE_384-NEXT: ret +; ; VBITS_GE_2048-LABEL: shuffle_ext_byone_v32f64: ; VBITS_GE_2048: // %bb.0: ; VBITS_GE_2048-NEXT: ptrue p0.d, vl32 @@ -912,6 +2722,15 @@ } define void @shuffle_ext_byone_reverse(<4 x double>* %a, <4 x double>* %b) #0 { +; NO_SVE-LABEL: shuffle_ext_byone_reverse: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: ldp q0, q2, [x0] +; NO_SVE-NEXT: ldr q1, [x1, #16] +; NO_SVE-NEXT: ext v1.16b, v1.16b, v0.16b, #8 +; NO_SVE-NEXT: ext v0.16b, v0.16b, v2.16b, #8 +; NO_SVE-NEXT: stp q1, q0, [x0] +; NO_SVE-NEXT: ret +; ; CHECK-LABEL: shuffle_ext_byone_reverse: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.d, vl4 @@ -929,6 +2748,13 @@ } define void @shuffle_ext_invalid(<4 x double>* %a, <4 x double>* %b) #0 { +; NO_SVE-LABEL: shuffle_ext_invalid: +; NO_SVE: // %bb.0: +; NO_SVE-NEXT: ldr q0, [x0, #16] +; NO_SVE-NEXT: ldr q1, [x1] +; NO_SVE-NEXT: stp q0, q1, [x0] +; NO_SVE-NEXT: ret +; ; CHECK-LABEL: shuffle_ext_invalid: ; CHECK: // %bb.0: ; CHECK-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill