diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-extract-subvector.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-extract-subvector.ll --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-extract-subvector.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-extract-subvector.ll @@ -1,6 +1,8 @@ -; RUN: llc -aarch64-sve-vector-bits-min=128 < %s | FileCheck %s -check-prefix=NO_SVE -; RUN: llc -aarch64-sve-vector-bits-min=256 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_EQ_256 -; RUN: llc -aarch64-sve-vector-bits-min=384 < %s | FileCheck %s -check-prefixes=CHECK +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; Don't use SVE when its registers are no bigger than NEON. +; RUN: llc -aarch64-sve-vector-bits-min=128 < %s | not grep ptrue +; RUN: llc -aarch64-sve-vector-bits-min=256 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_256 +; RUN: llc -aarch64-sve-vector-bits-min=384 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_256 ; RUN: llc -aarch64-sve-vector-bits-min=512 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512 ; RUN: llc -aarch64-sve-vector-bits-min=640 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512 ; RUN: llc -aarch64-sve-vector-bits-min=768 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512 @@ -17,9 +19,6 @@ target triple = "aarch64-unknown-linux-gnu" -; Don't use SVE when its registers are no bigger than NEON. -; NO_SVE-NOT: ptrue - ; i8 ; Don't use SVE for 64-bit vectors. @@ -58,13 +57,13 @@ } define void @extract_subvector_v64i8(<64 x i8>* %a, <32 x i8>* %b) #0 { -; VBITS_EQ_256-LABEL: extract_subvector_v64i8: -; VBITS_EQ_256: // %bb.0: -; VBITS_EQ_256-NEXT: mov w8, #32 -; VBITS_EQ_256-NEXT: ptrue p0.b, vl32 -; VBITS_EQ_256-NEXT: ld1b { z0.b }, p0/z, [x0, x8] -; VBITS_EQ_256-NEXT: st1b { z0.b }, p0, [x1] -; VBITS_EQ_256-NEXT: ret +; VBITS_GE_256-LABEL: extract_subvector_v64i8: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov w8, #32 +; VBITS_GE_256-NEXT: ptrue p0.b, vl32 +; VBITS_GE_256-NEXT: ld1b { z0.b }, p0/z, [x0, x8] +; VBITS_GE_256-NEXT: st1b { z0.b }, p0, [x1] +; VBITS_GE_256-NEXT: ret ; ; VBITS_GE_512-LABEL: extract_subvector_v64i8: ; VBITS_GE_512: // %bb.0: @@ -81,6 +80,18 @@ } define void @extract_subvector_v128i8(<128 x i8>* %a, <64 x i8>* %b) #0 { +; VBITS_GE_256-LABEL: extract_subvector_v128i8: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov w8, #64 +; VBITS_GE_256-NEXT: mov w9, #96 +; VBITS_GE_256-NEXT: ptrue p0.b, vl32 +; VBITS_GE_256-NEXT: ld1b { z0.b }, p0/z, [x0, x8] +; VBITS_GE_256-NEXT: ld1b { z1.b }, p0/z, [x0, x9] +; VBITS_GE_256-NEXT: mov w8, #32 +; VBITS_GE_256-NEXT: st1b { z1.b }, p0, [x1, x8] +; VBITS_GE_256-NEXT: st1b { z0.b }, p0, [x1] +; VBITS_GE_256-NEXT: ret +; ; VBITS_GE_1024-LABEL: extract_subvector_v128i8: ; VBITS_GE_1024: // %bb.0: ; VBITS_GE_1024-NEXT: ptrue p0.b, vl128 @@ -96,6 +107,26 @@ } define void @extract_subvector_v256i8(<256 x i8>* %a, <128 x i8>* %b) #0 { +; VBITS_GE_256-LABEL: extract_subvector_v256i8: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov w8, #128 +; VBITS_GE_256-NEXT: mov w9, #160 +; VBITS_GE_256-NEXT: mov w10, #224 +; VBITS_GE_256-NEXT: mov w11, #192 +; VBITS_GE_256-NEXT: ptrue p0.b, vl32 +; VBITS_GE_256-NEXT: ld1b { z0.b }, p0/z, [x0, x8] +; VBITS_GE_256-NEXT: ld1b { z1.b }, p0/z, [x0, x9] +; VBITS_GE_256-NEXT: ld1b { z2.b }, p0/z, [x0, x10] +; VBITS_GE_256-NEXT: ld1b { z3.b }, p0/z, [x0, x11] +; VBITS_GE_256-NEXT: mov w8, #64 +; VBITS_GE_256-NEXT: mov w9, #96 +; VBITS_GE_256-NEXT: mov w10, #32 +; VBITS_GE_256-NEXT: st1b { z0.b }, p0, [x1] +; VBITS_GE_256-NEXT: st1b { z3.b }, p0, [x1, x8] +; VBITS_GE_256-NEXT: st1b { z2.b }, p0, [x1, x9] +; VBITS_GE_256-NEXT: st1b { z1.b }, p0, [x1, x10] +; VBITS_GE_256-NEXT: ret +; ; VBITS_GE_2048-LABEL: extract_subvector_v256i8: ; VBITS_GE_2048: // %bb.0: ; VBITS_GE_2048-NEXT: ptrue p0.b, vl256 @@ -153,13 +184,13 @@ } define void @extract_subvector_v32i16(<32 x i16>* %a, <16 x i16>* %b) #0 { -; VBITS_EQ_256-LABEL: extract_subvector_v32i16: -; VBITS_EQ_256: // %bb.0: -; VBITS_EQ_256-NEXT: mov x8, #16 -; VBITS_EQ_256-NEXT: ptrue p0.h, vl16 -; VBITS_EQ_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] -; VBITS_EQ_256-NEXT: st1h { z0.h }, p0, [x1] -; VBITS_EQ_256-NEXT: ret +; VBITS_GE_256-LABEL: extract_subvector_v32i16: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov x8, #16 +; VBITS_GE_256-NEXT: ptrue p0.h, vl16 +; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] +; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x1] +; VBITS_GE_256-NEXT: ret ; ; VBITS_GE_512-LABEL: extract_subvector_v32i16: ; VBITS_GE_512: // %bb.0: @@ -176,6 +207,18 @@ } define void @extract_subvector_v64i16(<64 x i16>* %a, <32 x i16>* %b) #0 { +; VBITS_GE_256-LABEL: extract_subvector_v64i16: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov x8, #32 +; VBITS_GE_256-NEXT: mov x9, #48 +; VBITS_GE_256-NEXT: ptrue p0.h, vl16 +; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] +; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0, x9, lsl #1] +; VBITS_GE_256-NEXT: mov x8, #16 +; VBITS_GE_256-NEXT: st1h { z1.h }, p0, [x1, x8, lsl #1] +; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x1] +; VBITS_GE_256-NEXT: ret +; ; VBITS_GE_1024-LABEL: extract_subvector_v64i16: ; VBITS_GE_1024: // %bb.0: ; VBITS_GE_1024-NEXT: ptrue p0.h, vl64 @@ -191,6 +234,26 @@ } define void @extract_subvector_v128i16(<128 x i16>* %a, <64 x i16>* %b) #0 { +; VBITS_GE_256-LABEL: extract_subvector_v128i16: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov x8, #64 +; VBITS_GE_256-NEXT: mov x9, #80 +; VBITS_GE_256-NEXT: mov x10, #112 +; VBITS_GE_256-NEXT: mov x11, #96 +; VBITS_GE_256-NEXT: ptrue p0.h, vl16 +; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] +; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0, x9, lsl #1] +; VBITS_GE_256-NEXT: ld1h { z2.h }, p0/z, [x0, x10, lsl #1] +; VBITS_GE_256-NEXT: ld1h { z3.h }, p0/z, [x0, x11, lsl #1] +; VBITS_GE_256-NEXT: mov x8, #32 +; VBITS_GE_256-NEXT: mov x9, #48 +; VBITS_GE_256-NEXT: mov x10, #16 +; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x1] +; VBITS_GE_256-NEXT: st1h { z3.h }, p0, [x1, x8, lsl #1] +; VBITS_GE_256-NEXT: st1h { z2.h }, p0, [x1, x9, lsl #1] +; VBITS_GE_256-NEXT: st1h { z1.h }, p0, [x1, x10, lsl #1] +; VBITS_GE_256-NEXT: ret +; ; VBITS_GE_2048-LABEL: extract_subvector_v128i16: ; VBITS_GE_2048: // %bb.0: ; VBITS_GE_2048-NEXT: ptrue p0.h, vl128 @@ -244,13 +307,13 @@ } define void @extract_subvector_v16i32(<16 x i32>* %a, <8 x i32>* %b) #0 { -; VBITS_EQ_256-LABEL: extract_subvector_v16i32: -; VBITS_EQ_256: // %bb.0: -; VBITS_EQ_256-NEXT: mov x8, #8 -; VBITS_EQ_256-NEXT: ptrue p0.s, vl8 -; VBITS_EQ_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] -; VBITS_EQ_256-NEXT: st1w { z0.s }, p0, [x1] -; VBITS_EQ_256-NEXT: ret +; VBITS_GE_256-LABEL: extract_subvector_v16i32: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov x8, #8 +; VBITS_GE_256-NEXT: ptrue p0.s, vl8 +; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] +; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x1] +; VBITS_GE_256-NEXT: ret ; ; VBITS_GE_512-LABEL: extract_subvector_v16i32: ; VBITS_GE_512: // %bb.0: @@ -267,6 +330,18 @@ } define void @extract_subvector_v32i32(<32 x i32>* %a, <16 x i32>* %b) #0 { +; VBITS_GE_256-LABEL: extract_subvector_v32i32: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov x8, #16 +; VBITS_GE_256-NEXT: mov x9, #24 +; VBITS_GE_256-NEXT: ptrue p0.s, vl8 +; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0, x9, lsl #2] +; VBITS_GE_256-NEXT: mov x8, #8 +; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x1, x8, lsl #2] +; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x1] +; VBITS_GE_256-NEXT: ret +; ; VBITS_GE_1024-LABEL: extract_subvector_v32i32: ; VBITS_GE_1024: // %bb.0: ; VBITS_GE_1024-NEXT: ptrue p0.s, vl32 @@ -282,6 +357,26 @@ } define void @extract_subvector_v64i32(<64 x i32>* %a, <32 x i32>* %b) #0 { +; VBITS_GE_256-LABEL: extract_subvector_v64i32: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov x8, #32 +; VBITS_GE_256-NEXT: mov x9, #40 +; VBITS_GE_256-NEXT: mov x10, #56 +; VBITS_GE_256-NEXT: mov x11, #48 +; VBITS_GE_256-NEXT: ptrue p0.s, vl8 +; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0, x9, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z2.s }, p0/z, [x0, x10, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z3.s }, p0/z, [x0, x11, lsl #2] +; VBITS_GE_256-NEXT: mov x8, #16 +; VBITS_GE_256-NEXT: mov x9, #24 +; VBITS_GE_256-NEXT: mov x10, #8 +; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x1] +; VBITS_GE_256-NEXT: st1w { z3.s }, p0, [x1, x8, lsl #2] +; VBITS_GE_256-NEXT: st1w { z2.s }, p0, [x1, x9, lsl #2] +; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x1, x10, lsl #2] +; VBITS_GE_256-NEXT: ret +; ; VBITS_GE_2048-LABEL: extract_subvector_v64i32: ; VBITS_GE_2048: // %bb.0: ; VBITS_GE_2048-NEXT: ptrue p0.s, vl64 @@ -324,13 +419,13 @@ } define void @extract_subvector_v8i64(<8 x i64>* %a, <4 x i64>* %b) #0 { -; VBITS_EQ_256-LABEL: extract_subvector_v8i64: -; VBITS_EQ_256: // %bb.0: -; VBITS_EQ_256-NEXT: mov x8, #4 -; VBITS_EQ_256-NEXT: ptrue p0.d, vl4 -; VBITS_EQ_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] -; VBITS_EQ_256-NEXT: st1d { z0.d }, p0, [x1] -; VBITS_EQ_256-NEXT: ret +; VBITS_GE_256-LABEL: extract_subvector_v8i64: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov x8, #4 +; VBITS_GE_256-NEXT: ptrue p0.d, vl4 +; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] +; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x1] +; VBITS_GE_256-NEXT: ret ; ; VBITS_GE_512-LABEL: extract_subvector_v8i64: ; VBITS_GE_512: // %bb.0: @@ -347,6 +442,18 @@ } define void @extract_subvector_v16i64(<16 x i64>* %a, <8 x i64>* %b) #0 { +; VBITS_GE_256-LABEL: extract_subvector_v16i64: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov x8, #8 +; VBITS_GE_256-NEXT: mov x9, #12 +; VBITS_GE_256-NEXT: ptrue p0.d, vl4 +; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0, x9, lsl #3] +; VBITS_GE_256-NEXT: mov x8, #4 +; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x1, x8, lsl #3] +; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x1] +; VBITS_GE_256-NEXT: ret +; ; VBITS_GE_1024-LABEL: extract_subvector_v16i64: ; VBITS_GE_1024: // %bb.0: ; VBITS_GE_1024-NEXT: ptrue p0.d, vl16 @@ -362,6 +469,26 @@ } define void @extract_subvector_v32i64(<32 x i64>* %a, <16 x i64>* %b) #0 { +; VBITS_GE_256-LABEL: extract_subvector_v32i64: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov x8, #16 +; VBITS_GE_256-NEXT: mov x9, #20 +; VBITS_GE_256-NEXT: mov x10, #28 +; VBITS_GE_256-NEXT: mov x11, #24 +; VBITS_GE_256-NEXT: ptrue p0.d, vl4 +; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0, x9, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z2.d }, p0/z, [x0, x10, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z3.d }, p0/z, [x0, x11, lsl #3] +; VBITS_GE_256-NEXT: mov x8, #8 +; VBITS_GE_256-NEXT: mov x9, #12 +; VBITS_GE_256-NEXT: mov x10, #4 +; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x1] +; VBITS_GE_256-NEXT: st1d { z3.d }, p0, [x1, x8, lsl #3] +; VBITS_GE_256-NEXT: st1d { z2.d }, p0, [x1, x9, lsl #3] +; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x1, x10, lsl #3] +; VBITS_GE_256-NEXT: ret +; ; VBITS_GE_2048-LABEL: extract_subvector_v32i64: ; VBITS_GE_2048: // %bb.0: ; VBITS_GE_2048-NEXT: ptrue p0.d, vl32 @@ -415,13 +542,13 @@ } define void @extract_subvector_v32f16(<32 x half>* %a, <16 x half>* %b) #0 { -; VBITS_EQ_256-LABEL: extract_subvector_v32f16: -; VBITS_EQ_256: // %bb.0: -; VBITS_EQ_256-NEXT: mov x8, #16 -; VBITS_EQ_256-NEXT: ptrue p0.h, vl16 -; VBITS_EQ_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] -; VBITS_EQ_256-NEXT: st1h { z0.h }, p0, [x1] -; VBITS_EQ_256-NEXT: ret +; VBITS_GE_256-LABEL: extract_subvector_v32f16: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov x8, #16 +; VBITS_GE_256-NEXT: ptrue p0.h, vl16 +; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] +; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x1] +; VBITS_GE_256-NEXT: ret ; ; VBITS_GE_512-LABEL: extract_subvector_v32f16: ; VBITS_GE_512: // %bb.0: @@ -438,6 +565,18 @@ } define void @extract_subvector_v64f16(<64 x half>* %a, <32 x half>* %b) #0 { +; VBITS_GE_256-LABEL: extract_subvector_v64f16: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov x8, #32 +; VBITS_GE_256-NEXT: mov x9, #48 +; VBITS_GE_256-NEXT: ptrue p0.h, vl16 +; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] +; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0, x9, lsl #1] +; VBITS_GE_256-NEXT: mov x8, #16 +; VBITS_GE_256-NEXT: st1h { z1.h }, p0, [x1, x8, lsl #1] +; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x1] +; VBITS_GE_256-NEXT: ret +; ; VBITS_GE_1024-LABEL: extract_subvector_v64f16: ; VBITS_GE_1024: // %bb.0: ; VBITS_GE_1024-NEXT: ptrue p0.h, vl64 @@ -453,6 +592,26 @@ } define void @extract_subvector_v128f16(<128 x half>* %a, <64 x half>* %b) #0 { +; VBITS_GE_256-LABEL: extract_subvector_v128f16: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov x8, #64 +; VBITS_GE_256-NEXT: mov x9, #80 +; VBITS_GE_256-NEXT: mov x10, #112 +; VBITS_GE_256-NEXT: mov x11, #96 +; VBITS_GE_256-NEXT: ptrue p0.h, vl16 +; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] +; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0, x9, lsl #1] +; VBITS_GE_256-NEXT: ld1h { z2.h }, p0/z, [x0, x10, lsl #1] +; VBITS_GE_256-NEXT: ld1h { z3.h }, p0/z, [x0, x11, lsl #1] +; VBITS_GE_256-NEXT: mov x8, #32 +; VBITS_GE_256-NEXT: mov x9, #48 +; VBITS_GE_256-NEXT: mov x10, #16 +; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x1] +; VBITS_GE_256-NEXT: st1h { z3.h }, p0, [x1, x8, lsl #1] +; VBITS_GE_256-NEXT: st1h { z2.h }, p0, [x1, x9, lsl #1] +; VBITS_GE_256-NEXT: st1h { z1.h }, p0, [x1, x10, lsl #1] +; VBITS_GE_256-NEXT: ret +; ; VBITS_GE_2048-LABEL: extract_subvector_v128f16: ; VBITS_GE_2048: // %bb.0: ; VBITS_GE_2048-NEXT: ptrue p0.h, vl128 @@ -506,13 +665,13 @@ } define void @extract_subvector_v16f32(<16 x float>* %a, <8 x float>* %b) #0 { -; VBITS_EQ_256-LABEL: extract_subvector_v16f32: -; VBITS_EQ_256: // %bb.0: -; VBITS_EQ_256-NEXT: mov x8, #8 -; VBITS_EQ_256-NEXT: ptrue p0.s, vl8 -; VBITS_EQ_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] -; VBITS_EQ_256-NEXT: st1w { z0.s }, p0, [x1] -; VBITS_EQ_256-NEXT: ret +; VBITS_GE_256-LABEL: extract_subvector_v16f32: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov x8, #8 +; VBITS_GE_256-NEXT: ptrue p0.s, vl8 +; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] +; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x1] +; VBITS_GE_256-NEXT: ret ; ; VBITS_GE_512-LABEL: extract_subvector_v16f32: ; VBITS_GE_512: // %bb.0: @@ -529,6 +688,18 @@ } define void @extract_subvector_v32f32(<32 x float>* %a, <16 x float>* %b) #0 { +; VBITS_GE_256-LABEL: extract_subvector_v32f32: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov x8, #16 +; VBITS_GE_256-NEXT: mov x9, #24 +; VBITS_GE_256-NEXT: ptrue p0.s, vl8 +; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0, x9, lsl #2] +; VBITS_GE_256-NEXT: mov x8, #8 +; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x1, x8, lsl #2] +; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x1] +; VBITS_GE_256-NEXT: ret +; ; VBITS_GE_1024-LABEL: extract_subvector_v32f32: ; VBITS_GE_1024: // %bb.0: ; VBITS_GE_1024-NEXT: ptrue p0.s, vl32 @@ -544,6 +715,26 @@ } define void @extract_subvector_v64f32(<64 x float>* %a, <32 x float>* %b) #0 { +; VBITS_GE_256-LABEL: extract_subvector_v64f32: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov x8, #32 +; VBITS_GE_256-NEXT: mov x9, #40 +; VBITS_GE_256-NEXT: mov x10, #56 +; VBITS_GE_256-NEXT: mov x11, #48 +; VBITS_GE_256-NEXT: ptrue p0.s, vl8 +; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0, x9, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z2.s }, p0/z, [x0, x10, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z3.s }, p0/z, [x0, x11, lsl #2] +; VBITS_GE_256-NEXT: mov x8, #16 +; VBITS_GE_256-NEXT: mov x9, #24 +; VBITS_GE_256-NEXT: mov x10, #8 +; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x1] +; VBITS_GE_256-NEXT: st1w { z3.s }, p0, [x1, x8, lsl #2] +; VBITS_GE_256-NEXT: st1w { z2.s }, p0, [x1, x9, lsl #2] +; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x1, x10, lsl #2] +; VBITS_GE_256-NEXT: ret +; ; VBITS_GE_2048-LABEL: extract_subvector_v64f32: ; VBITS_GE_2048: // %bb.0: ; VBITS_GE_2048-NEXT: ptrue p0.s, vl64 @@ -586,13 +777,13 @@ } define void @extract_subvector_v8f64(<8 x double>* %a, <4 x double>* %b) #0 { -; VBITS_EQ_256-LABEL: extract_subvector_v8f64: -; VBITS_EQ_256: // %bb.0: -; VBITS_EQ_256-NEXT: mov x8, #4 -; VBITS_EQ_256-NEXT: ptrue p0.d, vl4 -; VBITS_EQ_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] -; VBITS_EQ_256-NEXT: st1d { z0.d }, p0, [x1] -; VBITS_EQ_256-NEXT: ret +; VBITS_GE_256-LABEL: extract_subvector_v8f64: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov x8, #4 +; VBITS_GE_256-NEXT: ptrue p0.d, vl4 +; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] +; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x1] +; VBITS_GE_256-NEXT: ret ; ; VBITS_GE_512-LABEL: extract_subvector_v8f64: ; VBITS_GE_512: // %bb.0: @@ -609,6 +800,18 @@ } define void @extract_subvector_v16f64(<16 x double>* %a, <8 x double>* %b) #0 { +; VBITS_GE_256-LABEL: extract_subvector_v16f64: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov x8, #8 +; VBITS_GE_256-NEXT: mov x9, #12 +; VBITS_GE_256-NEXT: ptrue p0.d, vl4 +; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0, x9, lsl #3] +; VBITS_GE_256-NEXT: mov x8, #4 +; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x1, x8, lsl #3] +; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x1] +; VBITS_GE_256-NEXT: ret +; ; VBITS_GE_1024-LABEL: extract_subvector_v16f64: ; VBITS_GE_1024: // %bb.0: ; VBITS_GE_1024-NEXT: ptrue p0.d, vl16 @@ -624,6 +827,26 @@ } define void @extract_subvector_v32f64(<32 x double>* %a, <16 x double>* %b) #0 { +; VBITS_GE_256-LABEL: extract_subvector_v32f64: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov x8, #16 +; VBITS_GE_256-NEXT: mov x9, #20 +; VBITS_GE_256-NEXT: mov x10, #28 +; VBITS_GE_256-NEXT: mov x11, #24 +; VBITS_GE_256-NEXT: ptrue p0.d, vl4 +; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0, x9, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z2.d }, p0/z, [x0, x10, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z3.d }, p0/z, [x0, x11, lsl #3] +; VBITS_GE_256-NEXT: mov x8, #8 +; VBITS_GE_256-NEXT: mov x9, #12 +; VBITS_GE_256-NEXT: mov x10, #4 +; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x1] +; VBITS_GE_256-NEXT: st1d { z3.d }, p0, [x1, x8, lsl #3] +; VBITS_GE_256-NEXT: st1d { z2.d }, p0, [x1, x9, lsl #3] +; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x1, x10, lsl #3] +; VBITS_GE_256-NEXT: ret +; ; VBITS_GE_2048-LABEL: extract_subvector_v32f64: ; VBITS_GE_2048: // %bb.0: ; VBITS_GE_2048-NEXT: ptrue p0.d, vl32 diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-float-compares.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-float-compares.ll --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-float-compares.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-float-compares.ll @@ -1,6 +1,8 @@ -; RUN: llc -aarch64-sve-vector-bits-min=128 < %s | FileCheck %s -check-prefix=NO_SVE -; RUN: llc -aarch64-sve-vector-bits-min=256 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_EQ_256 -; RUN: llc -aarch64-sve-vector-bits-min=384 < %s | FileCheck %s -check-prefixes=CHECK +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; Don't use SVE when its registers are no bigger than NEON. +; RUN: llc -aarch64-sve-vector-bits-min=128 < %s | not grep 'z[0-9]' +; RUN: llc -aarch64-sve-vector-bits-min=256 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_256 +; RUN: llc -aarch64-sve-vector-bits-min=384 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_256 ; RUN: llc -aarch64-sve-vector-bits-min=512 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512 ; RUN: llc -aarch64-sve-vector-bits-min=640 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512 ; RUN: llc -aarch64-sve-vector-bits-min=768 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512 @@ -17,12 +19,7 @@ target triple = "aarch64-unknown-linux-gnu" -; Don't use SVE when its registers are no bigger than NEON. -; NO_SVE-NOT: z{0-9} - -; ; FCMP OEQ -; ; Don't use SVE for 64-bit vectors. define <4 x i16> @fcmp_oeq_v4f16(<4 x half> %op1, <4 x half> %op2) #0 { @@ -66,21 +63,21 @@ define void @fcmp_oeq_v32f16(<32 x half>* %a, <32 x half>* %b, <32 x i16>* %c) #0 { ; Ensure sensible type legalisation -; VBITS_EQ_256-LABEL: fcmp_oeq_v32f16: -; VBITS_EQ_256: // %bb.0: -; VBITS_EQ_256-NEXT: mov x8, #16 -; VBITS_EQ_256-NEXT: ptrue p0.h, vl16 -; VBITS_EQ_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] -; VBITS_EQ_256-NEXT: ld1h { z1.h }, p0/z, [x0] -; VBITS_EQ_256-NEXT: ld1h { z2.h }, p0/z, [x1, x8, lsl #1] -; VBITS_EQ_256-NEXT: ld1h { z3.h }, p0/z, [x1] -; VBITS_EQ_256-NEXT: fcmeq p1.h, p0/z, z0.h, z2.h -; VBITS_EQ_256-NEXT: fcmeq p2.h, p0/z, z1.h, z3.h -; VBITS_EQ_256-NEXT: mov z0.h, p1/z, #-1 // =0xffffffffffffffff -; VBITS_EQ_256-NEXT: mov z1.h, p2/z, #-1 // =0xffffffffffffffff -; VBITS_EQ_256-NEXT: st1h { z0.h }, p0, [x2, x8, lsl #1] -; VBITS_EQ_256-NEXT: st1h { z1.h }, p0, [x2] -; VBITS_EQ_256-NEXT: ret +; VBITS_GE_256-LABEL: fcmp_oeq_v32f16: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov x8, #16 +; VBITS_GE_256-NEXT: ptrue p0.h, vl16 +; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] +; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0] +; VBITS_GE_256-NEXT: ld1h { z2.h }, p0/z, [x1, x8, lsl #1] +; VBITS_GE_256-NEXT: ld1h { z3.h }, p0/z, [x1] +; VBITS_GE_256-NEXT: fcmeq p1.h, p0/z, z0.h, z2.h +; VBITS_GE_256-NEXT: fcmeq p2.h, p0/z, z1.h, z3.h +; VBITS_GE_256-NEXT: mov z0.h, p1/z, #-1 // =0xffffffffffffffff +; VBITS_GE_256-NEXT: mov z1.h, p2/z, #-1 // =0xffffffffffffffff +; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x2, x8, lsl #1] +; VBITS_GE_256-NEXT: st1h { z1.h }, p0, [x2] +; VBITS_GE_256-NEXT: ret ; ; VBITS_GE_512-LABEL: fcmp_oeq_v32f16: ; VBITS_GE_512: // %bb.0: @@ -100,6 +97,34 @@ } define void @fcmp_oeq_v64f16(<64 x half>* %a, <64 x half>* %b, <64 x i16>* %c) #0 { +; VBITS_GE_256-LABEL: fcmp_oeq_v64f16: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov x8, #32 +; VBITS_GE_256-NEXT: mov x9, #48 +; VBITS_GE_256-NEXT: mov x10, #16 +; VBITS_GE_256-NEXT: ptrue p0.h, vl16 +; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] +; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0, x9, lsl #1] +; VBITS_GE_256-NEXT: ld1h { z2.h }, p0/z, [x0, x10, lsl #1] +; VBITS_GE_256-NEXT: ld1h { z3.h }, p0/z, [x0] +; VBITS_GE_256-NEXT: ld1h { z4.h }, p0/z, [x1, x9, lsl #1] +; VBITS_GE_256-NEXT: ld1h { z5.h }, p0/z, [x1, x10, lsl #1] +; VBITS_GE_256-NEXT: ld1h { z6.h }, p0/z, [x1, x8, lsl #1] +; VBITS_GE_256-NEXT: ld1h { z7.h }, p0/z, [x1] +; VBITS_GE_256-NEXT: fcmeq p1.h, p0/z, z2.h, z5.h +; VBITS_GE_256-NEXT: fcmeq p2.h, p0/z, z1.h, z4.h +; VBITS_GE_256-NEXT: mov z1.h, p1/z, #-1 // =0xffffffffffffffff +; VBITS_GE_256-NEXT: mov z2.h, p2/z, #-1 // =0xffffffffffffffff +; VBITS_GE_256-NEXT: fcmeq p1.h, p0/z, z0.h, z6.h +; VBITS_GE_256-NEXT: fcmeq p2.h, p0/z, z3.h, z7.h +; VBITS_GE_256-NEXT: mov z0.h, p1/z, #-1 // =0xffffffffffffffff +; VBITS_GE_256-NEXT: mov z3.h, p2/z, #-1 // =0xffffffffffffffff +; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x2, x8, lsl #1] +; VBITS_GE_256-NEXT: st1h { z2.h }, p0, [x2, x9, lsl #1] +; VBITS_GE_256-NEXT: st1h { z1.h }, p0, [x2, x10, lsl #1] +; VBITS_GE_256-NEXT: st1h { z3.h }, p0, [x2] +; VBITS_GE_256-NEXT: ret +; ; VBITS_GE_1024-LABEL: fcmp_oeq_v64f16: ; VBITS_GE_1024: // %bb.0: ; VBITS_GE_1024-NEXT: ptrue p0.h, vl64 @@ -118,6 +143,58 @@ } define void @fcmp_oeq_v128f16(<128 x half>* %a, <128 x half>* %b, <128 x i16>* %c) #0 { +; VBITS_GE_256-LABEL: fcmp_oeq_v128f16: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov x8, #96 +; VBITS_GE_256-NEXT: mov x9, #112 +; VBITS_GE_256-NEXT: mov x10, #64 +; VBITS_GE_256-NEXT: mov x11, #80 +; VBITS_GE_256-NEXT: mov x12, #32 +; VBITS_GE_256-NEXT: mov x13, #48 +; VBITS_GE_256-NEXT: mov x14, #16 +; VBITS_GE_256-NEXT: ptrue p0.h, vl16 +; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] +; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0, x9, lsl #1] +; VBITS_GE_256-NEXT: ld1h { z2.h }, p0/z, [x0, x10, lsl #1] +; VBITS_GE_256-NEXT: ld1h { z3.h }, p0/z, [x0, x11, lsl #1] +; VBITS_GE_256-NEXT: ld1h { z4.h }, p0/z, [x0, x12, lsl #1] +; VBITS_GE_256-NEXT: ld1h { z5.h }, p0/z, [x0, x13, lsl #1] +; VBITS_GE_256-NEXT: ld1h { z6.h }, p0/z, [x0, x14, lsl #1] +; VBITS_GE_256-NEXT: ld1h { z7.h }, p0/z, [x0] +; VBITS_GE_256-NEXT: ld1h { z16.h }, p0/z, [x1, x13, lsl #1] +; VBITS_GE_256-NEXT: ld1h { z17.h }, p0/z, [x1, x14, lsl #1] +; VBITS_GE_256-NEXT: ld1h { z20.h }, p0/z, [x1, x11, lsl #1] +; VBITS_GE_256-NEXT: ld1h { z21.h }, p0/z, [x1, x12, lsl #1] +; VBITS_GE_256-NEXT: ld1h { z19.h }, p0/z, [x1, x9, lsl #1] +; VBITS_GE_256-NEXT: ld1h { z22.h }, p0/z, [x1, x10, lsl #1] +; VBITS_GE_256-NEXT: ld1h { z18.h }, p0/z, [x1, x8, lsl #1] +; VBITS_GE_256-NEXT: ld1h { z23.h }, p0/z, [x1] +; VBITS_GE_256-NEXT: fcmeq p1.h, p0/z, z6.h, z17.h +; VBITS_GE_256-NEXT: fcmeq p2.h, p0/z, z5.h, z16.h +; VBITS_GE_256-NEXT: mov z5.h, p1/z, #-1 // =0xffffffffffffffff +; VBITS_GE_256-NEXT: mov z6.h, p2/z, #-1 // =0xffffffffffffffff +; VBITS_GE_256-NEXT: fcmeq p1.h, p0/z, z4.h, z21.h +; VBITS_GE_256-NEXT: fcmeq p2.h, p0/z, z3.h, z20.h +; VBITS_GE_256-NEXT: mov z3.h, p1/z, #-1 // =0xffffffffffffffff +; VBITS_GE_256-NEXT: mov z4.h, p2/z, #-1 // =0xffffffffffffffff +; VBITS_GE_256-NEXT: fcmeq p1.h, p0/z, z2.h, z22.h +; VBITS_GE_256-NEXT: fcmeq p2.h, p0/z, z1.h, z19.h +; VBITS_GE_256-NEXT: mov z1.h, p1/z, #-1 // =0xffffffffffffffff +; VBITS_GE_256-NEXT: mov z2.h, p2/z, #-1 // =0xffffffffffffffff +; VBITS_GE_256-NEXT: fcmeq p1.h, p0/z, z0.h, z18.h +; VBITS_GE_256-NEXT: fcmeq p2.h, p0/z, z7.h, z23.h +; VBITS_GE_256-NEXT: mov z0.h, p1/z, #-1 // =0xffffffffffffffff +; VBITS_GE_256-NEXT: mov z7.h, p2/z, #-1 // =0xffffffffffffffff +; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x2, x8, lsl #1] +; VBITS_GE_256-NEXT: st1h { z2.h }, p0, [x2, x9, lsl #1] +; VBITS_GE_256-NEXT: st1h { z1.h }, p0, [x2, x10, lsl #1] +; VBITS_GE_256-NEXT: st1h { z4.h }, p0, [x2, x11, lsl #1] +; VBITS_GE_256-NEXT: st1h { z3.h }, p0, [x2, x12, lsl #1] +; VBITS_GE_256-NEXT: st1h { z6.h }, p0, [x2, x13, lsl #1] +; VBITS_GE_256-NEXT: st1h { z5.h }, p0, [x2, x14, lsl #1] +; VBITS_GE_256-NEXT: st1h { z7.h }, p0, [x2] +; VBITS_GE_256-NEXT: ret +; ; VBITS_GE_2048-LABEL: fcmp_oeq_v128f16: ; VBITS_GE_2048: // %bb.0: ; VBITS_GE_2048-NEXT: ptrue p0.h, vl128 @@ -177,21 +254,21 @@ define void @fcmp_oeq_v16f32(<16 x float>* %a, <16 x float>* %b, <16 x i32>* %c) #0 { ; Ensure sensible type legalisation -; VBITS_EQ_256-LABEL: fcmp_oeq_v16f32: -; VBITS_EQ_256: // %bb.0: -; VBITS_EQ_256-NEXT: mov x8, #8 -; VBITS_EQ_256-NEXT: ptrue p0.s, vl8 -; VBITS_EQ_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] -; VBITS_EQ_256-NEXT: ld1w { z1.s }, p0/z, [x0] -; VBITS_EQ_256-NEXT: ld1w { z2.s }, p0/z, [x1, x8, lsl #2] -; VBITS_EQ_256-NEXT: ld1w { z3.s }, p0/z, [x1] -; VBITS_EQ_256-NEXT: fcmeq p1.s, p0/z, z0.s, z2.s -; VBITS_EQ_256-NEXT: fcmeq p2.s, p0/z, z1.s, z3.s -; VBITS_EQ_256-NEXT: mov z0.s, p1/z, #-1 // =0xffffffffffffffff -; VBITS_EQ_256-NEXT: mov z1.s, p2/z, #-1 // =0xffffffffffffffff -; VBITS_EQ_256-NEXT: st1w { z0.s }, p0, [x2, x8, lsl #2] -; VBITS_EQ_256-NEXT: st1w { z1.s }, p0, [x2] -; VBITS_EQ_256-NEXT: ret +; VBITS_GE_256-LABEL: fcmp_oeq_v16f32: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov x8, #8 +; VBITS_GE_256-NEXT: ptrue p0.s, vl8 +; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0] +; VBITS_GE_256-NEXT: ld1w { z2.s }, p0/z, [x1, x8, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z3.s }, p0/z, [x1] +; VBITS_GE_256-NEXT: fcmeq p1.s, p0/z, z0.s, z2.s +; VBITS_GE_256-NEXT: fcmeq p2.s, p0/z, z1.s, z3.s +; VBITS_GE_256-NEXT: mov z0.s, p1/z, #-1 // =0xffffffffffffffff +; VBITS_GE_256-NEXT: mov z1.s, p2/z, #-1 // =0xffffffffffffffff +; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x2, x8, lsl #2] +; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x2] +; VBITS_GE_256-NEXT: ret ; ; VBITS_GE_512-LABEL: fcmp_oeq_v16f32: ; VBITS_GE_512: // %bb.0: @@ -211,6 +288,34 @@ } define void @fcmp_oeq_v32f32(<32 x float>* %a, <32 x float>* %b, <32 x i32>* %c) #0 { +; VBITS_GE_256-LABEL: fcmp_oeq_v32f32: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov x8, #16 +; VBITS_GE_256-NEXT: mov x9, #24 +; VBITS_GE_256-NEXT: mov x10, #8 +; VBITS_GE_256-NEXT: ptrue p0.s, vl8 +; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0, x9, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z2.s }, p0/z, [x0, x10, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z3.s }, p0/z, [x0] +; VBITS_GE_256-NEXT: ld1w { z4.s }, p0/z, [x1, x9, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z5.s }, p0/z, [x1, x10, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z6.s }, p0/z, [x1, x8, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z7.s }, p0/z, [x1] +; VBITS_GE_256-NEXT: fcmeq p1.s, p0/z, z2.s, z5.s +; VBITS_GE_256-NEXT: fcmeq p2.s, p0/z, z1.s, z4.s +; VBITS_GE_256-NEXT: mov z1.s, p1/z, #-1 // =0xffffffffffffffff +; VBITS_GE_256-NEXT: mov z2.s, p2/z, #-1 // =0xffffffffffffffff +; VBITS_GE_256-NEXT: fcmeq p1.s, p0/z, z0.s, z6.s +; VBITS_GE_256-NEXT: fcmeq p2.s, p0/z, z3.s, z7.s +; VBITS_GE_256-NEXT: mov z0.s, p1/z, #-1 // =0xffffffffffffffff +; VBITS_GE_256-NEXT: mov z3.s, p2/z, #-1 // =0xffffffffffffffff +; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x2, x8, lsl #2] +; VBITS_GE_256-NEXT: st1w { z2.s }, p0, [x2, x9, lsl #2] +; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x2, x10, lsl #2] +; VBITS_GE_256-NEXT: st1w { z3.s }, p0, [x2] +; VBITS_GE_256-NEXT: ret +; ; VBITS_GE_1024-LABEL: fcmp_oeq_v32f32: ; VBITS_GE_1024: // %bb.0: ; VBITS_GE_1024-NEXT: ptrue p0.s, vl32 @@ -229,6 +334,58 @@ } define void @fcmp_oeq_v64f32(<64 x float>* %a, <64 x float>* %b, <64 x i32>* %c) #0 { +; VBITS_GE_256-LABEL: fcmp_oeq_v64f32: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov x8, #48 +; VBITS_GE_256-NEXT: mov x9, #56 +; VBITS_GE_256-NEXT: mov x10, #32 +; VBITS_GE_256-NEXT: mov x11, #40 +; VBITS_GE_256-NEXT: mov x12, #16 +; VBITS_GE_256-NEXT: mov x13, #24 +; VBITS_GE_256-NEXT: mov x14, #8 +; VBITS_GE_256-NEXT: ptrue p0.s, vl8 +; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0, x9, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z2.s }, p0/z, [x0, x10, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z3.s }, p0/z, [x0, x11, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z4.s }, p0/z, [x0, x12, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z5.s }, p0/z, [x0, x13, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z6.s }, p0/z, [x0, x14, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z7.s }, p0/z, [x0] +; VBITS_GE_256-NEXT: ld1w { z16.s }, p0/z, [x1, x13, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z17.s }, p0/z, [x1, x14, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z20.s }, p0/z, [x1, x11, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z21.s }, p0/z, [x1, x12, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z19.s }, p0/z, [x1, x9, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z22.s }, p0/z, [x1, x10, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z18.s }, p0/z, [x1, x8, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z23.s }, p0/z, [x1] +; VBITS_GE_256-NEXT: fcmeq p1.s, p0/z, z6.s, z17.s +; VBITS_GE_256-NEXT: fcmeq p2.s, p0/z, z5.s, z16.s +; VBITS_GE_256-NEXT: mov z5.s, p1/z, #-1 // =0xffffffffffffffff +; VBITS_GE_256-NEXT: mov z6.s, p2/z, #-1 // =0xffffffffffffffff +; VBITS_GE_256-NEXT: fcmeq p1.s, p0/z, z4.s, z21.s +; VBITS_GE_256-NEXT: fcmeq p2.s, p0/z, z3.s, z20.s +; VBITS_GE_256-NEXT: mov z3.s, p1/z, #-1 // =0xffffffffffffffff +; VBITS_GE_256-NEXT: mov z4.s, p2/z, #-1 // =0xffffffffffffffff +; VBITS_GE_256-NEXT: fcmeq p1.s, p0/z, z2.s, z22.s +; VBITS_GE_256-NEXT: fcmeq p2.s, p0/z, z1.s, z19.s +; VBITS_GE_256-NEXT: mov z1.s, p1/z, #-1 // =0xffffffffffffffff +; VBITS_GE_256-NEXT: mov z2.s, p2/z, #-1 // =0xffffffffffffffff +; VBITS_GE_256-NEXT: fcmeq p1.s, p0/z, z0.s, z18.s +; VBITS_GE_256-NEXT: fcmeq p2.s, p0/z, z7.s, z23.s +; VBITS_GE_256-NEXT: mov z0.s, p1/z, #-1 // =0xffffffffffffffff +; VBITS_GE_256-NEXT: mov z7.s, p2/z, #-1 // =0xffffffffffffffff +; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x2, x8, lsl #2] +; VBITS_GE_256-NEXT: st1w { z2.s }, p0, [x2, x9, lsl #2] +; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x2, x10, lsl #2] +; VBITS_GE_256-NEXT: st1w { z4.s }, p0, [x2, x11, lsl #2] +; VBITS_GE_256-NEXT: st1w { z3.s }, p0, [x2, x12, lsl #2] +; VBITS_GE_256-NEXT: st1w { z6.s }, p0, [x2, x13, lsl #2] +; VBITS_GE_256-NEXT: st1w { z5.s }, p0, [x2, x14, lsl #2] +; VBITS_GE_256-NEXT: st1w { z7.s }, p0, [x2] +; VBITS_GE_256-NEXT: ret +; ; VBITS_GE_2048-LABEL: fcmp_oeq_v64f32: ; VBITS_GE_2048: // %bb.0: ; VBITS_GE_2048-NEXT: ptrue p0.s, vl64 @@ -288,21 +445,21 @@ define void @fcmp_oeq_v8f64(<8 x double>* %a, <8 x double>* %b, <8 x i64>* %c) #0 { ; Ensure sensible type legalisation -; VBITS_EQ_256-LABEL: fcmp_oeq_v8f64: -; VBITS_EQ_256: // %bb.0: -; VBITS_EQ_256-NEXT: mov x8, #4 -; VBITS_EQ_256-NEXT: ptrue p0.d, vl4 -; VBITS_EQ_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] -; VBITS_EQ_256-NEXT: ld1d { z1.d }, p0/z, [x0] -; VBITS_EQ_256-NEXT: ld1d { z2.d }, p0/z, [x1, x8, lsl #3] -; VBITS_EQ_256-NEXT: ld1d { z3.d }, p0/z, [x1] -; VBITS_EQ_256-NEXT: fcmeq p1.d, p0/z, z0.d, z2.d -; VBITS_EQ_256-NEXT: fcmeq p2.d, p0/z, z1.d, z3.d -; VBITS_EQ_256-NEXT: mov z0.d, p1/z, #-1 // =0xffffffffffffffff -; VBITS_EQ_256-NEXT: mov z1.d, p2/z, #-1 // =0xffffffffffffffff -; VBITS_EQ_256-NEXT: st1d { z0.d }, p0, [x2, x8, lsl #3] -; VBITS_EQ_256-NEXT: st1d { z1.d }, p0, [x2] -; VBITS_EQ_256-NEXT: ret +; VBITS_GE_256-LABEL: fcmp_oeq_v8f64: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov x8, #4 +; VBITS_GE_256-NEXT: ptrue p0.d, vl4 +; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0] +; VBITS_GE_256-NEXT: ld1d { z2.d }, p0/z, [x1, x8, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z3.d }, p0/z, [x1] +; VBITS_GE_256-NEXT: fcmeq p1.d, p0/z, z0.d, z2.d +; VBITS_GE_256-NEXT: fcmeq p2.d, p0/z, z1.d, z3.d +; VBITS_GE_256-NEXT: mov z0.d, p1/z, #-1 // =0xffffffffffffffff +; VBITS_GE_256-NEXT: mov z1.d, p2/z, #-1 // =0xffffffffffffffff +; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x2, x8, lsl #3] +; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x2] +; VBITS_GE_256-NEXT: ret ; ; VBITS_GE_512-LABEL: fcmp_oeq_v8f64: ; VBITS_GE_512: // %bb.0: @@ -322,6 +479,34 @@ } define void @fcmp_oeq_v16f64(<16 x double>* %a, <16 x double>* %b, <16 x i64>* %c) #0 { +; VBITS_GE_256-LABEL: fcmp_oeq_v16f64: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov x8, #8 +; VBITS_GE_256-NEXT: mov x9, #12 +; VBITS_GE_256-NEXT: mov x10, #4 +; VBITS_GE_256-NEXT: ptrue p0.d, vl4 +; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0, x9, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z2.d }, p0/z, [x0, x10, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z3.d }, p0/z, [x0] +; VBITS_GE_256-NEXT: ld1d { z4.d }, p0/z, [x1, x9, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z5.d }, p0/z, [x1, x10, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z6.d }, p0/z, [x1, x8, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z7.d }, p0/z, [x1] +; VBITS_GE_256-NEXT: fcmeq p1.d, p0/z, z2.d, z5.d +; VBITS_GE_256-NEXT: fcmeq p2.d, p0/z, z1.d, z4.d +; VBITS_GE_256-NEXT: mov z1.d, p1/z, #-1 // =0xffffffffffffffff +; VBITS_GE_256-NEXT: mov z2.d, p2/z, #-1 // =0xffffffffffffffff +; VBITS_GE_256-NEXT: fcmeq p1.d, p0/z, z0.d, z6.d +; VBITS_GE_256-NEXT: fcmeq p2.d, p0/z, z3.d, z7.d +; VBITS_GE_256-NEXT: mov z0.d, p1/z, #-1 // =0xffffffffffffffff +; VBITS_GE_256-NEXT: mov z3.d, p2/z, #-1 // =0xffffffffffffffff +; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x2, x8, lsl #3] +; VBITS_GE_256-NEXT: st1d { z2.d }, p0, [x2, x9, lsl #3] +; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x2, x10, lsl #3] +; VBITS_GE_256-NEXT: st1d { z3.d }, p0, [x2] +; VBITS_GE_256-NEXT: ret +; ; VBITS_GE_1024-LABEL: fcmp_oeq_v16f64: ; VBITS_GE_1024: // %bb.0: ; VBITS_GE_1024-NEXT: ptrue p0.d, vl16 @@ -340,6 +525,58 @@ } define void @fcmp_oeq_v32f64(<32 x double>* %a, <32 x double>* %b, <32 x i64>* %c) #0 { +; VBITS_GE_256-LABEL: fcmp_oeq_v32f64: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov x8, #24 +; VBITS_GE_256-NEXT: mov x9, #28 +; VBITS_GE_256-NEXT: mov x10, #16 +; VBITS_GE_256-NEXT: mov x11, #20 +; VBITS_GE_256-NEXT: mov x12, #8 +; VBITS_GE_256-NEXT: mov x13, #12 +; VBITS_GE_256-NEXT: mov x14, #4 +; VBITS_GE_256-NEXT: ptrue p0.d, vl4 +; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0, x9, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z2.d }, p0/z, [x0, x10, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z3.d }, p0/z, [x0, x11, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z4.d }, p0/z, [x0, x12, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z5.d }, p0/z, [x0, x13, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z6.d }, p0/z, [x0, x14, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z7.d }, p0/z, [x0] +; VBITS_GE_256-NEXT: ld1d { z16.d }, p0/z, [x1, x13, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z17.d }, p0/z, [x1, x14, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z20.d }, p0/z, [x1, x11, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z21.d }, p0/z, [x1, x12, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z19.d }, p0/z, [x1, x9, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z22.d }, p0/z, [x1, x10, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z18.d }, p0/z, [x1, x8, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z23.d }, p0/z, [x1] +; VBITS_GE_256-NEXT: fcmeq p1.d, p0/z, z6.d, z17.d +; VBITS_GE_256-NEXT: fcmeq p2.d, p0/z, z5.d, z16.d +; VBITS_GE_256-NEXT: mov z5.d, p1/z, #-1 // =0xffffffffffffffff +; VBITS_GE_256-NEXT: mov z6.d, p2/z, #-1 // =0xffffffffffffffff +; VBITS_GE_256-NEXT: fcmeq p1.d, p0/z, z4.d, z21.d +; VBITS_GE_256-NEXT: fcmeq p2.d, p0/z, z3.d, z20.d +; VBITS_GE_256-NEXT: mov z3.d, p1/z, #-1 // =0xffffffffffffffff +; VBITS_GE_256-NEXT: mov z4.d, p2/z, #-1 // =0xffffffffffffffff +; VBITS_GE_256-NEXT: fcmeq p1.d, p0/z, z2.d, z22.d +; VBITS_GE_256-NEXT: fcmeq p2.d, p0/z, z1.d, z19.d +; VBITS_GE_256-NEXT: mov z1.d, p1/z, #-1 // =0xffffffffffffffff +; VBITS_GE_256-NEXT: mov z2.d, p2/z, #-1 // =0xffffffffffffffff +; VBITS_GE_256-NEXT: fcmeq p1.d, p0/z, z0.d, z18.d +; VBITS_GE_256-NEXT: fcmeq p2.d, p0/z, z7.d, z23.d +; VBITS_GE_256-NEXT: mov z0.d, p1/z, #-1 // =0xffffffffffffffff +; VBITS_GE_256-NEXT: mov z7.d, p2/z, #-1 // =0xffffffffffffffff +; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x2, x8, lsl #3] +; VBITS_GE_256-NEXT: st1d { z2.d }, p0, [x2, x9, lsl #3] +; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x2, x10, lsl #3] +; VBITS_GE_256-NEXT: st1d { z4.d }, p0, [x2, x11, lsl #3] +; VBITS_GE_256-NEXT: st1d { z3.d }, p0, [x2, x12, lsl #3] +; VBITS_GE_256-NEXT: st1d { z6.d }, p0, [x2, x13, lsl #3] +; VBITS_GE_256-NEXT: st1d { z5.d }, p0, [x2, x14, lsl #3] +; VBITS_GE_256-NEXT: st1d { z7.d }, p0, [x2] +; VBITS_GE_256-NEXT: ret +; ; VBITS_GE_2048-LABEL: fcmp_oeq_v32f64: ; VBITS_GE_2048: // %bb.0: ; VBITS_GE_2048-NEXT: ptrue p0.d, vl32 @@ -357,9 +594,7 @@ ret void } -; ; FCMP UEQ -; define void @fcmp_ueq_v16f16(<16 x half>* %a, <16 x half>* %b, <16 x i16>* %c) #0 { ; CHECK-LABEL: fcmp_ueq_v16f16: @@ -381,9 +616,7 @@ ret void } -; ; FCMP ONE -; define void @fcmp_one_v16f16(<16 x half>* %a, <16 x half>* %b, <16 x i16>* %c) #0 { ; CHECK-LABEL: fcmp_one_v16f16: @@ -405,9 +638,7 @@ ret void } -; ; FCMP UNE -; define void @fcmp_une_v16f16(<16 x half>* %a, <16 x half>* %b, <16 x i16>* %c) #0 { ; CHECK-LABEL: fcmp_une_v16f16: @@ -427,9 +658,7 @@ ret void } -; ; FCMP OGT -; define void @fcmp_ogt_v16f16(<16 x half>* %a, <16 x half>* %b, <16 x i16>* %c) #0 { ; CHECK-LABEL: fcmp_ogt_v16f16: @@ -449,9 +678,7 @@ ret void } -; ; FCMP UGT -; define void @fcmp_ugt_v16f16(<16 x half>* %a, <16 x half>* %b, <16 x i16>* %c) #0 { ; CHECK-LABEL: fcmp_ugt_v16f16: @@ -473,9 +700,7 @@ ret void } -; ; FCMP OLT -; define void @fcmp_olt_v16f16(<16 x half>* %a, <16 x half>* %b, <16 x i16>* %c) #0 { ; CHECK-LABEL: fcmp_olt_v16f16: @@ -495,9 +720,7 @@ ret void } -; ; FCMP ULT -; define void @fcmp_ult_v16f16(<16 x half>* %a, <16 x half>* %b, <16 x i16>* %c) #0 { ; CHECK-LABEL: fcmp_ult_v16f16: @@ -519,9 +742,7 @@ ret void } -; ; FCMP OGE -; define void @fcmp_oge_v16f16(<16 x half>* %a, <16 x half>* %b, <16 x i16>* %c) #0 { ; CHECK-LABEL: fcmp_oge_v16f16: @@ -541,9 +762,7 @@ ret void } -; ; FCMP UGE -; define void @fcmp_uge_v16f16(<16 x half>* %a, <16 x half>* %b, <16 x i16>* %c) #0 { ; CHECK-LABEL: fcmp_uge_v16f16: @@ -565,9 +784,7 @@ ret void } -; ; FCMP OLE -; define void @fcmp_ole_v16f16(<16 x half>* %a, <16 x half>* %b, <16 x i16>* %c) #0 { ; CHECK-LABEL: fcmp_ole_v16f16: @@ -587,9 +804,7 @@ ret void } -; ; FCMP ULE -; define void @fcmp_ule_v16f16(<16 x half>* %a, <16 x half>* %b, <16 x i16>* %c) #0 { ; CHECK-LABEL: fcmp_ule_v16f16: @@ -611,9 +826,7 @@ ret void } -; ; FCMP UNO -; define void @fcmp_uno_v16f16(<16 x half>* %a, <16 x half>* %b, <16 x i16>* %c) #0 { ; CHECK-LABEL: fcmp_uno_v16f16: @@ -633,9 +846,7 @@ ret void } -; ; FCMP ORD -; define void @fcmp_ord_v16f16(<16 x half>* %a, <16 x half>* %b, <16 x i16>* %c) #0 { ; CHECK-LABEL: fcmp_ord_v16f16: @@ -657,9 +868,7 @@ ret void } -; ; FCMP EQ -; define void @fcmp_eq_v16f16(<16 x half>* %a, <16 x half>* %b, <16 x i16>* %c) #0 { ; CHECK-LABEL: fcmp_eq_v16f16: @@ -679,9 +888,7 @@ ret void } -; ; FCMP NE -; define void @fcmp_ne_v16f16(<16 x half>* %a, <16 x half>* %b, <16 x i16>* %c) #0 { ; CHECK-LABEL: fcmp_ne_v16f16: @@ -701,9 +908,7 @@ ret void } -; ; FCMP GT -; define void @fcmp_gt_v16f16(<16 x half>* %a, <16 x half>* %b, <16 x i16>* %c) #0 { ; CHECK-LABEL: fcmp_gt_v16f16: @@ -723,9 +928,7 @@ ret void } -; ; FCMP LT -; define void @fcmp_lt_v16f16(<16 x half>* %a, <16 x half>* %b, <16 x i16>* %c) #0 { ; CHECK-LABEL: fcmp_lt_v16f16: @@ -745,9 +948,7 @@ ret void } -; ; FCMP GE -; define void @fcmp_ge_v16f16(<16 x half>* %a, <16 x half>* %b, <16 x i16>* %c) #0 { ; CHECK-LABEL: fcmp_ge_v16f16: @@ -767,9 +968,7 @@ ret void } -; ; FCMP LE -; define void @fcmp_le_v16f16(<16 x half>* %a, <16 x half>* %b, <16 x i16>* %c) #0 { ; CHECK-LABEL: fcmp_le_v16f16: diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-insert-vector-elt.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-insert-vector-elt.ll --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-insert-vector-elt.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-insert-vector-elt.ll @@ -1,4 +1,6 @@ -; RUN: llc -aarch64-sve-vector-bits-min=128 < %s | FileCheck %s -D#VBYTES=16 -check-prefix=NO_SVE +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; Don't use SVE when its registers are no bigger than NEON. +; RUN: llc -aarch64-sve-vector-bits-min=128 < %s | not grep ptrue ; RUN: llc -aarch64-sve-vector-bits-min=256 < %s | FileCheck %s -check-prefixes=VBITS_GE_256 ; RUN: llc -aarch64-sve-vector-bits-min=384 < %s | FileCheck %s -check-prefixes=VBITS_GE_256 ; RUN: llc -aarch64-sve-vector-bits-min=512 < %s | FileCheck %s -check-prefixes=VBITS_GE_256,VBITS_GE_512 @@ -17,12 +19,7 @@ target triple = "aarch64-unknown-linux-gnu" -; Don't use SVE when its registers are no bigger than NEON. -; NO_SVE-NOT: ptrue - -; ; insertelement -; ; Don't use SVE for 64-bit vectors. define <4 x half> @insertelement_v4f16(<4 x half> %op1) #0 { diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-gather.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-gather.ll --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-gather.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-gather.ll @@ -1,6 +1,8 @@ -; RUN: llc -aarch64-sve-vector-bits-min=128 < %s | FileCheck %s -check-prefix=NO_SVE -; RUN: llc -aarch64-sve-vector-bits-min=256 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_EQ_256 -; RUN: llc -aarch64-sve-vector-bits-min=384 < %s | FileCheck %s -check-prefixes=CHECK +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; Don't use SVE when its registers are no bigger than NEON. +; RUN: llc -aarch64-sve-vector-bits-min=128 < %s | not grep ptrue +; RUN: llc -aarch64-sve-vector-bits-min=256 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_256 +; RUN: llc -aarch64-sve-vector-bits-min=384 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_256 ; RUN: llc -aarch64-sve-vector-bits-min=512 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512 ; RUN: llc -aarch64-sve-vector-bits-min=640 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512 ; RUN: llc -aarch64-sve-vector-bits-min=768 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512 @@ -17,12 +19,7 @@ target triple = "aarch64-unknown-linux-gnu" -; Don't use SVE when its registers are no bigger than NEON. -; NO_SVE-NOT: ptrue - -; ; LD1B -; define void @masked_gather_v2i8(<2 x i8>* %a, <2 x i8*>* %b) #0 { ; CHECK-LABEL: masked_gather_v2i8: @@ -76,35 +73,35 @@ define void @masked_gather_v8i8(<8 x i8>* %a, <8 x i8*>* %b) #0 { ; Ensure sensible type legalisation. -; VBITS_EQ_256-LABEL: masked_gather_v8i8: -; VBITS_EQ_256: // %bb.0: -; VBITS_EQ_256-NEXT: ldr d0, [x0] -; VBITS_EQ_256-NEXT: mov x8, #4 -; VBITS_EQ_256-NEXT: ptrue p0.d, vl4 -; VBITS_EQ_256-NEXT: cmeq v0.8b, v0.8b, #0 -; VBITS_EQ_256-NEXT: ld1d { z2.d }, p0/z, [x1, x8, lsl #3] -; VBITS_EQ_256-NEXT: ld1d { z3.d }, p0/z, [x1] -; VBITS_EQ_256-NEXT: zip2 v1.8b, v0.8b, v0.8b -; VBITS_EQ_256-NEXT: zip1 v0.8b, v0.8b, v0.8b -; VBITS_EQ_256-NEXT: shl v1.4h, v1.4h, #8 -; VBITS_EQ_256-NEXT: shl v0.4h, v0.4h, #8 -; VBITS_EQ_256-NEXT: sshr v1.4h, v1.4h, #8 -; VBITS_EQ_256-NEXT: sshr v0.4h, v0.4h, #8 -; VBITS_EQ_256-NEXT: sunpklo z1.s, z1.h -; VBITS_EQ_256-NEXT: sunpklo z0.s, z0.h -; VBITS_EQ_256-NEXT: sunpklo z1.d, z1.s -; VBITS_EQ_256-NEXT: sunpklo z0.d, z0.s -; VBITS_EQ_256-NEXT: cmpne p1.d, p0/z, z1.d, #0 -; VBITS_EQ_256-NEXT: cmpne p0.d, p0/z, z0.d, #0 -; VBITS_EQ_256-NEXT: ld1b { z0.d }, p1/z, [z2.d] -; VBITS_EQ_256-NEXT: ld1b { z1.d }, p0/z, [z3.d] -; VBITS_EQ_256-NEXT: uzp1 z0.s, z0.s, z0.s -; VBITS_EQ_256-NEXT: uzp1 z1.s, z1.s, z1.s -; VBITS_EQ_256-NEXT: uzp1 z0.h, z0.h, z0.h -; VBITS_EQ_256-NEXT: uzp1 z1.h, z1.h, z1.h -; VBITS_EQ_256-NEXT: uzp1 v0.8b, v1.8b, v0.8b -; VBITS_EQ_256-NEXT: str d0, [x0] -; VBITS_EQ_256-NEXT: ret +; VBITS_GE_256-LABEL: masked_gather_v8i8: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: ldr d0, [x0] +; VBITS_GE_256-NEXT: mov x8, #4 +; VBITS_GE_256-NEXT: ptrue p0.d, vl4 +; VBITS_GE_256-NEXT: cmeq v0.8b, v0.8b, #0 +; VBITS_GE_256-NEXT: ld1d { z2.d }, p0/z, [x1, x8, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z3.d }, p0/z, [x1] +; VBITS_GE_256-NEXT: zip2 v1.8b, v0.8b, v0.8b +; VBITS_GE_256-NEXT: zip1 v0.8b, v0.8b, v0.8b +; VBITS_GE_256-NEXT: shl v1.4h, v1.4h, #8 +; VBITS_GE_256-NEXT: shl v0.4h, v0.4h, #8 +; VBITS_GE_256-NEXT: sshr v1.4h, v1.4h, #8 +; VBITS_GE_256-NEXT: sshr v0.4h, v0.4h, #8 +; VBITS_GE_256-NEXT: sunpklo z1.s, z1.h +; VBITS_GE_256-NEXT: sunpklo z0.s, z0.h +; VBITS_GE_256-NEXT: sunpklo z1.d, z1.s +; VBITS_GE_256-NEXT: sunpklo z0.d, z0.s +; VBITS_GE_256-NEXT: cmpne p1.d, p0/z, z1.d, #0 +; VBITS_GE_256-NEXT: cmpne p0.d, p0/z, z0.d, #0 +; VBITS_GE_256-NEXT: ld1b { z0.d }, p1/z, [z2.d] +; VBITS_GE_256-NEXT: ld1b { z1.d }, p0/z, [z3.d] +; VBITS_GE_256-NEXT: uzp1 z0.s, z0.s, z0.s +; VBITS_GE_256-NEXT: uzp1 z1.s, z1.s, z1.s +; VBITS_GE_256-NEXT: uzp1 z0.h, z0.h, z0.h +; VBITS_GE_256-NEXT: uzp1 z1.h, z1.h, z1.h +; VBITS_GE_256-NEXT: uzp1 v0.8b, v1.8b, v0.8b +; VBITS_GE_256-NEXT: str d0, [x0] +; VBITS_GE_256-NEXT: ret ; ; VBITS_GE_512-LABEL: masked_gather_v8i8: ; VBITS_GE_512: // %bb.0: @@ -131,6 +128,61 @@ } define void @masked_gather_v16i8(<16 x i8>* %a, <16 x i8*>* %b) #0 { +; VBITS_GE_256-LABEL: masked_gather_v16i8: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: ldr q0, [x0] +; VBITS_GE_256-NEXT: mov x10, #4 +; VBITS_GE_256-NEXT: mov x8, #8 +; VBITS_GE_256-NEXT: mov x9, #12 +; VBITS_GE_256-NEXT: ptrue p0.d, vl4 +; VBITS_GE_256-NEXT: cmeq v0.16b, v0.16b, #0 +; VBITS_GE_256-NEXT: ld1d { z6.d }, p0/z, [x1, x10, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z4.d }, p0/z, [x1, x8, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z5.d }, p0/z, [x1, x9, lsl #3] +; VBITS_GE_256-NEXT: zip2 v1.8b, v0.8b, v0.8b +; VBITS_GE_256-NEXT: zip1 v2.8b, v0.8b, v0.8b +; VBITS_GE_256-NEXT: ext v0.16b, v0.16b, v0.16b, #8 +; VBITS_GE_256-NEXT: ld1d { z7.d }, p0/z, [x1] +; VBITS_GE_256-NEXT: shl v1.4h, v1.4h, #8 +; VBITS_GE_256-NEXT: shl v2.4h, v2.4h, #8 +; VBITS_GE_256-NEXT: zip2 v3.8b, v0.8b, v0.8b +; VBITS_GE_256-NEXT: zip1 v0.8b, v0.8b, v0.8b +; VBITS_GE_256-NEXT: sshr v1.4h, v1.4h, #8 +; VBITS_GE_256-NEXT: sshr v2.4h, v2.4h, #8 +; VBITS_GE_256-NEXT: sunpklo z1.s, z1.h +; VBITS_GE_256-NEXT: sunpklo z2.s, z2.h +; VBITS_GE_256-NEXT: shl v3.4h, v3.4h, #8 +; VBITS_GE_256-NEXT: shl v0.4h, v0.4h, #8 +; VBITS_GE_256-NEXT: sunpklo z1.d, z1.s +; VBITS_GE_256-NEXT: sunpklo z2.d, z2.s +; VBITS_GE_256-NEXT: cmpne p1.d, p0/z, z1.d, #0 +; VBITS_GE_256-NEXT: sshr v1.4h, v3.4h, #8 +; VBITS_GE_256-NEXT: sshr v0.4h, v0.4h, #8 +; VBITS_GE_256-NEXT: cmpne p2.d, p0/z, z2.d, #0 +; VBITS_GE_256-NEXT: sunpklo z1.s, z1.h +; VBITS_GE_256-NEXT: ld1b { z2.d }, p1/z, [z6.d] +; VBITS_GE_256-NEXT: sunpklo z0.s, z0.h +; VBITS_GE_256-NEXT: sunpklo z1.d, z1.s +; VBITS_GE_256-NEXT: sunpklo z0.d, z0.s +; VBITS_GE_256-NEXT: cmpne p1.d, p0/z, z1.d, #0 +; VBITS_GE_256-NEXT: cmpne p0.d, p0/z, z0.d, #0 +; VBITS_GE_256-NEXT: ld1b { z3.d }, p2/z, [z7.d] +; VBITS_GE_256-NEXT: ld1b { z0.d }, p1/z, [z5.d] +; VBITS_GE_256-NEXT: ld1b { z1.d }, p0/z, [z4.d] +; VBITS_GE_256-NEXT: uzp1 z2.s, z2.s, z2.s +; VBITS_GE_256-NEXT: uzp1 z2.h, z2.h, z2.h +; VBITS_GE_256-NEXT: uzp1 z3.s, z3.s, z3.s +; VBITS_GE_256-NEXT: uzp1 z0.s, z0.s, z0.s +; VBITS_GE_256-NEXT: uzp1 z1.s, z1.s, z1.s +; VBITS_GE_256-NEXT: uzp1 z3.h, z3.h, z3.h +; VBITS_GE_256-NEXT: uzp1 z0.h, z0.h, z0.h +; VBITS_GE_256-NEXT: uzp1 z1.h, z1.h, z1.h +; VBITS_GE_256-NEXT: mov v3.d[1], v2.d[0] +; VBITS_GE_256-NEXT: mov v1.d[1], v0.d[0] +; VBITS_GE_256-NEXT: uzp1 v0.16b, v3.16b, v1.16b +; VBITS_GE_256-NEXT: str q0, [x0] +; VBITS_GE_256-NEXT: ret +; ; VBITS_GE_1024-LABEL: masked_gather_v16i8: ; VBITS_GE_1024: // %bb.0: ; VBITS_GE_1024-NEXT: ldr q0, [x0] @@ -156,6 +208,182 @@ } define void @masked_gather_v32i8(<32 x i8>* %a, <32 x i8*>* %b) #0 { +; VBITS_GE_256-LABEL: masked_gather_v32i8: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill +; VBITS_GE_256-NEXT: .cfi_def_cfa_offset 16 +; VBITS_GE_256-NEXT: mov x29, sp +; VBITS_GE_256-NEXT: .cfi_def_cfa w29, 16 +; VBITS_GE_256-NEXT: .cfi_offset w30, -8 +; VBITS_GE_256-NEXT: .cfi_offset w29, -16 +; VBITS_GE_256-NEXT: sub x9, sp, #48 +; VBITS_GE_256-NEXT: and sp, x9, #0xffffffffffffffe0 +; VBITS_GE_256-NEXT: ptrue p0.b, vl32 +; VBITS_GE_256-NEXT: mov x8, #24 +; VBITS_GE_256-NEXT: ld1b { z0.b }, p0/z, [x0] +; VBITS_GE_256-NEXT: mov x9, #28 +; VBITS_GE_256-NEXT: mov x10, #8 +; VBITS_GE_256-NEXT: cmpeq p1.b, p0/z, z0.b, #0 +; VBITS_GE_256-NEXT: mov z4.b, p1/z, #-1 // =0xffffffffffffffff +; VBITS_GE_256-NEXT: ptrue p1.d, vl4 +; VBITS_GE_256-NEXT: zip2 v2.8b, v4.8b, v0.8b +; VBITS_GE_256-NEXT: ld1d { z0.d }, p1/z, [x1, x8, lsl #3] +; VBITS_GE_256-NEXT: mov x8, #4 +; VBITS_GE_256-NEXT: ld1d { z1.d }, p1/z, [x1, x9, lsl #3] +; VBITS_GE_256-NEXT: mov x9, #12 +; VBITS_GE_256-NEXT: shl v3.4h, v2.4h, #8 +; VBITS_GE_256-NEXT: ld1d { z7.d }, p1/z, [x1, x8, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z2.d }, p1/z, [x1, x10, lsl #3] +; VBITS_GE_256-NEXT: mov x10, #16 +; VBITS_GE_256-NEXT: sshr v5.4h, v3.4h, #8 +; VBITS_GE_256-NEXT: mov x8, #20 +; VBITS_GE_256-NEXT: ld1d { z3.d }, p1/z, [x1, x9, lsl #3] +; VBITS_GE_256-NEXT: sunpklo z5.s, z5.h +; VBITS_GE_256-NEXT: ld1d { z6.d }, p1/z, [x1, x10, lsl #3] +; VBITS_GE_256-NEXT: sunpklo z5.d, z5.s +; VBITS_GE_256-NEXT: ld1d { z16.d }, p1/z, [x1, x8, lsl #3] +; VBITS_GE_256-NEXT: cmpne p2.d, p1/z, z5.d, #0 +; VBITS_GE_256-NEXT: ld1d { z17.d }, p1/z, [x1] +; VBITS_GE_256-NEXT: ld1b { z5.d }, p2/z, [z7.d] +; VBITS_GE_256-NEXT: zip1 v7.8b, v4.8b, v0.8b +; VBITS_GE_256-NEXT: uzp1 z5.s, z5.s, z5.s +; VBITS_GE_256-NEXT: shl v7.4h, v7.4h, #8 +; VBITS_GE_256-NEXT: uzp1 z5.h, z5.h, z5.h +; VBITS_GE_256-NEXT: umov w8, v5.h[3] +; VBITS_GE_256-NEXT: umov w9, v5.h[2] +; VBITS_GE_256-NEXT: umov w10, v5.h[1] +; VBITS_GE_256-NEXT: sshr v7.4h, v7.4h, #8 +; VBITS_GE_256-NEXT: umov w11, v5.h[0] +; VBITS_GE_256-NEXT: mov z5.d, z4.d +; VBITS_GE_256-NEXT: sunpklo z7.s, z7.h +; VBITS_GE_256-NEXT: ext z5.b, z5.b, z4.b, #16 +; VBITS_GE_256-NEXT: sunpklo z7.d, z7.s +; VBITS_GE_256-NEXT: strb w8, [sp, #7] +; VBITS_GE_256-NEXT: strb w9, [sp, #6] +; VBITS_GE_256-NEXT: cmpne p2.d, p1/z, z7.d, #0 +; VBITS_GE_256-NEXT: strb w10, [sp, #5] +; VBITS_GE_256-NEXT: strb w11, [sp, #4] +; VBITS_GE_256-NEXT: ld1b { z7.d }, p2/z, [z17.d] +; VBITS_GE_256-NEXT: zip2 v17.8b, v5.8b, v0.8b +; VBITS_GE_256-NEXT: ext v4.16b, v4.16b, v4.16b, #8 +; VBITS_GE_256-NEXT: uzp1 z7.s, z7.s, z7.s +; VBITS_GE_256-NEXT: shl v17.4h, v17.4h, #8 +; VBITS_GE_256-NEXT: uzp1 z7.h, z7.h, z7.h +; VBITS_GE_256-NEXT: umov w8, v7.h[3] +; VBITS_GE_256-NEXT: umov w9, v7.h[2] +; VBITS_GE_256-NEXT: umov w10, v7.h[1] +; VBITS_GE_256-NEXT: sshr v17.4h, v17.4h, #8 +; VBITS_GE_256-NEXT: umov w11, v7.h[0] +; VBITS_GE_256-NEXT: sunpklo z7.s, z17.h +; VBITS_GE_256-NEXT: sunpklo z7.d, z7.s +; VBITS_GE_256-NEXT: strb w8, [sp, #3] +; VBITS_GE_256-NEXT: strb w9, [sp, #2] +; VBITS_GE_256-NEXT: cmpne p2.d, p1/z, z7.d, #0 +; VBITS_GE_256-NEXT: strb w10, [sp, #1] +; VBITS_GE_256-NEXT: strb w11, [sp] +; VBITS_GE_256-NEXT: ld1b { z7.d }, p2/z, [z16.d] +; VBITS_GE_256-NEXT: zip1 v16.8b, v5.8b, v0.8b +; VBITS_GE_256-NEXT: uzp1 z7.s, z7.s, z7.s +; VBITS_GE_256-NEXT: shl v16.4h, v16.4h, #8 +; VBITS_GE_256-NEXT: uzp1 z7.h, z7.h, z7.h +; VBITS_GE_256-NEXT: umov w8, v7.h[3] +; VBITS_GE_256-NEXT: umov w9, v7.h[2] +; VBITS_GE_256-NEXT: umov w10, v7.h[1] +; VBITS_GE_256-NEXT: sshr v16.4h, v16.4h, #8 +; VBITS_GE_256-NEXT: umov w11, v7.h[0] +; VBITS_GE_256-NEXT: sunpklo z7.s, z16.h +; VBITS_GE_256-NEXT: sunpklo z7.d, z7.s +; VBITS_GE_256-NEXT: strb w8, [sp, #23] +; VBITS_GE_256-NEXT: strb w9, [sp, #22] +; VBITS_GE_256-NEXT: cmpne p2.d, p1/z, z7.d, #0 +; VBITS_GE_256-NEXT: strb w10, [sp, #21] +; VBITS_GE_256-NEXT: zip2 v7.8b, v4.8b, v0.8b +; VBITS_GE_256-NEXT: strb w11, [sp, #20] +; VBITS_GE_256-NEXT: zip1 v4.8b, v4.8b, v0.8b +; VBITS_GE_256-NEXT: ld1b { z6.d }, p2/z, [z6.d] +; VBITS_GE_256-NEXT: shl v7.4h, v7.4h, #8 +; VBITS_GE_256-NEXT: shl v4.4h, v4.4h, #8 +; VBITS_GE_256-NEXT: uzp1 z6.s, z6.s, z6.s +; VBITS_GE_256-NEXT: sshr v7.4h, v7.4h, #8 +; VBITS_GE_256-NEXT: uzp1 z6.h, z6.h, z6.h +; VBITS_GE_256-NEXT: sshr v4.4h, v4.4h, #8 +; VBITS_GE_256-NEXT: umov w8, v6.h[3] +; VBITS_GE_256-NEXT: umov w9, v6.h[2] +; VBITS_GE_256-NEXT: umov w10, v6.h[1] +; VBITS_GE_256-NEXT: umov w11, v6.h[0] +; VBITS_GE_256-NEXT: sunpklo z6.s, z7.h +; VBITS_GE_256-NEXT: sunpklo z4.s, z4.h +; VBITS_GE_256-NEXT: sunpklo z6.d, z6.s +; VBITS_GE_256-NEXT: sunpklo z4.d, z4.s +; VBITS_GE_256-NEXT: strb w8, [sp, #19] +; VBITS_GE_256-NEXT: cmpne p2.d, p1/z, z6.d, #0 +; VBITS_GE_256-NEXT: strb w9, [sp, #18] +; VBITS_GE_256-NEXT: strb w10, [sp, #17] +; VBITS_GE_256-NEXT: strb w11, [sp, #16] +; VBITS_GE_256-NEXT: ld1b { z3.d }, p2/z, [z3.d] +; VBITS_GE_256-NEXT: cmpne p2.d, p1/z, z4.d, #0 +; VBITS_GE_256-NEXT: uzp1 z3.s, z3.s, z3.s +; VBITS_GE_256-NEXT: uzp1 z3.h, z3.h, z3.h +; VBITS_GE_256-NEXT: umov w8, v3.h[3] +; VBITS_GE_256-NEXT: umov w9, v3.h[2] +; VBITS_GE_256-NEXT: umov w10, v3.h[1] +; VBITS_GE_256-NEXT: umov w11, v3.h[0] +; VBITS_GE_256-NEXT: ext v3.16b, v5.16b, v5.16b, #8 +; VBITS_GE_256-NEXT: strb w8, [sp, #15] +; VBITS_GE_256-NEXT: strb w9, [sp, #14] +; VBITS_GE_256-NEXT: strb w10, [sp, #13] +; VBITS_GE_256-NEXT: zip2 v4.8b, v3.8b, v0.8b +; VBITS_GE_256-NEXT: strb w11, [sp, #12] +; VBITS_GE_256-NEXT: ld1b { z2.d }, p2/z, [z2.d] +; VBITS_GE_256-NEXT: shl v4.4h, v4.4h, #8 +; VBITS_GE_256-NEXT: uzp1 z2.s, z2.s, z2.s +; VBITS_GE_256-NEXT: sshr v4.4h, v4.4h, #8 +; VBITS_GE_256-NEXT: uzp1 z2.h, z2.h, z2.h +; VBITS_GE_256-NEXT: umov w8, v2.h[3] +; VBITS_GE_256-NEXT: umov w9, v2.h[2] +; VBITS_GE_256-NEXT: umov w10, v2.h[1] +; VBITS_GE_256-NEXT: umov w11, v2.h[0] +; VBITS_GE_256-NEXT: sunpklo z2.s, z4.h +; VBITS_GE_256-NEXT: sunpklo z2.d, z2.s +; VBITS_GE_256-NEXT: strb w8, [sp, #11] +; VBITS_GE_256-NEXT: cmpne p2.d, p1/z, z2.d, #0 +; VBITS_GE_256-NEXT: strb w9, [sp, #10] +; VBITS_GE_256-NEXT: zip1 v2.8b, v3.8b, v0.8b +; VBITS_GE_256-NEXT: strb w10, [sp, #9] +; VBITS_GE_256-NEXT: strb w11, [sp, #8] +; VBITS_GE_256-NEXT: ld1b { z1.d }, p2/z, [z1.d] +; VBITS_GE_256-NEXT: shl v2.4h, v2.4h, #8 +; VBITS_GE_256-NEXT: sshr v2.4h, v2.4h, #8 +; VBITS_GE_256-NEXT: uzp1 z1.s, z1.s, z1.s +; VBITS_GE_256-NEXT: uzp1 z1.h, z1.h, z1.h +; VBITS_GE_256-NEXT: umov w8, v1.h[3] +; VBITS_GE_256-NEXT: umov w9, v1.h[2] +; VBITS_GE_256-NEXT: umov w10, v1.h[1] +; VBITS_GE_256-NEXT: umov w11, v1.h[0] +; VBITS_GE_256-NEXT: sunpklo z1.s, z2.h +; VBITS_GE_256-NEXT: sunpklo z1.d, z1.s +; VBITS_GE_256-NEXT: strb w8, [sp, #31] +; VBITS_GE_256-NEXT: cmpne p1.d, p1/z, z1.d, #0 +; VBITS_GE_256-NEXT: strb w9, [sp, #30] +; VBITS_GE_256-NEXT: strb w10, [sp, #29] +; VBITS_GE_256-NEXT: strb w11, [sp, #28] +; VBITS_GE_256-NEXT: ld1b { z0.d }, p1/z, [z0.d] +; VBITS_GE_256-NEXT: uzp1 z0.s, z0.s, z0.s +; VBITS_GE_256-NEXT: uzp1 z0.h, z0.h, z0.h +; VBITS_GE_256-NEXT: umov w8, v0.h[3] +; VBITS_GE_256-NEXT: umov w9, v0.h[2] +; VBITS_GE_256-NEXT: umov w10, v0.h[1] +; VBITS_GE_256-NEXT: umov w11, v0.h[0] +; VBITS_GE_256-NEXT: strb w8, [sp, #27] +; VBITS_GE_256-NEXT: strb w9, [sp, #26] +; VBITS_GE_256-NEXT: strb w10, [sp, #25] +; VBITS_GE_256-NEXT: strb w11, [sp, #24] +; VBITS_GE_256-NEXT: ld1b { z0.b }, p0/z, [sp] +; VBITS_GE_256-NEXT: st1b { z0.b }, p0, [x0] +; VBITS_GE_256-NEXT: mov sp, x29 +; VBITS_GE_256-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload +; VBITS_GE_256-NEXT: ret +; ; VBITS_GE_2048-LABEL: masked_gather_v32i8: ; VBITS_GE_2048: // %bb.0: ; VBITS_GE_2048-NEXT: ptrue p0.b, vl32 @@ -180,9 +408,7 @@ ret void } -; ; LD1H -; define void @masked_gather_v2i16(<2 x i16>* %a, <2 x i16*>* %b) #0 { ; CHECK-LABEL: masked_gather_v2i16: @@ -234,30 +460,30 @@ define void @masked_gather_v8i16(<8 x i16>* %a, <8 x i16*>* %b) #0 { ; Ensure sensible type legalisation. -; VBITS_EQ_256-LABEL: masked_gather_v8i16: -; VBITS_EQ_256: // %bb.0: -; VBITS_EQ_256-NEXT: ldr q0, [x0] -; VBITS_EQ_256-NEXT: mov x8, #4 -; VBITS_EQ_256-NEXT: ptrue p0.d, vl4 -; VBITS_EQ_256-NEXT: cmeq v0.8h, v0.8h, #0 -; VBITS_EQ_256-NEXT: ld1d { z2.d }, p0/z, [x1, x8, lsl #3] -; VBITS_EQ_256-NEXT: ld1d { z3.d }, p0/z, [x1] -; VBITS_EQ_256-NEXT: ext v1.16b, v0.16b, v0.16b, #8 -; VBITS_EQ_256-NEXT: sunpklo z0.s, z0.h -; VBITS_EQ_256-NEXT: sunpklo z0.d, z0.s -; VBITS_EQ_256-NEXT: cmpne p1.d, p0/z, z0.d, #0 -; VBITS_EQ_256-NEXT: ld1h { z0.d }, p1/z, [z3.d] -; VBITS_EQ_256-NEXT: sunpklo z1.s, z1.h -; VBITS_EQ_256-NEXT: sunpklo z1.d, z1.s -; VBITS_EQ_256-NEXT: cmpne p0.d, p0/z, z1.d, #0 -; VBITS_EQ_256-NEXT: ld1h { z1.d }, p0/z, [z2.d] -; VBITS_EQ_256-NEXT: uzp1 z0.s, z0.s, z0.s -; VBITS_EQ_256-NEXT: uzp1 z0.h, z0.h, z0.h -; VBITS_EQ_256-NEXT: uzp1 z1.s, z1.s, z1.s -; VBITS_EQ_256-NEXT: uzp1 z1.h, z1.h, z1.h -; VBITS_EQ_256-NEXT: mov v0.d[1], v1.d[0] -; VBITS_EQ_256-NEXT: str q0, [x0] -; VBITS_EQ_256-NEXT: ret +; VBITS_GE_256-LABEL: masked_gather_v8i16: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: ldr q0, [x0] +; VBITS_GE_256-NEXT: mov x8, #4 +; VBITS_GE_256-NEXT: ptrue p0.d, vl4 +; VBITS_GE_256-NEXT: cmeq v0.8h, v0.8h, #0 +; VBITS_GE_256-NEXT: ld1d { z2.d }, p0/z, [x1, x8, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z3.d }, p0/z, [x1] +; VBITS_GE_256-NEXT: ext v1.16b, v0.16b, v0.16b, #8 +; VBITS_GE_256-NEXT: sunpklo z0.s, z0.h +; VBITS_GE_256-NEXT: sunpklo z0.d, z0.s +; VBITS_GE_256-NEXT: cmpne p1.d, p0/z, z0.d, #0 +; VBITS_GE_256-NEXT: ld1h { z0.d }, p1/z, [z3.d] +; VBITS_GE_256-NEXT: sunpklo z1.s, z1.h +; VBITS_GE_256-NEXT: sunpklo z1.d, z1.s +; VBITS_GE_256-NEXT: cmpne p0.d, p0/z, z1.d, #0 +; VBITS_GE_256-NEXT: ld1h { z1.d }, p0/z, [z2.d] +; VBITS_GE_256-NEXT: uzp1 z0.s, z0.s, z0.s +; VBITS_GE_256-NEXT: uzp1 z0.h, z0.h, z0.h +; VBITS_GE_256-NEXT: uzp1 z1.s, z1.s, z1.s +; VBITS_GE_256-NEXT: uzp1 z1.h, z1.h, z1.h +; VBITS_GE_256-NEXT: mov v0.d[1], v1.d[0] +; VBITS_GE_256-NEXT: str q0, [x0] +; VBITS_GE_256-NEXT: ret ; ; VBITS_GE_512-LABEL: masked_gather_v8i16: ; VBITS_GE_512: // %bb.0: @@ -282,6 +508,54 @@ } define void @masked_gather_v16i16(<16 x i16>* %a, <16 x i16*>* %b) #0 { +; VBITS_GE_256-LABEL: masked_gather_v16i16: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: ptrue p0.h, vl16 +; VBITS_GE_256-NEXT: mov x8, #12 +; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0] +; VBITS_GE_256-NEXT: mov x9, #8 +; VBITS_GE_256-NEXT: mov x10, #4 +; VBITS_GE_256-NEXT: ptrue p1.d, vl4 +; VBITS_GE_256-NEXT: ld1d { z1.d }, p1/z, [x1, x8, lsl #3] +; VBITS_GE_256-NEXT: cmpeq p2.h, p0/z, z0.h, #0 +; VBITS_GE_256-NEXT: ld1d { z0.d }, p1/z, [x1, x9, lsl #3] +; VBITS_GE_256-NEXT: mov z2.h, p2/z, #-1 // =0xffffffffffffffff +; VBITS_GE_256-NEXT: ld1d { z3.d }, p1/z, [x1, x10, lsl #3] +; VBITS_GE_256-NEXT: sunpklo z4.s, z2.h +; VBITS_GE_256-NEXT: ld1d { z6.d }, p1/z, [x1] +; VBITS_GE_256-NEXT: ext v5.16b, v2.16b, v2.16b, #8 +; VBITS_GE_256-NEXT: sunpklo z4.d, z4.s +; VBITS_GE_256-NEXT: ext z2.b, z2.b, z2.b, #16 +; VBITS_GE_256-NEXT: cmpne p2.d, p1/z, z4.d, #0 +; VBITS_GE_256-NEXT: ext v4.16b, v2.16b, v2.16b, #8 +; VBITS_GE_256-NEXT: sunpklo z2.s, z2.h +; VBITS_GE_256-NEXT: ld1h { z6.d }, p2/z, [z6.d] +; VBITS_GE_256-NEXT: sunpklo z2.d, z2.s +; VBITS_GE_256-NEXT: sunpklo z5.s, z5.h +; VBITS_GE_256-NEXT: sunpklo z5.d, z5.s +; VBITS_GE_256-NEXT: cmpne p2.d, p1/z, z5.d, #0 +; VBITS_GE_256-NEXT: sunpklo z4.s, z4.h +; VBITS_GE_256-NEXT: ld1h { z3.d }, p2/z, [z3.d] +; VBITS_GE_256-NEXT: cmpne p2.d, p1/z, z2.d, #0 +; VBITS_GE_256-NEXT: sunpklo z2.d, z4.s +; VBITS_GE_256-NEXT: ld1h { z0.d }, p2/z, [z0.d] +; VBITS_GE_256-NEXT: cmpne p1.d, p1/z, z2.d, #0 +; VBITS_GE_256-NEXT: uzp1 z2.s, z6.s, z6.s +; VBITS_GE_256-NEXT: ld1h { z1.d }, p1/z, [z1.d] +; VBITS_GE_256-NEXT: uzp1 z2.h, z2.h, z2.h +; VBITS_GE_256-NEXT: uzp1 z3.s, z3.s, z3.s +; VBITS_GE_256-NEXT: ptrue p1.h, vl8 +; VBITS_GE_256-NEXT: uzp1 z0.s, z0.s, z0.s +; VBITS_GE_256-NEXT: uzp1 z3.h, z3.h, z3.h +; VBITS_GE_256-NEXT: uzp1 z0.h, z0.h, z0.h +; VBITS_GE_256-NEXT: uzp1 z1.s, z1.s, z1.s +; VBITS_GE_256-NEXT: uzp1 z1.h, z1.h, z1.h +; VBITS_GE_256-NEXT: mov v2.d[1], v3.d[0] +; VBITS_GE_256-NEXT: mov v0.d[1], v1.d[0] +; VBITS_GE_256-NEXT: splice z2.h, p1, z2.h, z0.h +; VBITS_GE_256-NEXT: st1h { z2.h }, p0, [x0] +; VBITS_GE_256-NEXT: ret +; ; VBITS_GE_1024-LABEL: masked_gather_v16i16: ; VBITS_GE_1024: // %bb.0: ; VBITS_GE_1024-NEXT: ptrue p0.h, vl16 @@ -305,6 +579,96 @@ } define void @masked_gather_v32i16(<32 x i16>* %a, <32 x i16*>* %b) #0 { +; VBITS_GE_256-LABEL: masked_gather_v32i16: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov x8, #16 +; VBITS_GE_256-NEXT: ptrue p0.h, vl16 +; VBITS_GE_256-NEXT: mov x11, #4 +; VBITS_GE_256-NEXT: mov x9, #12 +; VBITS_GE_256-NEXT: mov x10, #8 +; VBITS_GE_256-NEXT: ptrue p1.d, vl4 +; VBITS_GE_256-NEXT: ld1h { z3.h }, p0/z, [x0, x8, lsl #1] +; VBITS_GE_256-NEXT: ld1h { z4.h }, p0/z, [x0] +; VBITS_GE_256-NEXT: ld1d { z2.d }, p1/z, [x1, x11, lsl #3] +; VBITS_GE_256-NEXT: mov x11, #20 +; VBITS_GE_256-NEXT: ld1d { z0.d }, p1/z, [x1, x9, lsl #3] +; VBITS_GE_256-NEXT: mov x9, #28 +; VBITS_GE_256-NEXT: ld1d { z1.d }, p1/z, [x1, x10, lsl #3] +; VBITS_GE_256-NEXT: mov x10, #24 +; VBITS_GE_256-NEXT: cmpeq p2.h, p0/z, z3.h, #0 +; VBITS_GE_256-NEXT: ld1d { z17.d }, p1/z, [x1, x8, lsl #3] +; VBITS_GE_256-NEXT: mov z3.h, p2/z, #-1 // =0xffffffffffffffff +; VBITS_GE_256-NEXT: ld1d { z16.d }, p1/z, [x1, x11, lsl #3] +; VBITS_GE_256-NEXT: sunpklo z18.s, z3.h +; VBITS_GE_256-NEXT: ld1d { z6.d }, p1/z, [x1, x9, lsl #3] +; VBITS_GE_256-NEXT: ext v5.16b, v3.16b, v3.16b, #8 +; VBITS_GE_256-NEXT: ext z3.b, z3.b, z3.b, #16 +; VBITS_GE_256-NEXT: sunpklo z18.d, z18.s +; VBITS_GE_256-NEXT: ld1d { z7.d }, p1/z, [x1, x10, lsl #3] +; VBITS_GE_256-NEXT: cmpne p2.d, p1/z, z18.d, #0 +; VBITS_GE_256-NEXT: ext v18.16b, v3.16b, v3.16b, #8 +; VBITS_GE_256-NEXT: sunpklo z3.s, z3.h +; VBITS_GE_256-NEXT: sunpklo z5.s, z5.h +; VBITS_GE_256-NEXT: sunpklo z3.d, z3.s +; VBITS_GE_256-NEXT: sunpklo z5.d, z5.s +; VBITS_GE_256-NEXT: cmpne p3.d, p1/z, z5.d, #0 +; VBITS_GE_256-NEXT: ld1d { z5.d }, p1/z, [x1] +; VBITS_GE_256-NEXT: sunpklo z18.s, z18.h +; VBITS_GE_256-NEXT: ld1h { z17.d }, p2/z, [z17.d] +; VBITS_GE_256-NEXT: cmpeq p2.h, p0/z, z4.h, #0 +; VBITS_GE_256-NEXT: sunpklo z18.d, z18.s +; VBITS_GE_256-NEXT: ld1h { z4.d }, p3/z, [z16.d] +; VBITS_GE_256-NEXT: mov z16.h, p2/z, #-1 // =0xffffffffffffffff +; VBITS_GE_256-NEXT: cmpne p2.d, p1/z, z3.d, #0 +; VBITS_GE_256-NEXT: cmpne p3.d, p1/z, z18.d, #0 +; VBITS_GE_256-NEXT: ld1h { z3.d }, p2/z, [z7.d] +; VBITS_GE_256-NEXT: ld1h { z6.d }, p3/z, [z6.d] +; VBITS_GE_256-NEXT: uzp1 z17.s, z17.s, z17.s +; VBITS_GE_256-NEXT: uzp1 z4.s, z4.s, z4.s +; VBITS_GE_256-NEXT: uzp1 z7.h, z17.h, z17.h +; VBITS_GE_256-NEXT: uzp1 z4.h, z4.h, z4.h +; VBITS_GE_256-NEXT: uzp1 z3.s, z3.s, z3.s +; VBITS_GE_256-NEXT: uzp1 z6.s, z6.s, z6.s +; VBITS_GE_256-NEXT: mov v7.d[1], v4.d[0] +; VBITS_GE_256-NEXT: uzp1 z3.h, z3.h, z3.h +; VBITS_GE_256-NEXT: ext v4.16b, v16.16b, v16.16b, #8 +; VBITS_GE_256-NEXT: uzp1 z6.h, z6.h, z6.h +; VBITS_GE_256-NEXT: mov v3.d[1], v6.d[0] +; VBITS_GE_256-NEXT: sunpklo z6.s, z16.h +; VBITS_GE_256-NEXT: ext z16.b, z16.b, z16.b, #16 +; VBITS_GE_256-NEXT: sunpklo z6.d, z6.s +; VBITS_GE_256-NEXT: ext v17.16b, v16.16b, v16.16b, #8 +; VBITS_GE_256-NEXT: cmpne p2.d, p1/z, z6.d, #0 +; VBITS_GE_256-NEXT: sunpklo z4.s, z4.h +; VBITS_GE_256-NEXT: sunpklo z4.d, z4.s +; VBITS_GE_256-NEXT: cmpne p3.d, p1/z, z4.d, #0 +; VBITS_GE_256-NEXT: ld1h { z4.d }, p2/z, [z5.d] +; VBITS_GE_256-NEXT: sunpklo z5.s, z16.h +; VBITS_GE_256-NEXT: sunpklo z6.s, z17.h +; VBITS_GE_256-NEXT: sunpklo z5.d, z5.s +; VBITS_GE_256-NEXT: sunpklo z6.d, z6.s +; VBITS_GE_256-NEXT: cmpne p2.d, p1/z, z5.d, #0 +; VBITS_GE_256-NEXT: cmpne p1.d, p1/z, z6.d, #0 +; VBITS_GE_256-NEXT: ld1h { z2.d }, p3/z, [z2.d] +; VBITS_GE_256-NEXT: ld1h { z1.d }, p2/z, [z1.d] +; VBITS_GE_256-NEXT: ld1h { z0.d }, p1/z, [z0.d] +; VBITS_GE_256-NEXT: uzp1 z4.s, z4.s, z4.s +; VBITS_GE_256-NEXT: uzp1 z4.h, z4.h, z4.h +; VBITS_GE_256-NEXT: ptrue p1.h, vl8 +; VBITS_GE_256-NEXT: splice z7.h, p1, z7.h, z3.h +; VBITS_GE_256-NEXT: uzp1 z2.s, z2.s, z2.s +; VBITS_GE_256-NEXT: uzp1 z1.s, z1.s, z1.s +; VBITS_GE_256-NEXT: uzp1 z0.s, z0.s, z0.s +; VBITS_GE_256-NEXT: uzp1 z2.h, z2.h, z2.h +; VBITS_GE_256-NEXT: uzp1 z1.h, z1.h, z1.h +; VBITS_GE_256-NEXT: uzp1 z0.h, z0.h, z0.h +; VBITS_GE_256-NEXT: mov v4.d[1], v2.d[0] +; VBITS_GE_256-NEXT: st1h { z7.h }, p0, [x0, x8, lsl #1] +; VBITS_GE_256-NEXT: mov v1.d[1], v0.d[0] +; VBITS_GE_256-NEXT: splice z4.h, p1, z4.h, z1.h +; VBITS_GE_256-NEXT: st1h { z4.h }, p0, [x0] +; VBITS_GE_256-NEXT: ret +; ; VBITS_GE_2048-LABEL: masked_gather_v32i16: ; VBITS_GE_2048: // %bb.0: ; VBITS_GE_2048-NEXT: ptrue p0.h, vl32 @@ -327,9 +691,7 @@ ret void } -; ; LD1W -; define void @masked_gather_v2i32(<2 x i32>* %a, <2 x i32*>* %b) #0 { ; CHECK-LABEL: masked_gather_v2i32: @@ -375,29 +737,29 @@ define void @masked_gather_v8i32(<8 x i32>* %a, <8 x i32*>* %b) #0 { ; Ensure sensible type legalisation. -; VBITS_EQ_256-LABEL: masked_gather_v8i32: -; VBITS_EQ_256: // %bb.0: -; VBITS_EQ_256-NEXT: ptrue p0.s, vl8 -; VBITS_EQ_256-NEXT: mov x8, #4 -; VBITS_EQ_256-NEXT: ld1w { z0.s }, p0/z, [x0] -; VBITS_EQ_256-NEXT: ptrue p1.d, vl4 -; VBITS_EQ_256-NEXT: ld1d { z1.d }, p1/z, [x1, x8, lsl #3] -; VBITS_EQ_256-NEXT: ld1d { z2.d }, p1/z, [x1] -; VBITS_EQ_256-NEXT: cmpeq p2.s, p0/z, z0.s, #0 -; VBITS_EQ_256-NEXT: mov z0.s, p2/z, #-1 // =0xffffffffffffffff -; VBITS_EQ_256-NEXT: punpklo p2.h, p2.b -; VBITS_EQ_256-NEXT: ext z0.b, z0.b, z0.b, #16 -; VBITS_EQ_256-NEXT: and p2.b, p2/z, p2.b, p1.b -; VBITS_EQ_256-NEXT: sunpklo z0.d, z0.s -; VBITS_EQ_256-NEXT: ld1w { z2.d }, p2/z, [z2.d] -; VBITS_EQ_256-NEXT: cmpne p1.d, p1/z, z0.d, #0 -; VBITS_EQ_256-NEXT: ld1w { z0.d }, p1/z, [z1.d] -; VBITS_EQ_256-NEXT: ptrue p1.s, vl4 -; VBITS_EQ_256-NEXT: uzp1 z1.s, z2.s, z2.s -; VBITS_EQ_256-NEXT: uzp1 z0.s, z0.s, z0.s -; VBITS_EQ_256-NEXT: splice z1.s, p1, z1.s, z0.s -; VBITS_EQ_256-NEXT: st1w { z1.s }, p0, [x0] -; VBITS_EQ_256-NEXT: ret +; VBITS_GE_256-LABEL: masked_gather_v8i32: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: ptrue p0.s, vl8 +; VBITS_GE_256-NEXT: mov x8, #4 +; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0] +; VBITS_GE_256-NEXT: ptrue p1.d, vl4 +; VBITS_GE_256-NEXT: ld1d { z1.d }, p1/z, [x1, x8, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z2.d }, p1/z, [x1] +; VBITS_GE_256-NEXT: cmpeq p2.s, p0/z, z0.s, #0 +; VBITS_GE_256-NEXT: mov z0.s, p2/z, #-1 // =0xffffffffffffffff +; VBITS_GE_256-NEXT: punpklo p2.h, p2.b +; VBITS_GE_256-NEXT: ext z0.b, z0.b, z0.b, #16 +; VBITS_GE_256-NEXT: and p2.b, p2/z, p2.b, p1.b +; VBITS_GE_256-NEXT: sunpklo z0.d, z0.s +; VBITS_GE_256-NEXT: ld1w { z2.d }, p2/z, [z2.d] +; VBITS_GE_256-NEXT: cmpne p1.d, p1/z, z0.d, #0 +; VBITS_GE_256-NEXT: ld1w { z0.d }, p1/z, [z1.d] +; VBITS_GE_256-NEXT: ptrue p1.s, vl4 +; VBITS_GE_256-NEXT: uzp1 z1.s, z2.s, z2.s +; VBITS_GE_256-NEXT: uzp1 z0.s, z0.s, z0.s +; VBITS_GE_256-NEXT: splice z1.s, p1, z1.s, z0.s +; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x0] +; VBITS_GE_256-NEXT: ret ; ; VBITS_GE_512-LABEL: masked_gather_v8i32: ; VBITS_GE_512: // %bb.0: @@ -420,6 +782,48 @@ } define void @masked_gather_v16i32(<16 x i32>* %a, <16 x i32*>* %b) #0 { +; VBITS_GE_256-LABEL: masked_gather_v16i32: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov x8, #8 +; VBITS_GE_256-NEXT: ptrue p0.s, vl8 +; VBITS_GE_256-NEXT: mov x9, #4 +; VBITS_GE_256-NEXT: mov x10, #12 +; VBITS_GE_256-NEXT: ptrue p1.d, vl4 +; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0] +; VBITS_GE_256-NEXT: ld1d { z4.d }, p1/z, [x1, x8, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z2.d }, p1/z, [x1, x9, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z3.d }, p1/z, [x1, x10, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z5.d }, p1/z, [x1] +; VBITS_GE_256-NEXT: cmpeq p2.s, p0/z, z0.s, #0 +; VBITS_GE_256-NEXT: cmpeq p3.s, p0/z, z1.s, #0 +; VBITS_GE_256-NEXT: mov z0.s, p2/z, #-1 // =0xffffffffffffffff +; VBITS_GE_256-NEXT: punpklo p2.h, p2.b +; VBITS_GE_256-NEXT: mov z1.s, p3/z, #-1 // =0xffffffffffffffff +; VBITS_GE_256-NEXT: and p2.b, p2/z, p2.b, p1.b +; VBITS_GE_256-NEXT: ext z0.b, z0.b, z0.b, #16 +; VBITS_GE_256-NEXT: ext z1.b, z1.b, z1.b, #16 +; VBITS_GE_256-NEXT: ld1w { z4.d }, p2/z, [z4.d] +; VBITS_GE_256-NEXT: sunpklo z0.d, z0.s +; VBITS_GE_256-NEXT: punpklo p2.h, p3.b +; VBITS_GE_256-NEXT: sunpklo z1.d, z1.s +; VBITS_GE_256-NEXT: cmpne p3.d, p1/z, z0.d, #0 +; VBITS_GE_256-NEXT: and p2.b, p2/z, p2.b, p1.b +; VBITS_GE_256-NEXT: cmpne p1.d, p1/z, z1.d, #0 +; VBITS_GE_256-NEXT: ld1w { z0.d }, p3/z, [z3.d] +; VBITS_GE_256-NEXT: ld1w { z1.d }, p2/z, [z5.d] +; VBITS_GE_256-NEXT: ld1w { z2.d }, p1/z, [z2.d] +; VBITS_GE_256-NEXT: ptrue p1.s, vl4 +; VBITS_GE_256-NEXT: uzp1 z3.s, z4.s, z4.s +; VBITS_GE_256-NEXT: uzp1 z0.s, z0.s, z0.s +; VBITS_GE_256-NEXT: uzp1 z1.s, z1.s, z1.s +; VBITS_GE_256-NEXT: uzp1 z2.s, z2.s, z2.s +; VBITS_GE_256-NEXT: splice z3.s, p1, z3.s, z0.s +; VBITS_GE_256-NEXT: splice z1.s, p1, z1.s, z2.s +; VBITS_GE_256-NEXT: st1w { z3.s }, p0, [x0, x8, lsl #2] +; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x0] +; VBITS_GE_256-NEXT: ret +; ; VBITS_GE_1024-LABEL: masked_gather_v16i32: ; VBITS_GE_1024: // %bb.0: ; VBITS_GE_1024-NEXT: ptrue p0.s, vl16 @@ -441,6 +845,84 @@ } define void @masked_gather_v32i32(<32 x i32>* %a, <32 x i32*>* %b) #0 { +; VBITS_GE_256-LABEL: masked_gather_v32i32: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov x8, #24 +; VBITS_GE_256-NEXT: ptrue p0.s, vl8 +; VBITS_GE_256-NEXT: mov x10, #16 +; VBITS_GE_256-NEXT: mov x9, #8 +; VBITS_GE_256-NEXT: mov x14, #28 +; VBITS_GE_256-NEXT: ptrue p1.d, vl4 +; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] +; VBITS_GE_256-NEXT: mov x11, #4 +; VBITS_GE_256-NEXT: ld1w { z2.s }, p0/z, [x0, x10, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0, x9, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z3.s }, p0/z, [x0] +; VBITS_GE_256-NEXT: ld1d { z17.d }, p1/z, [x1, x14, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z18.d }, p1/z, [x1, x8, lsl #3] +; VBITS_GE_256-NEXT: mov x12, #20 +; VBITS_GE_256-NEXT: cmpeq p2.s, p0/z, z0.s, #0 +; VBITS_GE_256-NEXT: mov x13, #12 +; VBITS_GE_256-NEXT: mov z19.s, p2/z, #-1 // =0xffffffffffffffff +; VBITS_GE_256-NEXT: cmpeq p4.s, p0/z, z2.s, #0 +; VBITS_GE_256-NEXT: ext z19.b, z19.b, z19.b, #16 +; VBITS_GE_256-NEXT: punpklo p2.h, p2.b +; VBITS_GE_256-NEXT: sunpklo z2.d, z19.s +; VBITS_GE_256-NEXT: and p2.b, p2/z, p2.b, p1.b +; VBITS_GE_256-NEXT: cmpne p3.d, p1/z, z2.d, #0 +; VBITS_GE_256-NEXT: ld1d { z4.d }, p1/z, [x1, x11, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z5.d }, p1/z, [x1, x12, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z6.d }, p1/z, [x1, x13, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z7.d }, p1/z, [x1, x10, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z16.d }, p1/z, [x1, x9, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z0.d }, p1/z, [x1] +; VBITS_GE_256-NEXT: ld1w { z2.d }, p2/z, [z18.d] +; VBITS_GE_256-NEXT: ld1w { z17.d }, p3/z, [z17.d] +; VBITS_GE_256-NEXT: cmpeq p3.s, p0/z, z1.s, #0 +; VBITS_GE_256-NEXT: mov z1.s, p3/z, #-1 // =0xffffffffffffffff +; VBITS_GE_256-NEXT: cmpeq p2.s, p0/z, z3.s, #0 +; VBITS_GE_256-NEXT: ext z1.b, z1.b, z1.b, #16 +; VBITS_GE_256-NEXT: mov z18.s, p4/z, #-1 // =0xffffffffffffffff +; VBITS_GE_256-NEXT: mov z3.s, p2/z, #-1 // =0xffffffffffffffff +; VBITS_GE_256-NEXT: punpklo p3.h, p3.b +; VBITS_GE_256-NEXT: sunpklo z1.d, z1.s +; VBITS_GE_256-NEXT: and p3.b, p3/z, p3.b, p1.b +; VBITS_GE_256-NEXT: cmpne p5.d, p1/z, z1.d, #0 +; VBITS_GE_256-NEXT: punpklo p4.h, p4.b +; VBITS_GE_256-NEXT: ext z18.b, z18.b, z18.b, #16 +; VBITS_GE_256-NEXT: ext z3.b, z3.b, z3.b, #16 +; VBITS_GE_256-NEXT: ld1w { z16.d }, p3/z, [z16.d] +; VBITS_GE_256-NEXT: ld1w { z1.d }, p5/z, [z6.d] +; VBITS_GE_256-NEXT: and p4.b, p4/z, p4.b, p1.b +; VBITS_GE_256-NEXT: sunpklo z6.d, z18.s +; VBITS_GE_256-NEXT: punpklo p2.h, p2.b +; VBITS_GE_256-NEXT: sunpklo z3.d, z3.s +; VBITS_GE_256-NEXT: ld1w { z7.d }, p4/z, [z7.d] +; VBITS_GE_256-NEXT: cmpne p4.d, p1/z, z6.d, #0 +; VBITS_GE_256-NEXT: and p2.b, p2/z, p2.b, p1.b +; VBITS_GE_256-NEXT: cmpne p1.d, p1/z, z3.d, #0 +; VBITS_GE_256-NEXT: ld1w { z5.d }, p4/z, [z5.d] +; VBITS_GE_256-NEXT: ld1w { z0.d }, p2/z, [z0.d] +; VBITS_GE_256-NEXT: ld1w { z3.d }, p1/z, [z4.d] +; VBITS_GE_256-NEXT: ptrue p3.s, vl4 +; VBITS_GE_256-NEXT: uzp1 z4.s, z16.s, z16.s +; VBITS_GE_256-NEXT: uzp1 z1.s, z1.s, z1.s +; VBITS_GE_256-NEXT: uzp1 z2.s, z2.s, z2.s +; VBITS_GE_256-NEXT: uzp1 z17.s, z17.s, z17.s +; VBITS_GE_256-NEXT: splice z4.s, p3, z4.s, z1.s +; VBITS_GE_256-NEXT: uzp1 z1.s, z7.s, z7.s +; VBITS_GE_256-NEXT: uzp1 z5.s, z5.s, z5.s +; VBITS_GE_256-NEXT: uzp1 z0.s, z0.s, z0.s +; VBITS_GE_256-NEXT: uzp1 z3.s, z3.s, z3.s +; VBITS_GE_256-NEXT: splice z2.s, p3, z2.s, z17.s +; VBITS_GE_256-NEXT: splice z1.s, p3, z1.s, z5.s +; VBITS_GE_256-NEXT: splice z0.s, p3, z0.s, z3.s +; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x0, x10, lsl #2] +; VBITS_GE_256-NEXT: st1w { z4.s }, p0, [x0, x9, lsl #2] +; VBITS_GE_256-NEXT: st1w { z2.s }, p0, [x0, x8, lsl #2] +; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0] +; VBITS_GE_256-NEXT: ret +; ; VBITS_GE_2048-LABEL: masked_gather_v32i32: ; VBITS_GE_2048: // %bb.0: ; VBITS_GE_2048-NEXT: ptrue p0.s, vl32 @@ -461,9 +943,7 @@ ret void } -; ; LD1D -; ; Scalarize 1 x i64 gathers define void @masked_gather_v1i64(<1 x i64>* %a, <1 x i64*>* %b) #0 { @@ -527,21 +1007,21 @@ define void @masked_gather_v8i64(<8 x i64>* %a, <8 x i64*>* %b) #0 { ; Ensure sensible type legalisation. -; VBITS_EQ_256-LABEL: masked_gather_v8i64: -; VBITS_EQ_256: // %bb.0: -; VBITS_EQ_256-NEXT: mov x8, #4 -; VBITS_EQ_256-NEXT: ptrue p0.d, vl4 -; VBITS_EQ_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] -; VBITS_EQ_256-NEXT: ld1d { z1.d }, p0/z, [x0] -; VBITS_EQ_256-NEXT: ld1d { z2.d }, p0/z, [x1, x8, lsl #3] -; VBITS_EQ_256-NEXT: ld1d { z3.d }, p0/z, [x1] -; VBITS_EQ_256-NEXT: cmpeq p1.d, p0/z, z0.d, #0 -; VBITS_EQ_256-NEXT: cmpeq p2.d, p0/z, z1.d, #0 -; VBITS_EQ_256-NEXT: ld1d { z0.d }, p1/z, [z2.d] -; VBITS_EQ_256-NEXT: ld1d { z1.d }, p2/z, [z3.d] -; VBITS_EQ_256-NEXT: st1d { z0.d }, p0, [x0, x8, lsl #3] -; VBITS_EQ_256-NEXT: st1d { z1.d }, p0, [x0] -; VBITS_EQ_256-NEXT: ret +; VBITS_GE_256-LABEL: masked_gather_v8i64: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov x8, #4 +; VBITS_GE_256-NEXT: ptrue p0.d, vl4 +; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0] +; VBITS_GE_256-NEXT: ld1d { z2.d }, p0/z, [x1, x8, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z3.d }, p0/z, [x1] +; VBITS_GE_256-NEXT: cmpeq p1.d, p0/z, z0.d, #0 +; VBITS_GE_256-NEXT: cmpeq p2.d, p0/z, z1.d, #0 +; VBITS_GE_256-NEXT: ld1d { z0.d }, p1/z, [z2.d] +; VBITS_GE_256-NEXT: ld1d { z1.d }, p2/z, [z3.d] +; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x0, x8, lsl #3] +; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x0] +; VBITS_GE_256-NEXT: ret ; ; VBITS_GE_512-LABEL: masked_gather_v8i64: ; VBITS_GE_512: // %bb.0: @@ -562,6 +1042,34 @@ } define void @masked_gather_v16i64(<16 x i64>* %a, <16 x i64*>* %b) #0 { +; VBITS_GE_256-LABEL: masked_gather_v16i64: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov x8, #8 +; VBITS_GE_256-NEXT: mov x9, #4 +; VBITS_GE_256-NEXT: mov x10, #12 +; VBITS_GE_256-NEXT: ptrue p0.d, vl4 +; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0, x9, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z2.d }, p0/z, [x0, x10, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z3.d }, p0/z, [x0] +; VBITS_GE_256-NEXT: ld1d { z4.d }, p0/z, [x1, x9, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z5.d }, p0/z, [x1, x8, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z6.d }, p0/z, [x1, x10, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z7.d }, p0/z, [x1] +; VBITS_GE_256-NEXT: cmpeq p1.d, p0/z, z0.d, #0 +; VBITS_GE_256-NEXT: cmpeq p3.d, p0/z, z1.d, #0 +; VBITS_GE_256-NEXT: cmpeq p2.d, p0/z, z2.d, #0 +; VBITS_GE_256-NEXT: cmpeq p4.d, p0/z, z3.d, #0 +; VBITS_GE_256-NEXT: ld1d { z0.d }, p3/z, [z4.d] +; VBITS_GE_256-NEXT: ld1d { z1.d }, p2/z, [z6.d] +; VBITS_GE_256-NEXT: ld1d { z2.d }, p1/z, [z5.d] +; VBITS_GE_256-NEXT: ld1d { z3.d }, p4/z, [z7.d] +; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x0, x9, lsl #3] +; VBITS_GE_256-NEXT: st1d { z2.d }, p0, [x0, x8, lsl #3] +; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x0, x10, lsl #3] +; VBITS_GE_256-NEXT: st1d { z3.d }, p0, [x0] +; VBITS_GE_256-NEXT: ret +; ; VBITS_GE_1024-LABEL: masked_gather_v16i64: ; VBITS_GE_1024: // %bb.0: ; VBITS_GE_1024-NEXT: ptrue p0.d, vl16 @@ -580,6 +1088,58 @@ } define void @masked_gather_v32i64(<32 x i64>* %a, <32 x i64*>* %b) #0 { +; VBITS_GE_256-LABEL: masked_gather_v32i64: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov x8, #4 +; VBITS_GE_256-NEXT: mov x9, #8 +; VBITS_GE_256-NEXT: mov x10, #12 +; VBITS_GE_256-NEXT: mov x11, #16 +; VBITS_GE_256-NEXT: mov x12, #20 +; VBITS_GE_256-NEXT: mov x13, #24 +; VBITS_GE_256-NEXT: mov x14, #28 +; VBITS_GE_256-NEXT: ptrue p0.d, vl4 +; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0, x9, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z2.d }, p0/z, [x0, x10, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z3.d }, p0/z, [x0, x11, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z4.d }, p0/z, [x0, x12, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z5.d }, p0/z, [x0, x13, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z6.d }, p0/z, [x0, x14, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z7.d }, p0/z, [x0] +; VBITS_GE_256-NEXT: ld1d { z19.d }, p0/z, [x1, x8, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z21.d }, p0/z, [x1, x9, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z22.d }, p0/z, [x1, x10, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z20.d }, p0/z, [x1, x11, lsl #3] +; VBITS_GE_256-NEXT: cmpeq p2.d, p0/z, z0.d, #0 +; VBITS_GE_256-NEXT: ld1d { z16.d }, p0/z, [x1, x14, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z17.d }, p0/z, [x1, x13, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z18.d }, p0/z, [x1, x12, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z23.d }, p0/z, [x1] +; VBITS_GE_256-NEXT: ld1d { z0.d }, p2/z, [z19.d] +; VBITS_GE_256-NEXT: cmpeq p2.d, p0/z, z1.d, #0 +; VBITS_GE_256-NEXT: cmpeq p1.d, p0/z, z6.d, #0 +; VBITS_GE_256-NEXT: ld1d { z1.d }, p2/z, [z21.d] +; VBITS_GE_256-NEXT: cmpeq p2.d, p0/z, z2.d, #0 +; VBITS_GE_256-NEXT: ld1d { z2.d }, p2/z, [z22.d] +; VBITS_GE_256-NEXT: cmpeq p2.d, p0/z, z3.d, #0 +; VBITS_GE_256-NEXT: ld1d { z3.d }, p2/z, [z20.d] +; VBITS_GE_256-NEXT: cmpeq p2.d, p0/z, z4.d, #0 +; VBITS_GE_256-NEXT: ld1d { z4.d }, p2/z, [z18.d] +; VBITS_GE_256-NEXT: cmpeq p2.d, p0/z, z5.d, #0 +; VBITS_GE_256-NEXT: ld1d { z5.d }, p2/z, [z17.d] +; VBITS_GE_256-NEXT: ld1d { z6.d }, p1/z, [z16.d] +; VBITS_GE_256-NEXT: cmpeq p1.d, p0/z, z7.d, #0 +; VBITS_GE_256-NEXT: ld1d { z7.d }, p1/z, [z23.d] +; VBITS_GE_256-NEXT: st1d { z3.d }, p0, [x0, x11, lsl #3] +; VBITS_GE_256-NEXT: st1d { z4.d }, p0, [x0, x12, lsl #3] +; VBITS_GE_256-NEXT: st1d { z2.d }, p0, [x0, x10, lsl #3] +; VBITS_GE_256-NEXT: st1d { z6.d }, p0, [x0, x14, lsl #3] +; VBITS_GE_256-NEXT: st1d { z5.d }, p0, [x0, x13, lsl #3] +; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x0, x9, lsl #3] +; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x0, x8, lsl #3] +; VBITS_GE_256-NEXT: st1d { z7.d }, p0, [x0] +; VBITS_GE_256-NEXT: ret +; ; VBITS_GE_2048-LABEL: masked_gather_v32i64: ; VBITS_GE_2048: // %bb.0: ; VBITS_GE_2048-NEXT: ptrue p0.d, vl32 @@ -597,9 +1157,7 @@ ret void } -; ; LD1H (float) -; define void @masked_gather_v2f16(<2 x half>* %a, <2 x half*>* %b) #0 { ; CHECK-LABEL: masked_gather_v2f16: @@ -661,6 +1219,31 @@ } define void @masked_gather_v8f16(<8 x half>* %a, <8 x half*>* %b) #0 { +; VBITS_GE_256-LABEL: masked_gather_v8f16: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: ldr q0, [x0] +; VBITS_GE_256-NEXT: mov x8, #4 +; VBITS_GE_256-NEXT: ptrue p0.d, vl4 +; VBITS_GE_256-NEXT: fcmeq v0.8h, v0.8h, #0.0 +; VBITS_GE_256-NEXT: ld1d { z2.d }, p0/z, [x1, x8, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z3.d }, p0/z, [x1] +; VBITS_GE_256-NEXT: ext v1.16b, v0.16b, v0.16b, #8 +; VBITS_GE_256-NEXT: sunpklo z0.s, z0.h +; VBITS_GE_256-NEXT: sunpklo z0.d, z0.s +; VBITS_GE_256-NEXT: cmpne p1.d, p0/z, z0.d, #0 +; VBITS_GE_256-NEXT: ld1h { z0.d }, p1/z, [z3.d] +; VBITS_GE_256-NEXT: sunpklo z1.s, z1.h +; VBITS_GE_256-NEXT: sunpklo z1.d, z1.s +; VBITS_GE_256-NEXT: cmpne p0.d, p0/z, z1.d, #0 +; VBITS_GE_256-NEXT: ld1h { z1.d }, p0/z, [z2.d] +; VBITS_GE_256-NEXT: uzp1 z0.s, z0.s, z0.s +; VBITS_GE_256-NEXT: uzp1 z0.h, z0.h, z0.h +; VBITS_GE_256-NEXT: uzp1 z1.s, z1.s, z1.s +; VBITS_GE_256-NEXT: uzp1 z1.h, z1.h, z1.h +; VBITS_GE_256-NEXT: mov v0.d[1], v1.d[0] +; VBITS_GE_256-NEXT: str q0, [x0] +; VBITS_GE_256-NEXT: ret +; ; VBITS_GE_512-LABEL: masked_gather_v8f16: ; VBITS_GE_512: // %bb.0: ; VBITS_GE_512-NEXT: ldr q0, [x0] @@ -684,6 +1267,54 @@ } define void @masked_gather_v16f16(<16 x half>* %a, <16 x half*>* %b) #0 { +; VBITS_GE_256-LABEL: masked_gather_v16f16: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: ptrue p0.h, vl16 +; VBITS_GE_256-NEXT: mov x8, #12 +; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0] +; VBITS_GE_256-NEXT: mov x9, #8 +; VBITS_GE_256-NEXT: mov x10, #4 +; VBITS_GE_256-NEXT: ptrue p1.d, vl4 +; VBITS_GE_256-NEXT: ld1d { z1.d }, p1/z, [x1, x8, lsl #3] +; VBITS_GE_256-NEXT: fcmeq p2.h, p0/z, z0.h, #0.0 +; VBITS_GE_256-NEXT: ld1d { z0.d }, p1/z, [x1, x9, lsl #3] +; VBITS_GE_256-NEXT: mov z2.h, p2/z, #-1 // =0xffffffffffffffff +; VBITS_GE_256-NEXT: ld1d { z3.d }, p1/z, [x1, x10, lsl #3] +; VBITS_GE_256-NEXT: sunpklo z4.s, z2.h +; VBITS_GE_256-NEXT: ld1d { z6.d }, p1/z, [x1] +; VBITS_GE_256-NEXT: ext v5.16b, v2.16b, v2.16b, #8 +; VBITS_GE_256-NEXT: sunpklo z4.d, z4.s +; VBITS_GE_256-NEXT: ext z2.b, z2.b, z2.b, #16 +; VBITS_GE_256-NEXT: cmpne p2.d, p1/z, z4.d, #0 +; VBITS_GE_256-NEXT: ext v4.16b, v2.16b, v2.16b, #8 +; VBITS_GE_256-NEXT: sunpklo z2.s, z2.h +; VBITS_GE_256-NEXT: ld1h { z6.d }, p2/z, [z6.d] +; VBITS_GE_256-NEXT: sunpklo z2.d, z2.s +; VBITS_GE_256-NEXT: sunpklo z5.s, z5.h +; VBITS_GE_256-NEXT: sunpklo z5.d, z5.s +; VBITS_GE_256-NEXT: cmpne p2.d, p1/z, z5.d, #0 +; VBITS_GE_256-NEXT: sunpklo z4.s, z4.h +; VBITS_GE_256-NEXT: ld1h { z3.d }, p2/z, [z3.d] +; VBITS_GE_256-NEXT: cmpne p2.d, p1/z, z2.d, #0 +; VBITS_GE_256-NEXT: sunpklo z2.d, z4.s +; VBITS_GE_256-NEXT: ld1h { z0.d }, p2/z, [z0.d] +; VBITS_GE_256-NEXT: cmpne p1.d, p1/z, z2.d, #0 +; VBITS_GE_256-NEXT: uzp1 z2.s, z6.s, z6.s +; VBITS_GE_256-NEXT: ld1h { z1.d }, p1/z, [z1.d] +; VBITS_GE_256-NEXT: uzp1 z2.h, z2.h, z2.h +; VBITS_GE_256-NEXT: uzp1 z3.s, z3.s, z3.s +; VBITS_GE_256-NEXT: ptrue p1.h, vl8 +; VBITS_GE_256-NEXT: uzp1 z0.s, z0.s, z0.s +; VBITS_GE_256-NEXT: uzp1 z3.h, z3.h, z3.h +; VBITS_GE_256-NEXT: uzp1 z0.h, z0.h, z0.h +; VBITS_GE_256-NEXT: uzp1 z1.s, z1.s, z1.s +; VBITS_GE_256-NEXT: uzp1 z1.h, z1.h, z1.h +; VBITS_GE_256-NEXT: mov v2.d[1], v3.d[0] +; VBITS_GE_256-NEXT: mov v0.d[1], v1.d[0] +; VBITS_GE_256-NEXT: splice z2.h, p1, z2.h, z0.h +; VBITS_GE_256-NEXT: st1h { z2.h }, p0, [x0] +; VBITS_GE_256-NEXT: ret +; ; VBITS_GE_1024-LABEL: masked_gather_v16f16: ; VBITS_GE_1024: // %bb.0: ; VBITS_GE_1024-NEXT: ptrue p0.h, vl16 @@ -707,6 +1338,96 @@ } define void @masked_gather_v32f16(<32 x half>* %a, <32 x half*>* %b) #0 { +; VBITS_GE_256-LABEL: masked_gather_v32f16: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov x8, #16 +; VBITS_GE_256-NEXT: ptrue p0.h, vl16 +; VBITS_GE_256-NEXT: mov x11, #4 +; VBITS_GE_256-NEXT: mov x9, #12 +; VBITS_GE_256-NEXT: mov x10, #8 +; VBITS_GE_256-NEXT: ptrue p1.d, vl4 +; VBITS_GE_256-NEXT: ld1h { z3.h }, p0/z, [x0, x8, lsl #1] +; VBITS_GE_256-NEXT: ld1h { z4.h }, p0/z, [x0] +; VBITS_GE_256-NEXT: ld1d { z2.d }, p1/z, [x1, x11, lsl #3] +; VBITS_GE_256-NEXT: mov x11, #20 +; VBITS_GE_256-NEXT: ld1d { z0.d }, p1/z, [x1, x9, lsl #3] +; VBITS_GE_256-NEXT: mov x9, #28 +; VBITS_GE_256-NEXT: ld1d { z1.d }, p1/z, [x1, x10, lsl #3] +; VBITS_GE_256-NEXT: mov x10, #24 +; VBITS_GE_256-NEXT: fcmeq p2.h, p0/z, z3.h, #0.0 +; VBITS_GE_256-NEXT: ld1d { z17.d }, p1/z, [x1, x8, lsl #3] +; VBITS_GE_256-NEXT: mov z3.h, p2/z, #-1 // =0xffffffffffffffff +; VBITS_GE_256-NEXT: ld1d { z16.d }, p1/z, [x1, x11, lsl #3] +; VBITS_GE_256-NEXT: sunpklo z18.s, z3.h +; VBITS_GE_256-NEXT: ld1d { z6.d }, p1/z, [x1, x9, lsl #3] +; VBITS_GE_256-NEXT: ext v5.16b, v3.16b, v3.16b, #8 +; VBITS_GE_256-NEXT: ext z3.b, z3.b, z3.b, #16 +; VBITS_GE_256-NEXT: sunpklo z18.d, z18.s +; VBITS_GE_256-NEXT: ld1d { z7.d }, p1/z, [x1, x10, lsl #3] +; VBITS_GE_256-NEXT: cmpne p2.d, p1/z, z18.d, #0 +; VBITS_GE_256-NEXT: ext v18.16b, v3.16b, v3.16b, #8 +; VBITS_GE_256-NEXT: sunpklo z3.s, z3.h +; VBITS_GE_256-NEXT: sunpklo z5.s, z5.h +; VBITS_GE_256-NEXT: sunpklo z3.d, z3.s +; VBITS_GE_256-NEXT: sunpklo z5.d, z5.s +; VBITS_GE_256-NEXT: cmpne p3.d, p1/z, z5.d, #0 +; VBITS_GE_256-NEXT: ld1d { z5.d }, p1/z, [x1] +; VBITS_GE_256-NEXT: sunpklo z18.s, z18.h +; VBITS_GE_256-NEXT: ld1h { z17.d }, p2/z, [z17.d] +; VBITS_GE_256-NEXT: fcmeq p2.h, p0/z, z4.h, #0.0 +; VBITS_GE_256-NEXT: sunpklo z18.d, z18.s +; VBITS_GE_256-NEXT: ld1h { z4.d }, p3/z, [z16.d] +; VBITS_GE_256-NEXT: mov z16.h, p2/z, #-1 // =0xffffffffffffffff +; VBITS_GE_256-NEXT: cmpne p2.d, p1/z, z3.d, #0 +; VBITS_GE_256-NEXT: cmpne p3.d, p1/z, z18.d, #0 +; VBITS_GE_256-NEXT: ld1h { z3.d }, p2/z, [z7.d] +; VBITS_GE_256-NEXT: ld1h { z6.d }, p3/z, [z6.d] +; VBITS_GE_256-NEXT: uzp1 z17.s, z17.s, z17.s +; VBITS_GE_256-NEXT: uzp1 z4.s, z4.s, z4.s +; VBITS_GE_256-NEXT: uzp1 z7.h, z17.h, z17.h +; VBITS_GE_256-NEXT: uzp1 z4.h, z4.h, z4.h +; VBITS_GE_256-NEXT: uzp1 z3.s, z3.s, z3.s +; VBITS_GE_256-NEXT: uzp1 z6.s, z6.s, z6.s +; VBITS_GE_256-NEXT: mov v7.d[1], v4.d[0] +; VBITS_GE_256-NEXT: uzp1 z3.h, z3.h, z3.h +; VBITS_GE_256-NEXT: ext v4.16b, v16.16b, v16.16b, #8 +; VBITS_GE_256-NEXT: uzp1 z6.h, z6.h, z6.h +; VBITS_GE_256-NEXT: mov v3.d[1], v6.d[0] +; VBITS_GE_256-NEXT: sunpklo z6.s, z16.h +; VBITS_GE_256-NEXT: ext z16.b, z16.b, z16.b, #16 +; VBITS_GE_256-NEXT: sunpklo z6.d, z6.s +; VBITS_GE_256-NEXT: ext v17.16b, v16.16b, v16.16b, #8 +; VBITS_GE_256-NEXT: cmpne p2.d, p1/z, z6.d, #0 +; VBITS_GE_256-NEXT: sunpklo z4.s, z4.h +; VBITS_GE_256-NEXT: sunpklo z4.d, z4.s +; VBITS_GE_256-NEXT: cmpne p3.d, p1/z, z4.d, #0 +; VBITS_GE_256-NEXT: ld1h { z4.d }, p2/z, [z5.d] +; VBITS_GE_256-NEXT: sunpklo z5.s, z16.h +; VBITS_GE_256-NEXT: sunpklo z6.s, z17.h +; VBITS_GE_256-NEXT: sunpklo z5.d, z5.s +; VBITS_GE_256-NEXT: sunpklo z6.d, z6.s +; VBITS_GE_256-NEXT: cmpne p2.d, p1/z, z5.d, #0 +; VBITS_GE_256-NEXT: cmpne p1.d, p1/z, z6.d, #0 +; VBITS_GE_256-NEXT: ld1h { z2.d }, p3/z, [z2.d] +; VBITS_GE_256-NEXT: ld1h { z1.d }, p2/z, [z1.d] +; VBITS_GE_256-NEXT: ld1h { z0.d }, p1/z, [z0.d] +; VBITS_GE_256-NEXT: uzp1 z4.s, z4.s, z4.s +; VBITS_GE_256-NEXT: uzp1 z4.h, z4.h, z4.h +; VBITS_GE_256-NEXT: ptrue p1.h, vl8 +; VBITS_GE_256-NEXT: splice z7.h, p1, z7.h, z3.h +; VBITS_GE_256-NEXT: uzp1 z2.s, z2.s, z2.s +; VBITS_GE_256-NEXT: uzp1 z1.s, z1.s, z1.s +; VBITS_GE_256-NEXT: uzp1 z0.s, z0.s, z0.s +; VBITS_GE_256-NEXT: uzp1 z2.h, z2.h, z2.h +; VBITS_GE_256-NEXT: uzp1 z1.h, z1.h, z1.h +; VBITS_GE_256-NEXT: uzp1 z0.h, z0.h, z0.h +; VBITS_GE_256-NEXT: mov v4.d[1], v2.d[0] +; VBITS_GE_256-NEXT: st1h { z7.h }, p0, [x0, x8, lsl #1] +; VBITS_GE_256-NEXT: mov v1.d[1], v0.d[0] +; VBITS_GE_256-NEXT: splice z4.h, p1, z4.h, z1.h +; VBITS_GE_256-NEXT: st1h { z4.h }, p0, [x0] +; VBITS_GE_256-NEXT: ret +; ; VBITS_GE_2048-LABEL: masked_gather_v32f16: ; VBITS_GE_2048: // %bb.0: ; VBITS_GE_2048-NEXT: ptrue p0.h, vl32 @@ -729,9 +1450,7 @@ ret void } -; ; LD1W (float) -; define void @masked_gather_v2f32(<2 x float>* %a, <2 x float*>* %b) #0 { ; CHECK-LABEL: masked_gather_v2f32: @@ -776,6 +1495,30 @@ } define void @masked_gather_v8f32(<8 x float>* %a, <8 x float*>* %b) #0 { +; VBITS_GE_256-LABEL: masked_gather_v8f32: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: ptrue p0.s, vl8 +; VBITS_GE_256-NEXT: mov x8, #4 +; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0] +; VBITS_GE_256-NEXT: ptrue p1.d, vl4 +; VBITS_GE_256-NEXT: ld1d { z1.d }, p1/z, [x1, x8, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z2.d }, p1/z, [x1] +; VBITS_GE_256-NEXT: fcmeq p2.s, p0/z, z0.s, #0.0 +; VBITS_GE_256-NEXT: mov z0.s, p2/z, #-1 // =0xffffffffffffffff +; VBITS_GE_256-NEXT: punpklo p2.h, p2.b +; VBITS_GE_256-NEXT: ext z0.b, z0.b, z0.b, #16 +; VBITS_GE_256-NEXT: and p2.b, p2/z, p2.b, p1.b +; VBITS_GE_256-NEXT: sunpklo z0.d, z0.s +; VBITS_GE_256-NEXT: ld1w { z2.d }, p2/z, [z2.d] +; VBITS_GE_256-NEXT: cmpne p1.d, p1/z, z0.d, #0 +; VBITS_GE_256-NEXT: ld1w { z0.d }, p1/z, [z1.d] +; VBITS_GE_256-NEXT: ptrue p1.s, vl4 +; VBITS_GE_256-NEXT: uzp1 z1.s, z2.s, z2.s +; VBITS_GE_256-NEXT: uzp1 z0.s, z0.s, z0.s +; VBITS_GE_256-NEXT: splice z1.s, p1, z1.s, z0.s +; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x0] +; VBITS_GE_256-NEXT: ret +; ; VBITS_GE_512-LABEL: masked_gather_v8f32: ; VBITS_GE_512: // %bb.0: ; VBITS_GE_512-NEXT: ptrue p0.s, vl8 @@ -797,6 +1540,48 @@ } define void @masked_gather_v16f32(<16 x float>* %a, <16 x float*>* %b) #0 { +; VBITS_GE_256-LABEL: masked_gather_v16f32: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov x8, #8 +; VBITS_GE_256-NEXT: ptrue p0.s, vl8 +; VBITS_GE_256-NEXT: mov x9, #4 +; VBITS_GE_256-NEXT: mov x10, #12 +; VBITS_GE_256-NEXT: ptrue p1.d, vl4 +; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0] +; VBITS_GE_256-NEXT: ld1d { z4.d }, p1/z, [x1, x8, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z2.d }, p1/z, [x1, x9, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z3.d }, p1/z, [x1, x10, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z5.d }, p1/z, [x1] +; VBITS_GE_256-NEXT: fcmeq p2.s, p0/z, z0.s, #0.0 +; VBITS_GE_256-NEXT: fcmeq p3.s, p0/z, z1.s, #0.0 +; VBITS_GE_256-NEXT: mov z0.s, p2/z, #-1 // =0xffffffffffffffff +; VBITS_GE_256-NEXT: punpklo p2.h, p2.b +; VBITS_GE_256-NEXT: mov z1.s, p3/z, #-1 // =0xffffffffffffffff +; VBITS_GE_256-NEXT: and p2.b, p2/z, p2.b, p1.b +; VBITS_GE_256-NEXT: ext z0.b, z0.b, z0.b, #16 +; VBITS_GE_256-NEXT: ext z1.b, z1.b, z1.b, #16 +; VBITS_GE_256-NEXT: ld1w { z4.d }, p2/z, [z4.d] +; VBITS_GE_256-NEXT: sunpklo z0.d, z0.s +; VBITS_GE_256-NEXT: punpklo p2.h, p3.b +; VBITS_GE_256-NEXT: sunpklo z1.d, z1.s +; VBITS_GE_256-NEXT: cmpne p3.d, p1/z, z0.d, #0 +; VBITS_GE_256-NEXT: and p2.b, p2/z, p2.b, p1.b +; VBITS_GE_256-NEXT: cmpne p1.d, p1/z, z1.d, #0 +; VBITS_GE_256-NEXT: ld1w { z0.d }, p3/z, [z3.d] +; VBITS_GE_256-NEXT: ld1w { z1.d }, p2/z, [z5.d] +; VBITS_GE_256-NEXT: ld1w { z2.d }, p1/z, [z2.d] +; VBITS_GE_256-NEXT: ptrue p1.s, vl4 +; VBITS_GE_256-NEXT: uzp1 z3.s, z4.s, z4.s +; VBITS_GE_256-NEXT: uzp1 z0.s, z0.s, z0.s +; VBITS_GE_256-NEXT: uzp1 z1.s, z1.s, z1.s +; VBITS_GE_256-NEXT: uzp1 z2.s, z2.s, z2.s +; VBITS_GE_256-NEXT: splice z3.s, p1, z3.s, z0.s +; VBITS_GE_256-NEXT: splice z1.s, p1, z1.s, z2.s +; VBITS_GE_256-NEXT: st1w { z3.s }, p0, [x0, x8, lsl #2] +; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x0] +; VBITS_GE_256-NEXT: ret +; ; VBITS_GE_1024-LABEL: masked_gather_v16f32: ; VBITS_GE_1024: // %bb.0: ; VBITS_GE_1024-NEXT: ptrue p0.s, vl16 @@ -818,6 +1603,84 @@ } define void @masked_gather_v32f32(<32 x float>* %a, <32 x float*>* %b) #0 { +; VBITS_GE_256-LABEL: masked_gather_v32f32: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov x8, #24 +; VBITS_GE_256-NEXT: ptrue p0.s, vl8 +; VBITS_GE_256-NEXT: mov x10, #16 +; VBITS_GE_256-NEXT: mov x9, #8 +; VBITS_GE_256-NEXT: mov x14, #28 +; VBITS_GE_256-NEXT: ptrue p1.d, vl4 +; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] +; VBITS_GE_256-NEXT: mov x11, #4 +; VBITS_GE_256-NEXT: ld1w { z2.s }, p0/z, [x0, x10, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0, x9, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z3.s }, p0/z, [x0] +; VBITS_GE_256-NEXT: ld1d { z17.d }, p1/z, [x1, x14, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z18.d }, p1/z, [x1, x8, lsl #3] +; VBITS_GE_256-NEXT: mov x12, #20 +; VBITS_GE_256-NEXT: fcmeq p2.s, p0/z, z0.s, #0.0 +; VBITS_GE_256-NEXT: mov x13, #12 +; VBITS_GE_256-NEXT: mov z19.s, p2/z, #-1 // =0xffffffffffffffff +; VBITS_GE_256-NEXT: fcmeq p4.s, p0/z, z2.s, #0.0 +; VBITS_GE_256-NEXT: ext z19.b, z19.b, z19.b, #16 +; VBITS_GE_256-NEXT: punpklo p2.h, p2.b +; VBITS_GE_256-NEXT: sunpklo z2.d, z19.s +; VBITS_GE_256-NEXT: and p2.b, p2/z, p2.b, p1.b +; VBITS_GE_256-NEXT: cmpne p3.d, p1/z, z2.d, #0 +; VBITS_GE_256-NEXT: ld1d { z4.d }, p1/z, [x1, x11, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z5.d }, p1/z, [x1, x12, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z6.d }, p1/z, [x1, x13, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z7.d }, p1/z, [x1, x10, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z16.d }, p1/z, [x1, x9, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z0.d }, p1/z, [x1] +; VBITS_GE_256-NEXT: ld1w { z2.d }, p2/z, [z18.d] +; VBITS_GE_256-NEXT: ld1w { z17.d }, p3/z, [z17.d] +; VBITS_GE_256-NEXT: fcmeq p3.s, p0/z, z1.s, #0.0 +; VBITS_GE_256-NEXT: mov z1.s, p3/z, #-1 // =0xffffffffffffffff +; VBITS_GE_256-NEXT: fcmeq p2.s, p0/z, z3.s, #0.0 +; VBITS_GE_256-NEXT: ext z1.b, z1.b, z1.b, #16 +; VBITS_GE_256-NEXT: mov z18.s, p4/z, #-1 // =0xffffffffffffffff +; VBITS_GE_256-NEXT: mov z3.s, p2/z, #-1 // =0xffffffffffffffff +; VBITS_GE_256-NEXT: punpklo p3.h, p3.b +; VBITS_GE_256-NEXT: sunpklo z1.d, z1.s +; VBITS_GE_256-NEXT: and p3.b, p3/z, p3.b, p1.b +; VBITS_GE_256-NEXT: cmpne p5.d, p1/z, z1.d, #0 +; VBITS_GE_256-NEXT: punpklo p4.h, p4.b +; VBITS_GE_256-NEXT: ext z18.b, z18.b, z18.b, #16 +; VBITS_GE_256-NEXT: ext z3.b, z3.b, z3.b, #16 +; VBITS_GE_256-NEXT: ld1w { z16.d }, p3/z, [z16.d] +; VBITS_GE_256-NEXT: ld1w { z1.d }, p5/z, [z6.d] +; VBITS_GE_256-NEXT: and p4.b, p4/z, p4.b, p1.b +; VBITS_GE_256-NEXT: sunpklo z6.d, z18.s +; VBITS_GE_256-NEXT: punpklo p2.h, p2.b +; VBITS_GE_256-NEXT: sunpklo z3.d, z3.s +; VBITS_GE_256-NEXT: ld1w { z7.d }, p4/z, [z7.d] +; VBITS_GE_256-NEXT: cmpne p4.d, p1/z, z6.d, #0 +; VBITS_GE_256-NEXT: and p2.b, p2/z, p2.b, p1.b +; VBITS_GE_256-NEXT: cmpne p1.d, p1/z, z3.d, #0 +; VBITS_GE_256-NEXT: ld1w { z5.d }, p4/z, [z5.d] +; VBITS_GE_256-NEXT: ld1w { z0.d }, p2/z, [z0.d] +; VBITS_GE_256-NEXT: ld1w { z3.d }, p1/z, [z4.d] +; VBITS_GE_256-NEXT: ptrue p3.s, vl4 +; VBITS_GE_256-NEXT: uzp1 z4.s, z16.s, z16.s +; VBITS_GE_256-NEXT: uzp1 z1.s, z1.s, z1.s +; VBITS_GE_256-NEXT: uzp1 z2.s, z2.s, z2.s +; VBITS_GE_256-NEXT: uzp1 z17.s, z17.s, z17.s +; VBITS_GE_256-NEXT: splice z4.s, p3, z4.s, z1.s +; VBITS_GE_256-NEXT: uzp1 z1.s, z7.s, z7.s +; VBITS_GE_256-NEXT: uzp1 z5.s, z5.s, z5.s +; VBITS_GE_256-NEXT: uzp1 z0.s, z0.s, z0.s +; VBITS_GE_256-NEXT: uzp1 z3.s, z3.s, z3.s +; VBITS_GE_256-NEXT: splice z2.s, p3, z2.s, z17.s +; VBITS_GE_256-NEXT: splice z1.s, p3, z1.s, z5.s +; VBITS_GE_256-NEXT: splice z0.s, p3, z0.s, z3.s +; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x0, x10, lsl #2] +; VBITS_GE_256-NEXT: st1w { z4.s }, p0, [x0, x9, lsl #2] +; VBITS_GE_256-NEXT: st1w { z2.s }, p0, [x0, x8, lsl #2] +; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0] +; VBITS_GE_256-NEXT: ret +; ; VBITS_GE_2048-LABEL: masked_gather_v32f32: ; VBITS_GE_2048: // %bb.0: ; VBITS_GE_2048-NEXT: ptrue p0.s, vl32 @@ -838,9 +1701,7 @@ ret void } -; ; LD1D (float) -; ; Scalarize 1 x double gathers define void @masked_gather_v1f64(<1 x double>* %a, <1 x double*>* %b) #0 { @@ -903,6 +1764,22 @@ } define void @masked_gather_v8f64(<8 x double>* %a, <8 x double*>* %b) #0 { +; VBITS_GE_256-LABEL: masked_gather_v8f64: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov x8, #4 +; VBITS_GE_256-NEXT: ptrue p0.d, vl4 +; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0] +; VBITS_GE_256-NEXT: ld1d { z2.d }, p0/z, [x1, x8, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z3.d }, p0/z, [x1] +; VBITS_GE_256-NEXT: fcmeq p1.d, p0/z, z0.d, #0.0 +; VBITS_GE_256-NEXT: fcmeq p2.d, p0/z, z1.d, #0.0 +; VBITS_GE_256-NEXT: ld1d { z0.d }, p1/z, [z2.d] +; VBITS_GE_256-NEXT: ld1d { z1.d }, p2/z, [z3.d] +; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x0, x8, lsl #3] +; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x0] +; VBITS_GE_256-NEXT: ret +; ; VBITS_GE_512-LABEL: masked_gather_v8f64: ; VBITS_GE_512: // %bb.0: ; VBITS_GE_512-NEXT: ptrue p0.d, vl8 @@ -921,6 +1798,34 @@ } define void @masked_gather_v16f64(<16 x double>* %a, <16 x double*>* %b) #0 { +; VBITS_GE_256-LABEL: masked_gather_v16f64: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov x8, #8 +; VBITS_GE_256-NEXT: mov x9, #4 +; VBITS_GE_256-NEXT: mov x10, #12 +; VBITS_GE_256-NEXT: ptrue p0.d, vl4 +; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0, x9, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z2.d }, p0/z, [x0, x10, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z3.d }, p0/z, [x0] +; VBITS_GE_256-NEXT: ld1d { z4.d }, p0/z, [x1, x9, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z5.d }, p0/z, [x1, x8, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z6.d }, p0/z, [x1, x10, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z7.d }, p0/z, [x1] +; VBITS_GE_256-NEXT: fcmeq p1.d, p0/z, z0.d, #0.0 +; VBITS_GE_256-NEXT: fcmeq p3.d, p0/z, z1.d, #0.0 +; VBITS_GE_256-NEXT: fcmeq p2.d, p0/z, z2.d, #0.0 +; VBITS_GE_256-NEXT: fcmeq p4.d, p0/z, z3.d, #0.0 +; VBITS_GE_256-NEXT: ld1d { z0.d }, p3/z, [z4.d] +; VBITS_GE_256-NEXT: ld1d { z1.d }, p2/z, [z6.d] +; VBITS_GE_256-NEXT: ld1d { z2.d }, p1/z, [z5.d] +; VBITS_GE_256-NEXT: ld1d { z3.d }, p4/z, [z7.d] +; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x0, x9, lsl #3] +; VBITS_GE_256-NEXT: st1d { z2.d }, p0, [x0, x8, lsl #3] +; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x0, x10, lsl #3] +; VBITS_GE_256-NEXT: st1d { z3.d }, p0, [x0] +; VBITS_GE_256-NEXT: ret +; ; VBITS_GE_1024-LABEL: masked_gather_v16f64: ; VBITS_GE_1024: // %bb.0: ; VBITS_GE_1024-NEXT: ptrue p0.d, vl16 @@ -939,6 +1844,58 @@ } define void @masked_gather_v32f64(<32 x double>* %a, <32 x double*>* %b) #0 { +; VBITS_GE_256-LABEL: masked_gather_v32f64: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov x8, #4 +; VBITS_GE_256-NEXT: mov x9, #8 +; VBITS_GE_256-NEXT: mov x10, #12 +; VBITS_GE_256-NEXT: mov x11, #16 +; VBITS_GE_256-NEXT: mov x12, #20 +; VBITS_GE_256-NEXT: mov x13, #24 +; VBITS_GE_256-NEXT: mov x14, #28 +; VBITS_GE_256-NEXT: ptrue p0.d, vl4 +; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0, x9, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z2.d }, p0/z, [x0, x10, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z3.d }, p0/z, [x0, x11, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z4.d }, p0/z, [x0, x12, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z5.d }, p0/z, [x0, x13, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z6.d }, p0/z, [x0, x14, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z7.d }, p0/z, [x0] +; VBITS_GE_256-NEXT: ld1d { z19.d }, p0/z, [x1, x8, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z21.d }, p0/z, [x1, x9, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z22.d }, p0/z, [x1, x10, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z20.d }, p0/z, [x1, x11, lsl #3] +; VBITS_GE_256-NEXT: fcmeq p2.d, p0/z, z0.d, #0.0 +; VBITS_GE_256-NEXT: ld1d { z16.d }, p0/z, [x1, x14, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z17.d }, p0/z, [x1, x13, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z18.d }, p0/z, [x1, x12, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z23.d }, p0/z, [x1] +; VBITS_GE_256-NEXT: ld1d { z0.d }, p2/z, [z19.d] +; VBITS_GE_256-NEXT: fcmeq p2.d, p0/z, z1.d, #0.0 +; VBITS_GE_256-NEXT: fcmeq p1.d, p0/z, z6.d, #0.0 +; VBITS_GE_256-NEXT: ld1d { z1.d }, p2/z, [z21.d] +; VBITS_GE_256-NEXT: fcmeq p2.d, p0/z, z2.d, #0.0 +; VBITS_GE_256-NEXT: ld1d { z2.d }, p2/z, [z22.d] +; VBITS_GE_256-NEXT: fcmeq p2.d, p0/z, z3.d, #0.0 +; VBITS_GE_256-NEXT: ld1d { z3.d }, p2/z, [z20.d] +; VBITS_GE_256-NEXT: fcmeq p2.d, p0/z, z4.d, #0.0 +; VBITS_GE_256-NEXT: ld1d { z4.d }, p2/z, [z18.d] +; VBITS_GE_256-NEXT: fcmeq p2.d, p0/z, z5.d, #0.0 +; VBITS_GE_256-NEXT: ld1d { z5.d }, p2/z, [z17.d] +; VBITS_GE_256-NEXT: ld1d { z6.d }, p1/z, [z16.d] +; VBITS_GE_256-NEXT: fcmeq p1.d, p0/z, z7.d, #0.0 +; VBITS_GE_256-NEXT: ld1d { z7.d }, p1/z, [z23.d] +; VBITS_GE_256-NEXT: st1d { z3.d }, p0, [x0, x11, lsl #3] +; VBITS_GE_256-NEXT: st1d { z4.d }, p0, [x0, x12, lsl #3] +; VBITS_GE_256-NEXT: st1d { z2.d }, p0, [x0, x10, lsl #3] +; VBITS_GE_256-NEXT: st1d { z6.d }, p0, [x0, x14, lsl #3] +; VBITS_GE_256-NEXT: st1d { z5.d }, p0, [x0, x13, lsl #3] +; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x0, x9, lsl #3] +; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x0, x8, lsl #3] +; VBITS_GE_256-NEXT: st1d { z7.d }, p0, [x0] +; VBITS_GE_256-NEXT: ret +; ; VBITS_GE_2048-LABEL: masked_gather_v32f64: ; VBITS_GE_2048: // %bb.0: ; VBITS_GE_2048-NEXT: ptrue p0.d, vl32 @@ -960,18 +1917,60 @@ ; modes still function define void @masked_gather_32b_scaled_sext_f16(<32 x half>* %a, <32 x i32>* %b, half* %base) #0 { -; VBITS_GE_2048-LABEL: masked_gather_32b_scaled_sext_f16: -; VBITS_GE_2048: // %bb.0: -; VBITS_GE_2048-NEXT: ptrue p0.h, vl32 -; VBITS_GE_2048-NEXT: ptrue p1.s, vl32 -; VBITS_GE_2048-NEXT: ld1h { z0.h }, p0/z, [x0] -; VBITS_GE_2048-NEXT: ld1w { z1.s }, p1/z, [x1] -; VBITS_GE_2048-NEXT: fcmeq p1.h, p0/z, z0.h, #0.0 -; VBITS_GE_2048-NEXT: punpklo p1.h, p1.b -; VBITS_GE_2048-NEXT: ld1h { z0.s }, p1/z, [x2, z1.s, sxtw #1] -; VBITS_GE_2048-NEXT: uzp1 z0.h, z0.h, z0.h -; VBITS_GE_2048-NEXT: st1h { z0.h }, p0, [x0] -; VBITS_GE_2048-NEXT: ret +; VBITS_GE_256-LABEL: masked_gather_32b_scaled_sext_f16: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov x8, #16 +; VBITS_GE_256-NEXT: ptrue p0.h, vl16 +; VBITS_GE_256-NEXT: mov x10, #24 +; VBITS_GE_256-NEXT: ptrue p1.s, vl8 +; VBITS_GE_256-NEXT: mov x9, #8 +; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] +; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0] +; VBITS_GE_256-NEXT: ld1w { z4.s }, p1/z, [x1, x8, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z3.s }, p1/z, [x1, x10, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z2.s }, p1/z, [x1, x9, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z5.s }, p1/z, [x1] +; VBITS_GE_256-NEXT: fcmeq p2.h, p0/z, z0.h, #0.0 +; VBITS_GE_256-NEXT: fcmeq p3.h, p0/z, z1.h, #0.0 +; VBITS_GE_256-NEXT: mov z0.h, p2/z, #-1 // =0xffffffffffffffff +; VBITS_GE_256-NEXT: punpklo p2.h, p2.b +; VBITS_GE_256-NEXT: ext z0.b, z0.b, z0.b, #16 +; VBITS_GE_256-NEXT: mov z1.h, p3/z, #-1 // =0xffffffffffffffff +; VBITS_GE_256-NEXT: and p2.b, p2/z, p2.b, p1.b +; VBITS_GE_256-NEXT: sunpklo z0.s, z0.h +; VBITS_GE_256-NEXT: ld1h { z4.s }, p2/z, [x2, z4.s, sxtw #1] +; VBITS_GE_256-NEXT: cmpne p2.s, p1/z, z0.s, #0 +; VBITS_GE_256-NEXT: ext z1.b, z1.b, z1.b, #16 +; VBITS_GE_256-NEXT: ld1h { z0.s }, p2/z, [x2, z3.s, sxtw #1] +; VBITS_GE_256-NEXT: punpklo p2.h, p3.b +; VBITS_GE_256-NEXT: sunpklo z1.s, z1.h +; VBITS_GE_256-NEXT: and p2.b, p2/z, p2.b, p1.b +; VBITS_GE_256-NEXT: cmpne p1.s, p1/z, z1.s, #0 +; VBITS_GE_256-NEXT: ld1h { z1.s }, p2/z, [x2, z5.s, sxtw #1] +; VBITS_GE_256-NEXT: ld1h { z2.s }, p1/z, [x2, z2.s, sxtw #1] +; VBITS_GE_256-NEXT: uzp1 z3.h, z4.h, z4.h +; VBITS_GE_256-NEXT: uzp1 z0.h, z0.h, z0.h +; VBITS_GE_256-NEXT: ptrue p1.h, vl8 +; VBITS_GE_256-NEXT: splice z3.h, p1, z3.h, z0.h +; VBITS_GE_256-NEXT: uzp1 z1.h, z1.h, z1.h +; VBITS_GE_256-NEXT: uzp1 z2.h, z2.h, z2.h +; VBITS_GE_256-NEXT: splice z1.h, p1, z1.h, z2.h +; VBITS_GE_256-NEXT: st1h { z3.h }, p0, [x0, x8, lsl #1] +; VBITS_GE_256-NEXT: st1h { z1.h }, p0, [x0] +; VBITS_GE_256-NEXT: ret +; +; VBITS_GE_1024-LABEL: masked_gather_32b_scaled_sext_f16: +; VBITS_GE_1024: // %bb.0: +; VBITS_GE_1024-NEXT: ptrue p0.h, vl32 +; VBITS_GE_1024-NEXT: ptrue p1.s, vl32 +; VBITS_GE_1024-NEXT: ld1h { z0.h }, p0/z, [x0] +; VBITS_GE_1024-NEXT: ld1w { z1.s }, p1/z, [x1] +; VBITS_GE_1024-NEXT: fcmeq p1.h, p0/z, z0.h, #0.0 +; VBITS_GE_1024-NEXT: punpklo p1.h, p1.b +; VBITS_GE_1024-NEXT: ld1h { z0.s }, p1/z, [x2, z1.s, sxtw #1] +; VBITS_GE_1024-NEXT: uzp1 z0.h, z0.h, z0.h +; VBITS_GE_1024-NEXT: st1h { z0.h }, p0, [x0] +; VBITS_GE_1024-NEXT: ret %cvals = load <32 x half>, <32 x half>* %a %idxs = load <32 x i32>, <32 x i32>* %b %ext = sext <32 x i32> %idxs to <32 x i64> @@ -983,15 +1982,43 @@ } define void @masked_gather_32b_scaled_sext_f32(<32 x float>* %a, <32 x i32>* %b, float* %base) #0 { -; VBITS_GE_2048-LABEL: masked_gather_32b_scaled_sext_f32: -; VBITS_GE_2048: // %bb.0: -; VBITS_GE_2048-NEXT: ptrue p0.s, vl32 -; VBITS_GE_2048-NEXT: ld1w { z0.s }, p0/z, [x0] -; VBITS_GE_2048-NEXT: ld1w { z1.s }, p0/z, [x1] -; VBITS_GE_2048-NEXT: fcmeq p1.s, p0/z, z0.s, #0.0 -; VBITS_GE_2048-NEXT: ld1w { z0.s }, p1/z, [x2, z1.s, sxtw #2] -; VBITS_GE_2048-NEXT: st1w { z0.s }, p0, [x0] -; VBITS_GE_2048-NEXT: ret +; VBITS_GE_256-LABEL: masked_gather_32b_scaled_sext_f32: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov x8, #16 +; VBITS_GE_256-NEXT: mov x9, #8 +; VBITS_GE_256-NEXT: mov x10, #24 +; VBITS_GE_256-NEXT: ptrue p0.s, vl8 +; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0, x9, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z2.s }, p0/z, [x0, x10, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z3.s }, p0/z, [x0] +; VBITS_GE_256-NEXT: ld1w { z4.s }, p0/z, [x1, x9, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z5.s }, p0/z, [x1, x8, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z6.s }, p0/z, [x1, x10, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z7.s }, p0/z, [x1] +; VBITS_GE_256-NEXT: fcmeq p1.s, p0/z, z0.s, #0.0 +; VBITS_GE_256-NEXT: fcmeq p3.s, p0/z, z1.s, #0.0 +; VBITS_GE_256-NEXT: fcmeq p2.s, p0/z, z2.s, #0.0 +; VBITS_GE_256-NEXT: fcmeq p4.s, p0/z, z3.s, #0.0 +; VBITS_GE_256-NEXT: ld1w { z0.s }, p3/z, [x2, z4.s, sxtw #2] +; VBITS_GE_256-NEXT: ld1w { z1.s }, p2/z, [x2, z6.s, sxtw #2] +; VBITS_GE_256-NEXT: ld1w { z2.s }, p1/z, [x2, z5.s, sxtw #2] +; VBITS_GE_256-NEXT: ld1w { z3.s }, p4/z, [x2, z7.s, sxtw #2] +; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0, x9, lsl #2] +; VBITS_GE_256-NEXT: st1w { z2.s }, p0, [x0, x8, lsl #2] +; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x0, x10, lsl #2] +; VBITS_GE_256-NEXT: st1w { z3.s }, p0, [x0] +; VBITS_GE_256-NEXT: ret +; +; VBITS_GE_1024-LABEL: masked_gather_32b_scaled_sext_f32: +; VBITS_GE_1024: // %bb.0: +; VBITS_GE_1024-NEXT: ptrue p0.s, vl32 +; VBITS_GE_1024-NEXT: ld1w { z0.s }, p0/z, [x0] +; VBITS_GE_1024-NEXT: ld1w { z1.s }, p0/z, [x1] +; VBITS_GE_1024-NEXT: fcmeq p1.s, p0/z, z0.s, #0.0 +; VBITS_GE_1024-NEXT: ld1w { z0.s }, p1/z, [x2, z1.s, sxtw #2] +; VBITS_GE_1024-NEXT: st1w { z0.s }, p0, [x0] +; VBITS_GE_1024-NEXT: ret %cvals = load <32 x float>, <32 x float>* %a %idxs = load <32 x i32>, <32 x i32>* %b %ext = sext <32 x i32> %idxs to <32 x i64> @@ -1003,6 +2030,67 @@ } define void @masked_gather_32b_scaled_sext_f64(<32 x double>* %a, <32 x i32>* %b, double* %base) #0 { +; VBITS_GE_256-LABEL: masked_gather_32b_scaled_sext_f64: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov x8, #12 +; VBITS_GE_256-NEXT: mov x9, #8 +; VBITS_GE_256-NEXT: mov x10, #16 +; VBITS_GE_256-NEXT: mov x11, #20 +; VBITS_GE_256-NEXT: mov x12, #24 +; VBITS_GE_256-NEXT: mov x13, #28 +; VBITS_GE_256-NEXT: mov x14, #4 +; VBITS_GE_256-NEXT: ptrue p0.d, vl4 +; VBITS_GE_256-NEXT: ptrue p1.s, vl8 +; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0, x9, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z2.d }, p0/z, [x0, x10, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z3.d }, p0/z, [x0, x11, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z4.d }, p0/z, [x0, x12, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z5.d }, p0/z, [x0, x13, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z6.d }, p0/z, [x0, x14, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z7.d }, p0/z, [x0] +; VBITS_GE_256-NEXT: ld1w { z18.s }, p1/z, [x1, x9, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z17.s }, p1/z, [x1, x10, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z16.s }, p1/z, [x1, x12, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z19.s }, p1/z, [x1] +; VBITS_GE_256-NEXT: fcmeq p2.d, p0/z, z0.d, #0.0 +; VBITS_GE_256-NEXT: fcmeq p1.d, p0/z, z5.d, #0.0 +; VBITS_GE_256-NEXT: sunpklo z22.d, z18.s +; VBITS_GE_256-NEXT: ext z18.b, z18.b, z18.b, #16 +; VBITS_GE_256-NEXT: sunpklo z18.d, z18.s +; VBITS_GE_256-NEXT: sunpklo z21.d, z17.s +; VBITS_GE_256-NEXT: ld1d { z0.d }, p2/z, [x2, z18.d, lsl #3] +; VBITS_GE_256-NEXT: fcmeq p2.d, p0/z, z1.d, #0.0 +; VBITS_GE_256-NEXT: ext z17.b, z17.b, z17.b, #16 +; VBITS_GE_256-NEXT: ld1d { z1.d }, p2/z, [x2, z22.d, lsl #3] +; VBITS_GE_256-NEXT: fcmeq p2.d, p0/z, z2.d, #0.0 +; VBITS_GE_256-NEXT: sunpklo z20.d, z16.s +; VBITS_GE_256-NEXT: ext z16.b, z16.b, z16.b, #16 +; VBITS_GE_256-NEXT: sunpklo z17.d, z17.s +; VBITS_GE_256-NEXT: ld1d { z2.d }, p2/z, [x2, z21.d, lsl #3] +; VBITS_GE_256-NEXT: fcmeq p2.d, p0/z, z3.d, #0.0 +; VBITS_GE_256-NEXT: sunpklo z16.d, z16.s +; VBITS_GE_256-NEXT: sunpklo z23.d, z19.s +; VBITS_GE_256-NEXT: ext z19.b, z19.b, z19.b, #16 +; VBITS_GE_256-NEXT: ld1d { z3.d }, p2/z, [x2, z17.d, lsl #3] +; VBITS_GE_256-NEXT: fcmeq p2.d, p0/z, z4.d, #0.0 +; VBITS_GE_256-NEXT: sunpklo z19.d, z19.s +; VBITS_GE_256-NEXT: ld1d { z4.d }, p2/z, [x2, z20.d, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z5.d }, p1/z, [x2, z16.d, lsl #3] +; VBITS_GE_256-NEXT: fcmeq p1.d, p0/z, z6.d, #0.0 +; VBITS_GE_256-NEXT: fcmeq p2.d, p0/z, z7.d, #0.0 +; VBITS_GE_256-NEXT: ld1d { z6.d }, p2/z, [x2, z23.d, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z7.d }, p1/z, [x2, z19.d, lsl #3] +; VBITS_GE_256-NEXT: st1d { z3.d }, p0, [x0, x11, lsl #3] +; VBITS_GE_256-NEXT: st1d { z2.d }, p0, [x0, x10, lsl #3] +; VBITS_GE_256-NEXT: st1d { z5.d }, p0, [x0, x13, lsl #3] +; VBITS_GE_256-NEXT: st1d { z4.d }, p0, [x0, x12, lsl #3] +; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x0, x9, lsl #3] +; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x0, x8, lsl #3] +; VBITS_GE_256-NEXT: st1d { z7.d }, p0, [x0, x14, lsl #3] +; VBITS_GE_256-NEXT: st1d { z6.d }, p0, [x0] +; VBITS_GE_256-NEXT: ret +; ; VBITS_GE_2048-LABEL: masked_gather_32b_scaled_sext_f64: ; VBITS_GE_2048: // %bb.0: ; VBITS_GE_2048-NEXT: ptrue p0.d, vl32 @@ -1023,18 +2111,60 @@ } define void @masked_gather_32b_scaled_zext(<32 x half>* %a, <32 x i32>* %b, half* %base) #0 { -; VBITS_GE_2048-LABEL: masked_gather_32b_scaled_zext: -; VBITS_GE_2048: // %bb.0: -; VBITS_GE_2048-NEXT: ptrue p0.h, vl32 -; VBITS_GE_2048-NEXT: ptrue p1.s, vl32 -; VBITS_GE_2048-NEXT: ld1h { z0.h }, p0/z, [x0] -; VBITS_GE_2048-NEXT: ld1w { z1.s }, p1/z, [x1] -; VBITS_GE_2048-NEXT: fcmeq p1.h, p0/z, z0.h, #0.0 -; VBITS_GE_2048-NEXT: punpklo p1.h, p1.b -; VBITS_GE_2048-NEXT: ld1h { z0.s }, p1/z, [x2, z1.s, uxtw #1] -; VBITS_GE_2048-NEXT: uzp1 z0.h, z0.h, z0.h -; VBITS_GE_2048-NEXT: st1h { z0.h }, p0, [x0] -; VBITS_GE_2048-NEXT: ret +; VBITS_GE_256-LABEL: masked_gather_32b_scaled_zext: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov x8, #16 +; VBITS_GE_256-NEXT: ptrue p0.h, vl16 +; VBITS_GE_256-NEXT: mov x10, #24 +; VBITS_GE_256-NEXT: ptrue p1.s, vl8 +; VBITS_GE_256-NEXT: mov x9, #8 +; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] +; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0] +; VBITS_GE_256-NEXT: ld1w { z4.s }, p1/z, [x1, x8, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z3.s }, p1/z, [x1, x10, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z2.s }, p1/z, [x1, x9, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z5.s }, p1/z, [x1] +; VBITS_GE_256-NEXT: fcmeq p2.h, p0/z, z0.h, #0.0 +; VBITS_GE_256-NEXT: fcmeq p3.h, p0/z, z1.h, #0.0 +; VBITS_GE_256-NEXT: mov z0.h, p2/z, #-1 // =0xffffffffffffffff +; VBITS_GE_256-NEXT: punpklo p2.h, p2.b +; VBITS_GE_256-NEXT: ext z0.b, z0.b, z0.b, #16 +; VBITS_GE_256-NEXT: mov z1.h, p3/z, #-1 // =0xffffffffffffffff +; VBITS_GE_256-NEXT: and p2.b, p2/z, p2.b, p1.b +; VBITS_GE_256-NEXT: sunpklo z0.s, z0.h +; VBITS_GE_256-NEXT: ld1h { z4.s }, p2/z, [x2, z4.s, uxtw #1] +; VBITS_GE_256-NEXT: cmpne p2.s, p1/z, z0.s, #0 +; VBITS_GE_256-NEXT: ext z1.b, z1.b, z1.b, #16 +; VBITS_GE_256-NEXT: ld1h { z0.s }, p2/z, [x2, z3.s, uxtw #1] +; VBITS_GE_256-NEXT: punpklo p2.h, p3.b +; VBITS_GE_256-NEXT: sunpklo z1.s, z1.h +; VBITS_GE_256-NEXT: and p2.b, p2/z, p2.b, p1.b +; VBITS_GE_256-NEXT: cmpne p1.s, p1/z, z1.s, #0 +; VBITS_GE_256-NEXT: ld1h { z1.s }, p2/z, [x2, z5.s, uxtw #1] +; VBITS_GE_256-NEXT: ld1h { z2.s }, p1/z, [x2, z2.s, uxtw #1] +; VBITS_GE_256-NEXT: uzp1 z3.h, z4.h, z4.h +; VBITS_GE_256-NEXT: uzp1 z0.h, z0.h, z0.h +; VBITS_GE_256-NEXT: ptrue p1.h, vl8 +; VBITS_GE_256-NEXT: splice z3.h, p1, z3.h, z0.h +; VBITS_GE_256-NEXT: uzp1 z1.h, z1.h, z1.h +; VBITS_GE_256-NEXT: uzp1 z2.h, z2.h, z2.h +; VBITS_GE_256-NEXT: splice z1.h, p1, z1.h, z2.h +; VBITS_GE_256-NEXT: st1h { z3.h }, p0, [x0, x8, lsl #1] +; VBITS_GE_256-NEXT: st1h { z1.h }, p0, [x0] +; VBITS_GE_256-NEXT: ret +; +; VBITS_GE_1024-LABEL: masked_gather_32b_scaled_zext: +; VBITS_GE_1024: // %bb.0: +; VBITS_GE_1024-NEXT: ptrue p0.h, vl32 +; VBITS_GE_1024-NEXT: ptrue p1.s, vl32 +; VBITS_GE_1024-NEXT: ld1h { z0.h }, p0/z, [x0] +; VBITS_GE_1024-NEXT: ld1w { z1.s }, p1/z, [x1] +; VBITS_GE_1024-NEXT: fcmeq p1.h, p0/z, z0.h, #0.0 +; VBITS_GE_1024-NEXT: punpklo p1.h, p1.b +; VBITS_GE_1024-NEXT: ld1h { z0.s }, p1/z, [x2, z1.s, uxtw #1] +; VBITS_GE_1024-NEXT: uzp1 z0.h, z0.h, z0.h +; VBITS_GE_1024-NEXT: st1h { z0.h }, p0, [x0] +; VBITS_GE_1024-NEXT: ret %cvals = load <32 x half>, <32 x half>* %a %idxs = load <32 x i32>, <32 x i32>* %b %ext = zext <32 x i32> %idxs to <32 x i64> @@ -1046,18 +2176,60 @@ } define void @masked_gather_32b_unscaled_sext(<32 x half>* %a, <32 x i32>* %b, i8* %base) #0 { -; VBITS_GE_2048-LABEL: masked_gather_32b_unscaled_sext: -; VBITS_GE_2048: // %bb.0: -; VBITS_GE_2048-NEXT: ptrue p0.h, vl32 -; VBITS_GE_2048-NEXT: ptrue p1.s, vl32 -; VBITS_GE_2048-NEXT: ld1h { z0.h }, p0/z, [x0] -; VBITS_GE_2048-NEXT: ld1w { z1.s }, p1/z, [x1] -; VBITS_GE_2048-NEXT: fcmeq p1.h, p0/z, z0.h, #0.0 -; VBITS_GE_2048-NEXT: punpklo p1.h, p1.b -; VBITS_GE_2048-NEXT: ld1h { z0.s }, p1/z, [x2, z1.s, sxtw] -; VBITS_GE_2048-NEXT: uzp1 z0.h, z0.h, z0.h -; VBITS_GE_2048-NEXT: st1h { z0.h }, p0, [x0] -; VBITS_GE_2048-NEXT: ret +; VBITS_GE_256-LABEL: masked_gather_32b_unscaled_sext: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov x8, #16 +; VBITS_GE_256-NEXT: ptrue p0.h, vl16 +; VBITS_GE_256-NEXT: mov x10, #24 +; VBITS_GE_256-NEXT: ptrue p1.s, vl8 +; VBITS_GE_256-NEXT: mov x9, #8 +; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] +; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0] +; VBITS_GE_256-NEXT: ld1w { z4.s }, p1/z, [x1, x8, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z3.s }, p1/z, [x1, x10, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z2.s }, p1/z, [x1, x9, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z5.s }, p1/z, [x1] +; VBITS_GE_256-NEXT: fcmeq p2.h, p0/z, z0.h, #0.0 +; VBITS_GE_256-NEXT: fcmeq p3.h, p0/z, z1.h, #0.0 +; VBITS_GE_256-NEXT: mov z0.h, p2/z, #-1 // =0xffffffffffffffff +; VBITS_GE_256-NEXT: punpklo p2.h, p2.b +; VBITS_GE_256-NEXT: ext z0.b, z0.b, z0.b, #16 +; VBITS_GE_256-NEXT: mov z1.h, p3/z, #-1 // =0xffffffffffffffff +; VBITS_GE_256-NEXT: and p2.b, p2/z, p2.b, p1.b +; VBITS_GE_256-NEXT: sunpklo z0.s, z0.h +; VBITS_GE_256-NEXT: ld1h { z4.s }, p2/z, [x2, z4.s, sxtw] +; VBITS_GE_256-NEXT: cmpne p2.s, p1/z, z0.s, #0 +; VBITS_GE_256-NEXT: ext z1.b, z1.b, z1.b, #16 +; VBITS_GE_256-NEXT: ld1h { z0.s }, p2/z, [x2, z3.s, sxtw] +; VBITS_GE_256-NEXT: punpklo p2.h, p3.b +; VBITS_GE_256-NEXT: sunpklo z1.s, z1.h +; VBITS_GE_256-NEXT: and p2.b, p2/z, p2.b, p1.b +; VBITS_GE_256-NEXT: cmpne p1.s, p1/z, z1.s, #0 +; VBITS_GE_256-NEXT: ld1h { z1.s }, p2/z, [x2, z5.s, sxtw] +; VBITS_GE_256-NEXT: ld1h { z2.s }, p1/z, [x2, z2.s, sxtw] +; VBITS_GE_256-NEXT: uzp1 z3.h, z4.h, z4.h +; VBITS_GE_256-NEXT: uzp1 z0.h, z0.h, z0.h +; VBITS_GE_256-NEXT: ptrue p1.h, vl8 +; VBITS_GE_256-NEXT: splice z3.h, p1, z3.h, z0.h +; VBITS_GE_256-NEXT: uzp1 z1.h, z1.h, z1.h +; VBITS_GE_256-NEXT: uzp1 z2.h, z2.h, z2.h +; VBITS_GE_256-NEXT: splice z1.h, p1, z1.h, z2.h +; VBITS_GE_256-NEXT: st1h { z3.h }, p0, [x0, x8, lsl #1] +; VBITS_GE_256-NEXT: st1h { z1.h }, p0, [x0] +; VBITS_GE_256-NEXT: ret +; +; VBITS_GE_1024-LABEL: masked_gather_32b_unscaled_sext: +; VBITS_GE_1024: // %bb.0: +; VBITS_GE_1024-NEXT: ptrue p0.h, vl32 +; VBITS_GE_1024-NEXT: ptrue p1.s, vl32 +; VBITS_GE_1024-NEXT: ld1h { z0.h }, p0/z, [x0] +; VBITS_GE_1024-NEXT: ld1w { z1.s }, p1/z, [x1] +; VBITS_GE_1024-NEXT: fcmeq p1.h, p0/z, z0.h, #0.0 +; VBITS_GE_1024-NEXT: punpklo p1.h, p1.b +; VBITS_GE_1024-NEXT: ld1h { z0.s }, p1/z, [x2, z1.s, sxtw] +; VBITS_GE_1024-NEXT: uzp1 z0.h, z0.h, z0.h +; VBITS_GE_1024-NEXT: st1h { z0.h }, p0, [x0] +; VBITS_GE_1024-NEXT: ret %cvals = load <32 x half>, <32 x half>* %a %idxs = load <32 x i32>, <32 x i32>* %b %ext = sext <32 x i32> %idxs to <32 x i64> @@ -1070,18 +2242,60 @@ } define void @masked_gather_32b_unscaled_zext(<32 x half>* %a, <32 x i32>* %b, i8* %base) #0 { -; VBITS_GE_2048-LABEL: masked_gather_32b_unscaled_zext: -; VBITS_GE_2048: // %bb.0: -; VBITS_GE_2048-NEXT: ptrue p0.h, vl32 -; VBITS_GE_2048-NEXT: ptrue p1.s, vl32 -; VBITS_GE_2048-NEXT: ld1h { z0.h }, p0/z, [x0] -; VBITS_GE_2048-NEXT: ld1w { z1.s }, p1/z, [x1] -; VBITS_GE_2048-NEXT: fcmeq p1.h, p0/z, z0.h, #0.0 -; VBITS_GE_2048-NEXT: punpklo p1.h, p1.b -; VBITS_GE_2048-NEXT: ld1h { z0.s }, p1/z, [x2, z1.s, uxtw] -; VBITS_GE_2048-NEXT: uzp1 z0.h, z0.h, z0.h -; VBITS_GE_2048-NEXT: st1h { z0.h }, p0, [x0] -; VBITS_GE_2048-NEXT: ret +; VBITS_GE_256-LABEL: masked_gather_32b_unscaled_zext: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov x8, #16 +; VBITS_GE_256-NEXT: ptrue p0.h, vl16 +; VBITS_GE_256-NEXT: mov x10, #24 +; VBITS_GE_256-NEXT: ptrue p1.s, vl8 +; VBITS_GE_256-NEXT: mov x9, #8 +; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] +; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0] +; VBITS_GE_256-NEXT: ld1w { z4.s }, p1/z, [x1, x8, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z3.s }, p1/z, [x1, x10, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z2.s }, p1/z, [x1, x9, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z5.s }, p1/z, [x1] +; VBITS_GE_256-NEXT: fcmeq p2.h, p0/z, z0.h, #0.0 +; VBITS_GE_256-NEXT: fcmeq p3.h, p0/z, z1.h, #0.0 +; VBITS_GE_256-NEXT: mov z0.h, p2/z, #-1 // =0xffffffffffffffff +; VBITS_GE_256-NEXT: punpklo p2.h, p2.b +; VBITS_GE_256-NEXT: ext z0.b, z0.b, z0.b, #16 +; VBITS_GE_256-NEXT: mov z1.h, p3/z, #-1 // =0xffffffffffffffff +; VBITS_GE_256-NEXT: and p2.b, p2/z, p2.b, p1.b +; VBITS_GE_256-NEXT: sunpklo z0.s, z0.h +; VBITS_GE_256-NEXT: ld1h { z4.s }, p2/z, [x2, z4.s, uxtw] +; VBITS_GE_256-NEXT: cmpne p2.s, p1/z, z0.s, #0 +; VBITS_GE_256-NEXT: ext z1.b, z1.b, z1.b, #16 +; VBITS_GE_256-NEXT: ld1h { z0.s }, p2/z, [x2, z3.s, uxtw] +; VBITS_GE_256-NEXT: punpklo p2.h, p3.b +; VBITS_GE_256-NEXT: sunpklo z1.s, z1.h +; VBITS_GE_256-NEXT: and p2.b, p2/z, p2.b, p1.b +; VBITS_GE_256-NEXT: cmpne p1.s, p1/z, z1.s, #0 +; VBITS_GE_256-NEXT: ld1h { z1.s }, p2/z, [x2, z5.s, uxtw] +; VBITS_GE_256-NEXT: ld1h { z2.s }, p1/z, [x2, z2.s, uxtw] +; VBITS_GE_256-NEXT: uzp1 z3.h, z4.h, z4.h +; VBITS_GE_256-NEXT: uzp1 z0.h, z0.h, z0.h +; VBITS_GE_256-NEXT: ptrue p1.h, vl8 +; VBITS_GE_256-NEXT: splice z3.h, p1, z3.h, z0.h +; VBITS_GE_256-NEXT: uzp1 z1.h, z1.h, z1.h +; VBITS_GE_256-NEXT: uzp1 z2.h, z2.h, z2.h +; VBITS_GE_256-NEXT: splice z1.h, p1, z1.h, z2.h +; VBITS_GE_256-NEXT: st1h { z3.h }, p0, [x0, x8, lsl #1] +; VBITS_GE_256-NEXT: st1h { z1.h }, p0, [x0] +; VBITS_GE_256-NEXT: ret +; +; VBITS_GE_1024-LABEL: masked_gather_32b_unscaled_zext: +; VBITS_GE_1024: // %bb.0: +; VBITS_GE_1024-NEXT: ptrue p0.h, vl32 +; VBITS_GE_1024-NEXT: ptrue p1.s, vl32 +; VBITS_GE_1024-NEXT: ld1h { z0.h }, p0/z, [x0] +; VBITS_GE_1024-NEXT: ld1w { z1.s }, p1/z, [x1] +; VBITS_GE_1024-NEXT: fcmeq p1.h, p0/z, z0.h, #0.0 +; VBITS_GE_1024-NEXT: punpklo p1.h, p1.b +; VBITS_GE_1024-NEXT: ld1h { z0.s }, p1/z, [x2, z1.s, uxtw] +; VBITS_GE_1024-NEXT: uzp1 z0.h, z0.h, z0.h +; VBITS_GE_1024-NEXT: st1h { z0.h }, p0, [x0] +; VBITS_GE_1024-NEXT: ret %cvals = load <32 x half>, <32 x half>* %a %idxs = load <32 x i32>, <32 x i32>* %b %ext = zext <32 x i32> %idxs to <32 x i64> @@ -1094,6 +2308,84 @@ } define void @masked_gather_64b_scaled(<32 x float>* %a, <32 x i64>* %b, float* %base) #0 { +; VBITS_GE_256-LABEL: masked_gather_64b_scaled: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov x8, #24 +; VBITS_GE_256-NEXT: ptrue p0.s, vl8 +; VBITS_GE_256-NEXT: mov x10, #16 +; VBITS_GE_256-NEXT: mov x9, #8 +; VBITS_GE_256-NEXT: mov x14, #28 +; VBITS_GE_256-NEXT: ptrue p1.d, vl4 +; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] +; VBITS_GE_256-NEXT: mov x11, #4 +; VBITS_GE_256-NEXT: ld1w { z2.s }, p0/z, [x0, x10, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0, x9, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z3.s }, p0/z, [x0] +; VBITS_GE_256-NEXT: ld1d { z17.d }, p1/z, [x1, x14, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z18.d }, p1/z, [x1, x8, lsl #3] +; VBITS_GE_256-NEXT: mov x12, #20 +; VBITS_GE_256-NEXT: fcmeq p2.s, p0/z, z0.s, #0.0 +; VBITS_GE_256-NEXT: mov x13, #12 +; VBITS_GE_256-NEXT: mov z19.s, p2/z, #-1 // =0xffffffffffffffff +; VBITS_GE_256-NEXT: fcmeq p4.s, p0/z, z2.s, #0.0 +; VBITS_GE_256-NEXT: ext z19.b, z19.b, z19.b, #16 +; VBITS_GE_256-NEXT: punpklo p2.h, p2.b +; VBITS_GE_256-NEXT: sunpklo z2.d, z19.s +; VBITS_GE_256-NEXT: and p2.b, p2/z, p2.b, p1.b +; VBITS_GE_256-NEXT: cmpne p3.d, p1/z, z2.d, #0 +; VBITS_GE_256-NEXT: ld1d { z4.d }, p1/z, [x1, x11, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z5.d }, p1/z, [x1, x12, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z6.d }, p1/z, [x1, x13, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z7.d }, p1/z, [x1, x10, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z16.d }, p1/z, [x1, x9, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z0.d }, p1/z, [x1] +; VBITS_GE_256-NEXT: ld1w { z2.d }, p2/z, [x2, z18.d, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z17.d }, p3/z, [x2, z17.d, lsl #2] +; VBITS_GE_256-NEXT: fcmeq p3.s, p0/z, z1.s, #0.0 +; VBITS_GE_256-NEXT: mov z1.s, p3/z, #-1 // =0xffffffffffffffff +; VBITS_GE_256-NEXT: fcmeq p2.s, p0/z, z3.s, #0.0 +; VBITS_GE_256-NEXT: ext z1.b, z1.b, z1.b, #16 +; VBITS_GE_256-NEXT: mov z18.s, p4/z, #-1 // =0xffffffffffffffff +; VBITS_GE_256-NEXT: mov z3.s, p2/z, #-1 // =0xffffffffffffffff +; VBITS_GE_256-NEXT: punpklo p3.h, p3.b +; VBITS_GE_256-NEXT: sunpklo z1.d, z1.s +; VBITS_GE_256-NEXT: and p3.b, p3/z, p3.b, p1.b +; VBITS_GE_256-NEXT: cmpne p5.d, p1/z, z1.d, #0 +; VBITS_GE_256-NEXT: punpklo p4.h, p4.b +; VBITS_GE_256-NEXT: ext z18.b, z18.b, z18.b, #16 +; VBITS_GE_256-NEXT: ext z3.b, z3.b, z3.b, #16 +; VBITS_GE_256-NEXT: ld1w { z16.d }, p3/z, [x2, z16.d, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z1.d }, p5/z, [x2, z6.d, lsl #2] +; VBITS_GE_256-NEXT: and p4.b, p4/z, p4.b, p1.b +; VBITS_GE_256-NEXT: sunpklo z6.d, z18.s +; VBITS_GE_256-NEXT: punpklo p2.h, p2.b +; VBITS_GE_256-NEXT: sunpklo z3.d, z3.s +; VBITS_GE_256-NEXT: ld1w { z7.d }, p4/z, [x2, z7.d, lsl #2] +; VBITS_GE_256-NEXT: cmpne p4.d, p1/z, z6.d, #0 +; VBITS_GE_256-NEXT: and p2.b, p2/z, p2.b, p1.b +; VBITS_GE_256-NEXT: cmpne p1.d, p1/z, z3.d, #0 +; VBITS_GE_256-NEXT: ld1w { z5.d }, p4/z, [x2, z5.d, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z0.d }, p2/z, [x2, z0.d, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z3.d }, p1/z, [x2, z4.d, lsl #2] +; VBITS_GE_256-NEXT: ptrue p3.s, vl4 +; VBITS_GE_256-NEXT: uzp1 z4.s, z16.s, z16.s +; VBITS_GE_256-NEXT: uzp1 z1.s, z1.s, z1.s +; VBITS_GE_256-NEXT: uzp1 z2.s, z2.s, z2.s +; VBITS_GE_256-NEXT: uzp1 z17.s, z17.s, z17.s +; VBITS_GE_256-NEXT: splice z4.s, p3, z4.s, z1.s +; VBITS_GE_256-NEXT: uzp1 z1.s, z7.s, z7.s +; VBITS_GE_256-NEXT: uzp1 z5.s, z5.s, z5.s +; VBITS_GE_256-NEXT: uzp1 z0.s, z0.s, z0.s +; VBITS_GE_256-NEXT: uzp1 z3.s, z3.s, z3.s +; VBITS_GE_256-NEXT: splice z2.s, p3, z2.s, z17.s +; VBITS_GE_256-NEXT: splice z1.s, p3, z1.s, z5.s +; VBITS_GE_256-NEXT: splice z0.s, p3, z0.s, z3.s +; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x0, x10, lsl #2] +; VBITS_GE_256-NEXT: st1w { z4.s }, p0, [x0, x9, lsl #2] +; VBITS_GE_256-NEXT: st1w { z2.s }, p0, [x0, x8, lsl #2] +; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0] +; VBITS_GE_256-NEXT: ret +; ; VBITS_GE_2048-LABEL: masked_gather_64b_scaled: ; VBITS_GE_2048: // %bb.0: ; VBITS_GE_2048-NEXT: ptrue p0.s, vl32 @@ -1116,6 +2408,84 @@ } define void @masked_gather_64b_unscaled(<32 x float>* %a, <32 x i64>* %b, i8* %base) #0 { +; VBITS_GE_256-LABEL: masked_gather_64b_unscaled: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov x8, #24 +; VBITS_GE_256-NEXT: ptrue p0.s, vl8 +; VBITS_GE_256-NEXT: mov x10, #16 +; VBITS_GE_256-NEXT: mov x9, #8 +; VBITS_GE_256-NEXT: mov x14, #28 +; VBITS_GE_256-NEXT: ptrue p1.d, vl4 +; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] +; VBITS_GE_256-NEXT: mov x11, #4 +; VBITS_GE_256-NEXT: ld1w { z2.s }, p0/z, [x0, x10, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0, x9, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z3.s }, p0/z, [x0] +; VBITS_GE_256-NEXT: ld1d { z17.d }, p1/z, [x1, x14, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z18.d }, p1/z, [x1, x8, lsl #3] +; VBITS_GE_256-NEXT: mov x12, #20 +; VBITS_GE_256-NEXT: fcmeq p2.s, p0/z, z0.s, #0.0 +; VBITS_GE_256-NEXT: mov x13, #12 +; VBITS_GE_256-NEXT: mov z19.s, p2/z, #-1 // =0xffffffffffffffff +; VBITS_GE_256-NEXT: fcmeq p4.s, p0/z, z2.s, #0.0 +; VBITS_GE_256-NEXT: ext z19.b, z19.b, z19.b, #16 +; VBITS_GE_256-NEXT: punpklo p2.h, p2.b +; VBITS_GE_256-NEXT: sunpklo z2.d, z19.s +; VBITS_GE_256-NEXT: and p2.b, p2/z, p2.b, p1.b +; VBITS_GE_256-NEXT: cmpne p3.d, p1/z, z2.d, #0 +; VBITS_GE_256-NEXT: ld1d { z4.d }, p1/z, [x1, x11, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z5.d }, p1/z, [x1, x12, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z6.d }, p1/z, [x1, x13, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z7.d }, p1/z, [x1, x10, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z16.d }, p1/z, [x1, x9, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z0.d }, p1/z, [x1] +; VBITS_GE_256-NEXT: ld1w { z2.d }, p2/z, [x2, z18.d] +; VBITS_GE_256-NEXT: ld1w { z17.d }, p3/z, [x2, z17.d] +; VBITS_GE_256-NEXT: fcmeq p3.s, p0/z, z1.s, #0.0 +; VBITS_GE_256-NEXT: mov z1.s, p3/z, #-1 // =0xffffffffffffffff +; VBITS_GE_256-NEXT: fcmeq p2.s, p0/z, z3.s, #0.0 +; VBITS_GE_256-NEXT: ext z1.b, z1.b, z1.b, #16 +; VBITS_GE_256-NEXT: mov z18.s, p4/z, #-1 // =0xffffffffffffffff +; VBITS_GE_256-NEXT: mov z3.s, p2/z, #-1 // =0xffffffffffffffff +; VBITS_GE_256-NEXT: punpklo p3.h, p3.b +; VBITS_GE_256-NEXT: sunpklo z1.d, z1.s +; VBITS_GE_256-NEXT: and p3.b, p3/z, p3.b, p1.b +; VBITS_GE_256-NEXT: cmpne p5.d, p1/z, z1.d, #0 +; VBITS_GE_256-NEXT: punpklo p4.h, p4.b +; VBITS_GE_256-NEXT: ext z18.b, z18.b, z18.b, #16 +; VBITS_GE_256-NEXT: ext z3.b, z3.b, z3.b, #16 +; VBITS_GE_256-NEXT: ld1w { z16.d }, p3/z, [x2, z16.d] +; VBITS_GE_256-NEXT: ld1w { z1.d }, p5/z, [x2, z6.d] +; VBITS_GE_256-NEXT: and p4.b, p4/z, p4.b, p1.b +; VBITS_GE_256-NEXT: sunpklo z6.d, z18.s +; VBITS_GE_256-NEXT: punpklo p2.h, p2.b +; VBITS_GE_256-NEXT: sunpklo z3.d, z3.s +; VBITS_GE_256-NEXT: ld1w { z7.d }, p4/z, [x2, z7.d] +; VBITS_GE_256-NEXT: cmpne p4.d, p1/z, z6.d, #0 +; VBITS_GE_256-NEXT: and p2.b, p2/z, p2.b, p1.b +; VBITS_GE_256-NEXT: cmpne p1.d, p1/z, z3.d, #0 +; VBITS_GE_256-NEXT: ld1w { z5.d }, p4/z, [x2, z5.d] +; VBITS_GE_256-NEXT: ld1w { z0.d }, p2/z, [x2, z0.d] +; VBITS_GE_256-NEXT: ld1w { z3.d }, p1/z, [x2, z4.d] +; VBITS_GE_256-NEXT: ptrue p3.s, vl4 +; VBITS_GE_256-NEXT: uzp1 z4.s, z16.s, z16.s +; VBITS_GE_256-NEXT: uzp1 z1.s, z1.s, z1.s +; VBITS_GE_256-NEXT: uzp1 z2.s, z2.s, z2.s +; VBITS_GE_256-NEXT: uzp1 z17.s, z17.s, z17.s +; VBITS_GE_256-NEXT: splice z4.s, p3, z4.s, z1.s +; VBITS_GE_256-NEXT: uzp1 z1.s, z7.s, z7.s +; VBITS_GE_256-NEXT: uzp1 z5.s, z5.s, z5.s +; VBITS_GE_256-NEXT: uzp1 z0.s, z0.s, z0.s +; VBITS_GE_256-NEXT: uzp1 z3.s, z3.s, z3.s +; VBITS_GE_256-NEXT: splice z2.s, p3, z2.s, z17.s +; VBITS_GE_256-NEXT: splice z1.s, p3, z1.s, z5.s +; VBITS_GE_256-NEXT: splice z0.s, p3, z0.s, z3.s +; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x0, x10, lsl #2] +; VBITS_GE_256-NEXT: st1w { z4.s }, p0, [x0, x9, lsl #2] +; VBITS_GE_256-NEXT: st1w { z2.s }, p0, [x0, x8, lsl #2] +; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0] +; VBITS_GE_256-NEXT: ret +; ; VBITS_GE_2048-LABEL: masked_gather_64b_unscaled: ; VBITS_GE_2048: // %bb.0: ; VBITS_GE_2048-NEXT: ptrue p0.s, vl32 @@ -1139,6 +2509,84 @@ } define void @masked_gather_vec_plus_reg(<32 x float>* %a, <32 x i8*>* %b, i64 %off) #0 { +; VBITS_GE_256-LABEL: masked_gather_vec_plus_reg: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov x8, #24 +; VBITS_GE_256-NEXT: ptrue p0.s, vl8 +; VBITS_GE_256-NEXT: mov x10, #16 +; VBITS_GE_256-NEXT: mov x9, #8 +; VBITS_GE_256-NEXT: mov x14, #28 +; VBITS_GE_256-NEXT: ptrue p1.d, vl4 +; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] +; VBITS_GE_256-NEXT: mov x11, #4 +; VBITS_GE_256-NEXT: ld1w { z2.s }, p0/z, [x0, x10, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0, x9, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z3.s }, p0/z, [x0] +; VBITS_GE_256-NEXT: ld1d { z17.d }, p1/z, [x1, x14, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z18.d }, p1/z, [x1, x8, lsl #3] +; VBITS_GE_256-NEXT: mov x12, #20 +; VBITS_GE_256-NEXT: fcmeq p2.s, p0/z, z0.s, #0.0 +; VBITS_GE_256-NEXT: mov x13, #12 +; VBITS_GE_256-NEXT: mov z19.s, p2/z, #-1 // =0xffffffffffffffff +; VBITS_GE_256-NEXT: fcmeq p4.s, p0/z, z2.s, #0.0 +; VBITS_GE_256-NEXT: ext z19.b, z19.b, z19.b, #16 +; VBITS_GE_256-NEXT: punpklo p2.h, p2.b +; VBITS_GE_256-NEXT: sunpklo z2.d, z19.s +; VBITS_GE_256-NEXT: and p2.b, p2/z, p2.b, p1.b +; VBITS_GE_256-NEXT: cmpne p3.d, p1/z, z2.d, #0 +; VBITS_GE_256-NEXT: ld1d { z4.d }, p1/z, [x1, x11, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z5.d }, p1/z, [x1, x12, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z6.d }, p1/z, [x1, x13, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z7.d }, p1/z, [x1, x10, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z16.d }, p1/z, [x1, x9, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z0.d }, p1/z, [x1] +; VBITS_GE_256-NEXT: ld1w { z2.d }, p2/z, [x2, z18.d] +; VBITS_GE_256-NEXT: ld1w { z17.d }, p3/z, [x2, z17.d] +; VBITS_GE_256-NEXT: fcmeq p3.s, p0/z, z1.s, #0.0 +; VBITS_GE_256-NEXT: mov z1.s, p3/z, #-1 // =0xffffffffffffffff +; VBITS_GE_256-NEXT: fcmeq p2.s, p0/z, z3.s, #0.0 +; VBITS_GE_256-NEXT: ext z1.b, z1.b, z1.b, #16 +; VBITS_GE_256-NEXT: mov z18.s, p4/z, #-1 // =0xffffffffffffffff +; VBITS_GE_256-NEXT: mov z3.s, p2/z, #-1 // =0xffffffffffffffff +; VBITS_GE_256-NEXT: punpklo p3.h, p3.b +; VBITS_GE_256-NEXT: sunpklo z1.d, z1.s +; VBITS_GE_256-NEXT: and p3.b, p3/z, p3.b, p1.b +; VBITS_GE_256-NEXT: cmpne p5.d, p1/z, z1.d, #0 +; VBITS_GE_256-NEXT: punpklo p4.h, p4.b +; VBITS_GE_256-NEXT: ext z18.b, z18.b, z18.b, #16 +; VBITS_GE_256-NEXT: ext z3.b, z3.b, z3.b, #16 +; VBITS_GE_256-NEXT: ld1w { z16.d }, p3/z, [x2, z16.d] +; VBITS_GE_256-NEXT: ld1w { z1.d }, p5/z, [x2, z6.d] +; VBITS_GE_256-NEXT: and p4.b, p4/z, p4.b, p1.b +; VBITS_GE_256-NEXT: sunpklo z6.d, z18.s +; VBITS_GE_256-NEXT: punpklo p2.h, p2.b +; VBITS_GE_256-NEXT: sunpklo z3.d, z3.s +; VBITS_GE_256-NEXT: ld1w { z7.d }, p4/z, [x2, z7.d] +; VBITS_GE_256-NEXT: cmpne p4.d, p1/z, z6.d, #0 +; VBITS_GE_256-NEXT: and p2.b, p2/z, p2.b, p1.b +; VBITS_GE_256-NEXT: cmpne p1.d, p1/z, z3.d, #0 +; VBITS_GE_256-NEXT: ld1w { z5.d }, p4/z, [x2, z5.d] +; VBITS_GE_256-NEXT: ld1w { z0.d }, p2/z, [x2, z0.d] +; VBITS_GE_256-NEXT: ld1w { z3.d }, p1/z, [x2, z4.d] +; VBITS_GE_256-NEXT: ptrue p3.s, vl4 +; VBITS_GE_256-NEXT: uzp1 z4.s, z16.s, z16.s +; VBITS_GE_256-NEXT: uzp1 z1.s, z1.s, z1.s +; VBITS_GE_256-NEXT: uzp1 z2.s, z2.s, z2.s +; VBITS_GE_256-NEXT: uzp1 z17.s, z17.s, z17.s +; VBITS_GE_256-NEXT: splice z4.s, p3, z4.s, z1.s +; VBITS_GE_256-NEXT: uzp1 z1.s, z7.s, z7.s +; VBITS_GE_256-NEXT: uzp1 z5.s, z5.s, z5.s +; VBITS_GE_256-NEXT: uzp1 z0.s, z0.s, z0.s +; VBITS_GE_256-NEXT: uzp1 z3.s, z3.s, z3.s +; VBITS_GE_256-NEXT: splice z2.s, p3, z2.s, z17.s +; VBITS_GE_256-NEXT: splice z1.s, p3, z1.s, z5.s +; VBITS_GE_256-NEXT: splice z0.s, p3, z0.s, z3.s +; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x0, x10, lsl #2] +; VBITS_GE_256-NEXT: st1w { z4.s }, p0, [x0, x9, lsl #2] +; VBITS_GE_256-NEXT: st1w { z2.s }, p0, [x0, x8, lsl #2] +; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0] +; VBITS_GE_256-NEXT: ret +; ; VBITS_GE_2048-LABEL: masked_gather_vec_plus_reg: ; VBITS_GE_2048: // %bb.0: ; VBITS_GE_2048-NEXT: ptrue p0.s, vl32 @@ -1162,6 +2610,84 @@ } define void @masked_gather_vec_plus_imm(<32 x float>* %a, <32 x i8*>* %b) #0 { +; VBITS_GE_256-LABEL: masked_gather_vec_plus_imm: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov x8, #24 +; VBITS_GE_256-NEXT: ptrue p0.s, vl8 +; VBITS_GE_256-NEXT: mov x10, #16 +; VBITS_GE_256-NEXT: mov x9, #8 +; VBITS_GE_256-NEXT: mov x14, #28 +; VBITS_GE_256-NEXT: ptrue p1.d, vl4 +; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] +; VBITS_GE_256-NEXT: mov x11, #4 +; VBITS_GE_256-NEXT: ld1w { z2.s }, p0/z, [x0, x10, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0, x9, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z3.s }, p0/z, [x0] +; VBITS_GE_256-NEXT: ld1d { z17.d }, p1/z, [x1, x14, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z18.d }, p1/z, [x1, x8, lsl #3] +; VBITS_GE_256-NEXT: mov x12, #20 +; VBITS_GE_256-NEXT: fcmeq p2.s, p0/z, z0.s, #0.0 +; VBITS_GE_256-NEXT: mov x13, #12 +; VBITS_GE_256-NEXT: mov z19.s, p2/z, #-1 // =0xffffffffffffffff +; VBITS_GE_256-NEXT: fcmeq p4.s, p0/z, z2.s, #0.0 +; VBITS_GE_256-NEXT: ext z19.b, z19.b, z19.b, #16 +; VBITS_GE_256-NEXT: punpklo p2.h, p2.b +; VBITS_GE_256-NEXT: sunpklo z2.d, z19.s +; VBITS_GE_256-NEXT: and p2.b, p2/z, p2.b, p1.b +; VBITS_GE_256-NEXT: cmpne p3.d, p1/z, z2.d, #0 +; VBITS_GE_256-NEXT: ld1d { z4.d }, p1/z, [x1, x11, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z5.d }, p1/z, [x1, x12, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z6.d }, p1/z, [x1, x13, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z7.d }, p1/z, [x1, x10, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z16.d }, p1/z, [x1, x9, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z0.d }, p1/z, [x1] +; VBITS_GE_256-NEXT: ld1w { z2.d }, p2/z, [z18.d, #4] +; VBITS_GE_256-NEXT: ld1w { z17.d }, p3/z, [z17.d, #4] +; VBITS_GE_256-NEXT: fcmeq p3.s, p0/z, z1.s, #0.0 +; VBITS_GE_256-NEXT: mov z1.s, p3/z, #-1 // =0xffffffffffffffff +; VBITS_GE_256-NEXT: fcmeq p2.s, p0/z, z3.s, #0.0 +; VBITS_GE_256-NEXT: ext z1.b, z1.b, z1.b, #16 +; VBITS_GE_256-NEXT: mov z18.s, p4/z, #-1 // =0xffffffffffffffff +; VBITS_GE_256-NEXT: mov z3.s, p2/z, #-1 // =0xffffffffffffffff +; VBITS_GE_256-NEXT: punpklo p3.h, p3.b +; VBITS_GE_256-NEXT: sunpklo z1.d, z1.s +; VBITS_GE_256-NEXT: and p3.b, p3/z, p3.b, p1.b +; VBITS_GE_256-NEXT: cmpne p5.d, p1/z, z1.d, #0 +; VBITS_GE_256-NEXT: punpklo p4.h, p4.b +; VBITS_GE_256-NEXT: ext z18.b, z18.b, z18.b, #16 +; VBITS_GE_256-NEXT: ext z3.b, z3.b, z3.b, #16 +; VBITS_GE_256-NEXT: ld1w { z16.d }, p3/z, [z16.d, #4] +; VBITS_GE_256-NEXT: ld1w { z1.d }, p5/z, [z6.d, #4] +; VBITS_GE_256-NEXT: and p4.b, p4/z, p4.b, p1.b +; VBITS_GE_256-NEXT: sunpklo z6.d, z18.s +; VBITS_GE_256-NEXT: punpklo p2.h, p2.b +; VBITS_GE_256-NEXT: sunpklo z3.d, z3.s +; VBITS_GE_256-NEXT: ld1w { z7.d }, p4/z, [z7.d, #4] +; VBITS_GE_256-NEXT: cmpne p4.d, p1/z, z6.d, #0 +; VBITS_GE_256-NEXT: and p2.b, p2/z, p2.b, p1.b +; VBITS_GE_256-NEXT: cmpne p1.d, p1/z, z3.d, #0 +; VBITS_GE_256-NEXT: ld1w { z5.d }, p4/z, [z5.d, #4] +; VBITS_GE_256-NEXT: ld1w { z0.d }, p2/z, [z0.d, #4] +; VBITS_GE_256-NEXT: ld1w { z3.d }, p1/z, [z4.d, #4] +; VBITS_GE_256-NEXT: ptrue p3.s, vl4 +; VBITS_GE_256-NEXT: uzp1 z4.s, z16.s, z16.s +; VBITS_GE_256-NEXT: uzp1 z1.s, z1.s, z1.s +; VBITS_GE_256-NEXT: uzp1 z2.s, z2.s, z2.s +; VBITS_GE_256-NEXT: uzp1 z17.s, z17.s, z17.s +; VBITS_GE_256-NEXT: splice z4.s, p3, z4.s, z1.s +; VBITS_GE_256-NEXT: uzp1 z1.s, z7.s, z7.s +; VBITS_GE_256-NEXT: uzp1 z5.s, z5.s, z5.s +; VBITS_GE_256-NEXT: uzp1 z0.s, z0.s, z0.s +; VBITS_GE_256-NEXT: uzp1 z3.s, z3.s, z3.s +; VBITS_GE_256-NEXT: splice z2.s, p3, z2.s, z17.s +; VBITS_GE_256-NEXT: splice z1.s, p3, z1.s, z5.s +; VBITS_GE_256-NEXT: splice z0.s, p3, z0.s, z3.s +; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x0, x10, lsl #2] +; VBITS_GE_256-NEXT: st1w { z4.s }, p0, [x0, x9, lsl #2] +; VBITS_GE_256-NEXT: st1w { z2.s }, p0, [x0, x8, lsl #2] +; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0] +; VBITS_GE_256-NEXT: ret +; ; VBITS_GE_2048-LABEL: masked_gather_vec_plus_imm: ; VBITS_GE_2048: // %bb.0: ; VBITS_GE_2048-NEXT: ptrue p0.s, vl32 @@ -1185,6 +2711,100 @@ } define void @masked_gather_passthru(<32 x float>* %a, <32 x float*>* %b, <32 x float>* %c) #0 { +; VBITS_GE_256-LABEL: masked_gather_passthru: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov x8, #24 +; VBITS_GE_256-NEXT: ptrue p0.s, vl8 +; VBITS_GE_256-NEXT: mov x9, #8 +; VBITS_GE_256-NEXT: mov x10, #16 +; VBITS_GE_256-NEXT: ptrue p1.d, vl4 +; VBITS_GE_256-NEXT: mov x11, #4 +; VBITS_GE_256-NEXT: ld1w { z4.s }, p0/z, [x0, x8, lsl #2] +; VBITS_GE_256-NEXT: mov x12, #20 +; VBITS_GE_256-NEXT: ld1w { z17.s }, p0/z, [x0, x9, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z20.s }, p0/z, [x0, x10, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z7.s }, p0/z, [x0] +; VBITS_GE_256-NEXT: ld1d { z23.d }, p1/z, [x1, x8, lsl #3] +; VBITS_GE_256-NEXT: mov x13, #12 +; VBITS_GE_256-NEXT: mov x14, #28 +; VBITS_GE_256-NEXT: fcmeq p2.s, p0/z, z4.s, #0.0 +; VBITS_GE_256-NEXT: ld1d { z0.d }, p1/z, [x1, x11, lsl #3] +; VBITS_GE_256-NEXT: punpklo p3.h, p2.b +; VBITS_GE_256-NEXT: ld1d { z2.d }, p1/z, [x1, x12, lsl #3] +; VBITS_GE_256-NEXT: and p3.b, p3/z, p3.b, p1.b +; VBITS_GE_256-NEXT: ld1d { z19.d }, p1/z, [x1, x13, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z18.d }, p1/z, [x1, x10, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z21.d }, p1/z, [x1, x9, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z22.d }, p1/z, [x1, x14, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z3.d }, p1/z, [x1] +; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x2, x10, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z5.s }, p0/z, [x2, x9, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z16.s }, p0/z, [x2, x8, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z6.s }, p0/z, [x2] +; VBITS_GE_256-NEXT: ld1w { z4.d }, p3/z, [z23.d] +; VBITS_GE_256-NEXT: fcmeq p3.s, p0/z, z17.s, #0.0 +; VBITS_GE_256-NEXT: mov z17.s, p2/z, #-1 // =0xffffffffffffffff +; VBITS_GE_256-NEXT: punpklo p2.h, p3.b +; VBITS_GE_256-NEXT: and p2.b, p2/z, p2.b, p1.b +; VBITS_GE_256-NEXT: uzp1 z4.s, z4.s, z4.s +; VBITS_GE_256-NEXT: bif v4.16b, v16.16b, v17.16b +; VBITS_GE_256-NEXT: ext z17.b, z17.b, z17.b, #16 +; VBITS_GE_256-NEXT: sunpklo z23.d, z17.s +; VBITS_GE_256-NEXT: ext z16.b, z16.b, z16.b, #16 +; VBITS_GE_256-NEXT: cmpne p4.d, p1/z, z23.d, #0 +; VBITS_GE_256-NEXT: ld1w { z22.d }, p4/z, [z22.d] +; VBITS_GE_256-NEXT: ld1w { z21.d }, p2/z, [z21.d] +; VBITS_GE_256-NEXT: fcmeq p2.s, p0/z, z20.s, #0.0 +; VBITS_GE_256-NEXT: mov z20.s, p3/z, #-1 // =0xffffffffffffffff +; VBITS_GE_256-NEXT: punpklo p3.h, p2.b +; VBITS_GE_256-NEXT: and p3.b, p3/z, p3.b, p1.b +; VBITS_GE_256-NEXT: uzp1 z21.s, z21.s, z21.s +; VBITS_GE_256-NEXT: uzp1 z22.s, z22.s, z22.s +; VBITS_GE_256-NEXT: bif v21.16b, v5.16b, v20.16b +; VBITS_GE_256-NEXT: ext z20.b, z20.b, z20.b, #16 +; VBITS_GE_256-NEXT: sunpklo z23.d, z20.s +; VBITS_GE_256-NEXT: ext z5.b, z5.b, z5.b, #16 +; VBITS_GE_256-NEXT: cmpne p4.d, p1/z, z23.d, #0 +; VBITS_GE_256-NEXT: ld1w { z19.d }, p4/z, [z19.d] +; VBITS_GE_256-NEXT: ld1w { z18.d }, p3/z, [z18.d] +; VBITS_GE_256-NEXT: fcmeq p3.s, p0/z, z7.s, #0.0 +; VBITS_GE_256-NEXT: mov z7.s, p2/z, #-1 // =0xffffffffffffffff +; VBITS_GE_256-NEXT: punpklo p2.h, p3.b +; VBITS_GE_256-NEXT: and p2.b, p2/z, p2.b, p1.b +; VBITS_GE_256-NEXT: uzp1 z18.s, z18.s, z18.s +; VBITS_GE_256-NEXT: bif v18.16b, v1.16b, v7.16b +; VBITS_GE_256-NEXT: ext z7.b, z7.b, z7.b, #16 +; VBITS_GE_256-NEXT: sunpklo z23.d, z7.s +; VBITS_GE_256-NEXT: ext z1.b, z1.b, z1.b, #16 +; VBITS_GE_256-NEXT: cmpne p4.d, p1/z, z23.d, #0 +; VBITS_GE_256-NEXT: mov z23.s, p3/z, #-1 // =0xffffffffffffffff +; VBITS_GE_256-NEXT: ld1w { z2.d }, p4/z, [z2.d] +; VBITS_GE_256-NEXT: ld1w { z3.d }, p2/z, [z3.d] +; VBITS_GE_256-NEXT: bit v16.16b, v22.16b, v17.16b +; VBITS_GE_256-NEXT: uzp1 z3.s, z3.s, z3.s +; VBITS_GE_256-NEXT: uzp1 z2.s, z2.s, z2.s +; VBITS_GE_256-NEXT: bif v3.16b, v6.16b, v23.16b +; VBITS_GE_256-NEXT: ext z23.b, z23.b, z23.b, #16 +; VBITS_GE_256-NEXT: sunpklo z17.d, z23.s +; VBITS_GE_256-NEXT: ext z6.b, z6.b, z6.b, #16 +; VBITS_GE_256-NEXT: cmpne p1.d, p1/z, z17.d, #0 +; VBITS_GE_256-NEXT: uzp1 z17.s, z19.s, z19.s +; VBITS_GE_256-NEXT: ld1w { z0.d }, p1/z, [z0.d] +; VBITS_GE_256-NEXT: ptrue p1.s, vl4 +; VBITS_GE_256-NEXT: bit v5.16b, v17.16b, v20.16b +; VBITS_GE_256-NEXT: splice z4.s, p1, z4.s, z16.s +; VBITS_GE_256-NEXT: bit v1.16b, v2.16b, v7.16b +; VBITS_GE_256-NEXT: st1w { z4.s }, p0, [x0, x8, lsl #2] +; VBITS_GE_256-NEXT: uzp1 z0.s, z0.s, z0.s +; VBITS_GE_256-NEXT: bif v0.16b, v6.16b, v23.16b +; VBITS_GE_256-NEXT: splice z21.s, p1, z21.s, z5.s +; VBITS_GE_256-NEXT: splice z18.s, p1, z18.s, z1.s +; VBITS_GE_256-NEXT: st1w { z21.s }, p0, [x0, x9, lsl #2] +; VBITS_GE_256-NEXT: st1w { z18.s }, p0, [x0, x10, lsl #2] +; VBITS_GE_256-NEXT: splice z3.s, p1, z3.s, z0.s +; VBITS_GE_256-NEXT: st1w { z3.s }, p0, [x0] +; VBITS_GE_256-NEXT: ret +; ; VBITS_GE_2048-LABEL: masked_gather_passthru: ; VBITS_GE_2048: // %bb.0: ; VBITS_GE_2048-NEXT: ptrue p0.s, vl32 @@ -1209,6 +2829,84 @@ } define void @masked_gather_passthru_0(<32 x float>* %a, <32 x float*>* %b) #0 { +; VBITS_GE_256-LABEL: masked_gather_passthru_0: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov x8, #24 +; VBITS_GE_256-NEXT: ptrue p0.s, vl8 +; VBITS_GE_256-NEXT: mov x10, #16 +; VBITS_GE_256-NEXT: mov x9, #8 +; VBITS_GE_256-NEXT: mov x14, #28 +; VBITS_GE_256-NEXT: ptrue p1.d, vl4 +; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] +; VBITS_GE_256-NEXT: mov x11, #4 +; VBITS_GE_256-NEXT: ld1w { z2.s }, p0/z, [x0, x10, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0, x9, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z3.s }, p0/z, [x0] +; VBITS_GE_256-NEXT: ld1d { z17.d }, p1/z, [x1, x14, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z18.d }, p1/z, [x1, x8, lsl #3] +; VBITS_GE_256-NEXT: mov x12, #20 +; VBITS_GE_256-NEXT: fcmeq p2.s, p0/z, z0.s, #0.0 +; VBITS_GE_256-NEXT: mov x13, #12 +; VBITS_GE_256-NEXT: mov z19.s, p2/z, #-1 // =0xffffffffffffffff +; VBITS_GE_256-NEXT: fcmeq p4.s, p0/z, z2.s, #0.0 +; VBITS_GE_256-NEXT: ext z19.b, z19.b, z19.b, #16 +; VBITS_GE_256-NEXT: punpklo p2.h, p2.b +; VBITS_GE_256-NEXT: sunpklo z2.d, z19.s +; VBITS_GE_256-NEXT: and p2.b, p2/z, p2.b, p1.b +; VBITS_GE_256-NEXT: cmpne p3.d, p1/z, z2.d, #0 +; VBITS_GE_256-NEXT: ld1d { z4.d }, p1/z, [x1, x11, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z5.d }, p1/z, [x1, x12, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z6.d }, p1/z, [x1, x13, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z7.d }, p1/z, [x1, x10, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z16.d }, p1/z, [x1, x9, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z0.d }, p1/z, [x1] +; VBITS_GE_256-NEXT: ld1w { z2.d }, p2/z, [z18.d] +; VBITS_GE_256-NEXT: ld1w { z17.d }, p3/z, [z17.d] +; VBITS_GE_256-NEXT: fcmeq p3.s, p0/z, z1.s, #0.0 +; VBITS_GE_256-NEXT: mov z1.s, p3/z, #-1 // =0xffffffffffffffff +; VBITS_GE_256-NEXT: fcmeq p2.s, p0/z, z3.s, #0.0 +; VBITS_GE_256-NEXT: ext z1.b, z1.b, z1.b, #16 +; VBITS_GE_256-NEXT: mov z18.s, p4/z, #-1 // =0xffffffffffffffff +; VBITS_GE_256-NEXT: mov z3.s, p2/z, #-1 // =0xffffffffffffffff +; VBITS_GE_256-NEXT: punpklo p3.h, p3.b +; VBITS_GE_256-NEXT: sunpklo z1.d, z1.s +; VBITS_GE_256-NEXT: and p3.b, p3/z, p3.b, p1.b +; VBITS_GE_256-NEXT: cmpne p5.d, p1/z, z1.d, #0 +; VBITS_GE_256-NEXT: punpklo p4.h, p4.b +; VBITS_GE_256-NEXT: ext z18.b, z18.b, z18.b, #16 +; VBITS_GE_256-NEXT: ext z3.b, z3.b, z3.b, #16 +; VBITS_GE_256-NEXT: ld1w { z16.d }, p3/z, [z16.d] +; VBITS_GE_256-NEXT: ld1w { z1.d }, p5/z, [z6.d] +; VBITS_GE_256-NEXT: and p4.b, p4/z, p4.b, p1.b +; VBITS_GE_256-NEXT: sunpklo z6.d, z18.s +; VBITS_GE_256-NEXT: punpklo p2.h, p2.b +; VBITS_GE_256-NEXT: sunpklo z3.d, z3.s +; VBITS_GE_256-NEXT: ld1w { z7.d }, p4/z, [z7.d] +; VBITS_GE_256-NEXT: cmpne p4.d, p1/z, z6.d, #0 +; VBITS_GE_256-NEXT: and p2.b, p2/z, p2.b, p1.b +; VBITS_GE_256-NEXT: cmpne p1.d, p1/z, z3.d, #0 +; VBITS_GE_256-NEXT: ld1w { z5.d }, p4/z, [z5.d] +; VBITS_GE_256-NEXT: ld1w { z0.d }, p2/z, [z0.d] +; VBITS_GE_256-NEXT: ld1w { z3.d }, p1/z, [z4.d] +; VBITS_GE_256-NEXT: ptrue p3.s, vl4 +; VBITS_GE_256-NEXT: uzp1 z4.s, z16.s, z16.s +; VBITS_GE_256-NEXT: uzp1 z1.s, z1.s, z1.s +; VBITS_GE_256-NEXT: uzp1 z2.s, z2.s, z2.s +; VBITS_GE_256-NEXT: uzp1 z17.s, z17.s, z17.s +; VBITS_GE_256-NEXT: splice z4.s, p3, z4.s, z1.s +; VBITS_GE_256-NEXT: uzp1 z1.s, z7.s, z7.s +; VBITS_GE_256-NEXT: uzp1 z5.s, z5.s, z5.s +; VBITS_GE_256-NEXT: uzp1 z0.s, z0.s, z0.s +; VBITS_GE_256-NEXT: uzp1 z3.s, z3.s, z3.s +; VBITS_GE_256-NEXT: splice z2.s, p3, z2.s, z17.s +; VBITS_GE_256-NEXT: splice z1.s, p3, z1.s, z5.s +; VBITS_GE_256-NEXT: splice z0.s, p3, z0.s, z3.s +; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x0, x10, lsl #2] +; VBITS_GE_256-NEXT: st1w { z4.s }, p0, [x0, x9, lsl #2] +; VBITS_GE_256-NEXT: st1w { z2.s }, p0, [x0, x8, lsl #2] +; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0] +; VBITS_GE_256-NEXT: ret +; ; VBITS_GE_2048-LABEL: masked_gather_passthru_0: ; VBITS_GE_2048: // %bb.0: ; VBITS_GE_2048-NEXT: ptrue p0.s, vl32 diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-stores.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-stores.ll --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-stores.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-stores.ll @@ -1,6 +1,8 @@ -; RUN: llc -aarch64-sve-vector-bits-min=128 < %s | FileCheck %s -D#VBYTES=16 -check-prefix=NO_SVE -; RUN: llc -aarch64-sve-vector-bits-min=256 < %s | FileCheck %s -D#VBYTES=32 -check-prefixes=CHECK -; RUN: llc -aarch64-sve-vector-bits-min=384 < %s | FileCheck %s -D#VBYTES=32 -check-prefixes=CHECK +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; Don't use SVE when its registers are no bigger than NEON. +; RUN: llc -aarch64-sve-vector-bits-min=128 < %s | not grep ptrue +; RUN: llc -aarch64-sve-vector-bits-min=256 < %s | FileCheck %s -D#VBYTES=32 -check-prefixes=CHECK,VBITS_GE_256 +; RUN: llc -aarch64-sve-vector-bits-min=384 < %s | FileCheck %s -D#VBYTES=32 -check-prefixes=CHECK,VBITS_GE_256 ; RUN: llc -aarch64-sve-vector-bits-min=512 < %s | FileCheck %s -D#VBYTES=64 -check-prefixes=CHECK,VBITS_GE_512 ; RUN: llc -aarch64-sve-vector-bits-min=640 < %s | FileCheck %s -D#VBYTES=64 -check-prefixes=CHECK,VBITS_GE_512 ; RUN: llc -aarch64-sve-vector-bits-min=768 < %s | FileCheck %s -D#VBYTES=64 -check-prefixes=CHECK,VBITS_GE_512 @@ -17,9 +19,6 @@ target triple = "aarch64-unknown-linux-gnu" -; Don't use SVE when its registers are no bigger than NEON. -; NO_SVE-NOT: ptrue - ;; ;; Masked Stores ;; @@ -105,6 +104,20 @@ } define void @masked_store_v16f32(<16 x float>* %ap, <16 x float>* %bp) #0 { +; VBITS_GE_256-LABEL: masked_store_v16f32: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov x8, #8 +; VBITS_GE_256-NEXT: ptrue p0.s, vl8 +; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0] +; VBITS_GE_256-NEXT: ld1w { z2.s }, p0/z, [x1, x8, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z3.s }, p0/z, [x1] +; VBITS_GE_256-NEXT: fcmeq p1.s, p0/z, z0.s, z2.s +; VBITS_GE_256-NEXT: fcmeq p0.s, p0/z, z1.s, z3.s +; VBITS_GE_256-NEXT: st1w { z0.s }, p1, [x0, x8, lsl #2] +; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x0] +; VBITS_GE_256-NEXT: ret +; ; VBITS_GE_512-LABEL: masked_store_v16f32: ; VBITS_GE_512: // %bb.0: ; VBITS_GE_512-NEXT: ptrue p0.s, vl16 @@ -121,6 +134,30 @@ } define void @masked_store_v32f32(<32 x float>* %ap, <32 x float>* %bp) #0 { +; VBITS_GE_256-LABEL: masked_store_v32f32: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov x8, #24 +; VBITS_GE_256-NEXT: mov x9, #16 +; VBITS_GE_256-NEXT: mov x10, #8 +; VBITS_GE_256-NEXT: ptrue p0.s, vl8 +; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0, x9, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z2.s }, p0/z, [x0, x10, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z3.s }, p0/z, [x0] +; VBITS_GE_256-NEXT: ld1w { z4.s }, p0/z, [x1, x9, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z5.s }, p0/z, [x1, x10, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z6.s }, p0/z, [x1, x8, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z7.s }, p0/z, [x1] +; VBITS_GE_256-NEXT: fcmeq p1.s, p0/z, z2.s, z5.s +; VBITS_GE_256-NEXT: fcmeq p2.s, p0/z, z1.s, z4.s +; VBITS_GE_256-NEXT: fcmeq p3.s, p0/z, z0.s, z6.s +; VBITS_GE_256-NEXT: fcmeq p0.s, p0/z, z3.s, z7.s +; VBITS_GE_256-NEXT: st1w { z0.s }, p3, [x0, x8, lsl #2] +; VBITS_GE_256-NEXT: st1w { z1.s }, p2, [x0, x9, lsl #2] +; VBITS_GE_256-NEXT: st1w { z2.s }, p1, [x0, x10, lsl #2] +; VBITS_GE_256-NEXT: st1w { z3.s }, p0, [x0] +; VBITS_GE_256-NEXT: ret +; ; VBITS_GE_1024-LABEL: masked_store_v32f32: ; VBITS_GE_1024: // %bb.0: ; VBITS_GE_1024-NEXT: ptrue p0.s, vl32 @@ -137,6 +174,50 @@ } define void @masked_store_v64f32(<64 x float>* %ap, <64 x float>* %bp) #0 { +; VBITS_GE_256-LABEL: masked_store_v64f32: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov x8, #56 +; VBITS_GE_256-NEXT: mov x9, #48 +; VBITS_GE_256-NEXT: mov x10, #40 +; VBITS_GE_256-NEXT: mov x11, #32 +; VBITS_GE_256-NEXT: mov x12, #24 +; VBITS_GE_256-NEXT: mov x13, #16 +; VBITS_GE_256-NEXT: mov x14, #8 +; VBITS_GE_256-NEXT: ptrue p0.s, vl8 +; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0, x9, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z2.s }, p0/z, [x0, x10, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z3.s }, p0/z, [x0, x11, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z4.s }, p0/z, [x0, x12, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z5.s }, p0/z, [x0, x13, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z6.s }, p0/z, [x0, x14, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z7.s }, p0/z, [x0] +; VBITS_GE_256-NEXT: ld1w { z16.s }, p0/z, [x1, x13, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z17.s }, p0/z, [x1, x14, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z18.s }, p0/z, [x1, x11, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z19.s }, p0/z, [x1, x12, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z20.s }, p0/z, [x1, x9, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z21.s }, p0/z, [x1, x10, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z22.s }, p0/z, [x1, x8, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z23.s }, p0/z, [x1] +; VBITS_GE_256-NEXT: fcmeq p1.s, p0/z, z6.s, z17.s +; VBITS_GE_256-NEXT: fcmeq p2.s, p0/z, z5.s, z16.s +; VBITS_GE_256-NEXT: fcmeq p3.s, p0/z, z4.s, z19.s +; VBITS_GE_256-NEXT: fcmeq p4.s, p0/z, z3.s, z18.s +; VBITS_GE_256-NEXT: fcmeq p5.s, p0/z, z2.s, z21.s +; VBITS_GE_256-NEXT: fcmeq p6.s, p0/z, z1.s, z20.s +; VBITS_GE_256-NEXT: fcmeq p7.s, p0/z, z0.s, z22.s +; VBITS_GE_256-NEXT: fcmeq p0.s, p0/z, z7.s, z23.s +; VBITS_GE_256-NEXT: st1w { z0.s }, p7, [x0, x8, lsl #2] +; VBITS_GE_256-NEXT: st1w { z1.s }, p6, [x0, x9, lsl #2] +; VBITS_GE_256-NEXT: st1w { z2.s }, p5, [x0, x10, lsl #2] +; VBITS_GE_256-NEXT: st1w { z3.s }, p4, [x0, x11, lsl #2] +; VBITS_GE_256-NEXT: st1w { z4.s }, p3, [x0, x12, lsl #2] +; VBITS_GE_256-NEXT: st1w { z5.s }, p2, [x0, x13, lsl #2] +; VBITS_GE_256-NEXT: st1w { z6.s }, p1, [x0, x14, lsl #2] +; VBITS_GE_256-NEXT: st1w { z7.s }, p0, [x0] +; VBITS_GE_256-NEXT: ret +; ; VBITS_GE_2048-LABEL: masked_store_v64f32: ; VBITS_GE_2048: // %bb.0: ; VBITS_GE_2048-NEXT: ptrue p0.s, vl64 @@ -153,14 +234,38 @@ } define void @masked_store_trunc_v8i64i8(<8 x i64>* %ap, <8 x i64>* %bp, <8 x i8>* %dest) #0 { +; VBITS_GE_256-LABEL: masked_store_trunc_v8i64i8: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov x8, #4 +; VBITS_GE_256-NEXT: ptrue p0.d, vl4 +; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0] +; VBITS_GE_256-NEXT: ld1d { z2.d }, p0/z, [x1, x8, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z3.d }, p0/z, [x1] +; VBITS_GE_256-NEXT: cmpeq p1.d, p0/z, z0.d, z2.d +; VBITS_GE_256-NEXT: cmpeq p0.d, p0/z, z1.d, z3.d +; VBITS_GE_256-NEXT: mov z2.d, p1/z, #-1 // =0xffffffffffffffff +; VBITS_GE_256-NEXT: mov z3.d, p0/z, #-1 // =0xffffffffffffffff +; VBITS_GE_256-NEXT: ptrue p1.s, vl4 +; VBITS_GE_256-NEXT: uzp1 z2.s, z2.s, z2.s +; VBITS_GE_256-NEXT: uzp1 z3.s, z3.s, z3.s +; VBITS_GE_256-NEXT: ptrue p0.s, vl8 +; VBITS_GE_256-NEXT: splice z3.s, p1, z3.s, z2.s +; VBITS_GE_256-NEXT: uzp1 z0.s, z0.s, z0.s +; VBITS_GE_256-NEXT: uzp1 z1.s, z1.s, z1.s +; VBITS_GE_256-NEXT: cmpne p0.s, p0/z, z3.s, #0 +; VBITS_GE_256-NEXT: splice z1.s, p1, z1.s, z0.s +; VBITS_GE_256-NEXT: st1b { z1.s }, p0, [x2] +; VBITS_GE_256-NEXT: ret +; ; VBITS_GE_512-LABEL: masked_store_trunc_v8i64i8: ; VBITS_GE_512: // %bb.0: -; VBITS_GE_512-NEXT: ptrue p[[P0:[0-9]+]].d, vl8 -; VBITS_GE_512-NEXT: ld1d { [[Z0:z[0-9]+]].d }, p0/z, [x0] -; VBITS_GE_512-NEXT: ld1d { [[Z1:z[0-9]+]].d }, p0/z, [x1] -; VBITS_GE_512-NEXT: cmpeq p[[P1:[0-9]+]].d, p[[P0]]/z, [[Z0]].d, [[Z1]].d -; VBITS_GE_512-NEXT: st1b { [[Z0]].d }, p[[P1]], [x{{[0-9]+}}] -; VBITS_GE_512-NEXT: ret +; VBITS_GE_512-NEXT: ptrue p0.d, vl8 +; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x0] +; VBITS_GE_512-NEXT: ld1d { z1.d }, p0/z, [x1] +; VBITS_GE_512-NEXT: cmpeq p0.d, p0/z, z0.d, z1.d +; VBITS_GE_512-NEXT: st1b { z0.d }, p0, [x2] +; VBITS_GE_512-NEXT: ret %a = load <8 x i64>, <8 x i64>* %ap %b = load <8 x i64>, <8 x i64>* %bp @@ -171,13 +276,41 @@ } define void @masked_store_trunc_v8i64i16(<8 x i64>* %ap, <8 x i64>* %bp, <8 x i16>* %dest) #0 { -; CHECK-LABEL: masked_store_trunc_v8i64i16: -; VBITS_GE_512: ptrue p[[P0:[0-9]+]].d, vl8 -; VBITS_GE_512-NEXT: ld1d { [[Z0:z[0-9]+]].d }, p0/z, [x0] -; VBITS_GE_512-NEXT: ld1d { [[Z1:z[0-9]+]].d }, p0/z, [x1] -; VBITS_GE_512-NEXT: cmpeq p[[P1:[0-9]+]].d, p[[P0]]/z, [[Z0]].d, [[Z1]].d -; VBITS_GE_512-NEXT: st1h { [[Z0]].d }, p[[P1]], [x{{[0-9]+}}] -; VBITS_GE_512-NEXT: ret +; VBITS_GE_256-LABEL: masked_store_trunc_v8i64i16: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov x8, #4 +; VBITS_GE_256-NEXT: ptrue p0.d, vl4 +; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0] +; VBITS_GE_256-NEXT: ld1d { z2.d }, p0/z, [x1, x8, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z3.d }, p0/z, [x1] +; VBITS_GE_256-NEXT: cmpeq p1.d, p0/z, z0.d, z2.d +; VBITS_GE_256-NEXT: cmpeq p0.d, p0/z, z1.d, z3.d +; VBITS_GE_256-NEXT: uzp1 z0.s, z0.s, z0.s +; VBITS_GE_256-NEXT: uzp1 z1.s, z1.s, z1.s +; VBITS_GE_256-NEXT: mov z2.d, p1/z, #-1 // =0xffffffffffffffff +; VBITS_GE_256-NEXT: mov z3.d, p0/z, #-1 // =0xffffffffffffffff +; VBITS_GE_256-NEXT: uzp1 z0.h, z0.h, z0.h +; VBITS_GE_256-NEXT: uzp1 z1.h, z1.h, z1.h +; VBITS_GE_256-NEXT: ptrue p1.s, vl4 +; VBITS_GE_256-NEXT: uzp1 z2.s, z2.s, z2.s +; VBITS_GE_256-NEXT: uzp1 z3.s, z3.s, z3.s +; VBITS_GE_256-NEXT: ptrue p0.h, vl8 +; VBITS_GE_256-NEXT: mov v1.d[1], v0.d[0] +; VBITS_GE_256-NEXT: splice z3.s, p1, z3.s, z2.s +; VBITS_GE_256-NEXT: uzp1 z0.h, z3.h, z3.h +; VBITS_GE_256-NEXT: cmpne p0.h, p0/z, z0.h, #0 +; VBITS_GE_256-NEXT: st1h { z1.h }, p0, [x2] +; VBITS_GE_256-NEXT: ret +; +; VBITS_GE_512-LABEL: masked_store_trunc_v8i64i16: +; VBITS_GE_512: // %bb.0: +; VBITS_GE_512-NEXT: ptrue p0.d, vl8 +; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x0] +; VBITS_GE_512-NEXT: ld1d { z1.d }, p0/z, [x1] +; VBITS_GE_512-NEXT: cmpeq p0.d, p0/z, z0.d, z1.d +; VBITS_GE_512-NEXT: st1h { z0.d }, p0, [x2] +; VBITS_GE_512-NEXT: ret %a = load <8 x i64>, <8 x i64>* %ap %b = load <8 x i64>, <8 x i64>* %bp %mask = icmp eq <8 x i64> %a, %b @@ -187,13 +320,38 @@ } define void @masked_store_trunc_v8i64i32(<8 x i64>* %ap, <8 x i64>* %bp, <8 x i32>* %dest) #0 { -; CHECK-LABEL: masked_store_trunc_v8i64i32: -; VBITS_GE_512: ptrue p[[P0:[0-9]+]].d, vl8 -; VBITS_GE_512-NEXT: ld1d { [[Z0:z[0-9]+]].d }, p0/z, [x0] -; VBITS_GE_512-NEXT: ld1d { [[Z1:z[0-9]+]].d }, p0/z, [x1] -; VBITS_GE_512-NEXT: cmpeq p[[P1:[0-9]+]].d, p[[P0]]/z, [[Z0]].d, [[Z1]].d -; VBITS_GE_512-NEXT: st1w { [[Z0]].d }, p[[P1]], [x{{[0-9]+}}] -; VBITS_GE_512-NEXT: ret +; VBITS_GE_256-LABEL: masked_store_trunc_v8i64i32: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov x8, #4 +; VBITS_GE_256-NEXT: ptrue p0.d, vl4 +; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0] +; VBITS_GE_256-NEXT: ld1d { z2.d }, p0/z, [x1, x8, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z3.d }, p0/z, [x1] +; VBITS_GE_256-NEXT: cmpeq p1.d, p0/z, z0.d, z2.d +; VBITS_GE_256-NEXT: cmpeq p0.d, p0/z, z1.d, z3.d +; VBITS_GE_256-NEXT: mov z2.d, p1/z, #-1 // =0xffffffffffffffff +; VBITS_GE_256-NEXT: mov z3.d, p0/z, #-1 // =0xffffffffffffffff +; VBITS_GE_256-NEXT: ptrue p1.s, vl4 +; VBITS_GE_256-NEXT: uzp1 z2.s, z2.s, z2.s +; VBITS_GE_256-NEXT: uzp1 z3.s, z3.s, z3.s +; VBITS_GE_256-NEXT: ptrue p0.s, vl8 +; VBITS_GE_256-NEXT: splice z3.s, p1, z3.s, z2.s +; VBITS_GE_256-NEXT: uzp1 z0.s, z0.s, z0.s +; VBITS_GE_256-NEXT: uzp1 z1.s, z1.s, z1.s +; VBITS_GE_256-NEXT: cmpne p0.s, p0/z, z3.s, #0 +; VBITS_GE_256-NEXT: splice z1.s, p1, z1.s, z0.s +; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x2] +; VBITS_GE_256-NEXT: ret +; +; VBITS_GE_512-LABEL: masked_store_trunc_v8i64i32: +; VBITS_GE_512: // %bb.0: +; VBITS_GE_512-NEXT: ptrue p0.d, vl8 +; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x0] +; VBITS_GE_512-NEXT: ld1d { z1.d }, p0/z, [x1] +; VBITS_GE_512-NEXT: cmpeq p0.d, p0/z, z0.d, z1.d +; VBITS_GE_512-NEXT: st1w { z0.d }, p0, [x2] +; VBITS_GE_512-NEXT: ret %a = load <8 x i64>, <8 x i64>* %ap %b = load <8 x i64>, <8 x i64>* %bp %mask = icmp eq <8 x i64> %a, %b @@ -203,13 +361,41 @@ } define void @masked_store_trunc_v16i32i8(<16 x i32>* %ap, <16 x i32>* %bp, <16 x i8>* %dest) #0 { -; CHECK-LABEL: masked_store_trunc_v16i32i8: -; VBITS_GE_512: ptrue p[[P0:[0-9]+]].s, vl16 -; VBITS_GE_512-NEXT: ld1w { [[Z0:z[0-9]+]].s }, p0/z, [x0] -; VBITS_GE_512-NEXT: ld1w { [[Z1:z[0-9]+]].s }, p0/z, [x1] -; VBITS_GE_512-NEXT: cmpeq p[[P1:[0-9]+]].s, p[[P0]]/z, [[Z0]].s, [[Z1]].s -; VBITS_GE_512-NEXT: st1b { [[Z0]].s }, p[[P1]], [x{{[0-9]+}}] -; VBITS_GE_512-NEXT: ret +; VBITS_GE_256-LABEL: masked_store_trunc_v16i32i8: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov x8, #8 +; VBITS_GE_256-NEXT: ptrue p0.s, vl8 +; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0] +; VBITS_GE_256-NEXT: ld1w { z2.s }, p0/z, [x1, x8, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z3.s }, p0/z, [x1] +; VBITS_GE_256-NEXT: cmpeq p1.s, p0/z, z0.s, z2.s +; VBITS_GE_256-NEXT: cmpeq p0.s, p0/z, z1.s, z3.s +; VBITS_GE_256-NEXT: mov z2.s, p1/z, #-1 // =0xffffffffffffffff +; VBITS_GE_256-NEXT: mov z3.s, p0/z, #-1 // =0xffffffffffffffff +; VBITS_GE_256-NEXT: uzp1 z2.h, z2.h, z2.h +; VBITS_GE_256-NEXT: uzp1 z3.h, z3.h, z3.h +; VBITS_GE_256-NEXT: uzp1 z0.h, z0.h, z0.h +; VBITS_GE_256-NEXT: uzp1 z1.h, z1.h, z1.h +; VBITS_GE_256-NEXT: uzp1 z2.b, z2.b, z2.b +; VBITS_GE_256-NEXT: uzp1 z3.b, z3.b, z3.b +; VBITS_GE_256-NEXT: mov v3.d[1], v2.d[0] +; VBITS_GE_256-NEXT: uzp1 z0.b, z0.b, z0.b +; VBITS_GE_256-NEXT: uzp1 z1.b, z1.b, z1.b +; VBITS_GE_256-NEXT: ptrue p0.b, vl16 +; VBITS_GE_256-NEXT: mov v1.d[1], v0.d[0] +; VBITS_GE_256-NEXT: cmpne p0.b, p0/z, z3.b, #0 +; VBITS_GE_256-NEXT: st1b { z1.b }, p0, [x2] +; VBITS_GE_256-NEXT: ret +; +; VBITS_GE_512-LABEL: masked_store_trunc_v16i32i8: +; VBITS_GE_512: // %bb.0: +; VBITS_GE_512-NEXT: ptrue p0.s, vl16 +; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x0] +; VBITS_GE_512-NEXT: ld1w { z1.s }, p0/z, [x1] +; VBITS_GE_512-NEXT: cmpeq p0.s, p0/z, z0.s, z1.s +; VBITS_GE_512-NEXT: st1b { z0.s }, p0, [x2] +; VBITS_GE_512-NEXT: ret %a = load <16 x i32>, <16 x i32>* %ap %b = load <16 x i32>, <16 x i32>* %bp %mask = icmp eq <16 x i32> %a, %b @@ -219,13 +405,41 @@ } define void @masked_store_trunc_v16i32i16(<16 x i32>* %ap, <16 x i32>* %bp, <16 x i16>* %dest) #0 { -; CHECK-LABEL: masked_store_trunc_v16i32i16: -; VBITS_GE_512: ptrue p[[P0:[0-9]+]].s, vl16 -; VBITS_GE_512-NEXT: ld1w { [[Z0:z[0-9]+]].s }, p0/z, [x0] -; VBITS_GE_512-NEXT: ld1w { [[Z1:z[0-9]+]].s }, p0/z, [x1] -; VBITS_GE_512-NEXT: cmpeq p[[P1:[0-9]+]].s, p[[P0]]/z, [[Z0]].s, [[Z1]].s -; VBITS_GE_512-NEXT: st1h { [[Z0]].s }, p[[P1]], [x{{[0-9]+}}] -; VBITS_GE_512-NEXT: ret +; VBITS_GE_256-LABEL: masked_store_trunc_v16i32i16: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov x8, #8 +; VBITS_GE_256-NEXT: ptrue p0.s, vl8 +; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0] +; VBITS_GE_256-NEXT: ld1w { z2.s }, p0/z, [x1, x8, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z3.s }, p0/z, [x1] +; VBITS_GE_256-NEXT: cmpeq p1.s, p0/z, z0.s, z2.s +; VBITS_GE_256-NEXT: cmpeq p0.s, p0/z, z1.s, z3.s +; VBITS_GE_256-NEXT: mov z2.s, p1/z, #-1 // =0xffffffffffffffff +; VBITS_GE_256-NEXT: mov z3.s, p0/z, #-1 // =0xffffffffffffffff +; VBITS_GE_256-NEXT: uzp1 z2.h, z2.h, z2.h +; VBITS_GE_256-NEXT: uzp1 z3.h, z3.h, z3.h +; VBITS_GE_256-NEXT: uzp1 z2.b, z2.b, z2.b +; VBITS_GE_256-NEXT: uzp1 z3.b, z3.b, z3.b +; VBITS_GE_256-NEXT: mov v3.d[1], v2.d[0] +; VBITS_GE_256-NEXT: ptrue p0.h, vl16 +; VBITS_GE_256-NEXT: uzp1 z0.h, z0.h, z0.h +; VBITS_GE_256-NEXT: uzp1 z1.h, z1.h, z1.h +; VBITS_GE_256-NEXT: ptrue p1.h, vl8 +; VBITS_GE_256-NEXT: splice z1.h, p1, z1.h, z0.h +; VBITS_GE_256-NEXT: sunpklo z2.h, z3.b +; VBITS_GE_256-NEXT: cmpne p0.h, p0/z, z2.h, #0 +; VBITS_GE_256-NEXT: st1h { z1.h }, p0, [x2] +; VBITS_GE_256-NEXT: ret +; +; VBITS_GE_512-LABEL: masked_store_trunc_v16i32i16: +; VBITS_GE_512: // %bb.0: +; VBITS_GE_512-NEXT: ptrue p0.s, vl16 +; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x0] +; VBITS_GE_512-NEXT: ld1w { z1.s }, p0/z, [x1] +; VBITS_GE_512-NEXT: cmpeq p0.s, p0/z, z0.s, z1.s +; VBITS_GE_512-NEXT: st1h { z0.s }, p0, [x2] +; VBITS_GE_512-NEXT: ret %a = load <16 x i32>, <16 x i32>* %ap %b = load <16 x i32>, <16 x i32>* %bp %mask = icmp eq <16 x i32> %a, %b @@ -235,13 +449,38 @@ } define void @masked_store_trunc_v32i16i8(<32 x i16>* %ap, <32 x i16>* %bp, <32 x i8>* %dest) #0 { -; CHECK-LABEL: masked_store_trunc_v32i16i8: -; VBITS_GE_512: ptrue p[[P0:[0-9]+]].h, vl32 -; VBITS_GE_512-NEXT: ld1h { [[Z0:z[0-9]+]].h }, p0/z, [x0] -; VBITS_GE_512-NEXT: ld1h { [[Z1:z[0-9]+]].h }, p0/z, [x1] -; VBITS_GE_512-NEXT: cmpeq p[[P1:[0-9]+]].h, p[[P0]]/z, [[Z0]].h, [[Z1]].h -; VBITS_GE_512-NEXT: st1b { [[Z0]].h }, p[[P1]], [x{{[0-9]+}}] -; VBITS_GE_512-NEXT: ret +; VBITS_GE_256-LABEL: masked_store_trunc_v32i16i8: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov x8, #16 +; VBITS_GE_256-NEXT: ptrue p0.h, vl16 +; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] +; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0] +; VBITS_GE_256-NEXT: ld1h { z2.h }, p0/z, [x1, x8, lsl #1] +; VBITS_GE_256-NEXT: ld1h { z3.h }, p0/z, [x1] +; VBITS_GE_256-NEXT: cmpeq p1.h, p0/z, z0.h, z2.h +; VBITS_GE_256-NEXT: cmpeq p0.h, p0/z, z1.h, z3.h +; VBITS_GE_256-NEXT: mov z2.h, p1/z, #-1 // =0xffffffffffffffff +; VBITS_GE_256-NEXT: mov z3.h, p0/z, #-1 // =0xffffffffffffffff +; VBITS_GE_256-NEXT: ptrue p1.b, vl16 +; VBITS_GE_256-NEXT: uzp1 z2.b, z2.b, z2.b +; VBITS_GE_256-NEXT: uzp1 z3.b, z3.b, z3.b +; VBITS_GE_256-NEXT: ptrue p0.b, vl32 +; VBITS_GE_256-NEXT: splice z3.b, p1, z3.b, z2.b +; VBITS_GE_256-NEXT: uzp1 z0.b, z0.b, z0.b +; VBITS_GE_256-NEXT: uzp1 z1.b, z1.b, z1.b +; VBITS_GE_256-NEXT: cmpne p0.b, p0/z, z3.b, #0 +; VBITS_GE_256-NEXT: splice z1.b, p1, z1.b, z0.b +; VBITS_GE_256-NEXT: st1b { z1.b }, p0, [x2] +; VBITS_GE_256-NEXT: ret +; +; VBITS_GE_512-LABEL: masked_store_trunc_v32i16i8: +; VBITS_GE_512: // %bb.0: +; VBITS_GE_512-NEXT: ptrue p0.h, vl32 +; VBITS_GE_512-NEXT: ld1h { z0.h }, p0/z, [x0] +; VBITS_GE_512-NEXT: ld1h { z1.h }, p0/z, [x1] +; VBITS_GE_512-NEXT: cmpeq p0.h, p0/z, z0.h, z1.h +; VBITS_GE_512-NEXT: st1b { z0.h }, p0, [x2] +; VBITS_GE_512-NEXT: ret %a = load <32 x i16>, <32 x i16>* %ap %b = load <32 x i16>, <32 x i16>* %bp %mask = icmp eq <32 x i16> %a, %b diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-sdiv-pow2.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-sdiv-pow2.ll --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-sdiv-pow2.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-sdiv-pow2.ll @@ -1,5 +1,6 @@ -; RUN: llc -aarch64-sve-vector-bits-min=256 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_EQ_256 -; RUN: llc -aarch64-sve-vector-bits-min=384 < %s | FileCheck %s -check-prefixes=CHECK +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -aarch64-sve-vector-bits-min=256 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_256 +; RUN: llc -aarch64-sve-vector-bits-min=384 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_256 ; RUN: llc -aarch64-sve-vector-bits-min=512 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512 ; RUN: llc -aarch64-sve-vector-bits-min=640 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512 ; RUN: llc -aarch64-sve-vector-bits-min=768 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512 @@ -55,17 +56,17 @@ } define void @sdiv_v64i8(<64 x i8>* %a) #0 { -; VBITS_EQ_256-LABEL: sdiv_v64i8: -; VBITS_EQ_256: // %bb.0: -; VBITS_EQ_256-NEXT: mov w8, #32 -; VBITS_EQ_256-NEXT: ptrue p0.b, vl32 -; VBITS_EQ_256-NEXT: ld1b { z0.b }, p0/z, [x0, x8] -; VBITS_EQ_256-NEXT: ld1b { z1.b }, p0/z, [x0] -; VBITS_EQ_256-NEXT: asrd z0.b, p0/m, z0.b, #5 -; VBITS_EQ_256-NEXT: asrd z1.b, p0/m, z1.b, #5 -; VBITS_EQ_256-NEXT: st1b { z0.b }, p0, [x0, x8] -; VBITS_EQ_256-NEXT: st1b { z1.b }, p0, [x0] -; VBITS_EQ_256-NEXT: ret +; VBITS_GE_256-LABEL: sdiv_v64i8: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov w8, #32 +; VBITS_GE_256-NEXT: ptrue p0.b, vl32 +; VBITS_GE_256-NEXT: ld1b { z0.b }, p0/z, [x0, x8] +; VBITS_GE_256-NEXT: ld1b { z1.b }, p0/z, [x0] +; VBITS_GE_256-NEXT: asrd z0.b, p0/m, z0.b, #5 +; VBITS_GE_256-NEXT: asrd z1.b, p0/m, z1.b, #5 +; VBITS_GE_256-NEXT: st1b { z0.b }, p0, [x0, x8] +; VBITS_GE_256-NEXT: st1b { z1.b }, p0, [x0] +; VBITS_GE_256-NEXT: ret ; ; VBITS_GE_512-LABEL: sdiv_v64i8: ; VBITS_GE_512: // %bb.0: @@ -81,6 +82,26 @@ } define void @sdiv_v128i8(<128 x i8>* %a) #0 { +; VBITS_GE_256-LABEL: sdiv_v128i8: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov w8, #96 +; VBITS_GE_256-NEXT: mov w9, #32 +; VBITS_GE_256-NEXT: mov w10, #64 +; VBITS_GE_256-NEXT: ptrue p0.b, vl32 +; VBITS_GE_256-NEXT: ld1b { z0.b }, p0/z, [x0, x8] +; VBITS_GE_256-NEXT: ld1b { z1.b }, p0/z, [x0, x9] +; VBITS_GE_256-NEXT: ld1b { z2.b }, p0/z, [x0, x10] +; VBITS_GE_256-NEXT: ld1b { z3.b }, p0/z, [x0] +; VBITS_GE_256-NEXT: asrd z1.b, p0/m, z1.b, #5 +; VBITS_GE_256-NEXT: asrd z0.b, p0/m, z0.b, #5 +; VBITS_GE_256-NEXT: asrd z2.b, p0/m, z2.b, #5 +; VBITS_GE_256-NEXT: asrd z3.b, p0/m, z3.b, #5 +; VBITS_GE_256-NEXT: st1b { z2.b }, p0, [x0, x10] +; VBITS_GE_256-NEXT: st1b { z0.b }, p0, [x0, x8] +; VBITS_GE_256-NEXT: st1b { z1.b }, p0, [x0, x9] +; VBITS_GE_256-NEXT: st1b { z3.b }, p0, [x0] +; VBITS_GE_256-NEXT: ret +; ; VBITS_GE_1024-LABEL: sdiv_v128i8: ; VBITS_GE_1024: // %bb.0: ; VBITS_GE_1024-NEXT: ptrue p0.b, vl128 @@ -95,6 +116,42 @@ } define void @sdiv_v256i8(<256 x i8>* %a) #0 { +; VBITS_GE_256-LABEL: sdiv_v256i8: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov w8, #192 +; VBITS_GE_256-NEXT: mov w9, #96 +; VBITS_GE_256-NEXT: mov w10, #32 +; VBITS_GE_256-NEXT: mov w11, #160 +; VBITS_GE_256-NEXT: mov w12, #64 +; VBITS_GE_256-NEXT: mov w13, #224 +; VBITS_GE_256-NEXT: mov w14, #128 +; VBITS_GE_256-NEXT: ptrue p0.b, vl32 +; VBITS_GE_256-NEXT: ld1b { z0.b }, p0/z, [x0, x9] +; VBITS_GE_256-NEXT: ld1b { z1.b }, p0/z, [x0, x10] +; VBITS_GE_256-NEXT: ld1b { z2.b }, p0/z, [x0, x11] +; VBITS_GE_256-NEXT: ld1b { z3.b }, p0/z, [x0, x12] +; VBITS_GE_256-NEXT: ld1b { z4.b }, p0/z, [x0, x13] +; VBITS_GE_256-NEXT: ld1b { z5.b }, p0/z, [x0, x14] +; VBITS_GE_256-NEXT: ld1b { z6.b }, p0/z, [x0, x8] +; VBITS_GE_256-NEXT: ld1b { z7.b }, p0/z, [x0] +; VBITS_GE_256-NEXT: asrd z1.b, p0/m, z1.b, #5 +; VBITS_GE_256-NEXT: asrd z0.b, p0/m, z0.b, #5 +; VBITS_GE_256-NEXT: asrd z3.b, p0/m, z3.b, #5 +; VBITS_GE_256-NEXT: asrd z2.b, p0/m, z2.b, #5 +; VBITS_GE_256-NEXT: asrd z5.b, p0/m, z5.b, #5 +; VBITS_GE_256-NEXT: asrd z4.b, p0/m, z4.b, #5 +; VBITS_GE_256-NEXT: asrd z6.b, p0/m, z6.b, #5 +; VBITS_GE_256-NEXT: asrd z7.b, p0/m, z7.b, #5 +; VBITS_GE_256-NEXT: st1b { z6.b }, p0, [x0, x8] +; VBITS_GE_256-NEXT: st1b { z4.b }, p0, [x0, x13] +; VBITS_GE_256-NEXT: st1b { z5.b }, p0, [x0, x14] +; VBITS_GE_256-NEXT: st1b { z2.b }, p0, [x0, x11] +; VBITS_GE_256-NEXT: st1b { z3.b }, p0, [x0, x12] +; VBITS_GE_256-NEXT: st1b { z0.b }, p0, [x0, x9] +; VBITS_GE_256-NEXT: st1b { z1.b }, p0, [x0, x10] +; VBITS_GE_256-NEXT: st1b { z7.b }, p0, [x0] +; VBITS_GE_256-NEXT: ret +; ; VBITS_GE_2048-LABEL: sdiv_v256i8: ; VBITS_GE_2048: // %bb.0: ; VBITS_GE_2048-NEXT: ptrue p0.b, vl256 @@ -147,17 +204,17 @@ } define void @sdiv_v32i16(<32 x i16>* %a) #0 { -; VBITS_EQ_256-LABEL: sdiv_v32i16: -; VBITS_EQ_256: // %bb.0: -; VBITS_EQ_256-NEXT: mov x8, #16 -; VBITS_EQ_256-NEXT: ptrue p0.h, vl16 -; VBITS_EQ_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] -; VBITS_EQ_256-NEXT: ld1h { z1.h }, p0/z, [x0] -; VBITS_EQ_256-NEXT: asrd z0.h, p0/m, z0.h, #5 -; VBITS_EQ_256-NEXT: asrd z1.h, p0/m, z1.h, #5 -; VBITS_EQ_256-NEXT: st1h { z0.h }, p0, [x0, x8, lsl #1] -; VBITS_EQ_256-NEXT: st1h { z1.h }, p0, [x0] -; VBITS_EQ_256-NEXT: ret +; VBITS_GE_256-LABEL: sdiv_v32i16: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov x8, #16 +; VBITS_GE_256-NEXT: ptrue p0.h, vl16 +; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] +; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0] +; VBITS_GE_256-NEXT: asrd z0.h, p0/m, z0.h, #5 +; VBITS_GE_256-NEXT: asrd z1.h, p0/m, z1.h, #5 +; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x0, x8, lsl #1] +; VBITS_GE_256-NEXT: st1h { z1.h }, p0, [x0] +; VBITS_GE_256-NEXT: ret ; ; VBITS_GE_512-LABEL: sdiv_v32i16: ; VBITS_GE_512: // %bb.0: @@ -173,6 +230,26 @@ } define void @sdiv_v64i16(<64 x i16>* %a) #0 { +; VBITS_GE_256-LABEL: sdiv_v64i16: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov x8, #48 +; VBITS_GE_256-NEXT: mov x9, #16 +; VBITS_GE_256-NEXT: mov x10, #32 +; VBITS_GE_256-NEXT: ptrue p0.h, vl16 +; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] +; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0, x9, lsl #1] +; VBITS_GE_256-NEXT: ld1h { z2.h }, p0/z, [x0, x10, lsl #1] +; VBITS_GE_256-NEXT: ld1h { z3.h }, p0/z, [x0] +; VBITS_GE_256-NEXT: asrd z1.h, p0/m, z1.h, #5 +; VBITS_GE_256-NEXT: asrd z0.h, p0/m, z0.h, #5 +; VBITS_GE_256-NEXT: asrd z2.h, p0/m, z2.h, #5 +; VBITS_GE_256-NEXT: asrd z3.h, p0/m, z3.h, #5 +; VBITS_GE_256-NEXT: st1h { z2.h }, p0, [x0, x10, lsl #1] +; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x0, x8, lsl #1] +; VBITS_GE_256-NEXT: st1h { z1.h }, p0, [x0, x9, lsl #1] +; VBITS_GE_256-NEXT: st1h { z3.h }, p0, [x0] +; VBITS_GE_256-NEXT: ret +; ; VBITS_GE_1024-LABEL: sdiv_v64i16: ; VBITS_GE_1024: // %bb.0: ; VBITS_GE_1024-NEXT: ptrue p0.h, vl64 @@ -187,6 +264,42 @@ } define void @sdiv_v128i16(<128 x i16>* %a) #0 { +; VBITS_GE_256-LABEL: sdiv_v128i16: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov x8, #96 +; VBITS_GE_256-NEXT: mov x9, #48 +; VBITS_GE_256-NEXT: mov x10, #16 +; VBITS_GE_256-NEXT: mov x11, #80 +; VBITS_GE_256-NEXT: mov x12, #32 +; VBITS_GE_256-NEXT: mov x13, #112 +; VBITS_GE_256-NEXT: mov x14, #64 +; VBITS_GE_256-NEXT: ptrue p0.h, vl16 +; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x9, lsl #1] +; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0, x10, lsl #1] +; VBITS_GE_256-NEXT: ld1h { z2.h }, p0/z, [x0, x11, lsl #1] +; VBITS_GE_256-NEXT: ld1h { z3.h }, p0/z, [x0, x12, lsl #1] +; VBITS_GE_256-NEXT: ld1h { z4.h }, p0/z, [x0, x13, lsl #1] +; VBITS_GE_256-NEXT: ld1h { z5.h }, p0/z, [x0, x14, lsl #1] +; VBITS_GE_256-NEXT: ld1h { z6.h }, p0/z, [x0, x8, lsl #1] +; VBITS_GE_256-NEXT: ld1h { z7.h }, p0/z, [x0] +; VBITS_GE_256-NEXT: asrd z1.h, p0/m, z1.h, #5 +; VBITS_GE_256-NEXT: asrd z0.h, p0/m, z0.h, #5 +; VBITS_GE_256-NEXT: asrd z3.h, p0/m, z3.h, #5 +; VBITS_GE_256-NEXT: asrd z2.h, p0/m, z2.h, #5 +; VBITS_GE_256-NEXT: asrd z5.h, p0/m, z5.h, #5 +; VBITS_GE_256-NEXT: asrd z4.h, p0/m, z4.h, #5 +; VBITS_GE_256-NEXT: asrd z6.h, p0/m, z6.h, #5 +; VBITS_GE_256-NEXT: asrd z7.h, p0/m, z7.h, #5 +; VBITS_GE_256-NEXT: st1h { z6.h }, p0, [x0, x8, lsl #1] +; VBITS_GE_256-NEXT: st1h { z4.h }, p0, [x0, x13, lsl #1] +; VBITS_GE_256-NEXT: st1h { z5.h }, p0, [x0, x14, lsl #1] +; VBITS_GE_256-NEXT: st1h { z2.h }, p0, [x0, x11, lsl #1] +; VBITS_GE_256-NEXT: st1h { z3.h }, p0, [x0, x12, lsl #1] +; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x0, x9, lsl #1] +; VBITS_GE_256-NEXT: st1h { z1.h }, p0, [x0, x10, lsl #1] +; VBITS_GE_256-NEXT: st1h { z7.h }, p0, [x0] +; VBITS_GE_256-NEXT: ret +; ; VBITS_GE_2048-LABEL: sdiv_v128i16: ; VBITS_GE_2048: // %bb.0: ; VBITS_GE_2048-NEXT: ptrue p0.h, vl128 @@ -239,17 +352,17 @@ } define void @sdiv_v16i32(<16 x i32>* %a) #0 { -; VBITS_EQ_256-LABEL: sdiv_v16i32: -; VBITS_EQ_256: // %bb.0: -; VBITS_EQ_256-NEXT: mov x8, #8 -; VBITS_EQ_256-NEXT: ptrue p0.s, vl8 -; VBITS_EQ_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] -; VBITS_EQ_256-NEXT: ld1w { z1.s }, p0/z, [x0] -; VBITS_EQ_256-NEXT: asrd z0.s, p0/m, z0.s, #5 -; VBITS_EQ_256-NEXT: asrd z1.s, p0/m, z1.s, #5 -; VBITS_EQ_256-NEXT: st1w { z0.s }, p0, [x0, x8, lsl #2] -; VBITS_EQ_256-NEXT: st1w { z1.s }, p0, [x0] -; VBITS_EQ_256-NEXT: ret +; VBITS_GE_256-LABEL: sdiv_v16i32: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov x8, #8 +; VBITS_GE_256-NEXT: ptrue p0.s, vl8 +; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0] +; VBITS_GE_256-NEXT: asrd z0.s, p0/m, z0.s, #5 +; VBITS_GE_256-NEXT: asrd z1.s, p0/m, z1.s, #5 +; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0, x8, lsl #2] +; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x0] +; VBITS_GE_256-NEXT: ret ; ; VBITS_GE_512-LABEL: sdiv_v16i32: ; VBITS_GE_512: // %bb.0: @@ -265,6 +378,26 @@ } define void @sdiv_v32i32(<32 x i32>* %a) #0 { +; VBITS_GE_256-LABEL: sdiv_v32i32: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov x8, #24 +; VBITS_GE_256-NEXT: mov x9, #8 +; VBITS_GE_256-NEXT: mov x10, #16 +; VBITS_GE_256-NEXT: ptrue p0.s, vl8 +; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0, x9, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z2.s }, p0/z, [x0, x10, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z3.s }, p0/z, [x0] +; VBITS_GE_256-NEXT: asrd z1.s, p0/m, z1.s, #5 +; VBITS_GE_256-NEXT: asrd z0.s, p0/m, z0.s, #5 +; VBITS_GE_256-NEXT: asrd z2.s, p0/m, z2.s, #5 +; VBITS_GE_256-NEXT: asrd z3.s, p0/m, z3.s, #5 +; VBITS_GE_256-NEXT: st1w { z2.s }, p0, [x0, x10, lsl #2] +; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0, x8, lsl #2] +; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x0, x9, lsl #2] +; VBITS_GE_256-NEXT: st1w { z3.s }, p0, [x0] +; VBITS_GE_256-NEXT: ret +; ; VBITS_GE_1024-LABEL: sdiv_v32i32: ; VBITS_GE_1024: // %bb.0: ; VBITS_GE_1024-NEXT: ptrue p0.s, vl32 @@ -279,6 +412,42 @@ } define void @sdiv_v64i32(<64 x i32>* %a) #0 { +; VBITS_GE_256-LABEL: sdiv_v64i32: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov x8, #48 +; VBITS_GE_256-NEXT: mov x9, #24 +; VBITS_GE_256-NEXT: mov x10, #8 +; VBITS_GE_256-NEXT: mov x11, #40 +; VBITS_GE_256-NEXT: mov x12, #16 +; VBITS_GE_256-NEXT: mov x13, #56 +; VBITS_GE_256-NEXT: mov x14, #32 +; VBITS_GE_256-NEXT: ptrue p0.s, vl8 +; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x9, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0, x10, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z2.s }, p0/z, [x0, x11, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z3.s }, p0/z, [x0, x12, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z4.s }, p0/z, [x0, x13, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z5.s }, p0/z, [x0, x14, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z6.s }, p0/z, [x0, x8, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z7.s }, p0/z, [x0] +; VBITS_GE_256-NEXT: asrd z1.s, p0/m, z1.s, #5 +; VBITS_GE_256-NEXT: asrd z0.s, p0/m, z0.s, #5 +; VBITS_GE_256-NEXT: asrd z3.s, p0/m, z3.s, #5 +; VBITS_GE_256-NEXT: asrd z2.s, p0/m, z2.s, #5 +; VBITS_GE_256-NEXT: asrd z5.s, p0/m, z5.s, #5 +; VBITS_GE_256-NEXT: asrd z4.s, p0/m, z4.s, #5 +; VBITS_GE_256-NEXT: asrd z6.s, p0/m, z6.s, #5 +; VBITS_GE_256-NEXT: asrd z7.s, p0/m, z7.s, #5 +; VBITS_GE_256-NEXT: st1w { z6.s }, p0, [x0, x8, lsl #2] +; VBITS_GE_256-NEXT: st1w { z4.s }, p0, [x0, x13, lsl #2] +; VBITS_GE_256-NEXT: st1w { z5.s }, p0, [x0, x14, lsl #2] +; VBITS_GE_256-NEXT: st1w { z2.s }, p0, [x0, x11, lsl #2] +; VBITS_GE_256-NEXT: st1w { z3.s }, p0, [x0, x12, lsl #2] +; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0, x9, lsl #2] +; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x0, x10, lsl #2] +; VBITS_GE_256-NEXT: st1w { z7.s }, p0, [x0] +; VBITS_GE_256-NEXT: ret +; ; VBITS_GE_2048-LABEL: sdiv_v64i32: ; VBITS_GE_2048: // %bb.0: ; VBITS_GE_2048-NEXT: ptrue p0.s, vl64 @@ -332,17 +501,17 @@ } define void @sdiv_v8i64(<8 x i64>* %a) #0 { -; VBITS_EQ_256-LABEL: sdiv_v8i64: -; VBITS_EQ_256: // %bb.0: -; VBITS_EQ_256-NEXT: mov x8, #4 -; VBITS_EQ_256-NEXT: ptrue p0.d, vl4 -; VBITS_EQ_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] -; VBITS_EQ_256-NEXT: ld1d { z1.d }, p0/z, [x0] -; VBITS_EQ_256-NEXT: asrd z0.d, p0/m, z0.d, #5 -; VBITS_EQ_256-NEXT: asrd z1.d, p0/m, z1.d, #5 -; VBITS_EQ_256-NEXT: st1d { z0.d }, p0, [x0, x8, lsl #3] -; VBITS_EQ_256-NEXT: st1d { z1.d }, p0, [x0] -; VBITS_EQ_256-NEXT: ret +; VBITS_GE_256-LABEL: sdiv_v8i64: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov x8, #4 +; VBITS_GE_256-NEXT: ptrue p0.d, vl4 +; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0] +; VBITS_GE_256-NEXT: asrd z0.d, p0/m, z0.d, #5 +; VBITS_GE_256-NEXT: asrd z1.d, p0/m, z1.d, #5 +; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x0, x8, lsl #3] +; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x0] +; VBITS_GE_256-NEXT: ret ; ; VBITS_GE_512-LABEL: sdiv_v8i64: ; VBITS_GE_512: // %bb.0: @@ -358,6 +527,26 @@ } define void @sdiv_v16i64(<16 x i64>* %a) #0 { +; VBITS_GE_256-LABEL: sdiv_v16i64: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov x8, #12 +; VBITS_GE_256-NEXT: mov x9, #4 +; VBITS_GE_256-NEXT: mov x10, #8 +; VBITS_GE_256-NEXT: ptrue p0.d, vl4 +; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0, x9, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z2.d }, p0/z, [x0, x10, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z3.d }, p0/z, [x0] +; VBITS_GE_256-NEXT: asrd z1.d, p0/m, z1.d, #5 +; VBITS_GE_256-NEXT: asrd z0.d, p0/m, z0.d, #5 +; VBITS_GE_256-NEXT: asrd z2.d, p0/m, z2.d, #5 +; VBITS_GE_256-NEXT: asrd z3.d, p0/m, z3.d, #5 +; VBITS_GE_256-NEXT: st1d { z2.d }, p0, [x0, x10, lsl #3] +; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x0, x8, lsl #3] +; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x0, x9, lsl #3] +; VBITS_GE_256-NEXT: st1d { z3.d }, p0, [x0] +; VBITS_GE_256-NEXT: ret +; ; VBITS_GE_1024-LABEL: sdiv_v16i64: ; VBITS_GE_1024: // %bb.0: ; VBITS_GE_1024-NEXT: ptrue p0.d, vl16 @@ -372,6 +561,42 @@ } define void @sdiv_v32i64(<32 x i64>* %a) #0 { +; VBITS_GE_256-LABEL: sdiv_v32i64: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov x8, #24 +; VBITS_GE_256-NEXT: mov x9, #12 +; VBITS_GE_256-NEXT: mov x10, #4 +; VBITS_GE_256-NEXT: mov x11, #20 +; VBITS_GE_256-NEXT: mov x12, #8 +; VBITS_GE_256-NEXT: mov x13, #28 +; VBITS_GE_256-NEXT: mov x14, #16 +; VBITS_GE_256-NEXT: ptrue p0.d, vl4 +; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x9, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0, x10, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z2.d }, p0/z, [x0, x11, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z3.d }, p0/z, [x0, x12, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z4.d }, p0/z, [x0, x13, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z5.d }, p0/z, [x0, x14, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z6.d }, p0/z, [x0, x8, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z7.d }, p0/z, [x0] +; VBITS_GE_256-NEXT: asrd z1.d, p0/m, z1.d, #5 +; VBITS_GE_256-NEXT: asrd z0.d, p0/m, z0.d, #5 +; VBITS_GE_256-NEXT: asrd z3.d, p0/m, z3.d, #5 +; VBITS_GE_256-NEXT: asrd z2.d, p0/m, z2.d, #5 +; VBITS_GE_256-NEXT: asrd z5.d, p0/m, z5.d, #5 +; VBITS_GE_256-NEXT: asrd z4.d, p0/m, z4.d, #5 +; VBITS_GE_256-NEXT: asrd z6.d, p0/m, z6.d, #5 +; VBITS_GE_256-NEXT: asrd z7.d, p0/m, z7.d, #5 +; VBITS_GE_256-NEXT: st1d { z6.d }, p0, [x0, x8, lsl #3] +; VBITS_GE_256-NEXT: st1d { z4.d }, p0, [x0, x13, lsl #3] +; VBITS_GE_256-NEXT: st1d { z5.d }, p0, [x0, x14, lsl #3] +; VBITS_GE_256-NEXT: st1d { z2.d }, p0, [x0, x11, lsl #3] +; VBITS_GE_256-NEXT: st1d { z3.d }, p0, [x0, x12, lsl #3] +; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x0, x9, lsl #3] +; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x0, x10, lsl #3] +; VBITS_GE_256-NEXT: st1d { z7.d }, p0, [x0] +; VBITS_GE_256-NEXT: ret +; ; VBITS_GE_2048-LABEL: sdiv_v32i64: ; VBITS_GE_2048: // %bb.0: ; VBITS_GE_2048-NEXT: ptrue p0.d, vl32 diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-shuffles.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-shuffles.ll --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-shuffles.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-shuffles.ll @@ -1,3 +1,4 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s | FileCheck %s target triple = "aarch64-unknown-linux-gnu" @@ -8,6 +9,28 @@ ; successfully exits code generation. define void @hang_when_merging_stores_after_legalisation(<8 x i32>* %a, <2 x i32> %b) #0 { ; CHECK-LABEL: hang_when_merging_stores_after_legalisation: +; CHECK: // %bb.0: +; CHECK-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: mov x29, sp +; CHECK-NEXT: .cfi_def_cfa w29, 16 +; CHECK-NEXT: .cfi_offset w30, -8 +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: sub x9, sp, #48 +; CHECK-NEXT: and sp, x9, #0xffffffffffffffe0 +; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: stp s0, s0, [sp, #24] +; CHECK-NEXT: stp s0, s0, [sp, #16] +; CHECK-NEXT: stp s0, s0, [sp, #8] +; CHECK-NEXT: stp s0, s0, [sp] +; CHECK-NEXT: ld1w { z0.s }, p0/z, [sp] +; CHECK-NEXT: mov z1.d, z0.d +; CHECK-NEXT: ext z1.b, z1.b, z1.b, #16 +; CHECK-NEXT: st2 { v0.4s, v1.4s }, [x0] +; CHECK-NEXT: mov sp, x29 +; CHECK-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload +; CHECK-NEXT: ret %splat = shufflevector <2 x i32> %b, <2 x i32> undef, <8 x i32> zeroinitializer %interleaved.vec = shufflevector <8 x i32> %splat, <8 x i32> undef, <8 x i32> store <8 x i32> %interleaved.vec, <8 x i32>* %a, align 4 @@ -17,8 +40,85 @@ ; Ensure we don't crash when trying to lower a shuffle via and extract define void @crash_when_lowering_extract_shuffle(<32 x i32>* %dst, i1 %cond) #0 { ; CHECK-LABEL: crash_when_lowering_extract_shuffle: -; CHECK: ld1w { z3.s }, p0/z, [x0] -; CHECK: st1w { z3.s }, p0, [x0] +; CHECK: // %bb.0: +; CHECK-NEXT: tbnz w1, #0, .LBB1_2 +; CHECK-NEXT: // %bb.1: // %vector.body +; CHECK-NEXT: mov z0.b, #0 // =0x0 +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: umov w8, v0.b[8] +; CHECK-NEXT: umov w9, v0.b[1] +; CHECK-NEXT: umov w10, v0.b[9] +; CHECK-NEXT: umov w11, v0.b[2] +; CHECK-NEXT: mov v1.16b, v0.16b +; CHECK-NEXT: fmov s2, w8 +; CHECK-NEXT: umov w8, v0.b[10] +; CHECK-NEXT: mov v1.b[1], w9 +; CHECK-NEXT: umov w9, v0.b[3] +; CHECK-NEXT: mov v2.b[1], w10 +; CHECK-NEXT: umov w10, v0.b[11] +; CHECK-NEXT: mov v1.b[2], w11 +; CHECK-NEXT: umov w11, v0.b[7] +; CHECK-NEXT: mov v2.b[2], w8 +; CHECK-NEXT: umov w8, v0.b[4] +; CHECK-NEXT: mov v1.b[3], w9 +; CHECK-NEXT: umov w9, v0.b[12] +; CHECK-NEXT: mov v2.b[3], w10 +; CHECK-NEXT: umov w10, v0.b[5] +; CHECK-NEXT: mov v1.b[4], w8 +; CHECK-NEXT: umov w8, v0.b[13] +; CHECK-NEXT: mov v2.b[4], w9 +; CHECK-NEXT: umov w9, v0.b[6] +; CHECK-NEXT: mov v1.b[5], w10 +; CHECK-NEXT: umov w10, v0.b[14] +; CHECK-NEXT: mov v2.b[5], w8 +; CHECK-NEXT: mov x8, #16 +; CHECK-NEXT: mov v1.b[6], w9 +; CHECK-NEXT: mov x9, #24 +; CHECK-NEXT: ld1w { z4.s }, p0/z, [x0, x8, lsl #2] +; CHECK-NEXT: mov v2.b[6], w10 +; CHECK-NEXT: umov w10, v0.b[15] +; CHECK-NEXT: ext z0.b, z0.b, z0.b, #16 +; CHECK-NEXT: ld1w { z5.s }, p0/z, [x0, x9, lsl #2] +; CHECK-NEXT: dup v3.2d, v0.d[1] +; CHECK-NEXT: uunpklo z0.h, z0.b +; CHECK-NEXT: mov v1.b[7], w11 +; CHECK-NEXT: uunpklo z0.s, z0.h +; CHECK-NEXT: mov v2.b[7], w10 +; CHECK-NEXT: uunpklo z3.h, z3.b +; CHECK-NEXT: uunpklo z3.s, z3.h +; CHECK-NEXT: mov x11, #8 +; CHECK-NEXT: lsl z0.s, z0.s, #31 +; CHECK-NEXT: lsl z3.s, z3.s, #31 +; CHECK-NEXT: asr z0.s, z0.s, #31 +; CHECK-NEXT: asr z3.s, z3.s, #31 +; CHECK-NEXT: uunpklo z1.h, z1.b +; CHECK-NEXT: uunpklo z2.h, z2.b +; CHECK-NEXT: and z0.s, z0.s, #0x1 +; CHECK-NEXT: and z3.s, z3.s, #0x1 +; CHECK-NEXT: uunpklo z1.s, z1.h +; CHECK-NEXT: uunpklo z2.s, z2.h +; CHECK-NEXT: cmpne p1.s, p0/z, z0.s, #0 +; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0, x11, lsl #2] +; CHECK-NEXT: cmpne p2.s, p0/z, z3.s, #0 +; CHECK-NEXT: ld1w { z3.s }, p0/z, [x0] +; CHECK-NEXT: lsl z1.s, z1.s, #31 +; CHECK-NEXT: lsl z2.s, z2.s, #31 +; CHECK-NEXT: asr z1.s, z1.s, #31 +; CHECK-NEXT: asr z2.s, z2.s, #31 +; CHECK-NEXT: and z1.s, z1.s, #0x1 +; CHECK-NEXT: and z2.s, z2.s, #0x1 +; CHECK-NEXT: mov z4.s, p1/m, #0 // =0x0 +; CHECK-NEXT: mov z5.s, p2/m, #0 // =0x0 +; CHECK-NEXT: cmpne p1.s, p0/z, z1.s, #0 +; CHECK-NEXT: cmpne p2.s, p0/z, z2.s, #0 +; CHECK-NEXT: mov z3.s, p1/m, #0 // =0x0 +; CHECK-NEXT: mov z0.s, p2/m, #0 // =0x0 +; CHECK-NEXT: st1w { z4.s }, p0, [x0, x8, lsl #2] +; CHECK-NEXT: st1w { z5.s }, p0, [x0, x9, lsl #2] +; CHECK-NEXT: st1w { z0.s }, p0, [x0, x11, lsl #2] +; CHECK-NEXT: st1w { z3.s }, p0, [x0] +; CHECK-NEXT: .LBB1_2: // %exit +; CHECK-NEXT: ret %broadcast.splat = shufflevector <32 x i1> zeroinitializer, <32 x i1> zeroinitializer, <32 x i32> zeroinitializer br i1 %cond, label %exit, label %vector.body diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-splat-vector.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-splat-vector.ll --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-splat-vector.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-splat-vector.ll @@ -1,6 +1,8 @@ -; RUN: llc -aarch64-sve-vector-bits-min=128 < %s | FileCheck %s -check-prefix=NO_SVE -; RUN: llc -aarch64-sve-vector-bits-min=256 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_EQ_256 -; RUN: llc -aarch64-sve-vector-bits-min=384 < %s | FileCheck %s -check-prefixes=CHECK +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; Don't use SVE when its registers are no bigger than NEON. +; RUN: llc -aarch64-sve-vector-bits-min=128 < %s | not grep ptrue +; RUN: llc -aarch64-sve-vector-bits-min=256 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_256 +; RUN: llc -aarch64-sve-vector-bits-min=384 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_256 ; RUN: llc -aarch64-sve-vector-bits-min=512 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512 ; RUN: llc -aarch64-sve-vector-bits-min=640 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512 ; RUN: llc -aarch64-sve-vector-bits-min=768 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512 @@ -17,18 +19,14 @@ target triple = "aarch64-unknown-linux-gnu" -; Don't use SVE when its registers are no bigger than NEON. -; NO_SVE-NOT: ptrue - -; ; DUP (integer) -; ; Don't use SVE for 64-bit vectors. define <8 x i8> @splat_v8i8(i8 %a) #0 { ; CHECK-LABEL: splat_v8i8: -; CHECK: dup v0.8b, w0 -; CHECK-NEXT: ret +; CHECK: // %bb.0: +; CHECK-NEXT: dup v0.8b, w0 +; CHECK-NEXT: ret %insert = insertelement <8 x i8> undef, i8 %a, i64 0 %splat = shufflevector <8 x i8> %insert, <8 x i8> undef, <8 x i32> zeroinitializer ret <8 x i8> %splat @@ -37,8 +35,9 @@ ; Don't use SVE for 128-bit vectors. define <16 x i8> @splat_v16i8(i8 %a) #0 { ; CHECK-LABEL: splat_v16i8: -; CHECK: dup v0.16b, w0 -; CHECK-NEXT: ret +; CHECK: // %bb.0: +; CHECK-NEXT: dup v0.16b, w0 +; CHECK-NEXT: ret %insert = insertelement <16 x i8> undef, i8 %a, i64 0 %splat = shufflevector <16 x i8> %insert, <16 x i8> undef, <16 x i32> zeroinitializer ret <16 x i8> %splat @@ -46,10 +45,11 @@ define void @splat_v32i8(i8 %a, <32 x i8>* %b) #0 { ; CHECK-LABEL: splat_v32i8: -; CHECK-DAG: mov [[RES:z[0-9]+]].b, w0 -; CHECK-DAG: ptrue [[PG:p[0-9]+]].b, vl32 -; CHECK-NEXT: st1b { [[RES]].b }, [[PG]], [x1] -; CHECK-NEXT: ret +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.b, vl32 +; CHECK-NEXT: mov z0.b, w0 +; CHECK-NEXT: st1b { z0.b }, p0, [x1] +; CHECK-NEXT: ret %insert = insertelement <32 x i8> undef, i8 %a, i64 0 %splat = shufflevector <32 x i8> %insert, <32 x i8> undef, <32 x i32> zeroinitializer store <32 x i8> %splat, <32 x i8>* %b @@ -57,19 +57,23 @@ } define void @splat_v64i8(i8 %a, <64 x i8>* %b) #0 { -; CHECK-LABEL: splat_v64i8: -; VBITS_GE_512-DAG: mov [[RES:z[0-9]+]].b, w0 -; VBITS_GE_512-DAG: ptrue [[PG:p[0-9]+]].b, vl64 -; VBITS_GE_512-NEXT: st1b { [[RES]].b }, [[PG]], [x1] -; VBITS_GE_512-NEXT: ret +; VBITS_GE_256-LABEL: splat_v64i8: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov w8, #32 +; VBITS_GE_256-NEXT: mov z0.b, w0 +; VBITS_GE_256-NEXT: ptrue p0.b, vl32 +; VBITS_GE_256-NEXT: st1b { z0.b }, p0, [x1] +; VBITS_GE_256-NEXT: st1b { z0.b }, p0, [x1, x8] +; VBITS_GE_256-NEXT: ret +; +; VBITS_GE_512-LABEL: splat_v64i8: +; VBITS_GE_512: // %bb.0: +; VBITS_GE_512-NEXT: ptrue p0.b, vl64 +; VBITS_GE_512-NEXT: mov z0.b, w0 +; VBITS_GE_512-NEXT: st1b { z0.b }, p0, [x1] +; VBITS_GE_512-NEXT: ret ; Ensure sensible type legalisation. -; VBITS_EQ_256-DAG: mov [[RES:z[0-9]+]].b, w0 -; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].b, vl32 -; VBITS_EQ_256-DAG: mov w[[NUMELTS:[0-9]+]], #32 -; VBITS_EQ_256-DAG: st1b { [[RES]].b }, [[PG]], [x1] -; VBITS_EQ_256-DAG: st1b { [[RES]].b }, [[PG]], [x1, x[[NUMELTS]]] -; VBITS_EQ_256-NEXT: ret %insert = insertelement <64 x i8> undef, i8 %a, i64 0 %splat = shufflevector <64 x i8> %insert, <64 x i8> undef, <64 x i32> zeroinitializer store <64 x i8> %splat, <64 x i8>* %b @@ -77,11 +81,25 @@ } define void @splat_v128i8(i8 %a, <128 x i8>* %b) #0 { -; CHECK-LABEL: splat_v128i8: -; VBITS_GE_1024-DAG: mov [[RES:z[0-9]+]].b, w0 -; VBITS_GE_1024-DAG: ptrue [[PG:p[0-9]+]].b, vl128 -; VBITS_GE_1024-NEXT: st1b { [[RES]].b }, [[PG]], [x1] -; VBITS_GE_1024-NEXT: ret +; VBITS_GE_256-LABEL: splat_v128i8: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov w8, #96 +; VBITS_GE_256-NEXT: mov w9, #64 +; VBITS_GE_256-NEXT: mov w10, #32 +; VBITS_GE_256-NEXT: ptrue p0.b, vl32 +; VBITS_GE_256-NEXT: mov z0.b, w0 +; VBITS_GE_256-NEXT: st1b { z0.b }, p0, [x1, x8] +; VBITS_GE_256-NEXT: st1b { z0.b }, p0, [x1, x9] +; VBITS_GE_256-NEXT: st1b { z0.b }, p0, [x1, x10] +; VBITS_GE_256-NEXT: st1b { z0.b }, p0, [x1] +; VBITS_GE_256-NEXT: ret +; +; VBITS_GE_1024-LABEL: splat_v128i8: +; VBITS_GE_1024: // %bb.0: +; VBITS_GE_1024-NEXT: ptrue p0.b, vl128 +; VBITS_GE_1024-NEXT: mov z0.b, w0 +; VBITS_GE_1024-NEXT: st1b { z0.b }, p0, [x1] +; VBITS_GE_1024-NEXT: ret %insert = insertelement <128 x i8> undef, i8 %a, i64 0 %splat = shufflevector <128 x i8> %insert, <128 x i8> undef, <128 x i32> zeroinitializer store <128 x i8> %splat, <128 x i8>* %b @@ -89,11 +107,33 @@ } define void @splat_v256i8(i8 %a, <256 x i8>* %b) #0 { -; CHECK-LABEL: splat_v256i8: -; VBITS_GE_2048-DAG: mov [[RES:z[0-9]+]].b, w0 -; VBITS_GE_2048-DAG: ptrue [[PG:p[0-9]+]].b, vl256 -; VBITS_GE_2048-NEXT: st1b { [[RES]].b }, [[PG]], [x1] -; VBITS_GE_2048-NEXT: ret +; VBITS_GE_256-LABEL: splat_v256i8: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov w8, #224 +; VBITS_GE_256-NEXT: mov w9, #192 +; VBITS_GE_256-NEXT: ptrue p0.b, vl32 +; VBITS_GE_256-NEXT: mov z0.b, w0 +; VBITS_GE_256-NEXT: mov w10, #160 +; VBITS_GE_256-NEXT: mov w11, #128 +; VBITS_GE_256-NEXT: mov w12, #96 +; VBITS_GE_256-NEXT: st1b { z0.b }, p0, [x1, x8] +; VBITS_GE_256-NEXT: mov w8, #64 +; VBITS_GE_256-NEXT: st1b { z0.b }, p0, [x1, x9] +; VBITS_GE_256-NEXT: mov w9, #32 +; VBITS_GE_256-NEXT: st1b { z0.b }, p0, [x1, x10] +; VBITS_GE_256-NEXT: st1b { z0.b }, p0, [x1, x11] +; VBITS_GE_256-NEXT: st1b { z0.b }, p0, [x1, x12] +; VBITS_GE_256-NEXT: st1b { z0.b }, p0, [x1, x8] +; VBITS_GE_256-NEXT: st1b { z0.b }, p0, [x1] +; VBITS_GE_256-NEXT: st1b { z0.b }, p0, [x1, x9] +; VBITS_GE_256-NEXT: ret +; +; VBITS_GE_2048-LABEL: splat_v256i8: +; VBITS_GE_2048: // %bb.0: +; VBITS_GE_2048-NEXT: ptrue p0.b, vl256 +; VBITS_GE_2048-NEXT: mov z0.b, w0 +; VBITS_GE_2048-NEXT: st1b { z0.b }, p0, [x1] +; VBITS_GE_2048-NEXT: ret %insert = insertelement <256 x i8> undef, i8 %a, i64 0 %splat = shufflevector <256 x i8> %insert, <256 x i8> undef, <256 x i32> zeroinitializer store <256 x i8> %splat, <256 x i8>* %b @@ -103,8 +143,9 @@ ; Don't use SVE for 64-bit vectors. define <4 x i16> @splat_v4i16(i16 %a) #0 { ; CHECK-LABEL: splat_v4i16: -; CHECK: dup v0.4h, w0 -; CHECK-NEXT: ret +; CHECK: // %bb.0: +; CHECK-NEXT: dup v0.4h, w0 +; CHECK-NEXT: ret %insert = insertelement <4 x i16> undef, i16 %a, i64 0 %splat = shufflevector <4 x i16> %insert, <4 x i16> undef, <4 x i32> zeroinitializer ret <4 x i16> %splat @@ -113,8 +154,9 @@ ; Don't use SVE for 128-bit vectors. define <8 x i16> @splat_v8i16(i16 %a) #0 { ; CHECK-LABEL: splat_v8i16: -; CHECK: dup v0.8h, w0 -; CHECK-NEXT: ret +; CHECK: // %bb.0: +; CHECK-NEXT: dup v0.8h, w0 +; CHECK-NEXT: ret %insert = insertelement <8 x i16> undef, i16 %a, i64 0 %splat = shufflevector <8 x i16> %insert, <8 x i16> undef, <8 x i32> zeroinitializer ret <8 x i16> %splat @@ -122,10 +164,11 @@ define void @splat_v16i16(i16 %a, <16 x i16>* %b) #0 { ; CHECK-LABEL: splat_v16i16: -; CHECK-DAG: mov [[RES:z[0-9]+]].h, w0 -; CHECK-DAG: ptrue [[PG:p[0-9]+]].h, vl16 -; CHECK-NEXT: st1h { [[RES]].h }, [[PG]], [x1] -; CHECK-NEXT: ret +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.h, vl16 +; CHECK-NEXT: mov z0.h, w0 +; CHECK-NEXT: st1h { z0.h }, p0, [x1] +; CHECK-NEXT: ret %insert = insertelement <16 x i16> undef, i16 %a, i64 0 %splat = shufflevector <16 x i16> %insert, <16 x i16> undef, <16 x i32> zeroinitializer store <16 x i16> %splat, <16 x i16>* %b @@ -133,19 +176,23 @@ } define void @splat_v32i16(i16 %a, <32 x i16>* %b) #0 { -; CHECK-LABEL: splat_v32i16: -; VBITS_GE_512-DAG: mov [[RES:z[0-9]+]].h, w0 -; VBITS_GE_512-DAG: ptrue [[PG:p[0-9]+]].h, vl32 -; VBITS_GE_512-NEXT: st1h { [[RES]].h }, [[PG]], [x1] -; VBITS_GE_512-NEXT: ret +; VBITS_GE_256-LABEL: splat_v32i16: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov x8, #16 +; VBITS_GE_256-NEXT: mov z0.h, w0 +; VBITS_GE_256-NEXT: ptrue p0.h, vl16 +; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x1] +; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x1, x8, lsl #1] +; VBITS_GE_256-NEXT: ret +; +; VBITS_GE_512-LABEL: splat_v32i16: +; VBITS_GE_512: // %bb.0: +; VBITS_GE_512-NEXT: ptrue p0.h, vl32 +; VBITS_GE_512-NEXT: mov z0.h, w0 +; VBITS_GE_512-NEXT: st1h { z0.h }, p0, [x1] +; VBITS_GE_512-NEXT: ret ; Ensure sensible type legalisation. -; VBITS_EQ_256-DAG: mov [[RES:z[0-9]+]].h, w0 -; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].h, vl16 -; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #16 -; VBITS_EQ_256-DAG: st1h { [[RES]].h }, [[PG]], [x1] -; VBITS_EQ_256-DAG: st1h { [[RES]].h }, [[PG]], [x1, x[[NUMELTS]], lsl #1] -; VBITS_EQ_256-NEXT: ret %insert = insertelement <32 x i16> undef, i16 %a, i64 0 %splat = shufflevector <32 x i16> %insert, <32 x i16> undef, <32 x i32> zeroinitializer store <32 x i16> %splat, <32 x i16>* %b @@ -153,11 +200,25 @@ } define void @splat_v64i16(i16 %a, <64 x i16>* %b) #0 { -; CHECK-LABEL: splat_v64i16: -; VBITS_GE_1024-DAG: mov [[RES:z[0-9]+]].h, w0 -; VBITS_GE_1024-DAG: ptrue [[PG:p[0-9]+]].h, vl64 -; VBITS_GE_1024-NEXT: st1h { [[RES]].h }, [[PG]], [x1] -; VBITS_GE_1024-NEXT: ret +; VBITS_GE_256-LABEL: splat_v64i16: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov x8, #48 +; VBITS_GE_256-NEXT: mov x9, #32 +; VBITS_GE_256-NEXT: mov x10, #16 +; VBITS_GE_256-NEXT: ptrue p0.h, vl16 +; VBITS_GE_256-NEXT: mov z0.h, w0 +; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x1, x8, lsl #1] +; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x1, x9, lsl #1] +; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x1, x10, lsl #1] +; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x1] +; VBITS_GE_256-NEXT: ret +; +; VBITS_GE_1024-LABEL: splat_v64i16: +; VBITS_GE_1024: // %bb.0: +; VBITS_GE_1024-NEXT: ptrue p0.h, vl64 +; VBITS_GE_1024-NEXT: mov z0.h, w0 +; VBITS_GE_1024-NEXT: st1h { z0.h }, p0, [x1] +; VBITS_GE_1024-NEXT: ret %insert = insertelement <64 x i16> undef, i16 %a, i64 0 %splat = shufflevector <64 x i16> %insert, <64 x i16> undef, <64 x i32> zeroinitializer store <64 x i16> %splat, <64 x i16>* %b @@ -165,11 +226,33 @@ } define void @splat_v128i16(i16 %a, <128 x i16>* %b) #0 { -; CHECK-LABEL: splat_v128i16: -; VBITS_GE_2048-DAG: mov [[RES:z[0-9]+]].h, w0 -; VBITS_GE_2048-DAG: ptrue [[PG:p[0-9]+]].h, vl128 -; VBITS_GE_2048-NEXT: st1h { [[RES]].h }, [[PG]], [x1] -; VBITS_GE_2048-NEXT: ret +; VBITS_GE_256-LABEL: splat_v128i16: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov x8, #112 +; VBITS_GE_256-NEXT: mov x9, #96 +; VBITS_GE_256-NEXT: ptrue p0.h, vl16 +; VBITS_GE_256-NEXT: mov z0.h, w0 +; VBITS_GE_256-NEXT: mov x10, #80 +; VBITS_GE_256-NEXT: mov x11, #64 +; VBITS_GE_256-NEXT: mov x12, #48 +; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x1, x8, lsl #1] +; VBITS_GE_256-NEXT: mov x8, #32 +; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x1, x9, lsl #1] +; VBITS_GE_256-NEXT: mov x9, #16 +; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x1, x10, lsl #1] +; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x1, x11, lsl #1] +; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x1, x12, lsl #1] +; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x1, x8, lsl #1] +; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x1] +; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x1, x9, lsl #1] +; VBITS_GE_256-NEXT: ret +; +; VBITS_GE_2048-LABEL: splat_v128i16: +; VBITS_GE_2048: // %bb.0: +; VBITS_GE_2048-NEXT: ptrue p0.h, vl128 +; VBITS_GE_2048-NEXT: mov z0.h, w0 +; VBITS_GE_2048-NEXT: st1h { z0.h }, p0, [x1] +; VBITS_GE_2048-NEXT: ret %insert = insertelement <128 x i16> undef, i16 %a, i64 0 %splat = shufflevector <128 x i16> %insert, <128 x i16> undef, <128 x i32> zeroinitializer store <128 x i16> %splat, <128 x i16>* %b @@ -179,8 +262,9 @@ ; Don't use SVE for 64-bit vectors. define <2 x i32> @splat_v2i32(i32 %a) #0 { ; CHECK-LABEL: splat_v2i32: -; CHECK: dup v0.2s, w0 -; CHECK-NEXT: ret +; CHECK: // %bb.0: +; CHECK-NEXT: dup v0.2s, w0 +; CHECK-NEXT: ret %insert = insertelement <2 x i32> undef, i32 %a, i64 0 %splat = shufflevector <2 x i32> %insert, <2 x i32> undef, <2 x i32> zeroinitializer ret <2 x i32> %splat @@ -189,8 +273,9 @@ ; Don't use SVE for 128-bit vectors. define <4 x i32> @splat_v4i32(i32 %a) #0 { ; CHECK-LABEL: splat_v4i32: -; CHECK: dup v0.4s, w0 -; CHECK-NEXT: ret +; CHECK: // %bb.0: +; CHECK-NEXT: dup v0.4s, w0 +; CHECK-NEXT: ret %insert = insertelement <4 x i32> undef, i32 %a, i64 0 %splat = shufflevector <4 x i32> %insert, <4 x i32> undef, <4 x i32> zeroinitializer ret <4 x i32> %splat @@ -198,10 +283,11 @@ define void @splat_v8i32(i32 %a, <8 x i32>* %b) #0 { ; CHECK-LABEL: splat_v8i32: -; CHECK-DAG: mov [[RES:z[0-9]+]].s, w0 -; CHECK-DAG: ptrue [[PG:p[0-9]+]].s, vl8 -; CHECK-NEXT: st1w { [[RES]].s }, [[PG]], [x1] -; CHECK-NEXT: ret +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.s, vl8 +; CHECK-NEXT: mov z0.s, w0 +; CHECK-NEXT: st1w { z0.s }, p0, [x1] +; CHECK-NEXT: ret %insert = insertelement <8 x i32> undef, i32 %a, i64 0 %splat = shufflevector <8 x i32> %insert, <8 x i32> undef, <8 x i32> zeroinitializer store <8 x i32> %splat, <8 x i32>* %b @@ -209,19 +295,23 @@ } define void @splat_v16i32(i32 %a, <16 x i32>* %b) #0 { -; CHECK-LABEL: splat_v16i32: -; VBITS_GE_512-DAG: mov [[RES:z[0-9]+]].s, w0 -; VBITS_GE_512-DAG: ptrue [[PG:p[0-9]+]].s, vl16 -; VBITS_GE_512-NEXT: st1w { [[RES]].s }, [[PG]], [x1] -; VBITS_GE_512-NEXT: ret +; VBITS_GE_256-LABEL: splat_v16i32: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov x8, #8 +; VBITS_GE_256-NEXT: mov z0.s, w0 +; VBITS_GE_256-NEXT: ptrue p0.s, vl8 +; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x1] +; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x1, x8, lsl #2] +; VBITS_GE_256-NEXT: ret +; +; VBITS_GE_512-LABEL: splat_v16i32: +; VBITS_GE_512: // %bb.0: +; VBITS_GE_512-NEXT: ptrue p0.s, vl16 +; VBITS_GE_512-NEXT: mov z0.s, w0 +; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x1] +; VBITS_GE_512-NEXT: ret ; Ensure sensible type legalisation. -; VBITS_EQ_256-DAG: mov [[RES:z[0-9]+]].s, w0 -; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].s, vl8 -; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #8 -; VBITS_EQ_256-DAG: st1w { [[RES]].s }, [[PG]], [x1] -; VBITS_EQ_256-DAG: st1w { [[RES]].s }, [[PG]], [x1, x[[NUMELTS]], lsl #2] -; VBITS_EQ_256-NEXT: ret %insert = insertelement <16 x i32> undef, i32 %a, i64 0 %splat = shufflevector <16 x i32> %insert, <16 x i32> undef, <16 x i32> zeroinitializer store <16 x i32> %splat, <16 x i32>* %b @@ -229,11 +319,25 @@ } define void @splat_v32i32(i32 %a, <32 x i32>* %b) #0 { -; CHECK-LABEL: splat_v32i32: -; VBITS_GE_1024-DAG: mov [[RES:z[0-9]+]].s, w0 -; VBITS_GE_1024-DAG: ptrue [[PG:p[0-9]+]].s, vl32 -; VBITS_GE_1024-NEXT: st1w { [[RES]].s }, [[PG]], [x1] -; VBITS_GE_1024-NEXT: ret +; VBITS_GE_256-LABEL: splat_v32i32: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov x8, #24 +; VBITS_GE_256-NEXT: mov x9, #16 +; VBITS_GE_256-NEXT: mov x10, #8 +; VBITS_GE_256-NEXT: ptrue p0.s, vl8 +; VBITS_GE_256-NEXT: mov z0.s, w0 +; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x1, x8, lsl #2] +; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x1, x9, lsl #2] +; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x1, x10, lsl #2] +; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x1] +; VBITS_GE_256-NEXT: ret +; +; VBITS_GE_1024-LABEL: splat_v32i32: +; VBITS_GE_1024: // %bb.0: +; VBITS_GE_1024-NEXT: ptrue p0.s, vl32 +; VBITS_GE_1024-NEXT: mov z0.s, w0 +; VBITS_GE_1024-NEXT: st1w { z0.s }, p0, [x1] +; VBITS_GE_1024-NEXT: ret %insert = insertelement <32 x i32> undef, i32 %a, i64 0 %splat = shufflevector <32 x i32> %insert, <32 x i32> undef, <32 x i32> zeroinitializer store <32 x i32> %splat, <32 x i32>* %b @@ -241,11 +345,33 @@ } define void @splat_v64i32(i32 %a, <64 x i32>* %b) #0 { -; CHECK-LABEL: splat_v64i32: -; VBITS_GE_2048-DAG: mov [[RES:z[0-9]+]].s, w0 -; VBITS_GE_2048-DAG: ptrue [[PG:p[0-9]+]].s, vl64 -; VBITS_GE_2048-NEXT: st1w { [[RES]].s }, [[PG]], [x1] -; VBITS_GE_2048-NEXT: ret +; VBITS_GE_256-LABEL: splat_v64i32: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov x8, #56 +; VBITS_GE_256-NEXT: mov x9, #48 +; VBITS_GE_256-NEXT: ptrue p0.s, vl8 +; VBITS_GE_256-NEXT: mov z0.s, w0 +; VBITS_GE_256-NEXT: mov x10, #40 +; VBITS_GE_256-NEXT: mov x11, #32 +; VBITS_GE_256-NEXT: mov x12, #24 +; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x1, x8, lsl #2] +; VBITS_GE_256-NEXT: mov x8, #16 +; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x1, x9, lsl #2] +; VBITS_GE_256-NEXT: mov x9, #8 +; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x1, x10, lsl #2] +; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x1, x11, lsl #2] +; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x1, x12, lsl #2] +; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x1, x8, lsl #2] +; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x1] +; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x1, x9, lsl #2] +; VBITS_GE_256-NEXT: ret +; +; VBITS_GE_2048-LABEL: splat_v64i32: +; VBITS_GE_2048: // %bb.0: +; VBITS_GE_2048-NEXT: ptrue p0.s, vl64 +; VBITS_GE_2048-NEXT: mov z0.s, w0 +; VBITS_GE_2048-NEXT: st1w { z0.s }, p0, [x1] +; VBITS_GE_2048-NEXT: ret %insert = insertelement <64 x i32> undef, i32 %a, i64 0 %splat = shufflevector <64 x i32> %insert, <64 x i32> undef, <64 x i32> zeroinitializer store <64 x i32> %splat, <64 x i32>* %b @@ -255,8 +381,9 @@ ; Don't use SVE for 64-bit vectors. define <1 x i64> @splat_v1i64(i64 %a) #0 { ; CHECK-LABEL: splat_v1i64: -; CHECK: fmov d0, x0 -; CHECK-NEXT: ret +; CHECK: // %bb.0: +; CHECK-NEXT: fmov d0, x0 +; CHECK-NEXT: ret %insert = insertelement <1 x i64> undef, i64 %a, i64 0 %splat = shufflevector <1 x i64> %insert, <1 x i64> undef, <1 x i32> zeroinitializer ret <1 x i64> %splat @@ -265,8 +392,9 @@ ; Don't use SVE for 128-bit vectors. define <2 x i64> @splat_v2i64(i64 %a) #0 { ; CHECK-LABEL: splat_v2i64: -; CHECK: dup v0.2d, x0 -; CHECK-NEXT: ret +; CHECK: // %bb.0: +; CHECK-NEXT: dup v0.2d, x0 +; CHECK-NEXT: ret %insert = insertelement <2 x i64> undef, i64 %a, i64 0 %splat = shufflevector <2 x i64> %insert, <2 x i64> undef, <2 x i32> zeroinitializer ret <2 x i64> %splat @@ -274,10 +402,11 @@ define void @splat_v4i64(i64 %a, <4 x i64>* %b) #0 { ; CHECK-LABEL: splat_v4i64: -; CHECK-DAG: mov [[RES:z[0-9]+]].d, x0 -; CHECK-DAG: ptrue [[PG:p[0-9]+]].d, vl4 -; CHECK-NEXT: st1d { [[RES]].d }, [[PG]], [x1] -; CHECK-NEXT: ret +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.d, vl4 +; CHECK-NEXT: mov z0.d, x0 +; CHECK-NEXT: st1d { z0.d }, p0, [x1] +; CHECK-NEXT: ret %insert = insertelement <4 x i64> undef, i64 %a, i64 0 %splat = shufflevector <4 x i64> %insert, <4 x i64> undef, <4 x i32> zeroinitializer store <4 x i64> %splat, <4 x i64>* %b @@ -285,19 +414,23 @@ } define void @splat_v8i64(i64 %a, <8 x i64>* %b) #0 { -; CHECK-LABEL: splat_v8i64: -; VBITS_GE_512-DAG: mov [[RES:z[0-9]+]].d, x0 -; VBITS_GE_512-DAG: ptrue [[PG:p[0-9]+]].d, vl8 -; VBITS_GE_512-NEXT: st1d { [[RES]].d }, [[PG]], [x1] -; VBITS_GE_512-NEXT: ret +; VBITS_GE_256-LABEL: splat_v8i64: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov x8, #4 +; VBITS_GE_256-NEXT: mov z0.d, x0 +; VBITS_GE_256-NEXT: ptrue p0.d, vl4 +; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x1] +; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x1, x8, lsl #3] +; VBITS_GE_256-NEXT: ret +; +; VBITS_GE_512-LABEL: splat_v8i64: +; VBITS_GE_512: // %bb.0: +; VBITS_GE_512-NEXT: ptrue p0.d, vl8 +; VBITS_GE_512-NEXT: mov z0.d, x0 +; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x1] +; VBITS_GE_512-NEXT: ret ; Ensure sensible type legalisation. -; VBITS_EQ_256-DAG: mov [[RES:z[0-9]+]].d, x0 -; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].d, vl4 -; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #4 -; VBITS_EQ_256-DAG: st1d { [[RES]].d }, [[PG]], [x1] -; VBITS_EQ_256-DAG: st1d { [[RES]].d }, [[PG]], [x1, x[[NUMELTS]], lsl #3] -; VBITS_EQ_256-NEXT: ret %insert = insertelement <8 x i64> undef, i64 %a, i64 0 %splat = shufflevector <8 x i64> %insert, <8 x i64> undef, <8 x i32> zeroinitializer store <8 x i64> %splat, <8 x i64>* %b @@ -305,11 +438,25 @@ } define void @splat_v16i64(i64 %a, <16 x i64>* %b) #0 { -; CHECK-LABEL: splat_v16i64: -; VBITS_GE_1024-DAG: mov [[RES:z[0-9]+]].d, x0 -; VBITS_GE_1024-DAG: ptrue [[PG:p[0-9]+]].d, vl16 -; VBITS_GE_1024-NEXT: st1d { [[RES]].d }, [[PG]], [x1] -; VBITS_GE_1024-NEXT: ret +; VBITS_GE_256-LABEL: splat_v16i64: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov x8, #12 +; VBITS_GE_256-NEXT: mov x9, #8 +; VBITS_GE_256-NEXT: mov x10, #4 +; VBITS_GE_256-NEXT: ptrue p0.d, vl4 +; VBITS_GE_256-NEXT: mov z0.d, x0 +; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x1, x8, lsl #3] +; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x1, x9, lsl #3] +; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x1, x10, lsl #3] +; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x1] +; VBITS_GE_256-NEXT: ret +; +; VBITS_GE_1024-LABEL: splat_v16i64: +; VBITS_GE_1024: // %bb.0: +; VBITS_GE_1024-NEXT: ptrue p0.d, vl16 +; VBITS_GE_1024-NEXT: mov z0.d, x0 +; VBITS_GE_1024-NEXT: st1d { z0.d }, p0, [x1] +; VBITS_GE_1024-NEXT: ret %insert = insertelement <16 x i64> undef, i64 %a, i64 0 %splat = shufflevector <16 x i64> %insert, <16 x i64> undef, <16 x i32> zeroinitializer store <16 x i64> %splat, <16 x i64>* %b @@ -317,26 +464,48 @@ } define void @splat_v32i64(i64 %a, <32 x i64>* %b) #0 { -; CHECK-LABEL: splat_v32i64: -; VBITS_GE_2048-DAG: mov [[RES:z[0-9]+]].d, x0 -; VBITS_GE_2048-DAG: ptrue [[PG:p[0-9]+]].d, vl32 -; VBITS_GE_2048-NEXT: st1d { [[RES]].d }, [[PG]], [x1] -; VBITS_GE_2048-NEXT: ret +; VBITS_GE_256-LABEL: splat_v32i64: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov x8, #28 +; VBITS_GE_256-NEXT: mov x9, #24 +; VBITS_GE_256-NEXT: ptrue p0.d, vl4 +; VBITS_GE_256-NEXT: mov z0.d, x0 +; VBITS_GE_256-NEXT: mov x10, #20 +; VBITS_GE_256-NEXT: mov x11, #16 +; VBITS_GE_256-NEXT: mov x12, #12 +; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x1, x8, lsl #3] +; VBITS_GE_256-NEXT: mov x8, #8 +; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x1, x9, lsl #3] +; VBITS_GE_256-NEXT: mov x9, #4 +; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x1, x10, lsl #3] +; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x1, x11, lsl #3] +; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x1, x12, lsl #3] +; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x1, x8, lsl #3] +; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x1] +; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x1, x9, lsl #3] +; VBITS_GE_256-NEXT: ret +; +; VBITS_GE_2048-LABEL: splat_v32i64: +; VBITS_GE_2048: // %bb.0: +; VBITS_GE_2048-NEXT: ptrue p0.d, vl32 +; VBITS_GE_2048-NEXT: mov z0.d, x0 +; VBITS_GE_2048-NEXT: st1d { z0.d }, p0, [x1] +; VBITS_GE_2048-NEXT: ret %insert = insertelement <32 x i64> undef, i64 %a, i64 0 %splat = shufflevector <32 x i64> %insert, <32 x i64> undef, <32 x i32> zeroinitializer store <32 x i64> %splat, <32 x i64>* %b ret void } -; ; DUP (floating-point) -; ; Don't use SVE for 64-bit vectors. define <4 x half> @splat_v4f16(half %a) #0 { ; CHECK-LABEL: splat_v4f16: -; CHECK: dup v0.4h, v0.h[0] -; CHECK-NEXT: ret +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $h0 killed $h0 def $q0 +; CHECK-NEXT: dup v0.4h, v0.h[0] +; CHECK-NEXT: ret %insert = insertelement <4 x half> undef, half %a, i64 0 %splat = shufflevector <4 x half> %insert, <4 x half> undef, <4 x i32> zeroinitializer ret <4 x half> %splat @@ -345,8 +514,10 @@ ; Don't use SVE for 128-bit vectors. define <8 x half> @splat_v8f16(half %a) #0 { ; CHECK-LABEL: splat_v8f16: -; CHECK: dup v0.8h, v0.h[0] -; CHECK-NEXT: ret +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $h0 killed $h0 def $q0 +; CHECK-NEXT: dup v0.8h, v0.h[0] +; CHECK-NEXT: ret %insert = insertelement <8 x half> undef, half %a, i64 0 %splat = shufflevector <8 x half> %insert, <8 x half> undef, <8 x i32> zeroinitializer ret <8 x half> %splat @@ -354,10 +525,12 @@ define void @splat_v16f16(half %a, <16 x half>* %b) #0 { ; CHECK-LABEL: splat_v16f16: -; CHECK-DAG: mov [[RES:z[0-9]+]].h, h0 -; CHECK-DAG: ptrue [[PG:p[0-9]+]].h, vl16 -; CHECK-NEXT: st1h { [[RES]].h }, [[PG]], [x0] -; CHECK-NEXT: ret +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $h0 killed $h0 def $z0 +; CHECK-NEXT: ptrue p0.h, vl16 +; CHECK-NEXT: mov z0.h, h0 +; CHECK-NEXT: st1h { z0.h }, p0, [x0] +; CHECK-NEXT: ret %insert = insertelement <16 x half> undef, half %a, i64 0 %splat = shufflevector <16 x half> %insert, <16 x half> undef, <16 x i32> zeroinitializer store <16 x half> %splat, <16 x half>* %b @@ -365,19 +538,25 @@ } define void @splat_v32f16(half %a, <32 x half>* %b) #0 { -; CHECK-LABEL: splat_v32f16: -; VBITS_GE_512-DAG: mov [[RES:z[0-9]+]].h, h0 -; VBITS_GE_512-DAG: ptrue [[PG:p[0-9]+]].h, vl32 -; VBITS_GE_512-NEXT: st1h { [[RES]].h }, [[PG]], [x0] -; VBITS_GE_512-NEXT: ret +; VBITS_GE_256-LABEL: splat_v32f16: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov x8, #16 +; VBITS_GE_256-NEXT: // kill: def $h0 killed $h0 def $z0 +; VBITS_GE_256-NEXT: ptrue p0.h, vl16 +; VBITS_GE_256-NEXT: mov z0.h, h0 +; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x0] +; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x0, x8, lsl #1] +; VBITS_GE_256-NEXT: ret +; +; VBITS_GE_512-LABEL: splat_v32f16: +; VBITS_GE_512: // %bb.0: +; VBITS_GE_512-NEXT: // kill: def $h0 killed $h0 def $z0 +; VBITS_GE_512-NEXT: ptrue p0.h, vl32 +; VBITS_GE_512-NEXT: mov z0.h, h0 +; VBITS_GE_512-NEXT: st1h { z0.h }, p0, [x0] +; VBITS_GE_512-NEXT: ret ; Ensure sensible type legalisation. -; VBITS_EQ_256-DAG: mov [[RES:z[0-9]+]].h, h0 -; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].h, vl16 -; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #16 -; VBITS_EQ_256-DAG: st1h { [[RES]].h }, [[PG]], [x0] -; VBITS_EQ_256-DAG: st1h { [[RES]].h }, [[PG]], [x0, x[[NUMELTS]], lsl #1] -; VBITS_EQ_256-NEXT: ret %insert = insertelement <32 x half> undef, half %a, i64 0 %splat = shufflevector <32 x half> %insert, <32 x half> undef, <32 x i32> zeroinitializer store <32 x half> %splat, <32 x half>* %b @@ -385,11 +564,27 @@ } define void @splat_v64f16(half %a, <64 x half>* %b) #0 { -; CHECK-LABEL: splat_v64f16: -; VBITS_GE_1024-DAG: mov [[RES:z[0-9]+]].h, h0 -; VBITS_GE_1024-DAG: ptrue [[PG:p[0-9]+]].h, vl64 -; VBITS_GE_1024-NEXT: st1h { [[RES]].h }, [[PG]], [x0] -; VBITS_GE_1024-NEXT: ret +; VBITS_GE_256-LABEL: splat_v64f16: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov x8, #48 +; VBITS_GE_256-NEXT: mov x9, #32 +; VBITS_GE_256-NEXT: mov x10, #16 +; VBITS_GE_256-NEXT: // kill: def $h0 killed $h0 def $z0 +; VBITS_GE_256-NEXT: ptrue p0.h, vl16 +; VBITS_GE_256-NEXT: mov z0.h, h0 +; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x0, x8, lsl #1] +; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x0, x9, lsl #1] +; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x0, x10, lsl #1] +; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x0] +; VBITS_GE_256-NEXT: ret +; +; VBITS_GE_1024-LABEL: splat_v64f16: +; VBITS_GE_1024: // %bb.0: +; VBITS_GE_1024-NEXT: // kill: def $h0 killed $h0 def $z0 +; VBITS_GE_1024-NEXT: ptrue p0.h, vl64 +; VBITS_GE_1024-NEXT: mov z0.h, h0 +; VBITS_GE_1024-NEXT: st1h { z0.h }, p0, [x0] +; VBITS_GE_1024-NEXT: ret %insert = insertelement <64 x half> undef, half %a, i64 0 %splat = shufflevector <64 x half> %insert, <64 x half> undef, <64 x i32> zeroinitializer store <64 x half> %splat, <64 x half>* %b @@ -397,11 +592,35 @@ } define void @splat_v128f16(half %a, <128 x half>* %b) #0 { -; CHECK-LABEL: splat_v128f16: -; VBITS_GE_2048-DAG: mov [[RES:z[0-9]+]].h, h0 -; VBITS_GE_2048-DAG: ptrue [[PG:p[0-9]+]].h, vl128 -; VBITS_GE_2048-NEXT: st1h { [[RES]].h }, [[PG]], [x0] -; VBITS_GE_2048-NEXT: ret +; VBITS_GE_256-LABEL: splat_v128f16: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov x8, #112 +; VBITS_GE_256-NEXT: mov x9, #96 +; VBITS_GE_256-NEXT: // kill: def $h0 killed $h0 def $z0 +; VBITS_GE_256-NEXT: ptrue p0.h, vl16 +; VBITS_GE_256-NEXT: mov x10, #80 +; VBITS_GE_256-NEXT: mov z0.h, h0 +; VBITS_GE_256-NEXT: mov x11, #64 +; VBITS_GE_256-NEXT: mov x12, #48 +; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x0, x8, lsl #1] +; VBITS_GE_256-NEXT: mov x8, #32 +; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x0, x9, lsl #1] +; VBITS_GE_256-NEXT: mov x9, #16 +; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x0, x10, lsl #1] +; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x0, x11, lsl #1] +; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x0, x12, lsl #1] +; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x0, x8, lsl #1] +; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x0] +; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x0, x9, lsl #1] +; VBITS_GE_256-NEXT: ret +; +; VBITS_GE_2048-LABEL: splat_v128f16: +; VBITS_GE_2048: // %bb.0: +; VBITS_GE_2048-NEXT: // kill: def $h0 killed $h0 def $z0 +; VBITS_GE_2048-NEXT: ptrue p0.h, vl128 +; VBITS_GE_2048-NEXT: mov z0.h, h0 +; VBITS_GE_2048-NEXT: st1h { z0.h }, p0, [x0] +; VBITS_GE_2048-NEXT: ret %insert = insertelement <128 x half> undef, half %a, i64 0 %splat = shufflevector <128 x half> %insert, <128 x half> undef, <128 x i32> zeroinitializer store <128 x half> %splat, <128 x half>* %b @@ -411,8 +630,10 @@ ; Don't use SVE for 64-bit vectors. define <2 x float> @splat_v2f32(float %a, <2 x float> %op2) #0 { ; CHECK-LABEL: splat_v2f32: -; CHECK: dup v0.2s, v0.s[0] -; CHECK-NEXT: ret +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $s0 killed $s0 def $q0 +; CHECK-NEXT: dup v0.2s, v0.s[0] +; CHECK-NEXT: ret %insert = insertelement <2 x float> undef, float %a, i64 0 %splat = shufflevector <2 x float> %insert, <2 x float> undef, <2 x i32> zeroinitializer ret <2 x float> %splat @@ -421,8 +642,10 @@ ; Don't use SVE for 128-bit vectors. define <4 x float> @splat_v4f32(float %a, <4 x float> %op2) #0 { ; CHECK-LABEL: splat_v4f32: -; CHECK: dup v0.4s, v0.s[0] -; CHECK-NEXT: ret +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $s0 killed $s0 def $q0 +; CHECK-NEXT: dup v0.4s, v0.s[0] +; CHECK-NEXT: ret %insert = insertelement <4 x float> undef, float %a, i64 0 %splat = shufflevector <4 x float> %insert, <4 x float> undef, <4 x i32> zeroinitializer ret <4 x float> %splat @@ -430,10 +653,12 @@ define void @splat_v8f32(float %a, <8 x float>* %b) #0 { ; CHECK-LABEL: splat_v8f32: -; CHECK-DAG: mov [[RES:z[0-9]+]].s, s0 -; CHECK-DAG: ptrue [[PG:p[0-9]+]].s, vl8 -; CHECK-NEXT: st1w { [[RES]].s }, [[PG]], [x0] -; CHECK-NEXT: ret +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $s0 killed $s0 def $z0 +; CHECK-NEXT: ptrue p0.s, vl8 +; CHECK-NEXT: mov z0.s, s0 +; CHECK-NEXT: st1w { z0.s }, p0, [x0] +; CHECK-NEXT: ret %insert = insertelement <8 x float> undef, float %a, i64 0 %splat = shufflevector <8 x float> %insert, <8 x float> undef, <8 x i32> zeroinitializer store <8 x float> %splat, <8 x float>* %b @@ -441,19 +666,25 @@ } define void @splat_v16f32(float %a, <16 x float>* %b) #0 { -; CHECK-LABEL: splat_v16f32: -; VBITS_GE_512-DAG: mov [[RES:z[0-9]+]].s, s0 -; VBITS_GE_512-DAG: ptrue [[PG:p[0-9]+]].s, vl16 -; VBITS_GE_512-NEXT: st1w { [[RES]].s }, [[PG]], [x0] -; VBITS_GE_512-NEXT: ret +; VBITS_GE_256-LABEL: splat_v16f32: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov x8, #8 +; VBITS_GE_256-NEXT: // kill: def $s0 killed $s0 def $z0 +; VBITS_GE_256-NEXT: ptrue p0.s, vl8 +; VBITS_GE_256-NEXT: mov z0.s, s0 +; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0] +; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0, x8, lsl #2] +; VBITS_GE_256-NEXT: ret +; +; VBITS_GE_512-LABEL: splat_v16f32: +; VBITS_GE_512: // %bb.0: +; VBITS_GE_512-NEXT: // kill: def $s0 killed $s0 def $z0 +; VBITS_GE_512-NEXT: ptrue p0.s, vl16 +; VBITS_GE_512-NEXT: mov z0.s, s0 +; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x0] +; VBITS_GE_512-NEXT: ret ; Ensure sensible type legalisation. -; VBITS_EQ_256-DAG: mov [[RES:z[0-9]+]].s, s0 -; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].s, vl8 -; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #8 -; VBITS_EQ_256-DAG: st1w { [[RES]].s }, [[PG]], [x0] -; VBITS_EQ_256-DAG: st1w { [[RES]].s }, [[PG]], [x0, x[[NUMELTS]], lsl #2] -; VBITS_EQ_256-NEXT: ret %insert = insertelement <16 x float> undef, float %a, i64 0 %splat = shufflevector <16 x float> %insert, <16 x float> undef, <16 x i32> zeroinitializer store <16 x float> %splat, <16 x float>* %b @@ -461,11 +692,27 @@ } define void @splat_v32f32(float %a, <32 x float>* %b) #0 { -; CHECK-LABEL: splat_v32f32: -; VBITS_GE_1024-DAG: mov [[RES:z[0-9]+]].s, s0 -; VBITS_GE_1024-DAG: ptrue [[PG:p[0-9]+]].s, vl32 -; VBITS_GE_1024-NEXT: st1w { [[RES]].s }, [[PG]], [x0] -; VBITS_GE_1024-NEXT: ret +; VBITS_GE_256-LABEL: splat_v32f32: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov x8, #24 +; VBITS_GE_256-NEXT: mov x9, #16 +; VBITS_GE_256-NEXT: mov x10, #8 +; VBITS_GE_256-NEXT: // kill: def $s0 killed $s0 def $z0 +; VBITS_GE_256-NEXT: ptrue p0.s, vl8 +; VBITS_GE_256-NEXT: mov z0.s, s0 +; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0, x8, lsl #2] +; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0, x9, lsl #2] +; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0, x10, lsl #2] +; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0] +; VBITS_GE_256-NEXT: ret +; +; VBITS_GE_1024-LABEL: splat_v32f32: +; VBITS_GE_1024: // %bb.0: +; VBITS_GE_1024-NEXT: // kill: def $s0 killed $s0 def $z0 +; VBITS_GE_1024-NEXT: ptrue p0.s, vl32 +; VBITS_GE_1024-NEXT: mov z0.s, s0 +; VBITS_GE_1024-NEXT: st1w { z0.s }, p0, [x0] +; VBITS_GE_1024-NEXT: ret %insert = insertelement <32 x float> undef, float %a, i64 0 %splat = shufflevector <32 x float> %insert, <32 x float> undef, <32 x i32> zeroinitializer store <32 x float> %splat, <32 x float>* %b @@ -473,11 +720,35 @@ } define void @splat_v64f32(float %a, <64 x float>* %b) #0 { -; CHECK-LABEL: splat_v64f32: -; VBITS_GE_2048-DAG: mov [[RES:z[0-9]+]].s, s0 -; VBITS_GE_2048-DAG: ptrue [[PG:p[0-9]+]].s, vl64 -; VBITS_GE_2048-NEXT: st1w { [[RES]].s }, [[PG]], [x0] -; VBITS_GE_2048-NEXT: ret +; VBITS_GE_256-LABEL: splat_v64f32: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov x8, #56 +; VBITS_GE_256-NEXT: mov x9, #48 +; VBITS_GE_256-NEXT: // kill: def $s0 killed $s0 def $z0 +; VBITS_GE_256-NEXT: ptrue p0.s, vl8 +; VBITS_GE_256-NEXT: mov x10, #40 +; VBITS_GE_256-NEXT: mov z0.s, s0 +; VBITS_GE_256-NEXT: mov x11, #32 +; VBITS_GE_256-NEXT: mov x12, #24 +; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0, x8, lsl #2] +; VBITS_GE_256-NEXT: mov x8, #16 +; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0, x9, lsl #2] +; VBITS_GE_256-NEXT: mov x9, #8 +; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0, x10, lsl #2] +; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0, x11, lsl #2] +; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0, x12, lsl #2] +; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0, x8, lsl #2] +; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0] +; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0, x9, lsl #2] +; VBITS_GE_256-NEXT: ret +; +; VBITS_GE_2048-LABEL: splat_v64f32: +; VBITS_GE_2048: // %bb.0: +; VBITS_GE_2048-NEXT: // kill: def $s0 killed $s0 def $z0 +; VBITS_GE_2048-NEXT: ptrue p0.s, vl64 +; VBITS_GE_2048-NEXT: mov z0.s, s0 +; VBITS_GE_2048-NEXT: st1w { z0.s }, p0, [x0] +; VBITS_GE_2048-NEXT: ret %insert = insertelement <64 x float> undef, float %a, i64 0 %splat = shufflevector <64 x float> %insert, <64 x float> undef, <64 x i32> zeroinitializer store <64 x float> %splat, <64 x float>* %b @@ -487,8 +758,8 @@ ; Don't use SVE for 64-bit vectors. define <1 x double> @splat_v1f64(double %a, <1 x double> %op2) #0 { ; CHECK-LABEL: splat_v1f64: -; CHECK: // %bb.0: -; CHECK-NEXT: ret +; CHECK: // %bb.0: +; CHECK-NEXT: ret %insert = insertelement <1 x double> undef, double %a, i64 0 %splat = shufflevector <1 x double> %insert, <1 x double> undef, <1 x i32> zeroinitializer ret <1 x double> %splat @@ -497,8 +768,10 @@ ; Don't use SVE for 128-bit vectors. define <2 x double> @splat_v2f64(double %a, <2 x double> %op2) #0 { ; CHECK-LABEL: splat_v2f64: -; CHECK: dup v0.2d, v0.d[0] -; CHECK-NEXT: ret +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-NEXT: dup v0.2d, v0.d[0] +; CHECK-NEXT: ret %insert = insertelement <2 x double> undef, double %a, i64 0 %splat = shufflevector <2 x double> %insert, <2 x double> undef, <2 x i32> zeroinitializer ret <2 x double> %splat @@ -506,10 +779,12 @@ define void @splat_v4f64(double %a, <4 x double>* %b) #0 { ; CHECK-LABEL: splat_v4f64: -; CHECK-DAG: mov [[RES:z[0-9]+]].d, d0 -; CHECK-DAG: ptrue [[PG:p[0-9]+]].d, vl4 -; CHECK-NEXT: st1d { [[RES]].d }, [[PG]], [x0] -; CHECK-NEXT: ret +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: ptrue p0.d, vl4 +; CHECK-NEXT: mov z0.d, d0 +; CHECK-NEXT: st1d { z0.d }, p0, [x0] +; CHECK-NEXT: ret %insert = insertelement <4 x double> undef, double %a, i64 0 %splat = shufflevector <4 x double> %insert, <4 x double> undef, <4 x i32> zeroinitializer store <4 x double> %splat, <4 x double>* %b @@ -517,19 +792,25 @@ } define void @splat_v8f64(double %a, <8 x double>* %b) #0 { -; CHECK-LABEL: splat_v8f64: -; VBITS_GE_512-DAG: mov [[RES:z[0-9]+]].d, d0 -; VBITS_GE_512-DAG: ptrue [[PG:p[0-9]+]].d, vl8 -; VBITS_GE_512-NEXT: st1d { [[RES]].d }, [[PG]], [x0] -; VBITS_GE_512-NEXT: ret +; VBITS_GE_256-LABEL: splat_v8f64: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov x8, #4 +; VBITS_GE_256-NEXT: // kill: def $d0 killed $d0 def $z0 +; VBITS_GE_256-NEXT: ptrue p0.d, vl4 +; VBITS_GE_256-NEXT: mov z0.d, d0 +; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x0] +; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x0, x8, lsl #3] +; VBITS_GE_256-NEXT: ret +; +; VBITS_GE_512-LABEL: splat_v8f64: +; VBITS_GE_512: // %bb.0: +; VBITS_GE_512-NEXT: // kill: def $d0 killed $d0 def $z0 +; VBITS_GE_512-NEXT: ptrue p0.d, vl8 +; VBITS_GE_512-NEXT: mov z0.d, d0 +; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x0] +; VBITS_GE_512-NEXT: ret ; Ensure sensible type legalisation. -; VBITS_EQ_256-DAG: mov [[RES:z[0-9]+]].d, d0 -; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].d, vl4 -; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #4 -; VBITS_EQ_256-DAG: st1d { [[RES]].d }, [[PG]], [x0] -; VBITS_EQ_256-DAG: st1d { [[RES]].d }, [[PG]], [x0, x[[NUMELTS]], lsl #3] -; VBITS_EQ_256-NEXT: ret %insert = insertelement <8 x double> undef, double %a, i64 0 %splat = shufflevector <8 x double> %insert, <8 x double> undef, <8 x i32> zeroinitializer store <8 x double> %splat, <8 x double>* %b @@ -537,11 +818,27 @@ } define void @splat_v16f64(double %a, <16 x double>* %b) #0 { -; CHECK-LABEL: splat_v16f64: -; VBITS_GE_1024-DAG: mov [[RES:z[0-9]+]].d, d0 -; VBITS_GE_1024-DAG: ptrue [[PG:p[0-9]+]].d, vl16 -; VBITS_GE_1024-NEXT: st1d { [[RES]].d }, [[PG]], [x0] -; VBITS_GE_1024-NEXT: ret +; VBITS_GE_256-LABEL: splat_v16f64: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov x8, #12 +; VBITS_GE_256-NEXT: mov x9, #8 +; VBITS_GE_256-NEXT: mov x10, #4 +; VBITS_GE_256-NEXT: // kill: def $d0 killed $d0 def $z0 +; VBITS_GE_256-NEXT: ptrue p0.d, vl4 +; VBITS_GE_256-NEXT: mov z0.d, d0 +; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x0, x8, lsl #3] +; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x0, x9, lsl #3] +; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x0, x10, lsl #3] +; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x0] +; VBITS_GE_256-NEXT: ret +; +; VBITS_GE_1024-LABEL: splat_v16f64: +; VBITS_GE_1024: // %bb.0: +; VBITS_GE_1024-NEXT: // kill: def $d0 killed $d0 def $z0 +; VBITS_GE_1024-NEXT: ptrue p0.d, vl16 +; VBITS_GE_1024-NEXT: mov z0.d, d0 +; VBITS_GE_1024-NEXT: st1d { z0.d }, p0, [x0] +; VBITS_GE_1024-NEXT: ret %insert = insertelement <16 x double> undef, double %a, i64 0 %splat = shufflevector <16 x double> %insert, <16 x double> undef, <16 x i32> zeroinitializer store <16 x double> %splat, <16 x double>* %b @@ -549,27 +846,59 @@ } define void @splat_v32f64(double %a, <32 x double>* %b) #0 { -; CHECK-LABEL: splat_v32f64: -; VBITS_GE_2048-DAG: mov [[RES:z[0-9]+]].d, d0 -; VBITS_GE_2048-DAG: ptrue [[PG:p[0-9]+]].d, vl32 -; VBITS_GE_2048-NEXT: st1d { [[RES]].d }, [[PG]], [x0] -; VBITS_GE_2048-NEXT: ret +; VBITS_GE_256-LABEL: splat_v32f64: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov x8, #28 +; VBITS_GE_256-NEXT: mov x9, #24 +; VBITS_GE_256-NEXT: // kill: def $d0 killed $d0 def $z0 +; VBITS_GE_256-NEXT: ptrue p0.d, vl4 +; VBITS_GE_256-NEXT: mov x10, #20 +; VBITS_GE_256-NEXT: mov z0.d, d0 +; VBITS_GE_256-NEXT: mov x11, #16 +; VBITS_GE_256-NEXT: mov x12, #12 +; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x0, x8, lsl #3] +; VBITS_GE_256-NEXT: mov x8, #8 +; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x0, x9, lsl #3] +; VBITS_GE_256-NEXT: mov x9, #4 +; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x0, x10, lsl #3] +; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x0, x11, lsl #3] +; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x0, x12, lsl #3] +; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x0, x8, lsl #3] +; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x0] +; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x0, x9, lsl #3] +; VBITS_GE_256-NEXT: ret +; +; VBITS_GE_2048-LABEL: splat_v32f64: +; VBITS_GE_2048: // %bb.0: +; VBITS_GE_2048-NEXT: // kill: def $d0 killed $d0 def $z0 +; VBITS_GE_2048-NEXT: ptrue p0.d, vl32 +; VBITS_GE_2048-NEXT: mov z0.d, d0 +; VBITS_GE_2048-NEXT: st1d { z0.d }, p0, [x0] +; VBITS_GE_2048-NEXT: ret %insert = insertelement <32 x double> undef, double %a, i64 0 %splat = shufflevector <32 x double> %insert, <32 x double> undef, <32 x i32> zeroinitializer store <32 x double> %splat, <32 x double>* %b ret void } -; ; DUP (integer immediate) -; define void @splat_imm_v64i8(<64 x i8>* %a) #0 { -; CHECK-LABEL: splat_imm_v64i8: -; VBITS_GE_512-DAG: mov [[RES:z[0-9]+]].b, #1 -; VBITS_GE_512-DAG: ptrue [[PG:p[0-9]+]].b, vl64 -; VBITS_GE_512-NEXT: st1b { [[RES]].b }, [[PG]], [x0] -; VBITS_GE_512-NEXT: ret +; VBITS_GE_256-LABEL: splat_imm_v64i8: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov w8, #32 +; VBITS_GE_256-NEXT: mov z0.b, #1 // =0x1 +; VBITS_GE_256-NEXT: ptrue p0.b, vl32 +; VBITS_GE_256-NEXT: st1b { z0.b }, p0, [x0] +; VBITS_GE_256-NEXT: st1b { z0.b }, p0, [x0, x8] +; VBITS_GE_256-NEXT: ret +; +; VBITS_GE_512-LABEL: splat_imm_v64i8: +; VBITS_GE_512: // %bb.0: +; VBITS_GE_512-NEXT: mov z0.b, #1 // =0x1 +; VBITS_GE_512-NEXT: ptrue p0.b, vl64 +; VBITS_GE_512-NEXT: st1b { z0.b }, p0, [x0] +; VBITS_GE_512-NEXT: ret %insert = insertelement <64 x i8> undef, i8 1, i64 0 %splat = shufflevector <64 x i8> %insert, <64 x i8> undef, <64 x i32> zeroinitializer store <64 x i8> %splat, <64 x i8>* %a @@ -577,11 +906,21 @@ } define void @splat_imm_v32i16(<32 x i16>* %a) #0 { -; CHECK-LABEL: splat_imm_v32i16: -; VBITS_GE_512-DAG: mov [[RES:z[0-9]+]].h, #2 -; VBITS_GE_512-DAG: ptrue [[PG:p[0-9]+]].h, vl32 -; VBITS_GE_512-NEXT: st1h { [[RES]].h }, [[PG]], [x0] -; VBITS_GE_512-NEXT: ret +; VBITS_GE_256-LABEL: splat_imm_v32i16: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov x8, #16 +; VBITS_GE_256-NEXT: mov z0.h, #2 // =0x2 +; VBITS_GE_256-NEXT: ptrue p0.h, vl16 +; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x0] +; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x0, x8, lsl #1] +; VBITS_GE_256-NEXT: ret +; +; VBITS_GE_512-LABEL: splat_imm_v32i16: +; VBITS_GE_512: // %bb.0: +; VBITS_GE_512-NEXT: mov z0.h, #2 // =0x2 +; VBITS_GE_512-NEXT: ptrue p0.h, vl32 +; VBITS_GE_512-NEXT: st1h { z0.h }, p0, [x0] +; VBITS_GE_512-NEXT: ret %insert = insertelement <32 x i16> undef, i16 2, i64 0 %splat = shufflevector <32 x i16> %insert, <32 x i16> undef, <32 x i32> zeroinitializer store <32 x i16> %splat, <32 x i16>* %a @@ -589,11 +928,21 @@ } define void @splat_imm_v16i32(<16 x i32>* %a) #0 { -; CHECK-LABEL: splat_imm_v16i32: -; VBITS_GE_512-DAG: mov [[RES:z[0-9]+]].s, #3 -; VBITS_GE_512-DAG: ptrue [[PG:p[0-9]+]].s, vl16 -; VBITS_GE_512-NEXT: st1w { [[RES]].s }, [[PG]], [x0] -; VBITS_GE_512-NEXT: ret +; VBITS_GE_256-LABEL: splat_imm_v16i32: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov x8, #8 +; VBITS_GE_256-NEXT: mov z0.s, #3 // =0x3 +; VBITS_GE_256-NEXT: ptrue p0.s, vl8 +; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0] +; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0, x8, lsl #2] +; VBITS_GE_256-NEXT: ret +; +; VBITS_GE_512-LABEL: splat_imm_v16i32: +; VBITS_GE_512: // %bb.0: +; VBITS_GE_512-NEXT: mov z0.s, #3 // =0x3 +; VBITS_GE_512-NEXT: ptrue p0.s, vl16 +; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x0] +; VBITS_GE_512-NEXT: ret %insert = insertelement <16 x i32> undef, i32 3, i64 0 %splat = shufflevector <16 x i32> %insert, <16 x i32> undef, <16 x i32> zeroinitializer store <16 x i32> %splat, <16 x i32>* %a @@ -601,27 +950,45 @@ } define void @splat_imm_v8i64(<8 x i64>* %a) #0 { -; CHECK-LABEL: splat_imm_v8i64: -; VBITS_GE_512-DAG: mov [[RES:z[0-9]+]].d, #4 -; VBITS_GE_512-DAG: ptrue [[PG:p[0-9]+]].d, vl8 -; VBITS_GE_512-NEXT: st1d { [[RES]].d }, [[PG]], [x0] -; VBITS_GE_512-NEXT: ret +; VBITS_GE_256-LABEL: splat_imm_v8i64: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov x8, #4 +; VBITS_GE_256-NEXT: mov z0.d, #4 // =0x4 +; VBITS_GE_256-NEXT: ptrue p0.d, vl4 +; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x0] +; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x0, x8, lsl #3] +; VBITS_GE_256-NEXT: ret +; +; VBITS_GE_512-LABEL: splat_imm_v8i64: +; VBITS_GE_512: // %bb.0: +; VBITS_GE_512-NEXT: mov z0.d, #4 // =0x4 +; VBITS_GE_512-NEXT: ptrue p0.d, vl8 +; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x0] +; VBITS_GE_512-NEXT: ret %insert = insertelement <8 x i64> undef, i64 4, i64 0 %splat = shufflevector <8 x i64> %insert, <8 x i64> undef, <8 x i32> zeroinitializer store <8 x i64> %splat, <8 x i64>* %a ret void } -; ; DUP (floating-point immediate) -; define void @splat_imm_v32f16(<32 x half>* %a) #0 { -; CHECK-LABEL: splat_imm_v32f16: -; VBITS_GE_512-DAG: fmov [[RES:z[0-9]+]].h, #5.00000000 -; VBITS_GE_512-DAG: ptrue [[PG:p[0-9]+]].h, vl32 -; VBITS_GE_512-NEXT: st1h { [[RES]].h }, [[PG]], [x0] -; VBITS_GE_512-NEXT: ret +; VBITS_GE_256-LABEL: splat_imm_v32f16: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov x8, #16 +; VBITS_GE_256-NEXT: fmov z0.h, #5.00000000 +; VBITS_GE_256-NEXT: ptrue p0.h, vl16 +; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x0] +; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x0, x8, lsl #1] +; VBITS_GE_256-NEXT: ret +; +; VBITS_GE_512-LABEL: splat_imm_v32f16: +; VBITS_GE_512: // %bb.0: +; VBITS_GE_512-NEXT: fmov z0.h, #5.00000000 +; VBITS_GE_512-NEXT: ptrue p0.h, vl32 +; VBITS_GE_512-NEXT: st1h { z0.h }, p0, [x0] +; VBITS_GE_512-NEXT: ret %insert = insertelement <32 x half> undef, half 5.0, i64 0 %splat = shufflevector <32 x half> %insert, <32 x half> undef, <32 x i32> zeroinitializer store <32 x half> %splat, <32 x half>* %a @@ -629,11 +996,21 @@ } define void @splat_imm_v16f32(<16 x float>* %a) #0 { -; CHECK-LABEL: splat_imm_v16f32: -; VBITS_GE_512-DAG: fmov [[RES:z[0-9]+]].s, #6.00000000 -; VBITS_GE_512-DAG: ptrue [[PG:p[0-9]+]].s, vl16 -; VBITS_GE_512-NEXT: st1w { [[RES]].s }, [[PG]], [x0] -; VBITS_GE_512-NEXT: ret +; VBITS_GE_256-LABEL: splat_imm_v16f32: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov x8, #8 +; VBITS_GE_256-NEXT: fmov z0.s, #6.00000000 +; VBITS_GE_256-NEXT: ptrue p0.s, vl8 +; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0] +; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0, x8, lsl #2] +; VBITS_GE_256-NEXT: ret +; +; VBITS_GE_512-LABEL: splat_imm_v16f32: +; VBITS_GE_512: // %bb.0: +; VBITS_GE_512-NEXT: fmov z0.s, #6.00000000 +; VBITS_GE_512-NEXT: ptrue p0.s, vl16 +; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x0] +; VBITS_GE_512-NEXT: ret %insert = insertelement <16 x float> undef, float 6.0, i64 0 %splat = shufflevector <16 x float> %insert, <16 x float> undef, <16 x i32> zeroinitializer store <16 x float> %splat, <16 x float>* %a @@ -641,11 +1018,21 @@ } define void @splat_imm_v8f64(<8 x double>* %a) #0 { -; CHECK-LABEL: splat_imm_v8f64: -; VBITS_GE_512-DAG: fmov [[RES:z[0-9]+]].d, #7.00000000 -; VBITS_GE_512-DAG: ptrue [[PG:p[0-9]+]].d, vl8 -; VBITS_GE_512-NEXT: st1d { [[RES]].d }, [[PG]], [x0] -; VBITS_GE_512-NEXT: ret +; VBITS_GE_256-LABEL: splat_imm_v8f64: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov x8, #4 +; VBITS_GE_256-NEXT: fmov z0.d, #7.00000000 +; VBITS_GE_256-NEXT: ptrue p0.d, vl4 +; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x0] +; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x0, x8, lsl #3] +; VBITS_GE_256-NEXT: ret +; +; VBITS_GE_512-LABEL: splat_imm_v8f64: +; VBITS_GE_512: // %bb.0: +; VBITS_GE_512-NEXT: fmov z0.d, #7.00000000 +; VBITS_GE_512-NEXT: ptrue p0.d, vl8 +; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x0] +; VBITS_GE_512-NEXT: ret %insert = insertelement <8 x double> undef, double 7.0, i64 0 %splat = shufflevector <8 x double> %insert, <8 x double> undef, <8 x i32> zeroinitializer store <8 x double> %splat, <8 x double>* %a diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-subvector.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-subvector.ll --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-subvector.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-subvector.ll @@ -1,6 +1,8 @@ -; RUN: llc -aarch64-sve-vector-bits-min=128 -aarch64-enable-atomic-cfg-tidy=false < %s | FileCheck %s -check-prefix=NO_SVE -; RUN: llc -aarch64-sve-vector-bits-min=256 -aarch64-enable-atomic-cfg-tidy=false < %s | FileCheck %s -check-prefixes=CHECK -; RUN: llc -aarch64-sve-vector-bits-min=384 -aarch64-enable-atomic-cfg-tidy=false < %s | FileCheck %s -check-prefixes=CHECK +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; Don't use SVE when its registers are no bigger than NEON. +; RUN: llc -aarch64-sve-vector-bits-min=128 -aarch64-enable-atomic-cfg-tidy=false < %s | not grep ptrue +; RUN: llc -aarch64-sve-vector-bits-min=256 -aarch64-enable-atomic-cfg-tidy=false < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_256 +; RUN: llc -aarch64-sve-vector-bits-min=384 -aarch64-enable-atomic-cfg-tidy=false < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_256 ; RUN: llc -aarch64-sve-vector-bits-min=512 -aarch64-enable-atomic-cfg-tidy=false < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512 ; RUN: llc -aarch64-sve-vector-bits-min=640 -aarch64-enable-atomic-cfg-tidy=false < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512 ; RUN: llc -aarch64-sve-vector-bits-min=768 -aarch64-enable-atomic-cfg-tidy=false < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512 @@ -26,14 +28,12 @@ target triple = "aarch64-unknown-linux-gnu" -; Don't use SVE when its registers are no bigger than NEON. -; NO_SVE-NOT: ptrue - define void @subvector_v8i16(<8 x i16> *%in, <8 x i16>* %out) #0 { ; CHECK-LABEL: subvector_v8i16: -; CHECK: ldr [[DATA:q[0-9]+]], [x0] -; CHECK: str [[DATA]], [x1] -; CHECK: ret +; CHECK: // %bb.0: +; CHECK-NEXT: ldr q0, [x0] +; CHECK-NEXT: str q0, [x1] +; CHECK-NEXT: ret %a = load <8 x i16>, <8 x i16>* %in br label %bb1 @@ -44,10 +44,11 @@ define void @subvector_v16i16(<16 x i16> *%in, <16 x i16>* %out) #0 { ; CHECK-LABEL: subvector_v16i16: -; CHECK: ptrue [[PG:p[0-9]+]].h, vl16 -; CHECK: ld1h { [[DATA:z[0-9]+.h]] }, [[PG]]/z, [x0] -; CHECK: st1h { [[DATA]] }, [[PG]], [x1] -; CHECK: ret +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.h, vl16 +; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] +; CHECK-NEXT: st1h { z0.h }, p0, [x1] +; CHECK-NEXT: ret %a = load <16 x i16>, <16 x i16>* %in br label %bb1 @@ -57,11 +58,22 @@ } define void @subvector_v32i16(<32 x i16> *%in, <32 x i16>* %out) #0 { -; CHECK-LABEL: subvector_v32i16: -; VBITS_GE_512: ptrue [[PG:p[0-9]+]].h, vl32 -; VBITS_GE_512: ld1h { [[DATA:z[0-9]+.h]] }, [[PG]]/z, [x0] -; VBITS_GE_512: st1h { [[DATA]] }, [[PG]], [x1] -; CHECK: ret +; VBITS_GE_256-LABEL: subvector_v32i16: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov x8, #16 +; VBITS_GE_256-NEXT: ptrue p0.h, vl16 +; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] +; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0] +; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x1, x8, lsl #1] +; VBITS_GE_256-NEXT: st1h { z1.h }, p0, [x1] +; VBITS_GE_256-NEXT: ret +; +; VBITS_GE_512-LABEL: subvector_v32i16: +; VBITS_GE_512: // %bb.0: +; VBITS_GE_512-NEXT: ptrue p0.h, vl32 +; VBITS_GE_512-NEXT: ld1h { z0.h }, p0/z, [x0] +; VBITS_GE_512-NEXT: st1h { z0.h }, p0, [x1] +; VBITS_GE_512-NEXT: ret %a = load <32 x i16>, <32 x i16>* %in br label %bb1 @@ -71,11 +83,28 @@ } define void @subvector_v64i16(<64 x i16> *%in, <64 x i16>* %out) #0 { -; CHECK-LABEL: subvector_v64i16: -; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].h, vl64 -; VBITS_GE_1024: ld1h { [[DATA:z[0-9]+.h]] }, [[PG]]/z, [x0] -; VBITS_GE_1024: st1h { [[DATA]] }, [[PG]], [x1] -; CHECK: ret +; VBITS_GE_256-LABEL: subvector_v64i16: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov x8, #48 +; VBITS_GE_256-NEXT: mov x9, #32 +; VBITS_GE_256-NEXT: mov x10, #16 +; VBITS_GE_256-NEXT: ptrue p0.h, vl16 +; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] +; VBITS_GE_256-NEXT: ld1h { z2.h }, p0/z, [x0, x9, lsl #1] +; VBITS_GE_256-NEXT: ld1h { z3.h }, p0/z, [x0, x10, lsl #1] +; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0] +; VBITS_GE_256-NEXT: st1h { z2.h }, p0, [x1, x9, lsl #1] +; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x1, x8, lsl #1] +; VBITS_GE_256-NEXT: st1h { z3.h }, p0, [x1, x10, lsl #1] +; VBITS_GE_256-NEXT: st1h { z1.h }, p0, [x1] +; VBITS_GE_256-NEXT: ret +; +; VBITS_GE_1024-LABEL: subvector_v64i16: +; VBITS_GE_1024: // %bb.0: +; VBITS_GE_1024-NEXT: ptrue p0.h, vl64 +; VBITS_GE_1024-NEXT: ld1h { z0.h }, p0/z, [x0] +; VBITS_GE_1024-NEXT: st1h { z0.h }, p0, [x1] +; VBITS_GE_1024-NEXT: ret %a = load <64 x i16>, <64 x i16>* %in br label %bb1 @@ -86,10 +115,11 @@ define void @subvector_v8i32(<8 x i32> *%in, <8 x i32>* %out) #0 { ; CHECK-LABEL: subvector_v8i32: -; CHECK: ptrue [[PG:p[0-9]+]].s, vl8 -; CHECK: ld1w { [[DATA:z[0-9]+.s]] }, [[PG]]/z, [x0] -; CHECK: st1w { [[DATA]] }, [[PG]], [x1] -; CHECK: ret +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.s, vl8 +; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] +; CHECK-NEXT: st1w { z0.s }, p0, [x1] +; CHECK-NEXT: ret %a = load <8 x i32>, <8 x i32>* %in br label %bb1 @@ -99,11 +129,22 @@ } define void @subvector_v16i32(<16 x i32> *%in, <16 x i32>* %out) #0 { -; CHECK-LABEL: subvector_v16i32: -; VBITS_GE_512: ptrue [[PG:p[0-9]+]].s, vl16 -; VBITS_GE_512: ld1w { [[DATA:z[0-9]+.s]] }, [[PG]]/z, [x0] -; VBITS_GE_512: st1w { [[DATA]] }, [[PG]], [x1] -; CHECK: ret +; VBITS_GE_256-LABEL: subvector_v16i32: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov x8, #8 +; VBITS_GE_256-NEXT: ptrue p0.s, vl8 +; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0] +; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x1, x8, lsl #2] +; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x1] +; VBITS_GE_256-NEXT: ret +; +; VBITS_GE_512-LABEL: subvector_v16i32: +; VBITS_GE_512: // %bb.0: +; VBITS_GE_512-NEXT: ptrue p0.s, vl16 +; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x0] +; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x1] +; VBITS_GE_512-NEXT: ret %a = load <16 x i32>, <16 x i32>* %in br label %bb1 @@ -113,11 +154,28 @@ } define void @subvector_v32i32(<32 x i32> *%in, <32 x i32>* %out) #0 { -; CHECK-LABEL: subvector_v32i32: -; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].s, vl32 -; VBITS_GE_1024: ld1w { [[DATA:z[0-9]+.s]] }, [[PG]]/z, [x0] -; VBITS_GE_1024: st1w { [[DATA]] }, [[PG]], [x1] -; CHECK: ret +; VBITS_GE_256-LABEL: subvector_v32i32: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov x8, #24 +; VBITS_GE_256-NEXT: mov x9, #16 +; VBITS_GE_256-NEXT: mov x10, #8 +; VBITS_GE_256-NEXT: ptrue p0.s, vl8 +; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z2.s }, p0/z, [x0, x9, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z3.s }, p0/z, [x0, x10, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0] +; VBITS_GE_256-NEXT: st1w { z2.s }, p0, [x1, x9, lsl #2] +; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x1, x8, lsl #2] +; VBITS_GE_256-NEXT: st1w { z3.s }, p0, [x1, x10, lsl #2] +; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x1] +; VBITS_GE_256-NEXT: ret +; +; VBITS_GE_1024-LABEL: subvector_v32i32: +; VBITS_GE_1024: // %bb.0: +; VBITS_GE_1024-NEXT: ptrue p0.s, vl32 +; VBITS_GE_1024-NEXT: ld1w { z0.s }, p0/z, [x0] +; VBITS_GE_1024-NEXT: st1w { z0.s }, p0, [x1] +; VBITS_GE_1024-NEXT: ret %a = load <32 x i32>, <32 x i32>* %in br label %bb1 @@ -127,11 +185,40 @@ } define void @subvector_v64i32(<64 x i32> *%in, <64 x i32>* %out) #0 { -; CHECK-LABEL: subvector_v64i32: -; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].s, vl64 -; VBITS_GE_2048: ld1w { [[DATA:z[0-9]+.s]] }, [[PG]]/z, [x0] -; VBITS_GE_2048: st1w { [[DATA]] }, [[PG]], [x1] -; CHECK: ret +; VBITS_GE_256-LABEL: subvector_v64i32: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov x8, #56 +; VBITS_GE_256-NEXT: mov x9, #48 +; VBITS_GE_256-NEXT: mov x10, #40 +; VBITS_GE_256-NEXT: mov x11, #32 +; VBITS_GE_256-NEXT: mov x12, #24 +; VBITS_GE_256-NEXT: mov x13, #16 +; VBITS_GE_256-NEXT: mov x14, #8 +; VBITS_GE_256-NEXT: ptrue p0.s, vl8 +; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0, x9, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z3.s }, p0/z, [x0, x10, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z4.s }, p0/z, [x0, x11, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z5.s }, p0/z, [x0, x12, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z6.s }, p0/z, [x0, x13, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z7.s }, p0/z, [x0, x14, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z2.s }, p0/z, [x0] +; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x1, x9, lsl #2] +; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x1, x8, lsl #2] +; VBITS_GE_256-NEXT: st1w { z4.s }, p0, [x1, x11, lsl #2] +; VBITS_GE_256-NEXT: st1w { z3.s }, p0, [x1, x10, lsl #2] +; VBITS_GE_256-NEXT: st1w { z6.s }, p0, [x1, x13, lsl #2] +; VBITS_GE_256-NEXT: st1w { z5.s }, p0, [x1, x12, lsl #2] +; VBITS_GE_256-NEXT: st1w { z7.s }, p0, [x1, x14, lsl #2] +; VBITS_GE_256-NEXT: st1w { z2.s }, p0, [x1] +; VBITS_GE_256-NEXT: ret +; +; VBITS_GE_2048-LABEL: subvector_v64i32: +; VBITS_GE_2048: // %bb.0: +; VBITS_GE_2048-NEXT: ptrue p0.s, vl64 +; VBITS_GE_2048-NEXT: ld1w { z0.s }, p0/z, [x0] +; VBITS_GE_2048-NEXT: st1w { z0.s }, p0, [x1] +; VBITS_GE_2048-NEXT: ret %a = load <64 x i32>, <64 x i32>* %in br label %bb1 @@ -142,11 +229,22 @@ define void @subvector_v8i64(<8 x i64> *%in, <8 x i64>* %out) #0 { -; CHECK-LABEL: subvector_v8i64: -; VBITS_GE_512: ptrue [[PG:p[0-9]+]].d, vl8 -; VBITS_GE_512: ld1d { [[DATA:z[0-9]+.d]] }, [[PG]]/z, [x0] -; VBITS_GE_512: st1d { [[DATA]] }, [[PG]], [x1] -; CHECK: ret +; VBITS_GE_256-LABEL: subvector_v8i64: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov x8, #4 +; VBITS_GE_256-NEXT: ptrue p0.d, vl4 +; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0] +; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x1, x8, lsl #3] +; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x1] +; VBITS_GE_256-NEXT: ret +; +; VBITS_GE_512-LABEL: subvector_v8i64: +; VBITS_GE_512: // %bb.0: +; VBITS_GE_512-NEXT: ptrue p0.d, vl8 +; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x0] +; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x1] +; VBITS_GE_512-NEXT: ret %a = load <8 x i64>, <8 x i64>* %in br label %bb1 @@ -156,11 +254,28 @@ } define void @subvector_v16i64(<16 x i64> *%in, <16 x i64>* %out) #0 { -; CHECK-LABEL: subvector_v16i64: -; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].d, vl16 -; VBITS_GE_1024: ld1d { [[DATA:z[0-9]+.d]] }, [[PG]]/z, [x0] -; VBITS_GE_1024: st1d { [[DATA]] }, [[PG]], [x1] -; CHECK: ret +; VBITS_GE_256-LABEL: subvector_v16i64: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov x8, #12 +; VBITS_GE_256-NEXT: mov x9, #8 +; VBITS_GE_256-NEXT: mov x10, #4 +; VBITS_GE_256-NEXT: ptrue p0.d, vl4 +; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z2.d }, p0/z, [x0, x9, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z3.d }, p0/z, [x0, x10, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0] +; VBITS_GE_256-NEXT: st1d { z2.d }, p0, [x1, x9, lsl #3] +; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x1, x8, lsl #3] +; VBITS_GE_256-NEXT: st1d { z3.d }, p0, [x1, x10, lsl #3] +; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x1] +; VBITS_GE_256-NEXT: ret +; +; VBITS_GE_1024-LABEL: subvector_v16i64: +; VBITS_GE_1024: // %bb.0: +; VBITS_GE_1024-NEXT: ptrue p0.d, vl16 +; VBITS_GE_1024-NEXT: ld1d { z0.d }, p0/z, [x0] +; VBITS_GE_1024-NEXT: st1d { z0.d }, p0, [x1] +; VBITS_GE_1024-NEXT: ret %a = load <16 x i64>, <16 x i64>* %in br label %bb1 @@ -170,11 +285,40 @@ } define void @subvector_v32i64(<32 x i64> *%in, <32 x i64>* %out) #0 { -; CHECK-LABEL: subvector_v32i64: -; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].d, vl32 -; VBITS_GE_2048: ld1d { [[DATA:z[0-9]+.d]] }, [[PG]]/z, [x0] -; VBITS_GE_2048: st1d { [[DATA]] }, [[PG]], [x1] -; CHECK: ret +; VBITS_GE_256-LABEL: subvector_v32i64: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov x8, #28 +; VBITS_GE_256-NEXT: mov x9, #24 +; VBITS_GE_256-NEXT: mov x10, #20 +; VBITS_GE_256-NEXT: mov x11, #16 +; VBITS_GE_256-NEXT: mov x12, #12 +; VBITS_GE_256-NEXT: mov x13, #8 +; VBITS_GE_256-NEXT: mov x14, #4 +; VBITS_GE_256-NEXT: ptrue p0.d, vl4 +; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0, x9, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z3.d }, p0/z, [x0, x10, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z4.d }, p0/z, [x0, x11, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z5.d }, p0/z, [x0, x12, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z6.d }, p0/z, [x0, x13, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z7.d }, p0/z, [x0, x14, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z2.d }, p0/z, [x0] +; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x1, x9, lsl #3] +; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x1, x8, lsl #3] +; VBITS_GE_256-NEXT: st1d { z4.d }, p0, [x1, x11, lsl #3] +; VBITS_GE_256-NEXT: st1d { z3.d }, p0, [x1, x10, lsl #3] +; VBITS_GE_256-NEXT: st1d { z6.d }, p0, [x1, x13, lsl #3] +; VBITS_GE_256-NEXT: st1d { z5.d }, p0, [x1, x12, lsl #3] +; VBITS_GE_256-NEXT: st1d { z7.d }, p0, [x1, x14, lsl #3] +; VBITS_GE_256-NEXT: st1d { z2.d }, p0, [x1] +; VBITS_GE_256-NEXT: ret +; +; VBITS_GE_2048-LABEL: subvector_v32i64: +; VBITS_GE_2048: // %bb.0: +; VBITS_GE_2048-NEXT: ptrue p0.d, vl32 +; VBITS_GE_2048-NEXT: ld1d { z0.d }, p0/z, [x0] +; VBITS_GE_2048-NEXT: st1d { z0.d }, p0, [x1] +; VBITS_GE_2048-NEXT: ret %a = load <32 x i64>, <32 x i64>* %in br label %bb1 @@ -185,9 +329,10 @@ define void @subvector_v8f16(<8 x half> *%in, <8 x half>* %out) #0 { ; CHECK-LABEL: subvector_v8f16: -; CHECK: ldr [[DATA:q[0-9]+]], [x0] -; CHECK: str [[DATA]], [x1] -; CHECK: ret +; CHECK: // %bb.0: +; CHECK-NEXT: ldr q0, [x0] +; CHECK-NEXT: str q0, [x1] +; CHECK-NEXT: ret %a = load <8 x half>, <8 x half>* %in br label %bb1 @@ -198,10 +343,11 @@ define void @subvector_v16f16(<16 x half> *%in, <16 x half>* %out) #0 { ; CHECK-LABEL: subvector_v16f16: -; CHECK: ptrue [[PG:p[0-9]+]].h, vl16 -; CHECK: ld1h { [[DATA:z[0-9]+.h]] }, [[PG]]/z, [x0] -; CHECK: st1h { [[DATA]] }, [[PG]], [x1] -; CHECK: ret +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.h, vl16 +; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] +; CHECK-NEXT: st1h { z0.h }, p0, [x1] +; CHECK-NEXT: ret %a = load <16 x half>, <16 x half>* %in br label %bb1 @@ -211,11 +357,22 @@ } define void @subvector_v32f16(<32 x half> *%in, <32 x half>* %out) #0 { -; CHECK-LABEL: subvector_v32f16: -; VBITS_GE_512: ptrue [[PG:p[0-9]+]].h, vl32 -; VBITS_GE_512: ld1h { [[DATA:z[0-9]+.h]] }, [[PG]]/z, [x0] -; VBITS_GE_512: st1h { [[DATA]] }, [[PG]], [x1] -; CHECK: ret +; VBITS_GE_256-LABEL: subvector_v32f16: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov x8, #16 +; VBITS_GE_256-NEXT: ptrue p0.h, vl16 +; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] +; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0] +; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x1, x8, lsl #1] +; VBITS_GE_256-NEXT: st1h { z1.h }, p0, [x1] +; VBITS_GE_256-NEXT: ret +; +; VBITS_GE_512-LABEL: subvector_v32f16: +; VBITS_GE_512: // %bb.0: +; VBITS_GE_512-NEXT: ptrue p0.h, vl32 +; VBITS_GE_512-NEXT: ld1h { z0.h }, p0/z, [x0] +; VBITS_GE_512-NEXT: st1h { z0.h }, p0, [x1] +; VBITS_GE_512-NEXT: ret %a = load <32 x half>, <32 x half>* %in br label %bb1 @@ -225,11 +382,28 @@ } define void @subvector_v64f16(<64 x half> *%in, <64 x half>* %out) #0 { -; CHECK-LABEL: subvector_v64f16: -; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].h, vl64 -; VBITS_GE_1024: ld1h { [[DATA:z[0-9]+.h]] }, [[PG]]/z, [x0] -; VBITS_GE_1024: st1h { [[DATA]] }, [[PG]], [x1] -; CHECK: ret +; VBITS_GE_256-LABEL: subvector_v64f16: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov x8, #48 +; VBITS_GE_256-NEXT: mov x9, #32 +; VBITS_GE_256-NEXT: mov x10, #16 +; VBITS_GE_256-NEXT: ptrue p0.h, vl16 +; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] +; VBITS_GE_256-NEXT: ld1h { z2.h }, p0/z, [x0, x9, lsl #1] +; VBITS_GE_256-NEXT: ld1h { z3.h }, p0/z, [x0, x10, lsl #1] +; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0] +; VBITS_GE_256-NEXT: st1h { z2.h }, p0, [x1, x9, lsl #1] +; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x1, x8, lsl #1] +; VBITS_GE_256-NEXT: st1h { z3.h }, p0, [x1, x10, lsl #1] +; VBITS_GE_256-NEXT: st1h { z1.h }, p0, [x1] +; VBITS_GE_256-NEXT: ret +; +; VBITS_GE_1024-LABEL: subvector_v64f16: +; VBITS_GE_1024: // %bb.0: +; VBITS_GE_1024-NEXT: ptrue p0.h, vl64 +; VBITS_GE_1024-NEXT: ld1h { z0.h }, p0/z, [x0] +; VBITS_GE_1024-NEXT: st1h { z0.h }, p0, [x1] +; VBITS_GE_1024-NEXT: ret %a = load <64 x half>, <64 x half>* %in br label %bb1 @@ -240,10 +414,11 @@ define void @subvector_v8f32(<8 x float> *%in, <8 x float>* %out) #0 { ; CHECK-LABEL: subvector_v8f32: -; CHECK: ptrue [[PG:p[0-9]+]].s, vl8 -; CHECK: ld1w { [[DATA:z[0-9]+.s]] }, [[PG]]/z, [x0] -; CHECK: st1w { [[DATA]] }, [[PG]], [x1] -; CHECK: ret +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.s, vl8 +; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] +; CHECK-NEXT: st1w { z0.s }, p0, [x1] +; CHECK-NEXT: ret %a = load <8 x float>, <8 x float>* %in br label %bb1 @@ -253,11 +428,22 @@ } define void @subvector_v16f32(<16 x float> *%in, <16 x float>* %out) #0 { -; CHECK-LABEL: subvector_v16f32: -; VBITS_GE_512: ptrue [[PG:p[0-9]+]].s, vl16 -; VBITS_GE_512: ld1w { [[DATA:z[0-9]+.s]] }, [[PG]]/z, [x0] -; VBITS_GE_512: st1w { [[DATA]] }, [[PG]], [x1] -; CHECK: ret +; VBITS_GE_256-LABEL: subvector_v16f32: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov x8, #8 +; VBITS_GE_256-NEXT: ptrue p0.s, vl8 +; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0] +; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x1, x8, lsl #2] +; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x1] +; VBITS_GE_256-NEXT: ret +; +; VBITS_GE_512-LABEL: subvector_v16f32: +; VBITS_GE_512: // %bb.0: +; VBITS_GE_512-NEXT: ptrue p0.s, vl16 +; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x0] +; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x1] +; VBITS_GE_512-NEXT: ret %a = load <16 x float>, <16 x float>* %in br label %bb1 @@ -267,11 +453,28 @@ } define void @subvector_v32f32(<32 x float> *%in, <32 x float>* %out) #0 { -; CHECK-LABEL: subvector_v32f32: -; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].s, vl32 -; VBITS_GE_1024: ld1w { [[DATA:z[0-9]+.s]] }, [[PG]]/z, [x0] -; VBITS_GE_1024: st1w { [[DATA]] }, [[PG]], [x1] -; CHECK: ret +; VBITS_GE_256-LABEL: subvector_v32f32: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov x8, #24 +; VBITS_GE_256-NEXT: mov x9, #16 +; VBITS_GE_256-NEXT: mov x10, #8 +; VBITS_GE_256-NEXT: ptrue p0.s, vl8 +; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z2.s }, p0/z, [x0, x9, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z3.s }, p0/z, [x0, x10, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0] +; VBITS_GE_256-NEXT: st1w { z2.s }, p0, [x1, x9, lsl #2] +; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x1, x8, lsl #2] +; VBITS_GE_256-NEXT: st1w { z3.s }, p0, [x1, x10, lsl #2] +; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x1] +; VBITS_GE_256-NEXT: ret +; +; VBITS_GE_1024-LABEL: subvector_v32f32: +; VBITS_GE_1024: // %bb.0: +; VBITS_GE_1024-NEXT: ptrue p0.s, vl32 +; VBITS_GE_1024-NEXT: ld1w { z0.s }, p0/z, [x0] +; VBITS_GE_1024-NEXT: st1w { z0.s }, p0, [x1] +; VBITS_GE_1024-NEXT: ret %a = load <32 x float>, <32 x float>* %in br label %bb1 @@ -281,11 +484,40 @@ } define void @subvector_v64f32(<64 x float> *%in, <64 x float>* %out) #0 { -; CHECK-LABEL: subvector_v64f32: -; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].s, vl64 -; VBITS_GE_2048: ld1w { [[DATA:z[0-9]+.s]] }, [[PG]]/z, [x0] -; VBITS_GE_2048: st1w { [[DATA]] }, [[PG]], [x1] -; CHECK: ret +; VBITS_GE_256-LABEL: subvector_v64f32: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov x8, #56 +; VBITS_GE_256-NEXT: mov x9, #48 +; VBITS_GE_256-NEXT: mov x10, #40 +; VBITS_GE_256-NEXT: mov x11, #32 +; VBITS_GE_256-NEXT: mov x12, #24 +; VBITS_GE_256-NEXT: mov x13, #16 +; VBITS_GE_256-NEXT: mov x14, #8 +; VBITS_GE_256-NEXT: ptrue p0.s, vl8 +; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0, x9, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z3.s }, p0/z, [x0, x10, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z4.s }, p0/z, [x0, x11, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z5.s }, p0/z, [x0, x12, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z6.s }, p0/z, [x0, x13, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z7.s }, p0/z, [x0, x14, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z2.s }, p0/z, [x0] +; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x1, x9, lsl #2] +; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x1, x8, lsl #2] +; VBITS_GE_256-NEXT: st1w { z4.s }, p0, [x1, x11, lsl #2] +; VBITS_GE_256-NEXT: st1w { z3.s }, p0, [x1, x10, lsl #2] +; VBITS_GE_256-NEXT: st1w { z6.s }, p0, [x1, x13, lsl #2] +; VBITS_GE_256-NEXT: st1w { z5.s }, p0, [x1, x12, lsl #2] +; VBITS_GE_256-NEXT: st1w { z7.s }, p0, [x1, x14, lsl #2] +; VBITS_GE_256-NEXT: st1w { z2.s }, p0, [x1] +; VBITS_GE_256-NEXT: ret +; +; VBITS_GE_2048-LABEL: subvector_v64f32: +; VBITS_GE_2048: // %bb.0: +; VBITS_GE_2048-NEXT: ptrue p0.s, vl64 +; VBITS_GE_2048-NEXT: ld1w { z0.s }, p0/z, [x0] +; VBITS_GE_2048-NEXT: st1w { z0.s }, p0, [x1] +; VBITS_GE_2048-NEXT: ret %a = load <64 x float>, <64 x float>* %in br label %bb1 @@ -294,11 +526,22 @@ ret void } define void @subvector_v8f64(<8 x double> *%in, <8 x double>* %out) #0 { -; CHECK-LABEL: subvector_v8f64: -; VBITS_GE_512: ptrue [[PG:p[0-9]+]].d, vl8 -; VBITS_GE_512: ld1d { [[DATA:z[0-9]+.d]] }, [[PG]]/z, [x0] -; VBITS_GE_512: st1d { [[DATA]] }, [[PG]], [x1] -; CHECK: ret +; VBITS_GE_256-LABEL: subvector_v8f64: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov x8, #4 +; VBITS_GE_256-NEXT: ptrue p0.d, vl4 +; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0] +; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x1, x8, lsl #3] +; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x1] +; VBITS_GE_256-NEXT: ret +; +; VBITS_GE_512-LABEL: subvector_v8f64: +; VBITS_GE_512: // %bb.0: +; VBITS_GE_512-NEXT: ptrue p0.d, vl8 +; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x0] +; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x1] +; VBITS_GE_512-NEXT: ret %a = load <8 x double>, <8 x double>* %in br label %bb1 @@ -308,11 +551,28 @@ } define void @subvector_v16f64(<16 x double> *%in, <16 x double>* %out) #0 { -; CHECK-LABEL: subvector_v16f64: -; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].d, vl16 -; VBITS_GE_1024: ld1d { [[DATA:z[0-9]+.d]] }, [[PG]]/z, [x0] -; VBITS_GE_1024: st1d { [[DATA]] }, [[PG]], [x1] -; CHECK: ret +; VBITS_GE_256-LABEL: subvector_v16f64: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov x8, #12 +; VBITS_GE_256-NEXT: mov x9, #8 +; VBITS_GE_256-NEXT: mov x10, #4 +; VBITS_GE_256-NEXT: ptrue p0.d, vl4 +; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z2.d }, p0/z, [x0, x9, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z3.d }, p0/z, [x0, x10, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0] +; VBITS_GE_256-NEXT: st1d { z2.d }, p0, [x1, x9, lsl #3] +; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x1, x8, lsl #3] +; VBITS_GE_256-NEXT: st1d { z3.d }, p0, [x1, x10, lsl #3] +; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x1] +; VBITS_GE_256-NEXT: ret +; +; VBITS_GE_1024-LABEL: subvector_v16f64: +; VBITS_GE_1024: // %bb.0: +; VBITS_GE_1024-NEXT: ptrue p0.d, vl16 +; VBITS_GE_1024-NEXT: ld1d { z0.d }, p0/z, [x0] +; VBITS_GE_1024-NEXT: st1d { z0.d }, p0, [x1] +; VBITS_GE_1024-NEXT: ret %a = load <16 x double>, <16 x double>* %in br label %bb1 @@ -322,11 +582,40 @@ } define void @subvector_v32f64(<32 x double> *%in, <32 x double>* %out) #0 { -; CHECK-LABEL: subvector_v32f64: -; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].d, vl32 -; VBITS_GE_2048: ld1d { [[DATA:z[0-9]+.d]] }, [[PG]]/z, [x0] -; VBITS_GE_2048: st1d { [[DATA]] }, [[PG]], [x1] -; CHECK: ret +; VBITS_GE_256-LABEL: subvector_v32f64: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov x8, #28 +; VBITS_GE_256-NEXT: mov x9, #24 +; VBITS_GE_256-NEXT: mov x10, #20 +; VBITS_GE_256-NEXT: mov x11, #16 +; VBITS_GE_256-NEXT: mov x12, #12 +; VBITS_GE_256-NEXT: mov x13, #8 +; VBITS_GE_256-NEXT: mov x14, #4 +; VBITS_GE_256-NEXT: ptrue p0.d, vl4 +; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0, x9, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z3.d }, p0/z, [x0, x10, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z4.d }, p0/z, [x0, x11, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z5.d }, p0/z, [x0, x12, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z6.d }, p0/z, [x0, x13, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z7.d }, p0/z, [x0, x14, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z2.d }, p0/z, [x0] +; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x1, x9, lsl #3] +; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x1, x8, lsl #3] +; VBITS_GE_256-NEXT: st1d { z4.d }, p0, [x1, x11, lsl #3] +; VBITS_GE_256-NEXT: st1d { z3.d }, p0, [x1, x10, lsl #3] +; VBITS_GE_256-NEXT: st1d { z6.d }, p0, [x1, x13, lsl #3] +; VBITS_GE_256-NEXT: st1d { z5.d }, p0, [x1, x12, lsl #3] +; VBITS_GE_256-NEXT: st1d { z7.d }, p0, [x1, x14, lsl #3] +; VBITS_GE_256-NEXT: st1d { z2.d }, p0, [x1] +; VBITS_GE_256-NEXT: ret +; +; VBITS_GE_2048-LABEL: subvector_v32f64: +; VBITS_GE_2048: // %bb.0: +; VBITS_GE_2048-NEXT: ptrue p0.d, vl32 +; VBITS_GE_2048-NEXT: ld1d { z0.d }, p0/z, [x0] +; VBITS_GE_2048-NEXT: st1d { z0.d }, p0, [x1] +; VBITS_GE_2048-NEXT: ret %a = load <32 x double>, <32 x double>* %in br label %bb1 @@ -337,10 +626,15 @@ define <8 x i1> @no_warn_dropped_scalable(<8 x i32>* %in) #0 { ; CHECK-LABEL: no_warn_dropped_scalable: -; CHECK: ptrue [[PG:p[0-9]+]].s, vl8 -; CHECK: ld1w { [[A:z[0-9]+]].s }, [[PG]]/z, [x0] -; CHECK: cmpgt p{{[0-9]}}.s, [[PG]]/z, [[A]].s, #0 -; CHECK: ret +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.s, vl8 +; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] +; CHECK-NEXT: cmpgt p0.s, p0/z, z0.s, #0 +; CHECK-NEXT: mov z0.s, p0/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h +; CHECK-NEXT: uzp1 z0.b, z0.b, z0.b +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 +; CHECK-NEXT: ret %a = load <8 x i32>, <8 x i32>* %in br label %bb1 @@ -356,14 +650,14 @@ define void @no_subvector_binop_hang(<8 x i32>* %in, <8 x i32>* %out, i1 %cond) #0 { ; CHECK-LABEL: no_subvector_binop_hang: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue [[PG:p[0-9]+]].s, vl8 -; CHECK-NEXT: ld1w { [[A:z[0-9]+]].s }, [[PG]]/z, [x0] -; CHECK-NEXT: ld1w { [[B:z[0-9]+]].s }, [[PG]]/z, [x1] -; CHECK-NEXT: tbz w2, #0, [[LABEL:\.[A-z0-9_]+]] +; CHECK-NEXT: ptrue p0.s, vl8 +; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] +; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1] +; CHECK-NEXT: tbz w2, #0, .LBB23_2 ; CHECK-NEXT: // %bb.1: // %bb.1 -; CHECK-NEXT: orr [[OR:z[0-9]+]].d, [[A]].d, [[B]].d -; CHECK-NEXT: st1w { [[OR]].s }, [[PG]], [x1] -; CHECK-NEXT: [[LABEL]]: // %bb.2 +; CHECK-NEXT: orr z0.d, z0.d, z1.d +; CHECK-NEXT: st1w { z0.s }, p0, [x1] +; CHECK-NEXT: .LBB23_2: // %bb.2 ; CHECK-NEXT: ret %a = load <8 x i32>, <8 x i32>* %in %b = load <8 x i32>, <8 x i32>* %out diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-vector-shuffle.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-vector-shuffle.ll --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-vector-shuffle.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-vector-shuffle.ll @@ -1,6 +1,8 @@ -; RUN: llc -aarch64-sve-vector-bits-min=128 < %s | FileCheck %s -check-prefix=NO_SVE -; RUN: llc -aarch64-sve-vector-bits-min=256 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_EQ_256 -; RUN: llc -aarch64-sve-vector-bits-min=384 < %s | FileCheck %s -check-prefixes=CHECK +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; Don't use SVE when its registers are no bigger than NEON. +; RUN: llc -aarch64-sve-vector-bits-min=128 < %s | not grep ptrue +; RUN: llc -aarch64-sve-vector-bits-min=256 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_256 +; RUN: llc -aarch64-sve-vector-bits-min=384 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_256 ; RUN: llc -aarch64-sve-vector-bits-min=512 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512 ; RUN: llc -aarch64-sve-vector-bits-min=640 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512 ; RUN: llc -aarch64-sve-vector-bits-min=768 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512 @@ -17,9 +19,6 @@ target triple = "aarch64-unknown-linux-gnu" -; Don't use SVE when its registers are no bigger than NEON. -; NO_SVE-NOT: ptrue - ; Don't use SVE for 64-bit vectors define <8 x i8> @shuffle_ext_byone_v8i8(<8 x i8> %op1, <8 x i8> %op2) #0 { ; CHECK-LABEL: shuffle_ext_byone_v8i8: @@ -64,22 +63,22 @@ define void @shuffle_ext_byone_v64i8(<64 x i8>* %a, <64 x i8>* %b) #0 { ; Ensure sensible type legalisation. -; VBITS_EQ_256-LABEL: shuffle_ext_byone_v64i8: -; VBITS_EQ_256: // %bb.0: -; VBITS_EQ_256-NEXT: mov w8, #32 -; VBITS_EQ_256-NEXT: ptrue p0.b, vl32 -; VBITS_EQ_256-NEXT: ld1b { z0.b }, p0/z, [x0, x8] -; VBITS_EQ_256-NEXT: ld1b { z1.b }, p0/z, [x1, x8] -; VBITS_EQ_256-NEXT: ld1b { z2.b }, p0/z, [x1] -; VBITS_EQ_256-NEXT: mov z0.b, z0.b[31] -; VBITS_EQ_256-NEXT: mov z3.b, z2.b[31] -; VBITS_EQ_256-NEXT: fmov w9, s0 -; VBITS_EQ_256-NEXT: fmov w10, s3 -; VBITS_EQ_256-NEXT: insr z2.b, w9 -; VBITS_EQ_256-NEXT: insr z1.b, w10 -; VBITS_EQ_256-NEXT: st1b { z2.b }, p0, [x0] -; VBITS_EQ_256-NEXT: st1b { z1.b }, p0, [x0, x8] -; VBITS_EQ_256-NEXT: ret +; VBITS_GE_256-LABEL: shuffle_ext_byone_v64i8: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov w8, #32 +; VBITS_GE_256-NEXT: ptrue p0.b, vl32 +; VBITS_GE_256-NEXT: ld1b { z0.b }, p0/z, [x0, x8] +; VBITS_GE_256-NEXT: ld1b { z1.b }, p0/z, [x1, x8] +; VBITS_GE_256-NEXT: ld1b { z2.b }, p0/z, [x1] +; VBITS_GE_256-NEXT: mov z0.b, z0.b[31] +; VBITS_GE_256-NEXT: mov z3.b, z2.b[31] +; VBITS_GE_256-NEXT: fmov w9, s0 +; VBITS_GE_256-NEXT: fmov w10, s3 +; VBITS_GE_256-NEXT: insr z2.b, w9 +; VBITS_GE_256-NEXT: insr z1.b, w10 +; VBITS_GE_256-NEXT: st1b { z2.b }, p0, [x0] +; VBITS_GE_256-NEXT: st1b { z1.b }, p0, [x0, x8] +; VBITS_GE_256-NEXT: ret ; ; VBITS_GE_512-LABEL: shuffle_ext_byone_v64i8: ; VBITS_GE_512: // %bb.0: @@ -106,6 +105,35 @@ } define void @shuffle_ext_byone_v128i8(<128 x i8>* %a, <128 x i8>* %b) #0 { +; VBITS_GE_256-LABEL: shuffle_ext_byone_v128i8: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov w8, #64 +; VBITS_GE_256-NEXT: mov w10, #32 +; VBITS_GE_256-NEXT: ptrue p0.b, vl32 +; VBITS_GE_256-NEXT: mov w9, #96 +; VBITS_GE_256-NEXT: ld1b { z0.b }, p0/z, [x1, x8] +; VBITS_GE_256-NEXT: ld1b { z2.b }, p0/z, [x1, x10] +; VBITS_GE_256-NEXT: ld1b { z1.b }, p0/z, [x0, x9] +; VBITS_GE_256-NEXT: ld1b { z3.b }, p0/z, [x1, x9] +; VBITS_GE_256-NEXT: ld1b { z4.b }, p0/z, [x1] +; VBITS_GE_256-NEXT: mov z5.b, z0.b[31] +; VBITS_GE_256-NEXT: fmov w11, s5 +; VBITS_GE_256-NEXT: mov z5.b, z2.b[31] +; VBITS_GE_256-NEXT: mov z1.b, z1.b[31] +; VBITS_GE_256-NEXT: fmov w12, s5 +; VBITS_GE_256-NEXT: mov z5.b, z4.b[31] +; VBITS_GE_256-NEXT: fmov w13, s1 +; VBITS_GE_256-NEXT: fmov w14, s5 +; VBITS_GE_256-NEXT: insr z3.b, w11 +; VBITS_GE_256-NEXT: insr z0.b, w12 +; VBITS_GE_256-NEXT: st1b { z3.b }, p0, [x0, x9] +; VBITS_GE_256-NEXT: insr z4.b, w13 +; VBITS_GE_256-NEXT: st1b { z0.b }, p0, [x0, x8] +; VBITS_GE_256-NEXT: insr z2.b, w14 +; VBITS_GE_256-NEXT: st1b { z4.b }, p0, [x0] +; VBITS_GE_256-NEXT: st1b { z2.b }, p0, [x0, x10] +; VBITS_GE_256-NEXT: ret +; ; VBITS_GE_1024-LABEL: shuffle_ext_byone_v128i8: ; VBITS_GE_1024: // %bb.0: ; VBITS_GE_1024-NEXT: ptrue p0.b, vl128 @@ -140,6 +168,59 @@ } define void @shuffle_ext_byone_v256i8(<256 x i8>* %a, <256 x i8>* %b) #0 { +; VBITS_GE_256-LABEL: shuffle_ext_byone_v256i8: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov w8, #32 +; VBITS_GE_256-NEXT: ptrue p0.b, vl32 +; VBITS_GE_256-NEXT: mov w11, #128 +; VBITS_GE_256-NEXT: mov w13, #64 +; VBITS_GE_256-NEXT: mov w12, #96 +; VBITS_GE_256-NEXT: mov w14, #160 +; VBITS_GE_256-NEXT: ld1b { z0.b }, p0/z, [x1, x8] +; VBITS_GE_256-NEXT: mov w10, #192 +; VBITS_GE_256-NEXT: ld1b { z3.b }, p0/z, [x1, x11] +; VBITS_GE_256-NEXT: ld1b { z5.b }, p0/z, [x1, x13] +; VBITS_GE_256-NEXT: mov w9, #224 +; VBITS_GE_256-NEXT: ld1b { z7.b }, p0/z, [x1, x12] +; VBITS_GE_256-NEXT: ld1b { z4.b }, p0/z, [x1, x10] +; VBITS_GE_256-NEXT: mov z6.b, z0.b[31] +; VBITS_GE_256-NEXT: fmov w15, s6 +; VBITS_GE_256-NEXT: ld1b { z6.b }, p0/z, [x1, x14] +; VBITS_GE_256-NEXT: mov z16.b, z3.b[31] +; VBITS_GE_256-NEXT: ld1b { z1.b }, p0/z, [x0, x9] +; VBITS_GE_256-NEXT: ld1b { z2.b }, p0/z, [x1, x9] +; VBITS_GE_256-NEXT: ld1b { z17.b }, p0/z, [x1] +; VBITS_GE_256-NEXT: fmov w16, s16 +; VBITS_GE_256-NEXT: mov z16.b, z5.b[31] +; VBITS_GE_256-NEXT: insr z5.b, w15 +; VBITS_GE_256-NEXT: fmov w15, s16 +; VBITS_GE_256-NEXT: mov z16.b, z7.b[31] +; VBITS_GE_256-NEXT: mov z1.b, z1.b[31] +; VBITS_GE_256-NEXT: fmov w17, s16 +; VBITS_GE_256-NEXT: mov z16.b, z6.b[31] +; VBITS_GE_256-NEXT: fmov w18, s16 +; VBITS_GE_256-NEXT: mov z16.b, z4.b[31] +; VBITS_GE_256-NEXT: insr z7.b, w15 +; VBITS_GE_256-NEXT: fmov w15, s16 +; VBITS_GE_256-NEXT: mov z16.b, z17.b[31] +; VBITS_GE_256-NEXT: fmov w1, s1 +; VBITS_GE_256-NEXT: fmov w2, s16 +; VBITS_GE_256-NEXT: insr z3.b, w17 +; VBITS_GE_256-NEXT: insr z6.b, w16 +; VBITS_GE_256-NEXT: insr z4.b, w18 +; VBITS_GE_256-NEXT: insr z2.b, w15 +; VBITS_GE_256-NEXT: insr z17.b, w1 +; VBITS_GE_256-NEXT: insr z0.b, w2 +; VBITS_GE_256-NEXT: st1b { z2.b }, p0, [x0, x9] +; VBITS_GE_256-NEXT: st1b { z0.b }, p0, [x0, x8] +; VBITS_GE_256-NEXT: st1b { z4.b }, p0, [x0, x10] +; VBITS_GE_256-NEXT: st1b { z3.b }, p0, [x0, x11] +; VBITS_GE_256-NEXT: st1b { z7.b }, p0, [x0, x12] +; VBITS_GE_256-NEXT: st1b { z5.b }, p0, [x0, x13] +; VBITS_GE_256-NEXT: st1b { z6.b }, p0, [x0, x14] +; VBITS_GE_256-NEXT: st1b { z17.b }, p0, [x0] +; VBITS_GE_256-NEXT: ret +; ; VBITS_GE_2048-LABEL: shuffle_ext_byone_v256i8: ; VBITS_GE_2048: // %bb.0: ; VBITS_GE_2048-NEXT: ptrue p0.b, vl256 @@ -230,22 +311,22 @@ define void @shuffle_ext_byone_v32i16(<32 x i16>* %a, <32 x i16>* %b) #0 { ; Ensure sensible type legalisation. -; VBITS_EQ_256-LABEL: shuffle_ext_byone_v32i16: -; VBITS_EQ_256: // %bb.0: -; VBITS_EQ_256-NEXT: mov x8, #16 -; VBITS_EQ_256-NEXT: ptrue p0.h, vl16 -; VBITS_EQ_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] -; VBITS_EQ_256-NEXT: ld1h { z1.h }, p0/z, [x1, x8, lsl #1] -; VBITS_EQ_256-NEXT: ld1h { z2.h }, p0/z, [x1] -; VBITS_EQ_256-NEXT: mov z0.h, z0.h[15] -; VBITS_EQ_256-NEXT: mov z3.h, z2.h[15] -; VBITS_EQ_256-NEXT: fmov w9, s0 -; VBITS_EQ_256-NEXT: fmov w10, s3 -; VBITS_EQ_256-NEXT: insr z2.h, w9 -; VBITS_EQ_256-NEXT: insr z1.h, w10 -; VBITS_EQ_256-NEXT: st1h { z2.h }, p0, [x0] -; VBITS_EQ_256-NEXT: st1h { z1.h }, p0, [x0, x8, lsl #1] -; VBITS_EQ_256-NEXT: ret +; VBITS_GE_256-LABEL: shuffle_ext_byone_v32i16: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov x8, #16 +; VBITS_GE_256-NEXT: ptrue p0.h, vl16 +; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] +; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x1, x8, lsl #1] +; VBITS_GE_256-NEXT: ld1h { z2.h }, p0/z, [x1] +; VBITS_GE_256-NEXT: mov z0.h, z0.h[15] +; VBITS_GE_256-NEXT: mov z3.h, z2.h[15] +; VBITS_GE_256-NEXT: fmov w9, s0 +; VBITS_GE_256-NEXT: fmov w10, s3 +; VBITS_GE_256-NEXT: insr z2.h, w9 +; VBITS_GE_256-NEXT: insr z1.h, w10 +; VBITS_GE_256-NEXT: st1h { z2.h }, p0, [x0] +; VBITS_GE_256-NEXT: st1h { z1.h }, p0, [x0, x8, lsl #1] +; VBITS_GE_256-NEXT: ret ; ; VBITS_GE_512-LABEL: shuffle_ext_byone_v32i16: ; VBITS_GE_512: // %bb.0: @@ -268,6 +349,35 @@ } define void @shuffle_ext_byone_v64i16(<64 x i16>* %a, <64 x i16>* %b) #0 { +; VBITS_GE_256-LABEL: shuffle_ext_byone_v64i16: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov x8, #32 +; VBITS_GE_256-NEXT: mov x10, #16 +; VBITS_GE_256-NEXT: mov x9, #48 +; VBITS_GE_256-NEXT: ptrue p0.h, vl16 +; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x1, x8, lsl #1] +; VBITS_GE_256-NEXT: ld1h { z2.h }, p0/z, [x1, x10, lsl #1] +; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0, x9, lsl #1] +; VBITS_GE_256-NEXT: ld1h { z3.h }, p0/z, [x1, x9, lsl #1] +; VBITS_GE_256-NEXT: ld1h { z4.h }, p0/z, [x1] +; VBITS_GE_256-NEXT: mov z5.h, z0.h[15] +; VBITS_GE_256-NEXT: fmov w11, s5 +; VBITS_GE_256-NEXT: mov z5.h, z2.h[15] +; VBITS_GE_256-NEXT: mov z1.h, z1.h[15] +; VBITS_GE_256-NEXT: fmov w12, s5 +; VBITS_GE_256-NEXT: mov z5.h, z4.h[15] +; VBITS_GE_256-NEXT: fmov w13, s1 +; VBITS_GE_256-NEXT: fmov w14, s5 +; VBITS_GE_256-NEXT: insr z3.h, w11 +; VBITS_GE_256-NEXT: insr z0.h, w12 +; VBITS_GE_256-NEXT: st1h { z3.h }, p0, [x0, x9, lsl #1] +; VBITS_GE_256-NEXT: insr z4.h, w13 +; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x0, x8, lsl #1] +; VBITS_GE_256-NEXT: insr z2.h, w14 +; VBITS_GE_256-NEXT: st1h { z4.h }, p0, [x0] +; VBITS_GE_256-NEXT: st1h { z2.h }, p0, [x0, x10, lsl #1] +; VBITS_GE_256-NEXT: ret +; ; VBITS_GE_1024-LABEL: shuffle_ext_byone_v64i16: ; VBITS_GE_1024: // %bb.0: ; VBITS_GE_1024-NEXT: ptrue p0.h, vl64 @@ -294,6 +404,59 @@ } define void @shuffle_ext_byone_v128i16(<128 x i16>* %a, <128 x i16>* %b) #0 { +; VBITS_GE_256-LABEL: shuffle_ext_byone_v128i16: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov x8, #16 +; VBITS_GE_256-NEXT: ptrue p0.h, vl16 +; VBITS_GE_256-NEXT: mov x10, #64 +; VBITS_GE_256-NEXT: mov x13, #32 +; VBITS_GE_256-NEXT: mov x14, #48 +; VBITS_GE_256-NEXT: mov x11, #80 +; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x1, x8, lsl #1] +; VBITS_GE_256-NEXT: mov x12, #96 +; VBITS_GE_256-NEXT: ld1h { z2.h }, p0/z, [x1, x10, lsl #1] +; VBITS_GE_256-NEXT: ld1h { z5.h }, p0/z, [x1, x13, lsl #1] +; VBITS_GE_256-NEXT: mov x9, #112 +; VBITS_GE_256-NEXT: ld1h { z7.h }, p0/z, [x1, x14, lsl #1] +; VBITS_GE_256-NEXT: ld1h { z4.h }, p0/z, [x1, x12, lsl #1] +; VBITS_GE_256-NEXT: mov z6.h, z0.h[15] +; VBITS_GE_256-NEXT: fmov w15, s6 +; VBITS_GE_256-NEXT: ld1h { z6.h }, p0/z, [x1, x11, lsl #1] +; VBITS_GE_256-NEXT: mov z16.h, z2.h[15] +; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0, x9, lsl #1] +; VBITS_GE_256-NEXT: ld1h { z3.h }, p0/z, [x1, x9, lsl #1] +; VBITS_GE_256-NEXT: ld1h { z17.h }, p0/z, [x1] +; VBITS_GE_256-NEXT: fmov w16, s16 +; VBITS_GE_256-NEXT: mov z16.h, z5.h[15] +; VBITS_GE_256-NEXT: insr z5.h, w15 +; VBITS_GE_256-NEXT: fmov w15, s16 +; VBITS_GE_256-NEXT: mov z16.h, z7.h[15] +; VBITS_GE_256-NEXT: mov z1.h, z1.h[15] +; VBITS_GE_256-NEXT: fmov w17, s16 +; VBITS_GE_256-NEXT: mov z16.h, z6.h[15] +; VBITS_GE_256-NEXT: fmov w18, s16 +; VBITS_GE_256-NEXT: mov z16.h, z4.h[15] +; VBITS_GE_256-NEXT: insr z7.h, w15 +; VBITS_GE_256-NEXT: fmov w15, s16 +; VBITS_GE_256-NEXT: mov z16.h, z17.h[15] +; VBITS_GE_256-NEXT: fmov w1, s1 +; VBITS_GE_256-NEXT: fmov w2, s16 +; VBITS_GE_256-NEXT: insr z2.h, w17 +; VBITS_GE_256-NEXT: insr z6.h, w16 +; VBITS_GE_256-NEXT: insr z4.h, w18 +; VBITS_GE_256-NEXT: insr z3.h, w15 +; VBITS_GE_256-NEXT: insr z17.h, w1 +; VBITS_GE_256-NEXT: insr z0.h, w2 +; VBITS_GE_256-NEXT: st1h { z3.h }, p0, [x0, x9, lsl #1] +; VBITS_GE_256-NEXT: st1h { z4.h }, p0, [x0, x12, lsl #1] +; VBITS_GE_256-NEXT: st1h { z2.h }, p0, [x0, x10, lsl #1] +; VBITS_GE_256-NEXT: st1h { z7.h }, p0, [x0, x14, lsl #1] +; VBITS_GE_256-NEXT: st1h { z5.h }, p0, [x0, x13, lsl #1] +; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x0, x8, lsl #1] +; VBITS_GE_256-NEXT: st1h { z6.h }, p0, [x0, x11, lsl #1] +; VBITS_GE_256-NEXT: st1h { z17.h }, p0, [x0] +; VBITS_GE_256-NEXT: ret +; ; VBITS_GE_2048-LABEL: shuffle_ext_byone_v128i16: ; VBITS_GE_2048: // %bb.0: ; VBITS_GE_2048-NEXT: ptrue p0.h, vl128 @@ -367,22 +530,22 @@ define void @shuffle_ext_byone_v16i32(<16 x i32>* %a, <16 x i32>* %b) #0 { ; Ensure sensible type legalisation. -; VBITS_EQ_256-LABEL: shuffle_ext_byone_v16i32: -; VBITS_EQ_256: // %bb.0: -; VBITS_EQ_256-NEXT: mov x8, #8 -; VBITS_EQ_256-NEXT: ptrue p0.s, vl8 -; VBITS_EQ_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] -; VBITS_EQ_256-NEXT: ld1w { z1.s }, p0/z, [x1, x8, lsl #2] -; VBITS_EQ_256-NEXT: ld1w { z2.s }, p0/z, [x1] -; VBITS_EQ_256-NEXT: mov z0.s, z0.s[7] -; VBITS_EQ_256-NEXT: mov z3.s, z2.s[7] -; VBITS_EQ_256-NEXT: fmov w9, s0 -; VBITS_EQ_256-NEXT: fmov w10, s3 -; VBITS_EQ_256-NEXT: insr z2.s, w9 -; VBITS_EQ_256-NEXT: insr z1.s, w10 -; VBITS_EQ_256-NEXT: st1w { z2.s }, p0, [x0] -; VBITS_EQ_256-NEXT: st1w { z1.s }, p0, [x0, x8, lsl #2] -; VBITS_EQ_256-NEXT: ret +; VBITS_GE_256-LABEL: shuffle_ext_byone_v16i32: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov x8, #8 +; VBITS_GE_256-NEXT: ptrue p0.s, vl8 +; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x1, x8, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z2.s }, p0/z, [x1] +; VBITS_GE_256-NEXT: mov z0.s, z0.s[7] +; VBITS_GE_256-NEXT: mov z3.s, z2.s[7] +; VBITS_GE_256-NEXT: fmov w9, s0 +; VBITS_GE_256-NEXT: fmov w10, s3 +; VBITS_GE_256-NEXT: insr z2.s, w9 +; VBITS_GE_256-NEXT: insr z1.s, w10 +; VBITS_GE_256-NEXT: st1w { z2.s }, p0, [x0] +; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x0, x8, lsl #2] +; VBITS_GE_256-NEXT: ret ; ; VBITS_GE_512-LABEL: shuffle_ext_byone_v16i32: ; VBITS_GE_512: // %bb.0: @@ -403,6 +566,35 @@ } define void @shuffle_ext_byone_v32i32(<32 x i32>* %a, <32 x i32>* %b) #0 { +; VBITS_GE_256-LABEL: shuffle_ext_byone_v32i32: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov x8, #16 +; VBITS_GE_256-NEXT: mov x10, #8 +; VBITS_GE_256-NEXT: mov x9, #24 +; VBITS_GE_256-NEXT: ptrue p0.s, vl8 +; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x1, x8, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z2.s }, p0/z, [x1, x10, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0, x9, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z3.s }, p0/z, [x1, x9, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z4.s }, p0/z, [x1] +; VBITS_GE_256-NEXT: mov z5.s, z0.s[7] +; VBITS_GE_256-NEXT: fmov w11, s5 +; VBITS_GE_256-NEXT: mov z5.s, z2.s[7] +; VBITS_GE_256-NEXT: mov z1.s, z1.s[7] +; VBITS_GE_256-NEXT: fmov w12, s5 +; VBITS_GE_256-NEXT: mov z5.s, z4.s[7] +; VBITS_GE_256-NEXT: fmov w13, s1 +; VBITS_GE_256-NEXT: fmov w14, s5 +; VBITS_GE_256-NEXT: insr z3.s, w11 +; VBITS_GE_256-NEXT: insr z0.s, w12 +; VBITS_GE_256-NEXT: st1w { z3.s }, p0, [x0, x9, lsl #2] +; VBITS_GE_256-NEXT: insr z4.s, w13 +; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0, x8, lsl #2] +; VBITS_GE_256-NEXT: insr z2.s, w14 +; VBITS_GE_256-NEXT: st1w { z4.s }, p0, [x0] +; VBITS_GE_256-NEXT: st1w { z2.s }, p0, [x0, x10, lsl #2] +; VBITS_GE_256-NEXT: ret +; ; VBITS_GE_1024-LABEL: shuffle_ext_byone_v32i32: ; VBITS_GE_1024: // %bb.0: ; VBITS_GE_1024-NEXT: ptrue p0.s, vl32 @@ -425,6 +617,59 @@ } define void @shuffle_ext_byone_v64i32(<64 x i32>* %a, <64 x i32>* %b) #0 { +; VBITS_GE_256-LABEL: shuffle_ext_byone_v64i32: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov x8, #8 +; VBITS_GE_256-NEXT: ptrue p0.s, vl8 +; VBITS_GE_256-NEXT: mov x10, #32 +; VBITS_GE_256-NEXT: mov x13, #16 +; VBITS_GE_256-NEXT: mov x14, #24 +; VBITS_GE_256-NEXT: mov x11, #40 +; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x1, x8, lsl #2] +; VBITS_GE_256-NEXT: mov x12, #48 +; VBITS_GE_256-NEXT: ld1w { z2.s }, p0/z, [x1, x10, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z5.s }, p0/z, [x1, x13, lsl #2] +; VBITS_GE_256-NEXT: mov x9, #56 +; VBITS_GE_256-NEXT: ld1w { z7.s }, p0/z, [x1, x14, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z4.s }, p0/z, [x1, x12, lsl #2] +; VBITS_GE_256-NEXT: mov z6.s, z0.s[7] +; VBITS_GE_256-NEXT: fmov w15, s6 +; VBITS_GE_256-NEXT: ld1w { z6.s }, p0/z, [x1, x11, lsl #2] +; VBITS_GE_256-NEXT: mov z16.s, z2.s[7] +; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0, x9, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z3.s }, p0/z, [x1, x9, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z17.s }, p0/z, [x1] +; VBITS_GE_256-NEXT: fmov w16, s16 +; VBITS_GE_256-NEXT: mov z16.s, z5.s[7] +; VBITS_GE_256-NEXT: insr z5.s, w15 +; VBITS_GE_256-NEXT: fmov w15, s16 +; VBITS_GE_256-NEXT: mov z16.s, z7.s[7] +; VBITS_GE_256-NEXT: mov z1.s, z1.s[7] +; VBITS_GE_256-NEXT: fmov w17, s16 +; VBITS_GE_256-NEXT: mov z16.s, z6.s[7] +; VBITS_GE_256-NEXT: fmov w18, s16 +; VBITS_GE_256-NEXT: mov z16.s, z4.s[7] +; VBITS_GE_256-NEXT: insr z7.s, w15 +; VBITS_GE_256-NEXT: fmov w15, s16 +; VBITS_GE_256-NEXT: mov z16.s, z17.s[7] +; VBITS_GE_256-NEXT: fmov w1, s1 +; VBITS_GE_256-NEXT: fmov w2, s16 +; VBITS_GE_256-NEXT: insr z2.s, w17 +; VBITS_GE_256-NEXT: insr z6.s, w16 +; VBITS_GE_256-NEXT: insr z4.s, w18 +; VBITS_GE_256-NEXT: insr z3.s, w15 +; VBITS_GE_256-NEXT: insr z17.s, w1 +; VBITS_GE_256-NEXT: insr z0.s, w2 +; VBITS_GE_256-NEXT: st1w { z3.s }, p0, [x0, x9, lsl #2] +; VBITS_GE_256-NEXT: st1w { z4.s }, p0, [x0, x12, lsl #2] +; VBITS_GE_256-NEXT: st1w { z2.s }, p0, [x0, x10, lsl #2] +; VBITS_GE_256-NEXT: st1w { z7.s }, p0, [x0, x14, lsl #2] +; VBITS_GE_256-NEXT: st1w { z5.s }, p0, [x0, x13, lsl #2] +; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0, x8, lsl #2] +; VBITS_GE_256-NEXT: st1w { z6.s }, p0, [x0, x11, lsl #2] +; VBITS_GE_256-NEXT: st1w { z17.s }, p0, [x0] +; VBITS_GE_256-NEXT: ret +; ; VBITS_GE_2048-LABEL: shuffle_ext_byone_v64i32: ; VBITS_GE_2048: // %bb.0: ; VBITS_GE_2048-NEXT: ptrue p0.s, vl64 @@ -480,22 +725,22 @@ define void @shuffle_ext_byone_v8i64(<8 x i64>* %a, <8 x i64>* %b) #0 { ; Ensure sensible type legalisation. -; VBITS_EQ_256-LABEL: shuffle_ext_byone_v8i64: -; VBITS_EQ_256: // %bb.0: -; VBITS_EQ_256-NEXT: mov x8, #4 -; VBITS_EQ_256-NEXT: ptrue p0.d, vl4 -; VBITS_EQ_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] -; VBITS_EQ_256-NEXT: ld1d { z1.d }, p0/z, [x1, x8, lsl #3] -; VBITS_EQ_256-NEXT: ld1d { z2.d }, p0/z, [x1] -; VBITS_EQ_256-NEXT: mov z0.d, z0.d[3] -; VBITS_EQ_256-NEXT: mov z3.d, z2.d[3] -; VBITS_EQ_256-NEXT: fmov x9, d0 -; VBITS_EQ_256-NEXT: fmov x10, d3 -; VBITS_EQ_256-NEXT: insr z2.d, x9 -; VBITS_EQ_256-NEXT: insr z1.d, x10 -; VBITS_EQ_256-NEXT: st1d { z2.d }, p0, [x0] -; VBITS_EQ_256-NEXT: st1d { z1.d }, p0, [x0, x8, lsl #3] -; VBITS_EQ_256-NEXT: ret +; VBITS_GE_256-LABEL: shuffle_ext_byone_v8i64: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov x8, #4 +; VBITS_GE_256-NEXT: ptrue p0.d, vl4 +; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x1, x8, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z2.d }, p0/z, [x1] +; VBITS_GE_256-NEXT: mov z0.d, z0.d[3] +; VBITS_GE_256-NEXT: mov z3.d, z2.d[3] +; VBITS_GE_256-NEXT: fmov x9, d0 +; VBITS_GE_256-NEXT: fmov x10, d3 +; VBITS_GE_256-NEXT: insr z2.d, x9 +; VBITS_GE_256-NEXT: insr z1.d, x10 +; VBITS_GE_256-NEXT: st1d { z2.d }, p0, [x0] +; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x0, x8, lsl #3] +; VBITS_GE_256-NEXT: ret ; ; VBITS_GE_512-LABEL: shuffle_ext_byone_v8i64: ; VBITS_GE_512: // %bb.0: @@ -515,6 +760,35 @@ } define void @shuffle_ext_byone_v16i64(<16 x i64>* %a, <16 x i64>* %b) #0 { +; VBITS_GE_256-LABEL: shuffle_ext_byone_v16i64: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov x8, #8 +; VBITS_GE_256-NEXT: mov x10, #4 +; VBITS_GE_256-NEXT: mov x9, #12 +; VBITS_GE_256-NEXT: ptrue p0.d, vl4 +; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x1, x8, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z2.d }, p0/z, [x1, x10, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0, x9, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z3.d }, p0/z, [x1, x9, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z4.d }, p0/z, [x1] +; VBITS_GE_256-NEXT: mov z5.d, z0.d[3] +; VBITS_GE_256-NEXT: fmov x11, d5 +; VBITS_GE_256-NEXT: mov z5.d, z2.d[3] +; VBITS_GE_256-NEXT: mov z1.d, z1.d[3] +; VBITS_GE_256-NEXT: fmov x12, d5 +; VBITS_GE_256-NEXT: mov z5.d, z4.d[3] +; VBITS_GE_256-NEXT: fmov x13, d1 +; VBITS_GE_256-NEXT: fmov x14, d5 +; VBITS_GE_256-NEXT: insr z3.d, x11 +; VBITS_GE_256-NEXT: insr z0.d, x12 +; VBITS_GE_256-NEXT: st1d { z3.d }, p0, [x0, x9, lsl #3] +; VBITS_GE_256-NEXT: insr z4.d, x13 +; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x0, x8, lsl #3] +; VBITS_GE_256-NEXT: insr z2.d, x14 +; VBITS_GE_256-NEXT: st1d { z4.d }, p0, [x0] +; VBITS_GE_256-NEXT: st1d { z2.d }, p0, [x0, x10, lsl #3] +; VBITS_GE_256-NEXT: ret +; ; VBITS_GE_1024-LABEL: shuffle_ext_byone_v16i64: ; VBITS_GE_1024: // %bb.0: ; VBITS_GE_1024-NEXT: ptrue p0.d, vl16 @@ -535,6 +809,59 @@ } define void @shuffle_ext_byone_v32i64(<32 x i64>* %a, <32 x i64>* %b) #0 { +; VBITS_GE_256-LABEL: shuffle_ext_byone_v32i64: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov x8, #4 +; VBITS_GE_256-NEXT: ptrue p0.d, vl4 +; VBITS_GE_256-NEXT: mov x10, #16 +; VBITS_GE_256-NEXT: mov x13, #8 +; VBITS_GE_256-NEXT: mov x14, #12 +; VBITS_GE_256-NEXT: mov x11, #20 +; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x1, x8, lsl #3] +; VBITS_GE_256-NEXT: mov x12, #24 +; VBITS_GE_256-NEXT: ld1d { z2.d }, p0/z, [x1, x10, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z5.d }, p0/z, [x1, x13, lsl #3] +; VBITS_GE_256-NEXT: mov x9, #28 +; VBITS_GE_256-NEXT: ld1d { z7.d }, p0/z, [x1, x14, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z4.d }, p0/z, [x1, x12, lsl #3] +; VBITS_GE_256-NEXT: mov z6.d, z0.d[3] +; VBITS_GE_256-NEXT: fmov x15, d6 +; VBITS_GE_256-NEXT: ld1d { z6.d }, p0/z, [x1, x11, lsl #3] +; VBITS_GE_256-NEXT: mov z16.d, z2.d[3] +; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0, x9, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z3.d }, p0/z, [x1, x9, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z17.d }, p0/z, [x1] +; VBITS_GE_256-NEXT: fmov x16, d16 +; VBITS_GE_256-NEXT: mov z16.d, z5.d[3] +; VBITS_GE_256-NEXT: insr z5.d, x15 +; VBITS_GE_256-NEXT: fmov x15, d16 +; VBITS_GE_256-NEXT: mov z16.d, z7.d[3] +; VBITS_GE_256-NEXT: mov z1.d, z1.d[3] +; VBITS_GE_256-NEXT: fmov x17, d16 +; VBITS_GE_256-NEXT: mov z16.d, z6.d[3] +; VBITS_GE_256-NEXT: fmov x18, d16 +; VBITS_GE_256-NEXT: mov z16.d, z4.d[3] +; VBITS_GE_256-NEXT: insr z7.d, x15 +; VBITS_GE_256-NEXT: fmov x15, d16 +; VBITS_GE_256-NEXT: mov z16.d, z17.d[3] +; VBITS_GE_256-NEXT: fmov x1, d1 +; VBITS_GE_256-NEXT: fmov x2, d16 +; VBITS_GE_256-NEXT: insr z2.d, x17 +; VBITS_GE_256-NEXT: insr z6.d, x16 +; VBITS_GE_256-NEXT: insr z4.d, x18 +; VBITS_GE_256-NEXT: insr z3.d, x15 +; VBITS_GE_256-NEXT: insr z17.d, x1 +; VBITS_GE_256-NEXT: insr z0.d, x2 +; VBITS_GE_256-NEXT: st1d { z3.d }, p0, [x0, x9, lsl #3] +; VBITS_GE_256-NEXT: st1d { z4.d }, p0, [x0, x12, lsl #3] +; VBITS_GE_256-NEXT: st1d { z2.d }, p0, [x0, x10, lsl #3] +; VBITS_GE_256-NEXT: st1d { z7.d }, p0, [x0, x14, lsl #3] +; VBITS_GE_256-NEXT: st1d { z5.d }, p0, [x0, x13, lsl #3] +; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x0, x8, lsl #3] +; VBITS_GE_256-NEXT: st1d { z6.d }, p0, [x0, x11, lsl #3] +; VBITS_GE_256-NEXT: st1d { z17.d }, p0, [x0] +; VBITS_GE_256-NEXT: ret +; ; VBITS_GE_2048-LABEL: shuffle_ext_byone_v32i64: ; VBITS_GE_2048: // %bb.0: ; VBITS_GE_2048-NEXT: ptrue p0.d, vl32 @@ -596,20 +923,20 @@ define void @shuffle_ext_byone_v32f16(<32 x half>* %a, <32 x half>* %b) #0 { ; Ensure sensible type legalisation. -; VBITS_EQ_256-LABEL: shuffle_ext_byone_v32f16: -; VBITS_EQ_256: // %bb.0: -; VBITS_EQ_256-NEXT: mov x8, #16 -; VBITS_EQ_256-NEXT: ptrue p0.h, vl16 -; VBITS_EQ_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] -; VBITS_EQ_256-NEXT: ld1h { z1.h }, p0/z, [x1, x8, lsl #1] -; VBITS_EQ_256-NEXT: ld1h { z2.h }, p0/z, [x1] -; VBITS_EQ_256-NEXT: mov z0.h, z0.h[15] -; VBITS_EQ_256-NEXT: mov z3.h, z2.h[15] -; VBITS_EQ_256-NEXT: insr z2.h, h0 -; VBITS_EQ_256-NEXT: insr z1.h, h3 -; VBITS_EQ_256-NEXT: st1h { z2.h }, p0, [x0] -; VBITS_EQ_256-NEXT: st1h { z1.h }, p0, [x0, x8, lsl #1] -; VBITS_EQ_256-NEXT: ret +; VBITS_GE_256-LABEL: shuffle_ext_byone_v32f16: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov x8, #16 +; VBITS_GE_256-NEXT: ptrue p0.h, vl16 +; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] +; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x1, x8, lsl #1] +; VBITS_GE_256-NEXT: ld1h { z2.h }, p0/z, [x1] +; VBITS_GE_256-NEXT: mov z0.h, z0.h[15] +; VBITS_GE_256-NEXT: mov z3.h, z2.h[15] +; VBITS_GE_256-NEXT: insr z2.h, h0 +; VBITS_GE_256-NEXT: insr z1.h, h3 +; VBITS_GE_256-NEXT: st1h { z2.h }, p0, [x0] +; VBITS_GE_256-NEXT: st1h { z1.h }, p0, [x0, x8, lsl #1] +; VBITS_GE_256-NEXT: ret ; ; VBITS_GE_512-LABEL: shuffle_ext_byone_v32f16: ; VBITS_GE_512: // %bb.0: @@ -631,6 +958,31 @@ } define void @shuffle_ext_byone_v64f16(<64 x half>* %a, <64 x half>* %b) #0 { +; VBITS_GE_256-LABEL: shuffle_ext_byone_v64f16: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov x8, #32 +; VBITS_GE_256-NEXT: mov x9, #48 +; VBITS_GE_256-NEXT: mov x10, #16 +; VBITS_GE_256-NEXT: ptrue p0.h, vl16 +; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x1, x8, lsl #1] +; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x1, x9, lsl #1] +; VBITS_GE_256-NEXT: ld1h { z3.h }, p0/z, [x1, x10, lsl #1] +; VBITS_GE_256-NEXT: ld1h { z2.h }, p0/z, [x0, x9, lsl #1] +; VBITS_GE_256-NEXT: ld1h { z4.h }, p0/z, [x1] +; VBITS_GE_256-NEXT: mov z5.h, z0.h[15] +; VBITS_GE_256-NEXT: insr z1.h, h5 +; VBITS_GE_256-NEXT: mov z5.h, z3.h[15] +; VBITS_GE_256-NEXT: mov z2.h, z2.h[15] +; VBITS_GE_256-NEXT: insr z0.h, h5 +; VBITS_GE_256-NEXT: mov z5.h, z4.h[15] +; VBITS_GE_256-NEXT: insr z4.h, h2 +; VBITS_GE_256-NEXT: insr z3.h, h5 +; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x0, x8, lsl #1] +; VBITS_GE_256-NEXT: st1h { z1.h }, p0, [x0, x9, lsl #1] +; VBITS_GE_256-NEXT: st1h { z3.h }, p0, [x0, x10, lsl #1] +; VBITS_GE_256-NEXT: st1h { z4.h }, p0, [x0] +; VBITS_GE_256-NEXT: ret +; ; VBITS_GE_1024-LABEL: shuffle_ext_byone_v64f16: ; VBITS_GE_1024: // %bb.0: ; VBITS_GE_1024-NEXT: ptrue p0.h, vl64 @@ -657,6 +1009,51 @@ } define void @shuffle_ext_byone_v128f16(<128 x half>* %a, <128 x half>* %b) #0 { +; VBITS_GE_256-LABEL: shuffle_ext_byone_v128f16: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov x10, #64 +; VBITS_GE_256-NEXT: mov x9, #80 +; VBITS_GE_256-NEXT: mov x11, #16 +; VBITS_GE_256-NEXT: ptrue p0.h, vl16 +; VBITS_GE_256-NEXT: mov x12, #32 +; VBITS_GE_256-NEXT: mov x13, #48 +; VBITS_GE_256-NEXT: mov x8, #112 +; VBITS_GE_256-NEXT: mov x14, #96 +; VBITS_GE_256-NEXT: ld1h { z3.h }, p0/z, [x1, x10, lsl #1] +; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x1, x9, lsl #1] +; VBITS_GE_256-NEXT: ld1h { z5.h }, p0/z, [x1, x11, lsl #1] +; VBITS_GE_256-NEXT: ld1h { z4.h }, p0/z, [x1, x12, lsl #1] +; VBITS_GE_256-NEXT: ld1h { z16.h }, p0/z, [x1, x13, lsl #1] +; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] +; VBITS_GE_256-NEXT: ld1h { z2.h }, p0/z, [x1, x8, lsl #1] +; VBITS_GE_256-NEXT: ld1h { z7.h }, p0/z, [x1, x14, lsl #1] +; VBITS_GE_256-NEXT: ld1h { z17.h }, p0/z, [x1] +; VBITS_GE_256-NEXT: mov z18.h, z3.h[15] +; VBITS_GE_256-NEXT: mov z6.h, z1.h[15] +; VBITS_GE_256-NEXT: insr z1.h, h18 +; VBITS_GE_256-NEXT: mov z18.h, z5.h[15] +; VBITS_GE_256-NEXT: mov z19.h, z4.h[15] +; VBITS_GE_256-NEXT: insr z4.h, h18 +; VBITS_GE_256-NEXT: mov z18.h, z16.h[15] +; VBITS_GE_256-NEXT: insr z3.h, h18 +; VBITS_GE_256-NEXT: mov z18.h, z7.h[15] +; VBITS_GE_256-NEXT: insr z7.h, h6 +; VBITS_GE_256-NEXT: mov z0.h, z0.h[15] +; VBITS_GE_256-NEXT: mov z6.h, z17.h[15] +; VBITS_GE_256-NEXT: insr z16.h, h19 +; VBITS_GE_256-NEXT: insr z2.h, h18 +; VBITS_GE_256-NEXT: insr z17.h, h0 +; VBITS_GE_256-NEXT: insr z5.h, h6 +; VBITS_GE_256-NEXT: st1h { z2.h }, p0, [x0, x8, lsl #1] +; VBITS_GE_256-NEXT: st1h { z7.h }, p0, [x0, x14, lsl #1] +; VBITS_GE_256-NEXT: st1h { z3.h }, p0, [x0, x10, lsl #1] +; VBITS_GE_256-NEXT: st1h { z16.h }, p0, [x0, x13, lsl #1] +; VBITS_GE_256-NEXT: st1h { z4.h }, p0, [x0, x12, lsl #1] +; VBITS_GE_256-NEXT: st1h { z5.h }, p0, [x0, x11, lsl #1] +; VBITS_GE_256-NEXT: st1h { z1.h }, p0, [x0, x9, lsl #1] +; VBITS_GE_256-NEXT: st1h { z17.h }, p0, [x0] +; VBITS_GE_256-NEXT: ret +; ; VBITS_GE_2048-LABEL: shuffle_ext_byone_v128f16: ; VBITS_GE_2048: // %bb.0: ; VBITS_GE_2048-NEXT: ptrue p0.h, vl128 @@ -729,20 +1126,20 @@ define void @shuffle_ext_byone_v16f32(<16 x float>* %a, <16 x float>* %b) #0 { ; Ensure sensible type legalisation. -; VBITS_EQ_256-LABEL: shuffle_ext_byone_v16f32: -; VBITS_EQ_256: // %bb.0: -; VBITS_EQ_256-NEXT: mov x8, #8 -; VBITS_EQ_256-NEXT: ptrue p0.s, vl8 -; VBITS_EQ_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] -; VBITS_EQ_256-NEXT: ld1w { z1.s }, p0/z, [x1, x8, lsl #2] -; VBITS_EQ_256-NEXT: ld1w { z2.s }, p0/z, [x1] -; VBITS_EQ_256-NEXT: mov z0.s, z0.s[7] -; VBITS_EQ_256-NEXT: mov z3.s, z2.s[7] -; VBITS_EQ_256-NEXT: insr z2.s, s0 -; VBITS_EQ_256-NEXT: insr z1.s, s3 -; VBITS_EQ_256-NEXT: st1w { z2.s }, p0, [x0] -; VBITS_EQ_256-NEXT: st1w { z1.s }, p0, [x0, x8, lsl #2] -; VBITS_EQ_256-NEXT: ret +; VBITS_GE_256-LABEL: shuffle_ext_byone_v16f32: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov x8, #8 +; VBITS_GE_256-NEXT: ptrue p0.s, vl8 +; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x1, x8, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z2.s }, p0/z, [x1] +; VBITS_GE_256-NEXT: mov z0.s, z0.s[7] +; VBITS_GE_256-NEXT: mov z3.s, z2.s[7] +; VBITS_GE_256-NEXT: insr z2.s, s0 +; VBITS_GE_256-NEXT: insr z1.s, s3 +; VBITS_GE_256-NEXT: st1w { z2.s }, p0, [x0] +; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x0, x8, lsl #2] +; VBITS_GE_256-NEXT: ret ; ; VBITS_GE_512-LABEL: shuffle_ext_byone_v16f32: ; VBITS_GE_512: // %bb.0: @@ -762,6 +1159,31 @@ } define void @shuffle_ext_byone_v32f32(<32 x float>* %a, <32 x float>* %b) #0 { +; VBITS_GE_256-LABEL: shuffle_ext_byone_v32f32: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov x8, #16 +; VBITS_GE_256-NEXT: mov x9, #24 +; VBITS_GE_256-NEXT: mov x10, #8 +; VBITS_GE_256-NEXT: ptrue p0.s, vl8 +; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x1, x8, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x1, x9, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z3.s }, p0/z, [x1, x10, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z2.s }, p0/z, [x0, x9, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z4.s }, p0/z, [x1] +; VBITS_GE_256-NEXT: mov z5.s, z0.s[7] +; VBITS_GE_256-NEXT: insr z1.s, s5 +; VBITS_GE_256-NEXT: mov z5.s, z3.s[7] +; VBITS_GE_256-NEXT: mov z2.s, z2.s[7] +; VBITS_GE_256-NEXT: insr z0.s, s5 +; VBITS_GE_256-NEXT: mov z5.s, z4.s[7] +; VBITS_GE_256-NEXT: insr z4.s, s2 +; VBITS_GE_256-NEXT: insr z3.s, s5 +; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0, x8, lsl #2] +; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x0, x9, lsl #2] +; VBITS_GE_256-NEXT: st1w { z3.s }, p0, [x0, x10, lsl #2] +; VBITS_GE_256-NEXT: st1w { z4.s }, p0, [x0] +; VBITS_GE_256-NEXT: ret +; ; VBITS_GE_1024-LABEL: shuffle_ext_byone_v32f32: ; VBITS_GE_1024: // %bb.0: ; VBITS_GE_1024-NEXT: ptrue p0.s, vl32 @@ -784,6 +1206,51 @@ } define void @shuffle_ext_byone_v64f32(<64 x float>* %a, <64 x float>* %b) #0 { +; VBITS_GE_256-LABEL: shuffle_ext_byone_v64f32: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov x10, #32 +; VBITS_GE_256-NEXT: mov x9, #40 +; VBITS_GE_256-NEXT: mov x11, #8 +; VBITS_GE_256-NEXT: ptrue p0.s, vl8 +; VBITS_GE_256-NEXT: mov x12, #16 +; VBITS_GE_256-NEXT: mov x13, #24 +; VBITS_GE_256-NEXT: mov x8, #56 +; VBITS_GE_256-NEXT: mov x14, #48 +; VBITS_GE_256-NEXT: ld1w { z3.s }, p0/z, [x1, x10, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x1, x9, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z5.s }, p0/z, [x1, x11, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z4.s }, p0/z, [x1, x12, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z16.s }, p0/z, [x1, x13, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z2.s }, p0/z, [x1, x8, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z7.s }, p0/z, [x1, x14, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z17.s }, p0/z, [x1] +; VBITS_GE_256-NEXT: mov z18.s, z3.s[7] +; VBITS_GE_256-NEXT: mov z6.s, z1.s[7] +; VBITS_GE_256-NEXT: insr z1.s, s18 +; VBITS_GE_256-NEXT: mov z18.s, z5.s[7] +; VBITS_GE_256-NEXT: mov z19.s, z4.s[7] +; VBITS_GE_256-NEXT: insr z4.s, s18 +; VBITS_GE_256-NEXT: mov z18.s, z16.s[7] +; VBITS_GE_256-NEXT: insr z3.s, s18 +; VBITS_GE_256-NEXT: mov z18.s, z7.s[7] +; VBITS_GE_256-NEXT: insr z7.s, s6 +; VBITS_GE_256-NEXT: mov z0.s, z0.s[7] +; VBITS_GE_256-NEXT: mov z6.s, z17.s[7] +; VBITS_GE_256-NEXT: insr z16.s, s19 +; VBITS_GE_256-NEXT: insr z2.s, s18 +; VBITS_GE_256-NEXT: insr z17.s, s0 +; VBITS_GE_256-NEXT: insr z5.s, s6 +; VBITS_GE_256-NEXT: st1w { z2.s }, p0, [x0, x8, lsl #2] +; VBITS_GE_256-NEXT: st1w { z7.s }, p0, [x0, x14, lsl #2] +; VBITS_GE_256-NEXT: st1w { z3.s }, p0, [x0, x10, lsl #2] +; VBITS_GE_256-NEXT: st1w { z16.s }, p0, [x0, x13, lsl #2] +; VBITS_GE_256-NEXT: st1w { z4.s }, p0, [x0, x12, lsl #2] +; VBITS_GE_256-NEXT: st1w { z5.s }, p0, [x0, x11, lsl #2] +; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x0, x9, lsl #2] +; VBITS_GE_256-NEXT: st1w { z17.s }, p0, [x0] +; VBITS_GE_256-NEXT: ret +; ; VBITS_GE_2048-LABEL: shuffle_ext_byone_v64f32: ; VBITS_GE_2048: // %bb.0: ; VBITS_GE_2048-NEXT: ptrue p0.s, vl64 @@ -838,20 +1305,20 @@ define void @shuffle_ext_byone_v8f64(<8 x double>* %a, <8 x double>* %b) #0 { ; Ensure sensible type legalisation. -; VBITS_EQ_256-LABEL: shuffle_ext_byone_v8f64: -; VBITS_EQ_256: // %bb.0: -; VBITS_EQ_256-NEXT: mov x8, #4 -; VBITS_EQ_256-NEXT: ptrue p0.d, vl4 -; VBITS_EQ_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] -; VBITS_EQ_256-NEXT: ld1d { z1.d }, p0/z, [x1, x8, lsl #3] -; VBITS_EQ_256-NEXT: ld1d { z2.d }, p0/z, [x1] -; VBITS_EQ_256-NEXT: mov z0.d, z0.d[3] -; VBITS_EQ_256-NEXT: mov z3.d, z2.d[3] -; VBITS_EQ_256-NEXT: insr z2.d, d0 -; VBITS_EQ_256-NEXT: insr z1.d, d3 -; VBITS_EQ_256-NEXT: st1d { z2.d }, p0, [x0] -; VBITS_EQ_256-NEXT: st1d { z1.d }, p0, [x0, x8, lsl #3] -; VBITS_EQ_256-NEXT: ret +; VBITS_GE_256-LABEL: shuffle_ext_byone_v8f64: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov x8, #4 +; VBITS_GE_256-NEXT: ptrue p0.d, vl4 +; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x1, x8, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z2.d }, p0/z, [x1] +; VBITS_GE_256-NEXT: mov z0.d, z0.d[3] +; VBITS_GE_256-NEXT: mov z3.d, z2.d[3] +; VBITS_GE_256-NEXT: insr z2.d, d0 +; VBITS_GE_256-NEXT: insr z1.d, d3 +; VBITS_GE_256-NEXT: st1d { z2.d }, p0, [x0] +; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x0, x8, lsl #3] +; VBITS_GE_256-NEXT: ret ; ; VBITS_GE_512-LABEL: shuffle_ext_byone_v8f64: ; VBITS_GE_512: // %bb.0: @@ -870,6 +1337,31 @@ } define void @shuffle_ext_byone_v16f64(<16 x double>* %a, <16 x double>* %b) #0 { +; VBITS_GE_256-LABEL: shuffle_ext_byone_v16f64: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov x8, #8 +; VBITS_GE_256-NEXT: mov x9, #12 +; VBITS_GE_256-NEXT: mov x10, #4 +; VBITS_GE_256-NEXT: ptrue p0.d, vl4 +; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x1, x8, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x1, x9, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z3.d }, p0/z, [x1, x10, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z2.d }, p0/z, [x0, x9, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z4.d }, p0/z, [x1] +; VBITS_GE_256-NEXT: mov z5.d, z0.d[3] +; VBITS_GE_256-NEXT: insr z1.d, d5 +; VBITS_GE_256-NEXT: mov z5.d, z3.d[3] +; VBITS_GE_256-NEXT: mov z2.d, z2.d[3] +; VBITS_GE_256-NEXT: insr z0.d, d5 +; VBITS_GE_256-NEXT: mov z5.d, z4.d[3] +; VBITS_GE_256-NEXT: insr z4.d, d2 +; VBITS_GE_256-NEXT: insr z3.d, d5 +; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x0, x8, lsl #3] +; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x0, x9, lsl #3] +; VBITS_GE_256-NEXT: st1d { z3.d }, p0, [x0, x10, lsl #3] +; VBITS_GE_256-NEXT: st1d { z4.d }, p0, [x0] +; VBITS_GE_256-NEXT: ret +; ; VBITS_GE_1024-LABEL: shuffle_ext_byone_v16f64: ; VBITS_GE_1024: // %bb.0: ; VBITS_GE_1024-NEXT: ptrue p0.d, vl16 @@ -890,6 +1382,51 @@ } define void @shuffle_ext_byone_v32f64(<32 x double>* %a, <32 x double>* %b) #0 { +; VBITS_GE_256-LABEL: shuffle_ext_byone_v32f64: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov x10, #16 +; VBITS_GE_256-NEXT: mov x9, #20 +; VBITS_GE_256-NEXT: mov x11, #4 +; VBITS_GE_256-NEXT: ptrue p0.d, vl4 +; VBITS_GE_256-NEXT: mov x12, #8 +; VBITS_GE_256-NEXT: mov x13, #12 +; VBITS_GE_256-NEXT: mov x8, #28 +; VBITS_GE_256-NEXT: mov x14, #24 +; VBITS_GE_256-NEXT: ld1d { z3.d }, p0/z, [x1, x10, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x1, x9, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z5.d }, p0/z, [x1, x11, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z4.d }, p0/z, [x1, x12, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z16.d }, p0/z, [x1, x13, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z2.d }, p0/z, [x1, x8, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z7.d }, p0/z, [x1, x14, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z17.d }, p0/z, [x1] +; VBITS_GE_256-NEXT: mov z18.d, z3.d[3] +; VBITS_GE_256-NEXT: mov z6.d, z1.d[3] +; VBITS_GE_256-NEXT: insr z1.d, d18 +; VBITS_GE_256-NEXT: mov z18.d, z5.d[3] +; VBITS_GE_256-NEXT: mov z19.d, z4.d[3] +; VBITS_GE_256-NEXT: insr z4.d, d18 +; VBITS_GE_256-NEXT: mov z18.d, z16.d[3] +; VBITS_GE_256-NEXT: insr z3.d, d18 +; VBITS_GE_256-NEXT: mov z18.d, z7.d[3] +; VBITS_GE_256-NEXT: insr z7.d, d6 +; VBITS_GE_256-NEXT: mov z0.d, z0.d[3] +; VBITS_GE_256-NEXT: mov z6.d, z17.d[3] +; VBITS_GE_256-NEXT: insr z16.d, d19 +; VBITS_GE_256-NEXT: insr z2.d, d18 +; VBITS_GE_256-NEXT: insr z17.d, d0 +; VBITS_GE_256-NEXT: insr z5.d, d6 +; VBITS_GE_256-NEXT: st1d { z2.d }, p0, [x0, x8, lsl #3] +; VBITS_GE_256-NEXT: st1d { z7.d }, p0, [x0, x14, lsl #3] +; VBITS_GE_256-NEXT: st1d { z3.d }, p0, [x0, x10, lsl #3] +; VBITS_GE_256-NEXT: st1d { z16.d }, p0, [x0, x13, lsl #3] +; VBITS_GE_256-NEXT: st1d { z4.d }, p0, [x0, x12, lsl #3] +; VBITS_GE_256-NEXT: st1d { z5.d }, p0, [x0, x11, lsl #3] +; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x0, x9, lsl #3] +; VBITS_GE_256-NEXT: st1d { z17.d }, p0, [x0] +; VBITS_GE_256-NEXT: ret +; ; VBITS_GE_2048-LABEL: shuffle_ext_byone_v32f64: ; VBITS_GE_2048: // %bb.0: ; VBITS_GE_2048-NEXT: ptrue p0.d, vl32